]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
set a few variables correctly
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  *  lot of big-endian byteorder fixes by Alex Beregszaszi
10  */
11
12 #include <stddef.h>
13 #include <inttypes.h> /* for __WORDSIZE */
14
15 #ifndef __WORDSIZE
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
18 #endif
19
20 #undef PREFETCH
21 #undef MOVNTQ
22 #undef EMMS
23 #undef SFENCE
24 #undef MMREG_SIZE
25 #undef PREFETCHW
26 #undef PAVGB
27
28 #ifdef HAVE_SSE2
29 #define MMREG_SIZE 16
30 #else
31 #define MMREG_SIZE 8
32 #endif
33
34 #ifdef HAVE_3DNOW
35 #define PREFETCH  "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB     "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
41 #define PAVGB     "pavgb"
42 #else
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
45 #endif
46
47 #ifdef HAVE_3DNOW
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
49 #define EMMS     "femms"
50 #else
51 #define EMMS     "emms"
52 #endif
53
54 #ifdef HAVE_MMX2
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
57 #else
58 #define MOVNTQ "movq"
59 #define SFENCE "/nop"
60 #endif
61
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
63 {
64   uint8_t *dest = dst;
65   const uint8_t *s = src;
66   const uint8_t *end;
67 #ifdef HAVE_MMX
68   const uint8_t *mm_end;
69 #endif
70   end = s + src_size;
71 #ifdef HAVE_MMX
72   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
73   mm_end = end - 23;
74   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
75   while(s < mm_end)
76   {
77     __asm __volatile(
78         PREFETCH"       32%1\n\t"
79         "movd   %1, %%mm0\n\t"
80         "punpckldq 3%1, %%mm0\n\t"
81         "movd   6%1, %%mm1\n\t"
82         "punpckldq 9%1, %%mm1\n\t"
83         "movd   12%1, %%mm2\n\t"
84         "punpckldq 15%1, %%mm2\n\t"
85         "movd   18%1, %%mm3\n\t"
86         "punpckldq 21%1, %%mm3\n\t"
87         "pand   %%mm7, %%mm0\n\t"
88         "pand   %%mm7, %%mm1\n\t"
89         "pand   %%mm7, %%mm2\n\t"
90         "pand   %%mm7, %%mm3\n\t"
91         MOVNTQ" %%mm0, %0\n\t"
92         MOVNTQ" %%mm1, 8%0\n\t"
93         MOVNTQ" %%mm2, 16%0\n\t"
94         MOVNTQ" %%mm3, 24%0"
95         :"=m"(*dest)
96         :"m"(*s)
97         :"memory");
98     dest += 32;
99     s += 24;
100   }
101   __asm __volatile(SFENCE:::"memory");
102   __asm __volatile(EMMS:::"memory");
103 #endif
104   while(s < end)
105   {
106 #ifdef WORDS_BIGENDIAN
107     *dest++ = 0;
108     *dest++ = *s++;
109     *dest++ = *s++;
110     *dest++ = *s++;
111 #else
112     *dest++ = *s++;
113     *dest++ = *s++;
114     *dest++ = *s++;
115     *dest++ = 0;
116 #endif
117   }
118 }
119
120 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
121 {
122   uint8_t *dest = dst;
123   const uint8_t *s = src;
124   const uint8_t *end;
125 #ifdef HAVE_MMX
126   const uint8_t *mm_end;
127 #endif
128   end = s + src_size;
129 #ifdef HAVE_MMX
130   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
131   mm_end = end - 31;
132   while(s < mm_end)
133   {
134     __asm __volatile(
135         PREFETCH"       32%1\n\t"
136         "movq   %1, %%mm0\n\t"
137         "movq   8%1, %%mm1\n\t"
138         "movq   16%1, %%mm4\n\t"
139         "movq   24%1, %%mm5\n\t"
140         "movq   %%mm0, %%mm2\n\t"
141         "movq   %%mm1, %%mm3\n\t"
142         "movq   %%mm4, %%mm6\n\t"
143         "movq   %%mm5, %%mm7\n\t"
144         "psrlq  $8, %%mm2\n\t"
145         "psrlq  $8, %%mm3\n\t"
146         "psrlq  $8, %%mm6\n\t"
147         "psrlq  $8, %%mm7\n\t"
148         "pand   %2, %%mm0\n\t"
149         "pand   %2, %%mm1\n\t"
150         "pand   %2, %%mm4\n\t"
151         "pand   %2, %%mm5\n\t"
152         "pand   %3, %%mm2\n\t"
153         "pand   %3, %%mm3\n\t"
154         "pand   %3, %%mm6\n\t"
155         "pand   %3, %%mm7\n\t"
156         "por    %%mm2, %%mm0\n\t"
157         "por    %%mm3, %%mm1\n\t"
158         "por    %%mm6, %%mm4\n\t"
159         "por    %%mm7, %%mm5\n\t"
160
161         "movq   %%mm1, %%mm2\n\t"
162         "movq   %%mm4, %%mm3\n\t"
163         "psllq  $48, %%mm2\n\t"
164         "psllq  $32, %%mm3\n\t"
165         "pand   %4, %%mm2\n\t"
166         "pand   %5, %%mm3\n\t"
167         "por    %%mm2, %%mm0\n\t"
168         "psrlq  $16, %%mm1\n\t"
169         "psrlq  $32, %%mm4\n\t"
170         "psllq  $16, %%mm5\n\t"
171         "por    %%mm3, %%mm1\n\t"
172         "pand   %6, %%mm5\n\t"
173         "por    %%mm5, %%mm4\n\t"
174
175         MOVNTQ" %%mm0, %0\n\t"
176         MOVNTQ" %%mm1, 8%0\n\t"
177         MOVNTQ" %%mm4, 16%0"
178         :"=m"(*dest)
179         :"m"(*s),"m"(mask24l),
180          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
181         :"memory");
182     dest += 24;
183     s += 32;
184   }
185   __asm __volatile(SFENCE:::"memory");
186   __asm __volatile(EMMS:::"memory");
187 #endif
188   while(s < end)
189   {
190 #ifdef WORDS_BIGENDIAN
191     s++;
192     *dest++ = *s++;
193     *dest++ = *s++;
194     *dest++ = *s++;
195 #else
196     *dest++ = *s++;
197     *dest++ = *s++;
198     *dest++ = *s++;
199     s++;
200 #endif
201   }
202 }
203
204 /*
205  Original by Strepto/Astral
206  ported to gcc & bugfixed : A'rpi
207  MMX2, 3DNOW optimization by Nick Kurshev
208  32bit c version, and and&add trick by Michael Niedermayer
209 */
210 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
211 {
212   register const uint8_t* s=src;
213   register uint8_t* d=dst;
214   register const uint8_t *end;
215   const uint8_t *mm_end;
216   end = s + src_size;
217 #ifdef HAVE_MMX
218   __asm __volatile(PREFETCH"    %0"::"m"(*s));
219   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
220   mm_end = end - 15;
221   while(s<mm_end)
222   {
223         __asm __volatile(
224                 PREFETCH"       32%1\n\t"
225                 "movq   %1, %%mm0\n\t"
226                 "movq   8%1, %%mm2\n\t"
227                 "movq   %%mm0, %%mm1\n\t"
228                 "movq   %%mm2, %%mm3\n\t"
229                 "pand   %%mm4, %%mm0\n\t"
230                 "pand   %%mm4, %%mm2\n\t"
231                 "paddw  %%mm1, %%mm0\n\t"
232                 "paddw  %%mm3, %%mm2\n\t"
233                 MOVNTQ" %%mm0, %0\n\t"
234                 MOVNTQ" %%mm2, 8%0"
235                 :"=m"(*d)
236                 :"m"(*s)
237                 );
238         d+=16;
239         s+=16;
240   }
241   __asm __volatile(SFENCE:::"memory");
242   __asm __volatile(EMMS:::"memory");
243 #endif
244     mm_end = end - 3;
245     while(s < mm_end)
246     {
247         register unsigned x= *((uint32_t *)s);
248         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
249         d+=4;
250         s+=4;
251     }
252     if(s < end)
253     {
254         register unsigned short x= *((uint16_t *)s);
255         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
256     }
257 }
258
259 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
260 {
261   register const uint8_t* s=src;
262   register uint8_t* d=dst;
263   register const uint8_t *end;
264   const uint8_t *mm_end;
265   end = s + src_size;
266 #ifdef HAVE_MMX
267   __asm __volatile(PREFETCH"    %0"::"m"(*s));
268   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
269   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
270   mm_end = end - 15;
271   while(s<mm_end)
272   {
273         __asm __volatile(
274                 PREFETCH"       32%1\n\t"
275                 "movq   %1, %%mm0\n\t"
276                 "movq   8%1, %%mm2\n\t"
277                 "movq   %%mm0, %%mm1\n\t"
278                 "movq   %%mm2, %%mm3\n\t"
279                 "psrlq  $1, %%mm0\n\t"
280                 "psrlq  $1, %%mm2\n\t"
281                 "pand   %%mm7, %%mm0\n\t"
282                 "pand   %%mm7, %%mm2\n\t"
283                 "pand   %%mm6, %%mm1\n\t"
284                 "pand   %%mm6, %%mm3\n\t"
285                 "por    %%mm1, %%mm0\n\t"
286                 "por    %%mm3, %%mm2\n\t"
287                 MOVNTQ" %%mm0, %0\n\t"
288                 MOVNTQ" %%mm2, 8%0"
289                 :"=m"(*d)
290                 :"m"(*s)
291                 );
292         d+=16;
293         s+=16;
294   }
295   __asm __volatile(SFENCE:::"memory");
296   __asm __volatile(EMMS:::"memory");
297 #endif
298     mm_end = end - 3;
299     while(s < mm_end)
300     {
301         register uint32_t x= *((uint32_t *)s);
302         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
303         s+=4;
304         d+=4;
305     }
306     if(s < end)
307     {
308         register uint16_t x= *((uint16_t *)s);
309         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
310         s+=2;
311         d+=2;
312     }
313 }
314
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
316 {
317         const uint8_t *s = src;
318         const uint8_t *end;
319 #ifdef HAVE_MMX
320         const uint8_t *mm_end;
321 #endif
322         uint16_t *d = (uint16_t *)dst;
323         end = s + src_size;
324 #ifdef HAVE_MMX
325         mm_end = end - 15;
326 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
327         asm volatile(
328                 "movq %3, %%mm5                 \n\t"
329                 "movq %4, %%mm6                 \n\t"
330                 "movq %5, %%mm7                 \n\t"
331                 ".balign 16                     \n\t"
332                 "1:                             \n\t"
333                 PREFETCH" 32(%1)                \n\t"
334                 "movd   (%1), %%mm0             \n\t"
335                 "movd   4(%1), %%mm3            \n\t"
336                 "punpckldq 8(%1), %%mm0         \n\t"
337                 "punpckldq 12(%1), %%mm3        \n\t"
338                 "movq %%mm0, %%mm1              \n\t"
339                 "movq %%mm3, %%mm4              \n\t"
340                 "pand %%mm6, %%mm0              \n\t"
341                 "pand %%mm6, %%mm3              \n\t"
342                 "pmaddwd %%mm7, %%mm0           \n\t"
343                 "pmaddwd %%mm7, %%mm3           \n\t"
344                 "pand %%mm5, %%mm1              \n\t"
345                 "pand %%mm5, %%mm4              \n\t"
346                 "por %%mm1, %%mm0               \n\t"   
347                 "por %%mm4, %%mm3               \n\t"
348                 "psrld $5, %%mm0                \n\t"
349                 "pslld $11, %%mm3               \n\t"
350                 "por %%mm3, %%mm0               \n\t"
351                 MOVNTQ" %%mm0, (%0)             \n\t"
352                 "add $16, %1                    \n\t"
353                 "add $8, %0                     \n\t"
354                 "cmp %2, %1                     \n\t"
355                 " jb 1b                         \n\t"
356                 : "+r" (d), "+r"(s)
357                 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
358         );
359 #else
360         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
361         __asm __volatile(
362             "movq       %0, %%mm7\n\t"
363             "movq       %1, %%mm6\n\t"
364             ::"m"(red_16mask),"m"(green_16mask));
365         while(s < mm_end)
366         {
367             __asm __volatile(
368                 PREFETCH" 32%1\n\t"
369                 "movd   %1, %%mm0\n\t"
370                 "movd   4%1, %%mm3\n\t"
371                 "punpckldq 8%1, %%mm0\n\t"
372                 "punpckldq 12%1, %%mm3\n\t"
373                 "movq   %%mm0, %%mm1\n\t"
374                 "movq   %%mm0, %%mm2\n\t"
375                 "movq   %%mm3, %%mm4\n\t"
376                 "movq   %%mm3, %%mm5\n\t"
377                 "psrlq  $3, %%mm0\n\t"
378                 "psrlq  $3, %%mm3\n\t"
379                 "pand   %2, %%mm0\n\t"
380                 "pand   %2, %%mm3\n\t"
381                 "psrlq  $5, %%mm1\n\t"
382                 "psrlq  $5, %%mm4\n\t"
383                 "pand   %%mm6, %%mm1\n\t"
384                 "pand   %%mm6, %%mm4\n\t"
385                 "psrlq  $8, %%mm2\n\t"
386                 "psrlq  $8, %%mm5\n\t"
387                 "pand   %%mm7, %%mm2\n\t"
388                 "pand   %%mm7, %%mm5\n\t"
389                 "por    %%mm1, %%mm0\n\t"
390                 "por    %%mm4, %%mm3\n\t"
391                 "por    %%mm2, %%mm0\n\t"
392                 "por    %%mm5, %%mm3\n\t"
393                 "psllq  $16, %%mm3\n\t"
394                 "por    %%mm3, %%mm0\n\t"
395                 MOVNTQ" %%mm0, %0\n\t"
396                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
397                 d += 4;
398                 s += 16;
399         }
400 #endif
401         __asm __volatile(SFENCE:::"memory");
402         __asm __volatile(EMMS:::"memory");
403 #endif
404         while(s < end)
405         {
406                 register int rgb = *(uint32_t*)s; s += 4;
407                 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
408         }
409 }
410
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
412 {
413         const uint8_t *s = src;
414         const uint8_t *end;
415 #ifdef HAVE_MMX
416         const uint8_t *mm_end;
417 #endif
418         uint16_t *d = (uint16_t *)dst;
419         end = s + src_size;
420 #ifdef HAVE_MMX
421         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
422         __asm __volatile(
423             "movq       %0, %%mm7\n\t"
424             "movq       %1, %%mm6\n\t"
425             ::"m"(red_16mask),"m"(green_16mask));
426         mm_end = end - 15;
427         while(s < mm_end)
428         {
429             __asm __volatile(
430                 PREFETCH" 32%1\n\t"
431                 "movd   %1, %%mm0\n\t"
432                 "movd   4%1, %%mm3\n\t"
433                 "punpckldq 8%1, %%mm0\n\t"
434                 "punpckldq 12%1, %%mm3\n\t"
435                 "movq   %%mm0, %%mm1\n\t"
436                 "movq   %%mm0, %%mm2\n\t"
437                 "movq   %%mm3, %%mm4\n\t"
438                 "movq   %%mm3, %%mm5\n\t"
439                 "psllq  $8, %%mm0\n\t"
440                 "psllq  $8, %%mm3\n\t"
441                 "pand   %%mm7, %%mm0\n\t"
442                 "pand   %%mm7, %%mm3\n\t"
443                 "psrlq  $5, %%mm1\n\t"
444                 "psrlq  $5, %%mm4\n\t"
445                 "pand   %%mm6, %%mm1\n\t"
446                 "pand   %%mm6, %%mm4\n\t"
447                 "psrlq  $19, %%mm2\n\t"
448                 "psrlq  $19, %%mm5\n\t"
449                 "pand   %2, %%mm2\n\t"
450                 "pand   %2, %%mm5\n\t"
451                 "por    %%mm1, %%mm0\n\t"
452                 "por    %%mm4, %%mm3\n\t"
453                 "por    %%mm2, %%mm0\n\t"
454                 "por    %%mm5, %%mm3\n\t"
455                 "psllq  $16, %%mm3\n\t"
456                 "por    %%mm3, %%mm0\n\t"
457                 MOVNTQ" %%mm0, %0\n\t"
458                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
459                 d += 4;
460                 s += 16;
461         }
462         __asm __volatile(SFENCE:::"memory");
463         __asm __volatile(EMMS:::"memory");
464 #endif
465         while(s < end)
466         {
467                 // FIXME on bigendian
468                 const int src= *s; s += 4;
469                 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
470         }
471 }
472
473 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
474 {
475         const uint8_t *s = src;
476         const uint8_t *end;
477 #ifdef HAVE_MMX
478         const uint8_t *mm_end;
479 #endif
480         uint16_t *d = (uint16_t *)dst;
481         end = s + src_size;
482 #ifdef HAVE_MMX
483         mm_end = end - 15;
484 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
485         asm volatile(
486                 "movq %3, %%mm5                 \n\t"
487                 "movq %4, %%mm6                 \n\t"
488                 "movq %5, %%mm7                 \n\t"
489                 ".balign 16                     \n\t"
490                 "1:                             \n\t"
491                 PREFETCH" 32(%1)                \n\t"
492                 "movd   (%1), %%mm0             \n\t"
493                 "movd   4(%1), %%mm3            \n\t"
494                 "punpckldq 8(%1), %%mm0         \n\t"
495                 "punpckldq 12(%1), %%mm3        \n\t"
496                 "movq %%mm0, %%mm1              \n\t"
497                 "movq %%mm3, %%mm4              \n\t"
498                 "pand %%mm6, %%mm0              \n\t"
499                 "pand %%mm6, %%mm3              \n\t"
500                 "pmaddwd %%mm7, %%mm0           \n\t"
501                 "pmaddwd %%mm7, %%mm3           \n\t"
502                 "pand %%mm5, %%mm1              \n\t"
503                 "pand %%mm5, %%mm4              \n\t"
504                 "por %%mm1, %%mm0               \n\t"   
505                 "por %%mm4, %%mm3               \n\t"
506                 "psrld $6, %%mm0                \n\t"
507                 "pslld $10, %%mm3               \n\t"
508                 "por %%mm3, %%mm0               \n\t"
509                 MOVNTQ" %%mm0, (%0)             \n\t"
510                 "add $16, %1                    \n\t"
511                 "add $8, %0                     \n\t"
512                 "cmp %2, %1                     \n\t"
513                 " jb 1b                         \n\t"
514                 : "+r" (d), "+r"(s)
515                 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
516         );
517 #else
518         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
519         __asm __volatile(
520             "movq       %0, %%mm7\n\t"
521             "movq       %1, %%mm6\n\t"
522             ::"m"(red_15mask),"m"(green_15mask));
523         while(s < mm_end)
524         {
525             __asm __volatile(
526                 PREFETCH" 32%1\n\t"
527                 "movd   %1, %%mm0\n\t"
528                 "movd   4%1, %%mm3\n\t"
529                 "punpckldq 8%1, %%mm0\n\t"
530                 "punpckldq 12%1, %%mm3\n\t"
531                 "movq   %%mm0, %%mm1\n\t"
532                 "movq   %%mm0, %%mm2\n\t"
533                 "movq   %%mm3, %%mm4\n\t"
534                 "movq   %%mm3, %%mm5\n\t"
535                 "psrlq  $3, %%mm0\n\t"
536                 "psrlq  $3, %%mm3\n\t"
537                 "pand   %2, %%mm0\n\t"
538                 "pand   %2, %%mm3\n\t"
539                 "psrlq  $6, %%mm1\n\t"
540                 "psrlq  $6, %%mm4\n\t"
541                 "pand   %%mm6, %%mm1\n\t"
542                 "pand   %%mm6, %%mm4\n\t"
543                 "psrlq  $9, %%mm2\n\t"
544                 "psrlq  $9, %%mm5\n\t"
545                 "pand   %%mm7, %%mm2\n\t"
546                 "pand   %%mm7, %%mm5\n\t"
547                 "por    %%mm1, %%mm0\n\t"
548                 "por    %%mm4, %%mm3\n\t"
549                 "por    %%mm2, %%mm0\n\t"
550                 "por    %%mm5, %%mm3\n\t"
551                 "psllq  $16, %%mm3\n\t"
552                 "por    %%mm3, %%mm0\n\t"
553                 MOVNTQ" %%mm0, %0\n\t"
554                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
555                 d += 4;
556                 s += 16;
557         }
558 #endif
559         __asm __volatile(SFENCE:::"memory");
560         __asm __volatile(EMMS:::"memory");
561 #endif
562         while(s < end)
563         {
564                 // FIXME on bigendian
565                 const int src= *s; s += 4;
566                 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
567         }
568 }
569
570 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
571 {
572         const uint8_t *s = src;
573         const uint8_t *end;
574 #ifdef HAVE_MMX
575         const uint8_t *mm_end;
576 #endif
577         uint16_t *d = (uint16_t *)dst;
578         end = s + src_size;
579 #ifdef HAVE_MMX
580         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
581         __asm __volatile(
582             "movq       %0, %%mm7\n\t"
583             "movq       %1, %%mm6\n\t"
584             ::"m"(red_15mask),"m"(green_15mask));
585         mm_end = end - 15;
586         while(s < mm_end)
587         {
588             __asm __volatile(
589                 PREFETCH" 32%1\n\t"
590                 "movd   %1, %%mm0\n\t"
591                 "movd   4%1, %%mm3\n\t"
592                 "punpckldq 8%1, %%mm0\n\t"
593                 "punpckldq 12%1, %%mm3\n\t"
594                 "movq   %%mm0, %%mm1\n\t"
595                 "movq   %%mm0, %%mm2\n\t"
596                 "movq   %%mm3, %%mm4\n\t"
597                 "movq   %%mm3, %%mm5\n\t"
598                 "psllq  $7, %%mm0\n\t"
599                 "psllq  $7, %%mm3\n\t"
600                 "pand   %%mm7, %%mm0\n\t"
601                 "pand   %%mm7, %%mm3\n\t"
602                 "psrlq  $6, %%mm1\n\t"
603                 "psrlq  $6, %%mm4\n\t"
604                 "pand   %%mm6, %%mm1\n\t"
605                 "pand   %%mm6, %%mm4\n\t"
606                 "psrlq  $19, %%mm2\n\t"
607                 "psrlq  $19, %%mm5\n\t"
608                 "pand   %2, %%mm2\n\t"
609                 "pand   %2, %%mm5\n\t"
610                 "por    %%mm1, %%mm0\n\t"
611                 "por    %%mm4, %%mm3\n\t"
612                 "por    %%mm2, %%mm0\n\t"
613                 "por    %%mm5, %%mm3\n\t"
614                 "psllq  $16, %%mm3\n\t"
615                 "por    %%mm3, %%mm0\n\t"
616                 MOVNTQ" %%mm0, %0\n\t"
617                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
618                 d += 4;
619                 s += 16;
620         }
621         __asm __volatile(SFENCE:::"memory");
622         __asm __volatile(EMMS:::"memory");
623 #endif
624         while(s < end)
625         {
626                 // FIXME on bigendian
627                 const int src= *s; s += 4;
628                 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
629         }
630 }
631
632 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
633 {
634         const uint8_t *s = src;
635         const uint8_t *end;
636 #ifdef HAVE_MMX
637         const uint8_t *mm_end;
638 #endif
639         uint16_t *d = (uint16_t *)dst;
640         end = s + src_size;
641 #ifdef HAVE_MMX
642         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
643         __asm __volatile(
644             "movq       %0, %%mm7\n\t"
645             "movq       %1, %%mm6\n\t"
646             ::"m"(red_16mask),"m"(green_16mask));
647         mm_end = end - 11;
648         while(s < mm_end)
649         {
650             __asm __volatile(
651                 PREFETCH" 32%1\n\t"
652                 "movd   %1, %%mm0\n\t"
653                 "movd   3%1, %%mm3\n\t"
654                 "punpckldq 6%1, %%mm0\n\t"
655                 "punpckldq 9%1, %%mm3\n\t"
656                 "movq   %%mm0, %%mm1\n\t"
657                 "movq   %%mm0, %%mm2\n\t"
658                 "movq   %%mm3, %%mm4\n\t"
659                 "movq   %%mm3, %%mm5\n\t"
660                 "psrlq  $3, %%mm0\n\t"
661                 "psrlq  $3, %%mm3\n\t"
662                 "pand   %2, %%mm0\n\t"
663                 "pand   %2, %%mm3\n\t"
664                 "psrlq  $5, %%mm1\n\t"
665                 "psrlq  $5, %%mm4\n\t"
666                 "pand   %%mm6, %%mm1\n\t"
667                 "pand   %%mm6, %%mm4\n\t"
668                 "psrlq  $8, %%mm2\n\t"
669                 "psrlq  $8, %%mm5\n\t"
670                 "pand   %%mm7, %%mm2\n\t"
671                 "pand   %%mm7, %%mm5\n\t"
672                 "por    %%mm1, %%mm0\n\t"
673                 "por    %%mm4, %%mm3\n\t"
674                 "por    %%mm2, %%mm0\n\t"
675                 "por    %%mm5, %%mm3\n\t"
676                 "psllq  $16, %%mm3\n\t"
677                 "por    %%mm3, %%mm0\n\t"
678                 MOVNTQ" %%mm0, %0\n\t"
679                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680                 d += 4;
681                 s += 12;
682         }
683         __asm __volatile(SFENCE:::"memory");
684         __asm __volatile(EMMS:::"memory");
685 #endif
686         while(s < end)
687         {
688                 const int b= *s++;
689                 const int g= *s++;
690                 const int r= *s++;
691                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
692         }
693 }
694
695 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
696 {
697         const uint8_t *s = src;
698         const uint8_t *end;
699 #ifdef HAVE_MMX
700         const uint8_t *mm_end;
701 #endif
702         uint16_t *d = (uint16_t *)dst;
703         end = s + src_size;
704 #ifdef HAVE_MMX
705         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
706         __asm __volatile(
707             "movq       %0, %%mm7\n\t"
708             "movq       %1, %%mm6\n\t"
709             ::"m"(red_16mask),"m"(green_16mask));
710         mm_end = end - 15;
711         while(s < mm_end)
712         {
713             __asm __volatile(
714                 PREFETCH" 32%1\n\t"
715                 "movd   %1, %%mm0\n\t"
716                 "movd   3%1, %%mm3\n\t"
717                 "punpckldq 6%1, %%mm0\n\t"
718                 "punpckldq 9%1, %%mm3\n\t"
719                 "movq   %%mm0, %%mm1\n\t"
720                 "movq   %%mm0, %%mm2\n\t"
721                 "movq   %%mm3, %%mm4\n\t"
722                 "movq   %%mm3, %%mm5\n\t"
723                 "psllq  $8, %%mm0\n\t"
724                 "psllq  $8, %%mm3\n\t"
725                 "pand   %%mm7, %%mm0\n\t"
726                 "pand   %%mm7, %%mm3\n\t"
727                 "psrlq  $5, %%mm1\n\t"
728                 "psrlq  $5, %%mm4\n\t"
729                 "pand   %%mm6, %%mm1\n\t"
730                 "pand   %%mm6, %%mm4\n\t"
731                 "psrlq  $19, %%mm2\n\t"
732                 "psrlq  $19, %%mm5\n\t"
733                 "pand   %2, %%mm2\n\t"
734                 "pand   %2, %%mm5\n\t"
735                 "por    %%mm1, %%mm0\n\t"
736                 "por    %%mm4, %%mm3\n\t"
737                 "por    %%mm2, %%mm0\n\t"
738                 "por    %%mm5, %%mm3\n\t"
739                 "psllq  $16, %%mm3\n\t"
740                 "por    %%mm3, %%mm0\n\t"
741                 MOVNTQ" %%mm0, %0\n\t"
742                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
743                 d += 4;
744                 s += 12;
745         }
746         __asm __volatile(SFENCE:::"memory");
747         __asm __volatile(EMMS:::"memory");
748 #endif
749         while(s < end)
750         {
751                 const int r= *s++;
752                 const int g= *s++;
753                 const int b= *s++;
754                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
755         }
756 }
757
758 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
759 {
760         const uint8_t *s = src;
761         const uint8_t *end;
762 #ifdef HAVE_MMX
763         const uint8_t *mm_end;
764 #endif
765         uint16_t *d = (uint16_t *)dst;
766         end = s + src_size;
767 #ifdef HAVE_MMX
768         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
769         __asm __volatile(
770             "movq       %0, %%mm7\n\t"
771             "movq       %1, %%mm6\n\t"
772             ::"m"(red_15mask),"m"(green_15mask));
773         mm_end = end - 11;
774         while(s < mm_end)
775         {
776             __asm __volatile(
777                 PREFETCH" 32%1\n\t"
778                 "movd   %1, %%mm0\n\t"
779                 "movd   3%1, %%mm3\n\t"
780                 "punpckldq 6%1, %%mm0\n\t"
781                 "punpckldq 9%1, %%mm3\n\t"
782                 "movq   %%mm0, %%mm1\n\t"
783                 "movq   %%mm0, %%mm2\n\t"
784                 "movq   %%mm3, %%mm4\n\t"
785                 "movq   %%mm3, %%mm5\n\t"
786                 "psrlq  $3, %%mm0\n\t"
787                 "psrlq  $3, %%mm3\n\t"
788                 "pand   %2, %%mm0\n\t"
789                 "pand   %2, %%mm3\n\t"
790                 "psrlq  $6, %%mm1\n\t"
791                 "psrlq  $6, %%mm4\n\t"
792                 "pand   %%mm6, %%mm1\n\t"
793                 "pand   %%mm6, %%mm4\n\t"
794                 "psrlq  $9, %%mm2\n\t"
795                 "psrlq  $9, %%mm5\n\t"
796                 "pand   %%mm7, %%mm2\n\t"
797                 "pand   %%mm7, %%mm5\n\t"
798                 "por    %%mm1, %%mm0\n\t"
799                 "por    %%mm4, %%mm3\n\t"
800                 "por    %%mm2, %%mm0\n\t"
801                 "por    %%mm5, %%mm3\n\t"
802                 "psllq  $16, %%mm3\n\t"
803                 "por    %%mm3, %%mm0\n\t"
804                 MOVNTQ" %%mm0, %0\n\t"
805                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
806                 d += 4;
807                 s += 12;
808         }
809         __asm __volatile(SFENCE:::"memory");
810         __asm __volatile(EMMS:::"memory");
811 #endif
812         while(s < end)
813         {
814                 const int b= *s++;
815                 const int g= *s++;
816                 const int r= *s++;
817                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
818         }
819 }
820
821 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
822 {
823         const uint8_t *s = src;
824         const uint8_t *end;
825 #ifdef HAVE_MMX
826         const uint8_t *mm_end;
827 #endif
828         uint16_t *d = (uint16_t *)dst;
829         end = s + src_size;
830 #ifdef HAVE_MMX
831         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
832         __asm __volatile(
833             "movq       %0, %%mm7\n\t"
834             "movq       %1, %%mm6\n\t"
835             ::"m"(red_15mask),"m"(green_15mask));
836         mm_end = end - 15;
837         while(s < mm_end)
838         {
839             __asm __volatile(
840                 PREFETCH" 32%1\n\t"
841                 "movd   %1, %%mm0\n\t"
842                 "movd   3%1, %%mm3\n\t"
843                 "punpckldq 6%1, %%mm0\n\t"
844                 "punpckldq 9%1, %%mm3\n\t"
845                 "movq   %%mm0, %%mm1\n\t"
846                 "movq   %%mm0, %%mm2\n\t"
847                 "movq   %%mm3, %%mm4\n\t"
848                 "movq   %%mm3, %%mm5\n\t"
849                 "psllq  $7, %%mm0\n\t"
850                 "psllq  $7, %%mm3\n\t"
851                 "pand   %%mm7, %%mm0\n\t"
852                 "pand   %%mm7, %%mm3\n\t"
853                 "psrlq  $6, %%mm1\n\t"
854                 "psrlq  $6, %%mm4\n\t"
855                 "pand   %%mm6, %%mm1\n\t"
856                 "pand   %%mm6, %%mm4\n\t"
857                 "psrlq  $19, %%mm2\n\t"
858                 "psrlq  $19, %%mm5\n\t"
859                 "pand   %2, %%mm2\n\t"
860                 "pand   %2, %%mm5\n\t"
861                 "por    %%mm1, %%mm0\n\t"
862                 "por    %%mm4, %%mm3\n\t"
863                 "por    %%mm2, %%mm0\n\t"
864                 "por    %%mm5, %%mm3\n\t"
865                 "psllq  $16, %%mm3\n\t"
866                 "por    %%mm3, %%mm0\n\t"
867                 MOVNTQ" %%mm0, %0\n\t"
868                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
869                 d += 4;
870                 s += 12;
871         }
872         __asm __volatile(SFENCE:::"memory");
873         __asm __volatile(EMMS:::"memory");
874 #endif
875         while(s < end)
876         {
877                 const int r= *s++;
878                 const int g= *s++;
879                 const int b= *s++;
880                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
881         }
882 }
883
884 /*
885   I use here less accurate approximation by simply
886  left-shifting the input
887   value and filling the low order bits with
888  zeroes. This method improves png's
889   compression but this scheme cannot reproduce white exactly, since it does not
890   generate an all-ones maximum value; the net effect is to darken the
891   image slightly.
892
893   The better method should be "left bit replication":
894
895    4 3 2 1 0
896    ---------
897    1 1 0 1 1
898
899    7 6 5 4 3  2 1 0
900    ----------------
901    1 1 0 1 1  1 1 0
902    |=======|  |===|
903        |      Leftmost Bits Repeated to Fill Open Bits
904        |
905    Original Bits
906 */
907 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
908 {
909         const uint16_t *end;
910 #ifdef HAVE_MMX
911         const uint16_t *mm_end;
912 #endif
913         uint8_t *d = (uint8_t *)dst;
914         const uint16_t *s = (uint16_t *)src;
915         end = s + src_size/2;
916 #ifdef HAVE_MMX
917         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
918         mm_end = end - 7;
919         while(s < mm_end)
920         {
921             __asm __volatile(
922                 PREFETCH" 32%1\n\t"
923                 "movq   %1, %%mm0\n\t"
924                 "movq   %1, %%mm1\n\t"
925                 "movq   %1, %%mm2\n\t"
926                 "pand   %2, %%mm0\n\t"
927                 "pand   %3, %%mm1\n\t"
928                 "pand   %4, %%mm2\n\t"
929                 "psllq  $3, %%mm0\n\t"
930                 "psrlq  $2, %%mm1\n\t"
931                 "psrlq  $7, %%mm2\n\t"
932                 "movq   %%mm0, %%mm3\n\t"
933                 "movq   %%mm1, %%mm4\n\t"
934                 "movq   %%mm2, %%mm5\n\t"
935                 "punpcklwd %5, %%mm0\n\t"
936                 "punpcklwd %5, %%mm1\n\t"
937                 "punpcklwd %5, %%mm2\n\t"
938                 "punpckhwd %5, %%mm3\n\t"
939                 "punpckhwd %5, %%mm4\n\t"
940                 "punpckhwd %5, %%mm5\n\t"
941                 "psllq  $8, %%mm1\n\t"
942                 "psllq  $16, %%mm2\n\t"
943                 "por    %%mm1, %%mm0\n\t"
944                 "por    %%mm2, %%mm0\n\t"
945                 "psllq  $8, %%mm4\n\t"
946                 "psllq  $16, %%mm5\n\t"
947                 "por    %%mm4, %%mm3\n\t"
948                 "por    %%mm5, %%mm3\n\t"
949
950                 "movq   %%mm0, %%mm6\n\t"
951                 "movq   %%mm3, %%mm7\n\t"
952                 
953                 "movq   8%1, %%mm0\n\t"
954                 "movq   8%1, %%mm1\n\t"
955                 "movq   8%1, %%mm2\n\t"
956                 "pand   %2, %%mm0\n\t"
957                 "pand   %3, %%mm1\n\t"
958                 "pand   %4, %%mm2\n\t"
959                 "psllq  $3, %%mm0\n\t"
960                 "psrlq  $2, %%mm1\n\t"
961                 "psrlq  $7, %%mm2\n\t"
962                 "movq   %%mm0, %%mm3\n\t"
963                 "movq   %%mm1, %%mm4\n\t"
964                 "movq   %%mm2, %%mm5\n\t"
965                 "punpcklwd %5, %%mm0\n\t"
966                 "punpcklwd %5, %%mm1\n\t"
967                 "punpcklwd %5, %%mm2\n\t"
968                 "punpckhwd %5, %%mm3\n\t"
969                 "punpckhwd %5, %%mm4\n\t"
970                 "punpckhwd %5, %%mm5\n\t"
971                 "psllq  $8, %%mm1\n\t"
972                 "psllq  $16, %%mm2\n\t"
973                 "por    %%mm1, %%mm0\n\t"
974                 "por    %%mm2, %%mm0\n\t"
975                 "psllq  $8, %%mm4\n\t"
976                 "psllq  $16, %%mm5\n\t"
977                 "por    %%mm4, %%mm3\n\t"
978                 "por    %%mm5, %%mm3\n\t"
979
980                 :"=m"(*d)
981                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
982                 :"memory");
983             /* Borrowed 32 to 24 */
984             __asm __volatile(
985                 "movq   %%mm0, %%mm4\n\t"
986                 "movq   %%mm3, %%mm5\n\t"
987                 "movq   %%mm6, %%mm0\n\t"
988                 "movq   %%mm7, %%mm1\n\t"
989                 
990                 "movq   %%mm4, %%mm6\n\t"
991                 "movq   %%mm5, %%mm7\n\t"
992                 "movq   %%mm0, %%mm2\n\t"
993                 "movq   %%mm1, %%mm3\n\t"
994
995                 "psrlq  $8, %%mm2\n\t"
996                 "psrlq  $8, %%mm3\n\t"
997                 "psrlq  $8, %%mm6\n\t"
998                 "psrlq  $8, %%mm7\n\t"
999                 "pand   %2, %%mm0\n\t"
1000                 "pand   %2, %%mm1\n\t"
1001                 "pand   %2, %%mm4\n\t"
1002                 "pand   %2, %%mm5\n\t"
1003                 "pand   %3, %%mm2\n\t"
1004                 "pand   %3, %%mm3\n\t"
1005                 "pand   %3, %%mm6\n\t"
1006                 "pand   %3, %%mm7\n\t"
1007                 "por    %%mm2, %%mm0\n\t"
1008                 "por    %%mm3, %%mm1\n\t"
1009                 "por    %%mm6, %%mm4\n\t"
1010                 "por    %%mm7, %%mm5\n\t"
1011
1012                 "movq   %%mm1, %%mm2\n\t"
1013                 "movq   %%mm4, %%mm3\n\t"
1014                 "psllq  $48, %%mm2\n\t"
1015                 "psllq  $32, %%mm3\n\t"
1016                 "pand   %4, %%mm2\n\t"
1017                 "pand   %5, %%mm3\n\t"
1018                 "por    %%mm2, %%mm0\n\t"
1019                 "psrlq  $16, %%mm1\n\t"
1020                 "psrlq  $32, %%mm4\n\t"
1021                 "psllq  $16, %%mm5\n\t"
1022                 "por    %%mm3, %%mm1\n\t"
1023                 "pand   %6, %%mm5\n\t"
1024                 "por    %%mm5, %%mm4\n\t"
1025
1026                 MOVNTQ" %%mm0, %0\n\t"
1027                 MOVNTQ" %%mm1, 8%0\n\t"
1028                 MOVNTQ" %%mm4, 16%0"
1029
1030                 :"=m"(*d)
1031                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1032                 :"memory");
1033                 d += 24;
1034                 s += 8;
1035         }
1036         __asm __volatile(SFENCE:::"memory");
1037         __asm __volatile(EMMS:::"memory");
1038 #endif
1039         while(s < end)
1040         {
1041                 register uint16_t bgr;
1042                 bgr = *s++;
1043                 *d++ = (bgr&0x1F)<<3;
1044                 *d++ = (bgr&0x3E0)>>2;
1045                 *d++ = (bgr&0x7C00)>>7;
1046         }
1047 }
1048
1049 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1050 {
1051         const uint16_t *end;
1052 #ifdef HAVE_MMX
1053         const uint16_t *mm_end;
1054 #endif
1055         uint8_t *d = (uint8_t *)dst;
1056         const uint16_t *s = (const uint16_t *)src;
1057         end = s + src_size/2;
1058 #ifdef HAVE_MMX
1059         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1060         mm_end = end - 7;
1061         while(s < mm_end)
1062         {
1063             __asm __volatile(
1064                 PREFETCH" 32%1\n\t"
1065                 "movq   %1, %%mm0\n\t"
1066                 "movq   %1, %%mm1\n\t"
1067                 "movq   %1, %%mm2\n\t"
1068                 "pand   %2, %%mm0\n\t"
1069                 "pand   %3, %%mm1\n\t"
1070                 "pand   %4, %%mm2\n\t"
1071                 "psllq  $3, %%mm0\n\t"
1072                 "psrlq  $3, %%mm1\n\t"
1073                 "psrlq  $8, %%mm2\n\t"
1074                 "movq   %%mm0, %%mm3\n\t"
1075                 "movq   %%mm1, %%mm4\n\t"
1076                 "movq   %%mm2, %%mm5\n\t"
1077                 "punpcklwd %5, %%mm0\n\t"
1078                 "punpcklwd %5, %%mm1\n\t"
1079                 "punpcklwd %5, %%mm2\n\t"
1080                 "punpckhwd %5, %%mm3\n\t"
1081                 "punpckhwd %5, %%mm4\n\t"
1082                 "punpckhwd %5, %%mm5\n\t"
1083                 "psllq  $8, %%mm1\n\t"
1084                 "psllq  $16, %%mm2\n\t"
1085                 "por    %%mm1, %%mm0\n\t"
1086                 "por    %%mm2, %%mm0\n\t"
1087                 "psllq  $8, %%mm4\n\t"
1088                 "psllq  $16, %%mm5\n\t"
1089                 "por    %%mm4, %%mm3\n\t"
1090                 "por    %%mm5, %%mm3\n\t"
1091                 
1092                 "movq   %%mm0, %%mm6\n\t"
1093                 "movq   %%mm3, %%mm7\n\t"
1094
1095                 "movq   8%1, %%mm0\n\t"
1096                 "movq   8%1, %%mm1\n\t"
1097                 "movq   8%1, %%mm2\n\t"
1098                 "pand   %2, %%mm0\n\t"
1099                 "pand   %3, %%mm1\n\t"
1100                 "pand   %4, %%mm2\n\t"
1101                 "psllq  $3, %%mm0\n\t"
1102                 "psrlq  $3, %%mm1\n\t"
1103                 "psrlq  $8, %%mm2\n\t"
1104                 "movq   %%mm0, %%mm3\n\t"
1105                 "movq   %%mm1, %%mm4\n\t"
1106                 "movq   %%mm2, %%mm5\n\t"
1107                 "punpcklwd %5, %%mm0\n\t"
1108                 "punpcklwd %5, %%mm1\n\t"
1109                 "punpcklwd %5, %%mm2\n\t"
1110                 "punpckhwd %5, %%mm3\n\t"
1111                 "punpckhwd %5, %%mm4\n\t"
1112                 "punpckhwd %5, %%mm5\n\t"
1113                 "psllq  $8, %%mm1\n\t"
1114                 "psllq  $16, %%mm2\n\t"
1115                 "por    %%mm1, %%mm0\n\t"
1116                 "por    %%mm2, %%mm0\n\t"
1117                 "psllq  $8, %%mm4\n\t"
1118                 "psllq  $16, %%mm5\n\t"
1119                 "por    %%mm4, %%mm3\n\t"
1120                 "por    %%mm5, %%mm3\n\t"
1121                 :"=m"(*d)
1122                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1123                 :"memory");
1124             /* Borrowed 32 to 24 */
1125             __asm __volatile(
1126                 "movq   %%mm0, %%mm4\n\t"
1127                 "movq   %%mm3, %%mm5\n\t"
1128                 "movq   %%mm6, %%mm0\n\t"
1129                 "movq   %%mm7, %%mm1\n\t"
1130                 
1131                 "movq   %%mm4, %%mm6\n\t"
1132                 "movq   %%mm5, %%mm7\n\t"
1133                 "movq   %%mm0, %%mm2\n\t"
1134                 "movq   %%mm1, %%mm3\n\t"
1135
1136                 "psrlq  $8, %%mm2\n\t"
1137                 "psrlq  $8, %%mm3\n\t"
1138                 "psrlq  $8, %%mm6\n\t"
1139                 "psrlq  $8, %%mm7\n\t"
1140                 "pand   %2, %%mm0\n\t"
1141                 "pand   %2, %%mm1\n\t"
1142                 "pand   %2, %%mm4\n\t"
1143                 "pand   %2, %%mm5\n\t"
1144                 "pand   %3, %%mm2\n\t"
1145                 "pand   %3, %%mm3\n\t"
1146                 "pand   %3, %%mm6\n\t"
1147                 "pand   %3, %%mm7\n\t"
1148                 "por    %%mm2, %%mm0\n\t"
1149                 "por    %%mm3, %%mm1\n\t"
1150                 "por    %%mm6, %%mm4\n\t"
1151                 "por    %%mm7, %%mm5\n\t"
1152
1153                 "movq   %%mm1, %%mm2\n\t"
1154                 "movq   %%mm4, %%mm3\n\t"
1155                 "psllq  $48, %%mm2\n\t"
1156                 "psllq  $32, %%mm3\n\t"
1157                 "pand   %4, %%mm2\n\t"
1158                 "pand   %5, %%mm3\n\t"
1159                 "por    %%mm2, %%mm0\n\t"
1160                 "psrlq  $16, %%mm1\n\t"
1161                 "psrlq  $32, %%mm4\n\t"
1162                 "psllq  $16, %%mm5\n\t"
1163                 "por    %%mm3, %%mm1\n\t"
1164                 "pand   %6, %%mm5\n\t"
1165                 "por    %%mm5, %%mm4\n\t"
1166
1167                 MOVNTQ" %%mm0, %0\n\t"
1168                 MOVNTQ" %%mm1, 8%0\n\t"
1169                 MOVNTQ" %%mm4, 16%0"
1170
1171                 :"=m"(*d)
1172                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1173                 :"memory");
1174                 d += 24;
1175                 s += 8;
1176         }
1177         __asm __volatile(SFENCE:::"memory");
1178         __asm __volatile(EMMS:::"memory");
1179 #endif
1180         while(s < end)
1181         {
1182                 register uint16_t bgr;
1183                 bgr = *s++;
1184                 *d++ = (bgr&0x1F)<<3;
1185                 *d++ = (bgr&0x7E0)>>3;
1186                 *d++ = (bgr&0xF800)>>8;
1187         }
1188 }
1189
1190 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1191 {
1192         const uint16_t *end;
1193 #ifdef HAVE_MMX
1194         const uint16_t *mm_end;
1195 #endif
1196         uint8_t *d = (uint8_t *)dst;
1197         const uint16_t *s = (const uint16_t *)src;
1198         end = s + src_size/2;
1199 #ifdef HAVE_MMX
1200         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1201         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1202         mm_end = end - 3;
1203         while(s < mm_end)
1204         {
1205             __asm __volatile(
1206                 PREFETCH" 32%1\n\t"
1207                 "movq   %1, %%mm0\n\t"
1208                 "movq   %1, %%mm1\n\t"
1209                 "movq   %1, %%mm2\n\t"
1210                 "pand   %2, %%mm0\n\t"
1211                 "pand   %3, %%mm1\n\t"
1212                 "pand   %4, %%mm2\n\t"
1213                 "psllq  $3, %%mm0\n\t"
1214                 "psrlq  $2, %%mm1\n\t"
1215                 "psrlq  $7, %%mm2\n\t"
1216                 "movq   %%mm0, %%mm3\n\t"
1217                 "movq   %%mm1, %%mm4\n\t"
1218                 "movq   %%mm2, %%mm5\n\t"
1219                 "punpcklwd %%mm7, %%mm0\n\t"
1220                 "punpcklwd %%mm7, %%mm1\n\t"
1221                 "punpcklwd %%mm7, %%mm2\n\t"
1222                 "punpckhwd %%mm7, %%mm3\n\t"
1223                 "punpckhwd %%mm7, %%mm4\n\t"
1224                 "punpckhwd %%mm7, %%mm5\n\t"
1225                 "psllq  $8, %%mm1\n\t"
1226                 "psllq  $16, %%mm2\n\t"
1227                 "por    %%mm1, %%mm0\n\t"
1228                 "por    %%mm2, %%mm0\n\t"
1229                 "psllq  $8, %%mm4\n\t"
1230                 "psllq  $16, %%mm5\n\t"
1231                 "por    %%mm4, %%mm3\n\t"
1232                 "por    %%mm5, %%mm3\n\t"
1233                 MOVNTQ" %%mm0, %0\n\t"
1234                 MOVNTQ" %%mm3, 8%0\n\t"
1235                 :"=m"(*d)
1236                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1237                 :"memory");
1238                 d += 16;
1239                 s += 4;
1240         }
1241         __asm __volatile(SFENCE:::"memory");
1242         __asm __volatile(EMMS:::"memory");
1243 #endif
1244         while(s < end)
1245         {
1246 #if 0 //slightly slower on athlon
1247                 int bgr= *s++;
1248                 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1249 #else
1250 //FIXME this is very likely wrong for bigendian (and the following converters too)
1251                 register uint16_t bgr;
1252                 bgr = *s++;
1253 #ifdef WORDS_BIGENDIAN
1254                 *d++ = 0;
1255                 *d++ = (bgr&0x1F)<<3;
1256                 *d++ = (bgr&0x3E0)>>2;
1257                 *d++ = (bgr&0x7C00)>>7;
1258 #else
1259                 *d++ = (bgr&0x1F)<<3;
1260                 *d++ = (bgr&0x3E0)>>2;
1261                 *d++ = (bgr&0x7C00)>>7;
1262                 *d++ = 0;
1263 #endif
1264
1265 #endif
1266         }
1267 }
1268
1269 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1270 {
1271         const uint16_t *end;
1272 #ifdef HAVE_MMX
1273         const uint16_t *mm_end;
1274 #endif
1275         uint8_t *d = (uint8_t *)dst;
1276         const uint16_t *s = (uint16_t *)src;
1277         end = s + src_size/2;
1278 #ifdef HAVE_MMX
1279         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1280         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1281         mm_end = end - 3;
1282         while(s < mm_end)
1283         {
1284             __asm __volatile(
1285                 PREFETCH" 32%1\n\t"
1286                 "movq   %1, %%mm0\n\t"
1287                 "movq   %1, %%mm1\n\t"
1288                 "movq   %1, %%mm2\n\t"
1289                 "pand   %2, %%mm0\n\t"
1290                 "pand   %3, %%mm1\n\t"
1291                 "pand   %4, %%mm2\n\t"
1292                 "psllq  $3, %%mm0\n\t"
1293                 "psrlq  $3, %%mm1\n\t"
1294                 "psrlq  $8, %%mm2\n\t"
1295                 "movq   %%mm0, %%mm3\n\t"
1296                 "movq   %%mm1, %%mm4\n\t"
1297                 "movq   %%mm2, %%mm5\n\t"
1298                 "punpcklwd %%mm7, %%mm0\n\t"
1299                 "punpcklwd %%mm7, %%mm1\n\t"
1300                 "punpcklwd %%mm7, %%mm2\n\t"
1301                 "punpckhwd %%mm7, %%mm3\n\t"
1302                 "punpckhwd %%mm7, %%mm4\n\t"
1303                 "punpckhwd %%mm7, %%mm5\n\t"
1304                 "psllq  $8, %%mm1\n\t"
1305                 "psllq  $16, %%mm2\n\t"
1306                 "por    %%mm1, %%mm0\n\t"
1307                 "por    %%mm2, %%mm0\n\t"
1308                 "psllq  $8, %%mm4\n\t"
1309                 "psllq  $16, %%mm5\n\t"
1310                 "por    %%mm4, %%mm3\n\t"
1311                 "por    %%mm5, %%mm3\n\t"
1312                 MOVNTQ" %%mm0, %0\n\t"
1313                 MOVNTQ" %%mm3, 8%0\n\t"
1314                 :"=m"(*d)
1315                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1316                 :"memory");
1317                 d += 16;
1318                 s += 4;
1319         }
1320         __asm __volatile(SFENCE:::"memory");
1321         __asm __volatile(EMMS:::"memory");
1322 #endif
1323         while(s < end)
1324         {
1325                 register uint16_t bgr;
1326                 bgr = *s++;
1327 #ifdef WORDS_BIGENDIAN
1328                 *d++ = 0;
1329                 *d++ = (bgr&0x1F)<<3;
1330                 *d++ = (bgr&0x7E0)>>3;
1331                 *d++ = (bgr&0xF800)>>8;
1332 #else
1333                 *d++ = (bgr&0x1F)<<3;
1334                 *d++ = (bgr&0x7E0)>>3;
1335                 *d++ = (bgr&0xF800)>>8;
1336                 *d++ = 0;
1337 #endif
1338         }
1339 }
1340
1341 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1342 {
1343 #ifdef HAVE_MMX
1344 /* TODO: unroll this loop */
1345         asm volatile (
1346                 "xor %%"REG_a", %%"REG_a"       \n\t"
1347                 ".balign 16                     \n\t"
1348                 "1:                             \n\t"
1349                 PREFETCH" 32(%0, %%"REG_a")     \n\t"
1350                 "movq (%0, %%"REG_a"), %%mm0    \n\t"
1351                 "movq %%mm0, %%mm1              \n\t"
1352                 "movq %%mm0, %%mm2              \n\t"
1353                 "pslld $16, %%mm0               \n\t"
1354                 "psrld $16, %%mm1               \n\t"
1355                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1356                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1357                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1358                 "por %%mm0, %%mm2               \n\t"
1359                 "por %%mm1, %%mm2               \n\t"
1360                 MOVNTQ" %%mm2, (%1, %%"REG_a")  \n\t"
1361                 "add $8, %%"REG_a"              \n\t"
1362                 "cmp %2, %%"REG_a"              \n\t"
1363                 " jb 1b                         \n\t"
1364                 :: "r" (src), "r"(dst), "r" ((long)src_size-7)
1365                 : "%"REG_a
1366         );
1367
1368         __asm __volatile(SFENCE:::"memory");
1369         __asm __volatile(EMMS:::"memory");
1370 #else
1371         unsigned i;
1372         unsigned num_pixels = src_size >> 2;
1373         for(i=0; i<num_pixels; i++)
1374         {
1375 #ifdef WORDS_BIGENDIAN  
1376           dst[4*i + 1] = src[4*i + 3];
1377           dst[4*i + 2] = src[4*i + 2];
1378           dst[4*i + 3] = src[4*i + 1];
1379 #else
1380           dst[4*i + 0] = src[4*i + 2];
1381           dst[4*i + 1] = src[4*i + 1];
1382           dst[4*i + 2] = src[4*i + 0];
1383 #endif
1384         }
1385 #endif
1386 }
1387
1388 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1389 {
1390         unsigned i;
1391 #ifdef HAVE_MMX
1392         long mmx_size= 23 - src_size;
1393         asm volatile (
1394                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1395                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1396                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1397                 ".balign 16                     \n\t"
1398                 "1:                             \n\t"
1399                 PREFETCH" 32(%1, %%"REG_a")     \n\t"
1400                 "movq   (%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1401                 "movq   (%1, %%"REG_a"), %%mm1  \n\t" // BGR BGR BG
1402                 "movq  2(%1, %%"REG_a"), %%mm2  \n\t" // R BGR BGR B
1403                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1404                 "pand %%mm5, %%mm0              \n\t"
1405                 "pand %%mm6, %%mm1              \n\t"
1406                 "pand %%mm7, %%mm2              \n\t"
1407                 "por %%mm0, %%mm1               \n\t"
1408                 "por %%mm2, %%mm1               \n\t"                
1409                 "movq  6(%1, %%"REG_a"), %%mm0  \n\t" // BGR BGR BG
1410                 MOVNTQ" %%mm1,   (%2, %%"REG_a")\n\t" // RGB RGB RG
1411                 "movq  8(%1, %%"REG_a"), %%mm1  \n\t" // R BGR BGR B
1412                 "movq 10(%1, %%"REG_a"), %%mm2  \n\t" // GR BGR BGR
1413                 "pand %%mm7, %%mm0              \n\t"
1414                 "pand %%mm5, %%mm1              \n\t"
1415                 "pand %%mm6, %%mm2              \n\t"
1416                 "por %%mm0, %%mm1               \n\t"
1417                 "por %%mm2, %%mm1               \n\t"                
1418                 "movq 14(%1, %%"REG_a"), %%mm0  \n\t" // R BGR BGR B
1419                 MOVNTQ" %%mm1,  8(%2, %%"REG_a")\n\t" // B RGB RGB R
1420                 "movq 16(%1, %%"REG_a"), %%mm1  \n\t" // GR BGR BGR
1421                 "movq 18(%1, %%"REG_a"), %%mm2  \n\t" // BGR BGR BG
1422                 "pand %%mm6, %%mm0              \n\t"
1423                 "pand %%mm7, %%mm1              \n\t"
1424                 "pand %%mm5, %%mm2              \n\t"
1425                 "por %%mm0, %%mm1               \n\t"
1426                 "por %%mm2, %%mm1               \n\t"                
1427                 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1428                 "add $24, %%"REG_a"             \n\t"
1429                 " js 1b                         \n\t"
1430                 : "+a" (mmx_size)
1431                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1432         );
1433
1434         __asm __volatile(SFENCE:::"memory");
1435         __asm __volatile(EMMS:::"memory");
1436
1437         if(mmx_size==23) return; //finihsed, was multiple of 8
1438
1439         src+= src_size;
1440         dst+= src_size;
1441         src_size= 23-mmx_size;
1442         src-= src_size;
1443         dst-= src_size;
1444 #endif
1445         for(i=0; i<src_size; i+=3)
1446         {
1447                 register uint8_t x;
1448                 x          = src[i + 2];
1449                 dst[i + 1] = src[i + 1];
1450                 dst[i + 2] = src[i + 0];
1451                 dst[i + 0] = x;
1452         }
1453 }
1454
1455 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456         unsigned int width, unsigned int height,
1457         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1458 {
1459         unsigned y;
1460         const unsigned chromWidth= width>>1;
1461         for(y=0; y<height; y++)
1462         {
1463 #ifdef HAVE_MMX
1464 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1465                 asm volatile(
1466                         "xor %%"REG_a", %%"REG_a"       \n\t"
1467                         ".balign 16                     \n\t"
1468                         "1:                             \n\t"
1469                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1470                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1471                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1472                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1473                         "movq %%mm0, %%mm2              \n\t" // U(0)
1474                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1475                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1476                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1477
1478                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1479                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1480                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1481                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1482                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1483                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1484                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1485                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1486
1487                         MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1488                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1489                         MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1490                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1491
1492                         "add $8, %%"REG_a"              \n\t"
1493                         "cmp %4, %%"REG_a"              \n\t"
1494                         " jb 1b                         \n\t"
1495                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1496                         : "%"REG_a
1497                 );
1498 #else
1499
1500 #if defined ARCH_ALPHA && defined HAVE_MVI
1501 #define pl2yuy2(n)                                      \
1502         y1 = yc[n];                                     \
1503         y2 = yc2[n];                                    \
1504         u = uc[n];                                      \
1505         v = vc[n];                                      \
1506         asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1));      \
1507         asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2));      \
1508         asm("unpkbl %1, %0" : "=r"(u) : "r"(u));        \
1509         asm("unpkbl %1, %0" : "=r"(v) : "r"(v));        \
1510         yuv1 = (u << 8) + (v << 24);                    \
1511         yuv2 = yuv1 + y2;                               \
1512         yuv1 += y1;                                     \
1513         qdst[n] = yuv1;                                 \
1514         qdst2[n] = yuv2;
1515
1516                 int i;
1517                 uint64_t *qdst = (uint64_t *) dst;
1518                 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1519                 const uint32_t *yc = (uint32_t *) ysrc;
1520                 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1521                 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1522                 for(i = 0; i < chromWidth; i += 8){
1523                         uint64_t y1, y2, yuv1, yuv2;
1524                         uint64_t u, v;
1525                         /* Prefetch */
1526                         asm("ldq $31,64(%0)" :: "r"(yc));
1527                         asm("ldq $31,64(%0)" :: "r"(yc2));
1528                         asm("ldq $31,64(%0)" :: "r"(uc));
1529                         asm("ldq $31,64(%0)" :: "r"(vc));
1530
1531                         pl2yuy2(0);
1532                         pl2yuy2(1);
1533                         pl2yuy2(2);
1534                         pl2yuy2(3);
1535
1536                         yc += 4;
1537                         yc2 += 4;
1538                         uc += 4;
1539                         vc += 4;
1540                         qdst += 4;
1541                         qdst2 += 4;
1542                 }
1543                 y++;
1544                 ysrc += lumStride;
1545                 dst += dstStride;
1546
1547 #elif __WORDSIZE >= 64
1548                 int i;
1549                 uint64_t *ldst = (uint64_t *) dst;
1550                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1551                 for(i = 0; i < chromWidth; i += 2){
1552                         uint64_t k, l;
1553                         k = yc[0] + (uc[0] << 8) +
1554                             (yc[1] << 16) + (vc[0] << 24);
1555                         l = yc[2] + (uc[1] << 8) +
1556                             (yc[3] << 16) + (vc[1] << 24);
1557                         *ldst++ = k + (l << 32);
1558                         yc += 4;
1559                         uc += 2;
1560                         vc += 2;
1561                 }
1562
1563 #else
1564                 int i, *idst = (int32_t *) dst;
1565                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1566                 for(i = 0; i < chromWidth; i++){
1567 #ifdef WORDS_BIGENDIAN
1568                         *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1569                             (yc[1] << 8) + (vc[0] << 0);
1570 #else
1571                         *idst++ = yc[0] + (uc[0] << 8) +
1572                             (yc[1] << 16) + (vc[0] << 24);
1573 #endif
1574                         yc += 2;
1575                         uc++;
1576                         vc++;
1577                 }
1578 #endif
1579 #endif
1580                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1581                 {
1582                         usrc += chromStride;
1583                         vsrc += chromStride;
1584                 }
1585                 ysrc += lumStride;
1586                 dst += dstStride;
1587         }
1588 #ifdef HAVE_MMX
1589 asm(    EMMS" \n\t"
1590         SFENCE" \n\t"
1591         :::"memory");
1592 #endif
1593 }
1594
1595 /**
1596  *
1597  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1598  * problem for anyone then tell me, and ill fix it)
1599  */
1600 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1601         unsigned int width, unsigned int height,
1602         int lumStride, int chromStride, int dstStride)
1603 {
1604         //FIXME interpolate chroma
1605         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1606 }
1607
1608 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1609         unsigned int width, unsigned int height,
1610         int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1611 {
1612         unsigned y;
1613         const unsigned chromWidth= width>>1;
1614         for(y=0; y<height; y++)
1615         {
1616 #ifdef HAVE_MMX
1617 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1618                 asm volatile(
1619                         "xor %%"REG_a", %%"REG_a"       \n\t"
1620                         ".balign 16                     \n\t"
1621                         "1:                             \n\t"
1622                         PREFETCH" 32(%1, %%"REG_a", 2)  \n\t"
1623                         PREFETCH" 32(%2, %%"REG_a")     \n\t"
1624                         PREFETCH" 32(%3, %%"REG_a")     \n\t"
1625                         "movq (%2, %%"REG_a"), %%mm0    \n\t" // U(0)
1626                         "movq %%mm0, %%mm2              \n\t" // U(0)
1627                         "movq (%3, %%"REG_a"), %%mm1    \n\t" // V(0)
1628                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1629                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1630
1631                         "movq (%1, %%"REG_a",2), %%mm3  \n\t" // Y(0)
1632                         "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1633                         "movq %%mm0, %%mm4              \n\t" // Y(0)
1634                         "movq %%mm2, %%mm6              \n\t" // Y(8)
1635                         "punpcklbw %%mm3, %%mm0         \n\t" // YUYV YUYV(0)
1636                         "punpckhbw %%mm3, %%mm4         \n\t" // YUYV YUYV(4)
1637                         "punpcklbw %%mm5, %%mm2         \n\t" // YUYV YUYV(8)
1638                         "punpckhbw %%mm5, %%mm6         \n\t" // YUYV YUYV(12)
1639
1640                         MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1641                         MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1642                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1643                         MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1644
1645                         "add $8, %%"REG_a"              \n\t"
1646                         "cmp %4, %%"REG_a"              \n\t"
1647                         " jb 1b                         \n\t"
1648                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1649                         : "%"REG_a
1650                 );
1651 #else
1652 //FIXME adapt the alpha asm code from yv12->yuy2
1653
1654 #if __WORDSIZE >= 64
1655                 int i;
1656                 uint64_t *ldst = (uint64_t *) dst;
1657                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1658                 for(i = 0; i < chromWidth; i += 2){
1659                         uint64_t k, l;
1660                         k = uc[0] + (yc[0] << 8) +
1661                             (vc[0] << 16) + (yc[1] << 24);
1662                         l = uc[1] + (yc[2] << 8) +
1663                             (vc[1] << 16) + (yc[3] << 24);
1664                         *ldst++ = k + (l << 32);
1665                         yc += 4;
1666                         uc += 2;
1667                         vc += 2;
1668                 }
1669
1670 #else
1671                 int i, *idst = (int32_t *) dst;
1672                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1673                 for(i = 0; i < chromWidth; i++){
1674 #ifdef WORDS_BIGENDIAN
1675                         *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1676                             (vc[0] << 8) + (yc[1] << 0);
1677 #else
1678                         *idst++ = uc[0] + (yc[0] << 8) +
1679                             (vc[0] << 16) + (yc[1] << 24);
1680 #endif
1681                         yc += 2;
1682                         uc++;
1683                         vc++;
1684                 }
1685 #endif
1686 #endif
1687                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1688                 {
1689                         usrc += chromStride;
1690                         vsrc += chromStride;
1691                 }
1692                 ysrc += lumStride;
1693                 dst += dstStride;
1694         }
1695 #ifdef HAVE_MMX
1696 asm(    EMMS" \n\t"
1697         SFENCE" \n\t"
1698         :::"memory");
1699 #endif
1700 }
1701
1702 /**
1703  *
1704  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1705  * problem for anyone then tell me, and ill fix it)
1706  */
1707 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1708         unsigned int width, unsigned int height,
1709         int lumStride, int chromStride, int dstStride)
1710 {
1711         //FIXME interpolate chroma
1712         RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1713 }
1714
1715 /**
1716  *
1717  * width should be a multiple of 16
1718  */
1719 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1720         unsigned int width, unsigned int height,
1721         int lumStride, int chromStride, int dstStride)
1722 {
1723         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1724 }
1725
1726 /**
1727  *
1728  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729  * problem for anyone then tell me, and ill fix it)
1730  */
1731 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1732         unsigned int width, unsigned int height,
1733         int lumStride, int chromStride, int srcStride)
1734 {
1735         unsigned y;
1736         const unsigned chromWidth= width>>1;
1737         for(y=0; y<height; y+=2)
1738         {
1739 #ifdef HAVE_MMX
1740                 asm volatile(
1741                         "xor %%"REG_a", %%"REG_a"       \n\t"
1742                         "pcmpeqw %%mm7, %%mm7           \n\t"
1743                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1744                         ".balign 16                     \n\t"
1745                         "1:                             \n\t"
1746                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1747                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1748                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1749                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1750                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1751                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1752                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1753                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1754                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1755                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1756                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1757
1758                         MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1759
1760                         "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1761                         "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1762                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1763                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1764                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1765                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1766                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1767                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1768                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1769                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1770
1771                         MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1772
1773                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1774                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1775                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1776                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1777                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1778                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1779                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1780                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1781
1782                         MOVNTQ" %%mm0, (%3, %%"REG_a")  \n\t"
1783                         MOVNTQ" %%mm2, (%2, %%"REG_a")  \n\t"
1784
1785                         "add $8, %%"REG_a"              \n\t"
1786                         "cmp %4, %%"REG_a"              \n\t"
1787                         " jb 1b                         \n\t"
1788                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1789                         : "memory", "%"REG_a
1790                 );
1791
1792                 ydst += lumStride;
1793                 src  += srcStride;
1794
1795                 asm volatile(
1796                         "xor %%"REG_a", %%"REG_a"       \n\t"
1797                         ".balign 16                     \n\t"
1798                         "1:                             \n\t"
1799                         PREFETCH" 64(%0, %%"REG_a", 4)  \n\t"
1800                         "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1801                         "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1802                         "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1803                         "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1804                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1805                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1806                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1807                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1808                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1809                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1810
1811                         MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1812                         MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1813
1814                         "add $8, %%"REG_a"              \n\t"
1815                         "cmp %4, %%"REG_a"              \n\t"
1816                         " jb 1b                         \n\t"
1817
1818                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1819                         : "memory", "%"REG_a
1820                 );
1821 #else
1822                 unsigned i;
1823                 for(i=0; i<chromWidth; i++)
1824                 {
1825                         ydst[2*i+0]     = src[4*i+0];
1826                         udst[i]         = src[4*i+1];
1827                         ydst[2*i+1]     = src[4*i+2];
1828                         vdst[i]         = src[4*i+3];
1829                 }
1830                 ydst += lumStride;
1831                 src  += srcStride;
1832
1833                 for(i=0; i<chromWidth; i++)
1834                 {
1835                         ydst[2*i+0]     = src[4*i+0];
1836                         ydst[2*i+1]     = src[4*i+2];
1837                 }
1838 #endif
1839                 udst += chromStride;
1840                 vdst += chromStride;
1841                 ydst += lumStride;
1842                 src  += srcStride;
1843         }
1844 #ifdef HAVE_MMX
1845 asm volatile(   EMMS" \n\t"
1846                 SFENCE" \n\t"
1847                 :::"memory");
1848 #endif
1849 }
1850
1851 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1852         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853         unsigned int width, unsigned int height, int lumStride, int chromStride)
1854 {
1855         /* Y Plane */
1856         memcpy(ydst, ysrc, width*height);
1857
1858         /* XXX: implement upscaling for U,V */
1859 }
1860
1861 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1862 {
1863         int x,y;
1864         
1865         dst[0]= src[0];
1866         
1867         // first line
1868         for(x=0; x<srcWidth-1; x++){
1869                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1870                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1871         }
1872         dst[2*srcWidth-1]= src[srcWidth-1];
1873         
1874         dst+= dstStride;
1875
1876         for(y=1; y<srcHeight; y++){
1877 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878                 const long mmxSize= srcWidth&~15;
1879                 asm volatile(
1880                         "mov %4, %%"REG_a"              \n\t"
1881                         "1:                             \n\t"
1882                         "movq (%0, %%"REG_a"), %%mm0    \n\t"
1883                         "movq (%1, %%"REG_a"), %%mm1    \n\t"
1884                         "movq 1(%0, %%"REG_a"), %%mm2   \n\t"
1885                         "movq 1(%1, %%"REG_a"), %%mm3   \n\t"
1886                         "movq -1(%0, %%"REG_a"), %%mm4  \n\t"
1887                         "movq -1(%1, %%"REG_a"), %%mm5  \n\t"
1888                         PAVGB" %%mm0, %%mm5             \n\t"
1889                         PAVGB" %%mm0, %%mm3             \n\t"
1890                         PAVGB" %%mm0, %%mm5             \n\t"
1891                         PAVGB" %%mm0, %%mm3             \n\t"
1892                         PAVGB" %%mm1, %%mm4             \n\t"
1893                         PAVGB" %%mm1, %%mm2             \n\t"
1894                         PAVGB" %%mm1, %%mm4             \n\t"
1895                         PAVGB" %%mm1, %%mm2             \n\t"
1896                         "movq %%mm5, %%mm7              \n\t"
1897                         "movq %%mm4, %%mm6              \n\t"
1898                         "punpcklbw %%mm3, %%mm5         \n\t"
1899                         "punpckhbw %%mm3, %%mm7         \n\t"
1900                         "punpcklbw %%mm2, %%mm4         \n\t"
1901                         "punpckhbw %%mm2, %%mm6         \n\t"
1902 #if 1
1903                         MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1904                         MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1905                         MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1906                         MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1907 #else
1908                         "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1909                         "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1910                         "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1911                         "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1912 #endif
1913                         "add $8, %%"REG_a"              \n\t"
1914                         " js 1b                         \n\t"
1915                         :: "r" (src + mmxSize  ), "r" (src + srcStride + mmxSize  ),
1916                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1917                            "g" (-mmxSize)
1918                         : "%"REG_a
1919
1920                 );
1921 #else
1922                 const int mmxSize=1;
1923 #endif
1924                 dst[0        ]= (3*src[0] +   src[srcStride])>>2;
1925                 dst[dstStride]= (  src[0] + 3*src[srcStride])>>2;
1926
1927                 for(x=mmxSize-1; x<srcWidth-1; x++){
1928                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1929                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1930                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1931                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1932                 }
1933                 dst[srcWidth*2 -1            ]= (3*src[srcWidth-1] +   src[srcWidth-1 + srcStride])>>2;
1934                 dst[srcWidth*2 -1 + dstStride]= (  src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1935
1936                 dst+=dstStride*2;
1937                 src+=srcStride;
1938         }
1939         
1940         // last line
1941 #if 1
1942         dst[0]= src[0];
1943         
1944         for(x=0; x<srcWidth-1; x++){
1945                 dst[2*x+1]= (3*src[x] +   src[x+1])>>2;
1946                 dst[2*x+2]= (  src[x] + 3*src[x+1])>>2;
1947         }
1948         dst[2*srcWidth-1]= src[srcWidth-1];
1949 #else
1950         for(x=0; x<srcWidth; x++){
1951                 dst[2*x+0]=
1952                 dst[2*x+1]= src[x];
1953         }
1954 #endif
1955
1956 #ifdef HAVE_MMX
1957 asm volatile(   EMMS" \n\t"
1958                 SFENCE" \n\t"
1959                 :::"memory");
1960 #endif
1961 }
1962
1963 /**
1964  *
1965  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1966  * problem for anyone then tell me, and ill fix it)
1967  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1968  */
1969 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1970         unsigned int width, unsigned int height,
1971         int lumStride, int chromStride, int srcStride)
1972 {
1973         unsigned y;
1974         const unsigned chromWidth= width>>1;
1975         for(y=0; y<height; y+=2)
1976         {
1977 #ifdef HAVE_MMX
1978                 asm volatile(
1979                         "xorl %%eax, %%eax              \n\t"
1980                         "pcmpeqw %%mm7, %%mm7           \n\t"
1981                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1982                         ".balign 16                     \n\t"
1983                         "1:                             \n\t"
1984                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1985                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1986                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1987                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1988                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1989                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1990                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1991                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1992                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1993                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1994                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1995
1996                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1997
1998                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1999                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
2000                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
2001                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
2002                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
2003                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
2004                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
2005                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
2006                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
2007                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
2008
2009                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
2010
2011                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
2012                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
2013                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
2014                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
2015                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
2016                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
2017                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
2018                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
2019
2020                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
2021                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
2022
2023                         "addl $8, %%eax                 \n\t"
2024                         "cmpl %4, %%eax                 \n\t"
2025                         " jb 1b                         \n\t"
2026                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2027                         : "memory", "%eax"
2028                 );
2029
2030                 ydst += lumStride;
2031                 src  += srcStride;
2032
2033                 asm volatile(
2034                         "xorl %%eax, %%eax              \n\t"
2035                         ".balign 16                     \n\t"
2036                         "1:                             \n\t"
2037                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
2038                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
2039                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
2040                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
2041                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
2042                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
2043                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
2044                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
2045                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
2046                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
2047                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
2048
2049                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
2050                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
2051
2052                         "addl $8, %%eax                 \n\t"
2053                         "cmpl %4, %%eax                 \n\t"
2054                         " jb 1b                         \n\t"
2055
2056                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2057                         : "memory", "%eax"
2058                 );
2059 #else
2060                 unsigned i;
2061                 for(i=0; i<chromWidth; i++)
2062                 {
2063                         udst[i]         = src[4*i+0];
2064                         ydst[2*i+0]     = src[4*i+1];
2065                         vdst[i]         = src[4*i+2];
2066                         ydst[2*i+1]     = src[4*i+3];
2067                 }
2068                 ydst += lumStride;
2069                 src  += srcStride;
2070
2071                 for(i=0; i<chromWidth; i++)
2072                 {
2073                         ydst[2*i+0]     = src[4*i+1];
2074                         ydst[2*i+1]     = src[4*i+3];
2075                 }
2076 #endif
2077                 udst += chromStride;
2078                 vdst += chromStride;
2079                 ydst += lumStride;
2080                 src  += srcStride;
2081         }
2082 #ifdef HAVE_MMX
2083 asm volatile(   EMMS" \n\t"
2084                 SFENCE" \n\t"
2085                 :::"memory");
2086 #endif
2087 }
2088
2089 /**
2090  *
2091  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2092  * problem for anyone then tell me, and ill fix it)
2093  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2094  */
2095 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2096         unsigned int width, unsigned int height,
2097         int lumStride, int chromStride, int srcStride)
2098 {
2099         unsigned y;
2100         const unsigned chromWidth= width>>1;
2101 #ifdef HAVE_MMX
2102         for(y=0; y<height-2; y+=2)
2103         {
2104                 unsigned i;
2105                 for(i=0; i<2; i++)
2106                 {
2107                         asm volatile(
2108                                 "mov %2, %%"REG_a"              \n\t"
2109                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
2110                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
2111                                 "pxor %%mm7, %%mm7              \n\t"
2112                                 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2113                                 ".balign 16                     \n\t"
2114                                 "1:                             \n\t"
2115                                 PREFETCH" 64(%0, %%"REG_b")     \n\t"
2116                                 "movd (%0, %%"REG_b"), %%mm0    \n\t"
2117                                 "movd 3(%0, %%"REG_b"), %%mm1   \n\t"
2118                                 "punpcklbw %%mm7, %%mm0         \n\t"
2119                                 "punpcklbw %%mm7, %%mm1         \n\t"
2120                                 "movd 6(%0, %%"REG_b"), %%mm2   \n\t"
2121                                 "movd 9(%0, %%"REG_b"), %%mm3   \n\t"
2122                                 "punpcklbw %%mm7, %%mm2         \n\t"
2123                                 "punpcklbw %%mm7, %%mm3         \n\t"
2124                                 "pmaddwd %%mm6, %%mm0           \n\t"
2125                                 "pmaddwd %%mm6, %%mm1           \n\t"
2126                                 "pmaddwd %%mm6, %%mm2           \n\t"
2127                                 "pmaddwd %%mm6, %%mm3           \n\t"
2128 #ifndef FAST_BGR2YV12
2129                                 "psrad $8, %%mm0                \n\t"
2130                                 "psrad $8, %%mm1                \n\t"
2131                                 "psrad $8, %%mm2                \n\t"
2132                                 "psrad $8, %%mm3                \n\t"
2133 #endif
2134                                 "packssdw %%mm1, %%mm0          \n\t"
2135                                 "packssdw %%mm3, %%mm2          \n\t"
2136                                 "pmaddwd %%mm5, %%mm0           \n\t"
2137                                 "pmaddwd %%mm5, %%mm2           \n\t"
2138                                 "packssdw %%mm2, %%mm0          \n\t"
2139                                 "psraw $7, %%mm0                \n\t"
2140
2141                                 "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2142                                 "movd 15(%0, %%"REG_b"), %%mm1  \n\t"
2143                                 "punpcklbw %%mm7, %%mm4         \n\t"
2144                                 "punpcklbw %%mm7, %%mm1         \n\t"
2145                                 "movd 18(%0, %%"REG_b"), %%mm2  \n\t"
2146                                 "movd 21(%0, %%"REG_b"), %%mm3  \n\t"
2147                                 "punpcklbw %%mm7, %%mm2         \n\t"
2148                                 "punpcklbw %%mm7, %%mm3         \n\t"
2149                                 "pmaddwd %%mm6, %%mm4           \n\t"
2150                                 "pmaddwd %%mm6, %%mm1           \n\t"
2151                                 "pmaddwd %%mm6, %%mm2           \n\t"
2152                                 "pmaddwd %%mm6, %%mm3           \n\t"
2153 #ifndef FAST_BGR2YV12
2154                                 "psrad $8, %%mm4                \n\t"
2155                                 "psrad $8, %%mm1                \n\t"
2156                                 "psrad $8, %%mm2                \n\t"
2157                                 "psrad $8, %%mm3                \n\t"
2158 #endif
2159                                 "packssdw %%mm1, %%mm4          \n\t"
2160                                 "packssdw %%mm3, %%mm2          \n\t"
2161                                 "pmaddwd %%mm5, %%mm4           \n\t"
2162                                 "pmaddwd %%mm5, %%mm2           \n\t"
2163                                 "add $24, %%"REG_b"             \n\t"
2164                                 "packssdw %%mm2, %%mm4          \n\t"
2165                                 "psraw $7, %%mm4                \n\t"
2166
2167                                 "packuswb %%mm4, %%mm0          \n\t"
2168                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
2169
2170                                 MOVNTQ" %%mm0, (%1, %%"REG_a")  \n\t"
2171                                 "add $8, %%"REG_a"              \n\t"
2172                                 " js 1b                         \n\t"
2173                                 : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2174                                 : "%"REG_a, "%"REG_b
2175                         );
2176                         ydst += lumStride;
2177                         src  += srcStride;
2178                 }
2179                 src -= srcStride*2;
2180                 asm volatile(
2181                         "mov %4, %%"REG_a"              \n\t"
2182                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2183                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
2184                         "pxor %%mm7, %%mm7              \n\t"
2185                         "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2186                         "add %%"REG_b", %%"REG_b"       \n\t"
2187                         ".balign 16                     \n\t"
2188                         "1:                             \n\t"
2189                         PREFETCH" 64(%0, %%"REG_b")     \n\t"
2190                         PREFETCH" 64(%1, %%"REG_b")     \n\t"
2191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2192                         "movq (%0, %%"REG_b"), %%mm0    \n\t"
2193                         "movq (%1, %%"REG_b"), %%mm1    \n\t"
2194                         "movq 6(%0, %%"REG_b"), %%mm2   \n\t"
2195                         "movq 6(%1, %%"REG_b"), %%mm3   \n\t"
2196                         PAVGB" %%mm1, %%mm0             \n\t"
2197                         PAVGB" %%mm3, %%mm2             \n\t"
2198                         "movq %%mm0, %%mm1              \n\t"
2199                         "movq %%mm2, %%mm3              \n\t"
2200                         "psrlq $24, %%mm0               \n\t"
2201                         "psrlq $24, %%mm2               \n\t"
2202                         PAVGB" %%mm1, %%mm0             \n\t"
2203                         PAVGB" %%mm3, %%mm2             \n\t"
2204                         "punpcklbw %%mm7, %%mm0         \n\t"
2205                         "punpcklbw %%mm7, %%mm2         \n\t"
2206 #else
2207                         "movd (%0, %%"REG_b"), %%mm0    \n\t"
2208                         "movd (%1, %%"REG_b"), %%mm1    \n\t"
2209                         "movd 3(%0, %%"REG_b"), %%mm2   \n\t"
2210                         "movd 3(%1, %%"REG_b"), %%mm3   \n\t"
2211                         "punpcklbw %%mm7, %%mm0         \n\t"
2212                         "punpcklbw %%mm7, %%mm1         \n\t"
2213                         "punpcklbw %%mm7, %%mm2         \n\t"
2214                         "punpcklbw %%mm7, %%mm3         \n\t"
2215                         "paddw %%mm1, %%mm0             \n\t"
2216                         "paddw %%mm3, %%mm2             \n\t"
2217                         "paddw %%mm2, %%mm0             \n\t"
2218                         "movd 6(%0, %%"REG_b"), %%mm4   \n\t"
2219                         "movd 6(%1, %%"REG_b"), %%mm1   \n\t"
2220                         "movd 9(%0, %%"REG_b"), %%mm2   \n\t"
2221                         "movd 9(%1, %%"REG_b"), %%mm3   \n\t"
2222                         "punpcklbw %%mm7, %%mm4         \n\t"
2223                         "punpcklbw %%mm7, %%mm1         \n\t"
2224                         "punpcklbw %%mm7, %%mm2         \n\t"
2225                         "punpcklbw %%mm7, %%mm3         \n\t"
2226                         "paddw %%mm1, %%mm4             \n\t"
2227                         "paddw %%mm3, %%mm2             \n\t"
2228                         "paddw %%mm4, %%mm2             \n\t"
2229                         "psrlw $2, %%mm0                \n\t"
2230                         "psrlw $2, %%mm2                \n\t"
2231 #endif
2232                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2233                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2234
2235                         "pmaddwd %%mm0, %%mm1           \n\t"
2236                         "pmaddwd %%mm2, %%mm3           \n\t"
2237                         "pmaddwd %%mm6, %%mm0           \n\t"
2238                         "pmaddwd %%mm6, %%mm2           \n\t"
2239 #ifndef FAST_BGR2YV12
2240                         "psrad $8, %%mm0                \n\t"
2241                         "psrad $8, %%mm1                \n\t"
2242                         "psrad $8, %%mm2                \n\t"
2243                         "psrad $8, %%mm3                \n\t"
2244 #endif
2245                         "packssdw %%mm2, %%mm0          \n\t"
2246                         "packssdw %%mm3, %%mm1          \n\t"
2247                         "pmaddwd %%mm5, %%mm0           \n\t"
2248                         "pmaddwd %%mm5, %%mm1           \n\t"
2249                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
2250                         "psraw $7, %%mm0                \n\t"
2251
2252 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2253                         "movq 12(%0, %%"REG_b"), %%mm4  \n\t"
2254                         "movq 12(%1, %%"REG_b"), %%mm1  \n\t"
2255                         "movq 18(%0, %%"REG_b"), %%mm2  \n\t"
2256                         "movq 18(%1, %%"REG_b"), %%mm3  \n\t"
2257                         PAVGB" %%mm1, %%mm4             \n\t"
2258                         PAVGB" %%mm3, %%mm2             \n\t"
2259                         "movq %%mm4, %%mm1              \n\t"
2260                         "movq %%mm2, %%mm3              \n\t"
2261                         "psrlq $24, %%mm4               \n\t"
2262                         "psrlq $24, %%mm2               \n\t"
2263                         PAVGB" %%mm1, %%mm4             \n\t"
2264                         PAVGB" %%mm3, %%mm2             \n\t"
2265                         "punpcklbw %%mm7, %%mm4         \n\t"
2266                         "punpcklbw %%mm7, %%mm2         \n\t"
2267 #else
2268                         "movd 12(%0, %%"REG_b"), %%mm4  \n\t"
2269                         "movd 12(%1, %%"REG_b"), %%mm1  \n\t"
2270                         "movd 15(%0, %%"REG_b"), %%mm2  \n\t"
2271                         "movd 15(%1, %%"REG_b"), %%mm3  \n\t"
2272                         "punpcklbw %%mm7, %%mm4         \n\t"
2273                         "punpcklbw %%mm7, %%mm1         \n\t"
2274                         "punpcklbw %%mm7, %%mm2         \n\t"
2275                         "punpcklbw %%mm7, %%mm3         \n\t"
2276                         "paddw %%mm1, %%mm4             \n\t"
2277                         "paddw %%mm3, %%mm2             \n\t"
2278                         "paddw %%mm2, %%mm4             \n\t"
2279                         "movd 18(%0, %%"REG_b"), %%mm5  \n\t"
2280                         "movd 18(%1, %%"REG_b"), %%mm1  \n\t"
2281                         "movd 21(%0, %%"REG_b"), %%mm2  \n\t"
2282                         "movd 21(%1, %%"REG_b"), %%mm3  \n\t"
2283                         "punpcklbw %%mm7, %%mm5         \n\t"
2284                         "punpcklbw %%mm7, %%mm1         \n\t"
2285                         "punpcklbw %%mm7, %%mm2         \n\t"
2286                         "punpcklbw %%mm7, %%mm3         \n\t"
2287                         "paddw %%mm1, %%mm5             \n\t"
2288                         "paddw %%mm3, %%mm2             \n\t"
2289                         "paddw %%mm5, %%mm2             \n\t"
2290                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2291                         "psrlw $2, %%mm4                \n\t"
2292                         "psrlw $2, %%mm2                \n\t"
2293 #endif
2294                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2295                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2296
2297                         "pmaddwd %%mm4, %%mm1           \n\t"
2298                         "pmaddwd %%mm2, %%mm3           \n\t"
2299                         "pmaddwd %%mm6, %%mm4           \n\t"
2300                         "pmaddwd %%mm6, %%mm2           \n\t"
2301 #ifndef FAST_BGR2YV12
2302                         "psrad $8, %%mm4                \n\t"
2303                         "psrad $8, %%mm1                \n\t"
2304                         "psrad $8, %%mm2                \n\t"
2305                         "psrad $8, %%mm3                \n\t"
2306 #endif
2307                         "packssdw %%mm2, %%mm4          \n\t"
2308                         "packssdw %%mm3, %%mm1          \n\t"
2309                         "pmaddwd %%mm5, %%mm4           \n\t"
2310                         "pmaddwd %%mm5, %%mm1           \n\t"
2311                         "add $24, %%"REG_b"             \n\t"
2312                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2313                         "psraw $7, %%mm4                \n\t"
2314
2315                         "movq %%mm0, %%mm1              \n\t"
2316                         "punpckldq %%mm4, %%mm0         \n\t"
2317                         "punpckhdq %%mm4, %%mm1         \n\t"
2318                         "packsswb %%mm1, %%mm0          \n\t"
2319                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2320                         "movd %%mm0, (%2, %%"REG_a")    \n\t"
2321                         "punpckhdq %%mm0, %%mm0         \n\t"
2322                         "movd %%mm0, (%3, %%"REG_a")    \n\t"
2323                         "add $4, %%"REG_a"              \n\t"
2324                         " js 1b                         \n\t"
2325                         : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2326                         : "%"REG_a, "%"REG_b
2327                 );
2328
2329                 udst += chromStride;
2330                 vdst += chromStride;
2331                 src  += srcStride*2;
2332         }
2333
2334         asm volatile(   EMMS" \n\t"
2335                         SFENCE" \n\t"
2336                         :::"memory");
2337 #else
2338         y=0;
2339 #endif
2340         for(; y<height; y+=2)
2341         {
2342                 unsigned i;
2343                 for(i=0; i<chromWidth; i++)
2344                 {
2345                         unsigned int b= src[6*i+0];
2346                         unsigned int g= src[6*i+1];
2347                         unsigned int r= src[6*i+2];
2348
2349                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2351                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2352
2353                         udst[i]         = U;
2354                         vdst[i]         = V;
2355                         ydst[2*i]       = Y;
2356
2357                         b= src[6*i+3];
2358                         g= src[6*i+4];
2359                         r= src[6*i+5];
2360
2361                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2362                         ydst[2*i+1]     = Y;
2363                 }
2364                 ydst += lumStride;
2365                 src  += srcStride;
2366
2367                 for(i=0; i<chromWidth; i++)
2368                 {
2369                         unsigned int b= src[6*i+0];
2370                         unsigned int g= src[6*i+1];
2371                         unsigned int r= src[6*i+2];
2372
2373                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374
2375                         ydst[2*i]       = Y;
2376
2377                         b= src[6*i+3];
2378                         g= src[6*i+4];
2379                         r= src[6*i+5];
2380
2381                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2382                         ydst[2*i+1]     = Y;
2383                 }
2384                 udst += chromStride;
2385                 vdst += chromStride;
2386                 ydst += lumStride;
2387                 src  += srcStride;
2388         }
2389 }
2390
2391 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2392                             unsigned width, unsigned height, int src1Stride,
2393                             int src2Stride, int dstStride){
2394         unsigned h;
2395
2396         for(h=0; h < height; h++)
2397         {
2398                 unsigned w;
2399
2400 #ifdef HAVE_MMX
2401 #ifdef HAVE_SSE2
2402                 asm(
2403                         "xor %%"REG_a", %%"REG_a"       \n\t"
2404                         "1:                             \n\t"
2405                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2406                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2407                         "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2408                         "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2409                         "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2410                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2411                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2412                         "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2413                         "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2414                         "add $16, %%"REG_a"             \n\t"
2415                         "cmp %3, %%"REG_a"              \n\t"
2416                         " jb 1b                         \n\t"
2417                         ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2418                         : "memory", "%"REG_a""
2419                 );
2420 #else
2421                 asm(
2422                         "xor %%"REG_a", %%"REG_a"       \n\t"
2423                         "1:                             \n\t"
2424                         PREFETCH" 64(%1, %%"REG_a")     \n\t"
2425                         PREFETCH" 64(%2, %%"REG_a")     \n\t"
2426                         "movq (%1, %%"REG_a"), %%mm0    \n\t"
2427                         "movq 8(%1, %%"REG_a"), %%mm2   \n\t"
2428                         "movq %%mm0, %%mm1              \n\t"
2429                         "movq %%mm2, %%mm3              \n\t"
2430                         "movq (%2, %%"REG_a"), %%mm4    \n\t"
2431                         "movq 8(%2, %%"REG_a"), %%mm5   \n\t"
2432                         "punpcklbw %%mm4, %%mm0         \n\t"
2433                         "punpckhbw %%mm4, %%mm1         \n\t"
2434                         "punpcklbw %%mm5, %%mm2         \n\t"
2435                         "punpckhbw %%mm5, %%mm3         \n\t"
2436                         MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2437                         MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2438                         MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2439                         MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2440                         "add $16, %%"REG_a"             \n\t"
2441                         "cmp %3, %%"REG_a"              \n\t"
2442                         " jb 1b                         \n\t"
2443                         ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2444                         : "memory", "%"REG_a
2445                 );
2446 #endif
2447                 for(w= (width&(~15)); w < width; w++)
2448                 {
2449                         dest[2*w+0] = src1[w];
2450                         dest[2*w+1] = src2[w];
2451                 }
2452 #else
2453                 for(w=0; w < width; w++)
2454                 {
2455                         dest[2*w+0] = src1[w];
2456                         dest[2*w+1] = src2[w];
2457                 }
2458 #endif
2459                 dest += dstStride;
2460                 src1 += src1Stride;
2461                 src2 += src2Stride;
2462         }
2463 #ifdef HAVE_MMX
2464         asm(
2465                 EMMS" \n\t"
2466                 SFENCE" \n\t"
2467                 ::: "memory"
2468                 );
2469 #endif
2470 }
2471
2472 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2473                         uint8_t *dst1, uint8_t *dst2,
2474                         unsigned width, unsigned height,
2475                         int srcStride1, int srcStride2,
2476                         int dstStride1, int dstStride2)
2477 {
2478     unsigned int y,x,h;
2479     int w;
2480     w=width/2; h=height/2;
2481 #ifdef HAVE_MMX
2482     asm volatile(
2483         PREFETCH" %0\n\t"
2484         PREFETCH" %1\n\t"
2485         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2486 #endif
2487     for(y=0;y<h;y++){
2488         const uint8_t* s1=src1+srcStride1*(y>>1);
2489         uint8_t* d=dst1+dstStride1*y;
2490         x=0;
2491 #ifdef HAVE_MMX
2492         for(;x<w-31;x+=32)
2493         {
2494             asm volatile(
2495                 PREFETCH" 32%1\n\t"
2496                 "movq   %1, %%mm0\n\t"
2497                 "movq   8%1, %%mm2\n\t"
2498                 "movq   16%1, %%mm4\n\t"
2499                 "movq   24%1, %%mm6\n\t"
2500                 "movq   %%mm0, %%mm1\n\t"
2501                 "movq   %%mm2, %%mm3\n\t"
2502                 "movq   %%mm4, %%mm5\n\t"
2503                 "movq   %%mm6, %%mm7\n\t"
2504                 "punpcklbw %%mm0, %%mm0\n\t"
2505                 "punpckhbw %%mm1, %%mm1\n\t"
2506                 "punpcklbw %%mm2, %%mm2\n\t"
2507                 "punpckhbw %%mm3, %%mm3\n\t"
2508                 "punpcklbw %%mm4, %%mm4\n\t"
2509                 "punpckhbw %%mm5, %%mm5\n\t"
2510                 "punpcklbw %%mm6, %%mm6\n\t"
2511                 "punpckhbw %%mm7, %%mm7\n\t"
2512                 MOVNTQ" %%mm0, %0\n\t"
2513                 MOVNTQ" %%mm1, 8%0\n\t"
2514                 MOVNTQ" %%mm2, 16%0\n\t"
2515                 MOVNTQ" %%mm3, 24%0\n\t"
2516                 MOVNTQ" %%mm4, 32%0\n\t"
2517                 MOVNTQ" %%mm5, 40%0\n\t"
2518                 MOVNTQ" %%mm6, 48%0\n\t"
2519                 MOVNTQ" %%mm7, 56%0"
2520                 :"=m"(d[2*x])
2521                 :"m"(s1[x])
2522                 :"memory");
2523         }
2524 #endif
2525         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2526     }
2527     for(y=0;y<h;y++){
2528         const uint8_t* s2=src2+srcStride2*(y>>1);
2529         uint8_t* d=dst2+dstStride2*y;
2530         x=0;
2531 #ifdef HAVE_MMX
2532         for(;x<w-31;x+=32)
2533         {
2534             asm volatile(
2535                 PREFETCH" 32%1\n\t"
2536                 "movq   %1, %%mm0\n\t"
2537                 "movq   8%1, %%mm2\n\t"
2538                 "movq   16%1, %%mm4\n\t"
2539                 "movq   24%1, %%mm6\n\t"
2540                 "movq   %%mm0, %%mm1\n\t"
2541                 "movq   %%mm2, %%mm3\n\t"
2542                 "movq   %%mm4, %%mm5\n\t"
2543                 "movq   %%mm6, %%mm7\n\t"
2544                 "punpcklbw %%mm0, %%mm0\n\t"
2545                 "punpckhbw %%mm1, %%mm1\n\t"
2546                 "punpcklbw %%mm2, %%mm2\n\t"
2547                 "punpckhbw %%mm3, %%mm3\n\t"
2548                 "punpcklbw %%mm4, %%mm4\n\t"
2549                 "punpckhbw %%mm5, %%mm5\n\t"
2550                 "punpcklbw %%mm6, %%mm6\n\t"
2551                 "punpckhbw %%mm7, %%mm7\n\t"
2552                 MOVNTQ" %%mm0, %0\n\t"
2553                 MOVNTQ" %%mm1, 8%0\n\t"
2554                 MOVNTQ" %%mm2, 16%0\n\t"
2555                 MOVNTQ" %%mm3, 24%0\n\t"
2556                 MOVNTQ" %%mm4, 32%0\n\t"
2557                 MOVNTQ" %%mm5, 40%0\n\t"
2558                 MOVNTQ" %%mm6, 48%0\n\t"
2559                 MOVNTQ" %%mm7, 56%0"
2560                 :"=m"(d[2*x])
2561                 :"m"(s2[x])
2562                 :"memory");
2563         }
2564 #endif
2565         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2566     }
2567 #ifdef HAVE_MMX
2568         asm(
2569                 EMMS" \n\t"
2570                 SFENCE" \n\t"
2571                 ::: "memory"
2572                 );
2573 #endif
2574 }
2575
2576 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2577                         uint8_t *dst,
2578                         unsigned width, unsigned height,
2579                         int srcStride1, int srcStride2,
2580                         int srcStride3, int dstStride)
2581 {
2582     unsigned long y,x,w,h;
2583     w=width/2; h=height;
2584     for(y=0;y<h;y++){
2585         const uint8_t* yp=src1+srcStride1*y;
2586         const uint8_t* up=src2+srcStride2*(y>>2);
2587         const uint8_t* vp=src3+srcStride3*(y>>2);
2588         uint8_t* d=dst+dstStride*y;
2589         x=0;
2590 #ifdef HAVE_MMX
2591         for(;x<w-7;x+=8)
2592         {
2593             asm volatile(
2594                 PREFETCH" 32(%1, %0)\n\t"
2595                 PREFETCH" 32(%2, %0)\n\t"
2596                 PREFETCH" 32(%3, %0)\n\t"
2597                 "movq   (%1, %0, 4), %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2598                 "movq   (%2, %0), %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2599                 "movq   (%3, %0), %%mm2\n\t"         /* V0V1V2V3V4V5V6V7 */
2600                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2601                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2602                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2603                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2604                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2605                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2606                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2607
2608                 "movq   %%mm1, %%mm6\n\t"
2609                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2610                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2611                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2612                 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2613                 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2614                 
2615                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2616                 "movq   8(%1, %0, 4), %%mm0\n\t"
2617                 "movq   %%mm0, %%mm3\n\t"
2618                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2619                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2620                 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2621                 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2622
2623                 "movq   %%mm4, %%mm6\n\t"
2624                 "movq   16(%1, %0, 4), %%mm0\n\t"
2625                 "movq   %%mm0, %%mm3\n\t"
2626                 "punpcklbw %%mm5, %%mm4\n\t"
2627                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2628                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2629                 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2630                 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2631                 
2632                 "punpckhbw %%mm5, %%mm6\n\t"
2633                 "movq   24(%1, %0, 4), %%mm0\n\t"
2634                 "movq   %%mm0, %%mm3\n\t"
2635                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2636                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2637                 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2638                 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2639
2640                 : "+r" (x)
2641                 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2642                 :"memory");
2643         }
2644 #endif
2645         for(; x<w; x++)
2646         {
2647             const int x2= x<<2;
2648             d[8*x+0]=yp[x2];
2649             d[8*x+1]=up[x];
2650             d[8*x+2]=yp[x2+1];
2651             d[8*x+3]=vp[x];
2652             d[8*x+4]=yp[x2+2];
2653             d[8*x+5]=up[x];
2654             d[8*x+6]=yp[x2+3];
2655             d[8*x+7]=vp[x];
2656         }
2657     }
2658 #ifdef HAVE_MMX
2659         asm(
2660                 EMMS" \n\t"
2661                 SFENCE" \n\t"
2662                 ::: "memory"
2663                 );
2664 #endif
2665 }