]> git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c
remove unused varibles patch by (Eric Sesterhenn <SnakeByte at gmx dot de>)
[ffmpeg] / postproc / rgb2rgb_template.c
1 /*
2  *
3  *  rgb2rgb.c, Software RGB to RGB convertor
4  *  pluralize by Software PAL8 to RGB convertor
5  *               Software YUV to YUV convertor
6  *               Software YUV to RGB convertor
7  *  Written by Nick Kurshev.
8  *  palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9  */
10
11 #include <stddef.h>
12 #include <inttypes.h> /* for __WORDSIZE */
13
14 #ifndef __WORDSIZE
15 #warning You have misconfigured system and probably will lose performance!
16 #endif
17
18 #undef PREFETCH
19 #undef MOVNTQ
20 #undef EMMS
21 #undef SFENCE
22 #undef MMREG_SIZE
23 #undef PREFETCHW
24 #undef PAVGB
25
26 #ifdef HAVE_SSE2
27 #define MMREG_SIZE 16
28 #else
29 #define MMREG_SIZE 8
30 #endif
31
32 #ifdef HAVE_3DNOW
33 #define PREFETCH  "prefetch"
34 #define PREFETCHW "prefetchw"
35 #define PAVGB     "pavgusb"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
39 #define PAVGB     "pavgb"
40 #else
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
43 #endif
44
45 #ifdef HAVE_3DNOW
46 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
47 #define EMMS     "femms"
48 #else
49 #define EMMS     "emms"
50 #endif
51
52 #ifdef HAVE_MMX2
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
55 #else
56 #define MOVNTQ "movq"
57 #define SFENCE "/nop"
58 #endif
59
60 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
61 {
62   uint8_t *dest = dst;
63   const uint8_t *s = src;
64   const uint8_t *end;
65 #ifdef HAVE_MMX
66   const uint8_t *mm_end;
67 #endif
68   end = s + src_size;
69 #ifdef HAVE_MMX
70   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
71   mm_end = end - 23;
72   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
73   while(s < mm_end)
74   {
75     __asm __volatile(
76         PREFETCH"       32%1\n\t"
77         "movd   %1, %%mm0\n\t"
78         "punpckldq 3%1, %%mm0\n\t"
79         "movd   6%1, %%mm1\n\t"
80         "punpckldq 9%1, %%mm1\n\t"
81         "movd   12%1, %%mm2\n\t"
82         "punpckldq 15%1, %%mm2\n\t"
83         "movd   18%1, %%mm3\n\t"
84         "punpckldq 21%1, %%mm3\n\t"
85         "pand   %%mm7, %%mm0\n\t"
86         "pand   %%mm7, %%mm1\n\t"
87         "pand   %%mm7, %%mm2\n\t"
88         "pand   %%mm7, %%mm3\n\t"
89         MOVNTQ" %%mm0, %0\n\t"
90         MOVNTQ" %%mm1, 8%0\n\t"
91         MOVNTQ" %%mm2, 16%0\n\t"
92         MOVNTQ" %%mm3, 24%0"
93         :"=m"(*dest)
94         :"m"(*s)
95         :"memory");
96     dest += 32;
97     s += 24;
98   }
99   __asm __volatile(SFENCE:::"memory");
100   __asm __volatile(EMMS:::"memory");
101 #endif
102   while(s < end)
103   {
104     *dest++ = *s++;
105     *dest++ = *s++;
106     *dest++ = *s++;
107     *dest++ = 0;
108   }
109 }
110
111 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
112 {
113   uint8_t *dest = dst;
114   const uint8_t *s = src;
115   const uint8_t *end;
116 #ifdef HAVE_MMX
117   const uint8_t *mm_end;
118 #endif
119   end = s + src_size;
120 #ifdef HAVE_MMX
121   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
122   mm_end = end - 31;
123   while(s < mm_end)
124   {
125     __asm __volatile(
126         PREFETCH"       32%1\n\t"
127         "movq   %1, %%mm0\n\t"
128         "movq   8%1, %%mm1\n\t"
129         "movq   16%1, %%mm4\n\t"
130         "movq   24%1, %%mm5\n\t"
131         "movq   %%mm0, %%mm2\n\t"
132         "movq   %%mm1, %%mm3\n\t"
133         "movq   %%mm4, %%mm6\n\t"
134         "movq   %%mm5, %%mm7\n\t"
135         "psrlq  $8, %%mm2\n\t"
136         "psrlq  $8, %%mm3\n\t"
137         "psrlq  $8, %%mm6\n\t"
138         "psrlq  $8, %%mm7\n\t"
139         "pand   %2, %%mm0\n\t"
140         "pand   %2, %%mm1\n\t"
141         "pand   %2, %%mm4\n\t"
142         "pand   %2, %%mm5\n\t"
143         "pand   %3, %%mm2\n\t"
144         "pand   %3, %%mm3\n\t"
145         "pand   %3, %%mm6\n\t"
146         "pand   %3, %%mm7\n\t"
147         "por    %%mm2, %%mm0\n\t"
148         "por    %%mm3, %%mm1\n\t"
149         "por    %%mm6, %%mm4\n\t"
150         "por    %%mm7, %%mm5\n\t"
151
152         "movq   %%mm1, %%mm2\n\t"
153         "movq   %%mm4, %%mm3\n\t"
154         "psllq  $48, %%mm2\n\t"
155         "psllq  $32, %%mm3\n\t"
156         "pand   %4, %%mm2\n\t"
157         "pand   %5, %%mm3\n\t"
158         "por    %%mm2, %%mm0\n\t"
159         "psrlq  $16, %%mm1\n\t"
160         "psrlq  $32, %%mm4\n\t"
161         "psllq  $16, %%mm5\n\t"
162         "por    %%mm3, %%mm1\n\t"
163         "pand   %6, %%mm5\n\t"
164         "por    %%mm5, %%mm4\n\t"
165
166         MOVNTQ" %%mm0, %0\n\t"
167         MOVNTQ" %%mm1, 8%0\n\t"
168         MOVNTQ" %%mm4, 16%0"
169         :"=m"(*dest)
170         :"m"(*s),"m"(mask24l),
171          "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
172         :"memory");
173     dest += 24;
174     s += 32;
175   }
176   __asm __volatile(SFENCE:::"memory");
177   __asm __volatile(EMMS:::"memory");
178 #endif
179   while(s < end)
180   {
181     *dest++ = *s++;
182     *dest++ = *s++;
183     *dest++ = *s++;
184     s++;
185   }
186 }
187
188 /*
189  Original by Strepto/Astral
190  ported to gcc & bugfixed : A'rpi
191  MMX2, 3DNOW optimization by Nick Kurshev
192  32bit c version, and and&add trick by Michael Niedermayer
193 */
194 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
195 {
196   register const uint8_t* s=src;
197   register uint8_t* d=dst;
198   register const uint8_t *end;
199   const uint8_t *mm_end;
200   end = s + src_size;
201 #ifdef HAVE_MMX
202   __asm __volatile(PREFETCH"    %0"::"m"(*s));
203   __asm __volatile("movq        %0, %%mm4"::"m"(mask15s));
204   mm_end = end - 15;
205   while(s<mm_end)
206   {
207         __asm __volatile(
208                 PREFETCH"       32%1\n\t"
209                 "movq   %1, %%mm0\n\t"
210                 "movq   8%1, %%mm2\n\t"
211                 "movq   %%mm0, %%mm1\n\t"
212                 "movq   %%mm2, %%mm3\n\t"
213                 "pand   %%mm4, %%mm0\n\t"
214                 "pand   %%mm4, %%mm2\n\t"
215                 "paddw  %%mm1, %%mm0\n\t"
216                 "paddw  %%mm3, %%mm2\n\t"
217                 MOVNTQ" %%mm0, %0\n\t"
218                 MOVNTQ" %%mm2, 8%0"
219                 :"=m"(*d)
220                 :"m"(*s)
221                 );
222         d+=16;
223         s+=16;
224   }
225   __asm __volatile(SFENCE:::"memory");
226   __asm __volatile(EMMS:::"memory");
227 #endif
228     mm_end = end - 3;
229     while(s < mm_end)
230     {
231         register unsigned x= *((uint32_t *)s);
232         *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
233         d+=4;
234         s+=4;
235     }
236     if(s < end)
237     {
238         register unsigned short x= *((uint16_t *)s);
239         *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
240     }
241 }
242
243 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
244 {
245         unsigned j,i,num_pixels=src_size/3;
246         for(i=0,j=0; j<num_pixels; i+=3,j+=3)
247         {
248                 dst[j+0] = src[i+2];
249                 dst[j+1] = src[i+1];
250                 dst[j+2] = src[i+0];
251         }
252 }
253
254 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
255 {
256   register const uint8_t* s=src;
257   register uint8_t* d=dst;
258   register const uint8_t *end;
259   const uint8_t *mm_end;
260   end = s + src_size;
261 #ifdef HAVE_MMX
262   __asm __volatile(PREFETCH"    %0"::"m"(*s));
263   __asm __volatile("movq        %0, %%mm7"::"m"(mask15rg));
264   __asm __volatile("movq        %0, %%mm6"::"m"(mask15b));
265   mm_end = end - 15;
266   while(s<mm_end)
267   {
268         __asm __volatile(
269                 PREFETCH"       32%1\n\t"
270                 "movq   %1, %%mm0\n\t"
271                 "movq   8%1, %%mm2\n\t"
272                 "movq   %%mm0, %%mm1\n\t"
273                 "movq   %%mm2, %%mm3\n\t"
274                 "psrlq  $1, %%mm0\n\t"
275                 "psrlq  $1, %%mm2\n\t"
276                 "pand   %%mm7, %%mm0\n\t"
277                 "pand   %%mm7, %%mm2\n\t"
278                 "pand   %%mm6, %%mm1\n\t"
279                 "pand   %%mm6, %%mm3\n\t"
280                 "por    %%mm1, %%mm0\n\t"
281                 "por    %%mm3, %%mm2\n\t"
282                 MOVNTQ" %%mm0, %0\n\t"
283                 MOVNTQ" %%mm2, 8%0"
284                 :"=m"(*d)
285                 :"m"(*s)
286                 );
287         d+=16;
288         s+=16;
289   }
290   __asm __volatile(SFENCE:::"memory");
291   __asm __volatile(EMMS:::"memory");
292 #endif
293     mm_end = end - 3;
294     while(s < mm_end)
295     {
296         register uint32_t x= *((uint32_t *)s);
297         *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
298         s+=4;
299         d+=4;
300     }
301     if(s < end)
302     {
303         register uint16_t x= *((uint16_t *)s);
304         *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
305         s+=2;
306         d+=2;
307     }
308 }
309
310 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
311 {
312         const uint8_t *s = src;
313         const uint8_t *end;
314 #ifdef HAVE_MMX
315         const uint8_t *mm_end;
316 #endif
317         uint16_t *d = (uint16_t *)dst;
318         end = s + src_size;
319 #ifdef HAVE_MMX
320         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
321         __asm __volatile(
322             "movq       %0, %%mm7\n\t"
323             "movq       %1, %%mm6\n\t"
324             ::"m"(red_16mask),"m"(green_16mask));
325         mm_end = end - 15;
326         while(s < mm_end)
327         {
328             __asm __volatile(
329                 PREFETCH" 32%1\n\t"
330                 "movd   %1, %%mm0\n\t"
331                 "movd   4%1, %%mm3\n\t"
332                 "punpckldq 8%1, %%mm0\n\t"
333                 "punpckldq 12%1, %%mm3\n\t"
334                 "movq   %%mm0, %%mm1\n\t"
335                 "movq   %%mm0, %%mm2\n\t"
336                 "movq   %%mm3, %%mm4\n\t"
337                 "movq   %%mm3, %%mm5\n\t"
338                 "psrlq  $3, %%mm0\n\t"
339                 "psrlq  $3, %%mm3\n\t"
340                 "pand   %2, %%mm0\n\t"
341                 "pand   %2, %%mm3\n\t"
342                 "psrlq  $5, %%mm1\n\t"
343                 "psrlq  $5, %%mm4\n\t"
344                 "pand   %%mm6, %%mm1\n\t"
345                 "pand   %%mm6, %%mm4\n\t"
346                 "psrlq  $8, %%mm2\n\t"
347                 "psrlq  $8, %%mm5\n\t"
348                 "pand   %%mm7, %%mm2\n\t"
349                 "pand   %%mm7, %%mm5\n\t"
350                 "por    %%mm1, %%mm0\n\t"
351                 "por    %%mm4, %%mm3\n\t"
352                 "por    %%mm2, %%mm0\n\t"
353                 "por    %%mm5, %%mm3\n\t"
354                 "psllq  $16, %%mm3\n\t"
355                 "por    %%mm3, %%mm0\n\t"
356                 MOVNTQ" %%mm0, %0\n\t"
357                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
358                 d += 4;
359                 s += 16;
360         }
361         __asm __volatile(SFENCE:::"memory");
362         __asm __volatile(EMMS:::"memory");
363 #endif
364         while(s < end)
365         {
366                 const int b= *s++;
367                 const int g= *s++;
368                 const int r= *s++;
369                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
370                 s++;
371         }
372 }
373
374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
375 {
376         const uint8_t *s = src;
377         const uint8_t *end;
378 #ifdef HAVE_MMX
379         const uint8_t *mm_end;
380 #endif
381         uint16_t *d = (uint16_t *)dst;
382         end = s + src_size;
383 #ifdef HAVE_MMX
384         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
385         __asm __volatile(
386             "movq       %0, %%mm7\n\t"
387             "movq       %1, %%mm6\n\t"
388             ::"m"(red_16mask),"m"(green_16mask));
389         mm_end = end - 15;
390         while(s < mm_end)
391         {
392             __asm __volatile(
393                 PREFETCH" 32%1\n\t"
394                 "movd   %1, %%mm0\n\t"
395                 "movd   4%1, %%mm3\n\t"
396                 "punpckldq 8%1, %%mm0\n\t"
397                 "punpckldq 12%1, %%mm3\n\t"
398                 "movq   %%mm0, %%mm1\n\t"
399                 "movq   %%mm0, %%mm2\n\t"
400                 "movq   %%mm3, %%mm4\n\t"
401                 "movq   %%mm3, %%mm5\n\t"
402                 "psllq  $8, %%mm0\n\t"
403                 "psllq  $8, %%mm3\n\t"
404                 "pand   %%mm7, %%mm0\n\t"
405                 "pand   %%mm7, %%mm3\n\t"
406                 "psrlq  $5, %%mm1\n\t"
407                 "psrlq  $5, %%mm4\n\t"
408                 "pand   %%mm6, %%mm1\n\t"
409                 "pand   %%mm6, %%mm4\n\t"
410                 "psrlq  $19, %%mm2\n\t"
411                 "psrlq  $19, %%mm5\n\t"
412                 "pand   %2, %%mm2\n\t"
413                 "pand   %2, %%mm5\n\t"
414                 "por    %%mm1, %%mm0\n\t"
415                 "por    %%mm4, %%mm3\n\t"
416                 "por    %%mm2, %%mm0\n\t"
417                 "por    %%mm5, %%mm3\n\t"
418                 "psllq  $16, %%mm3\n\t"
419                 "por    %%mm3, %%mm0\n\t"
420                 MOVNTQ" %%mm0, %0\n\t"
421                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
422                 d += 4;
423                 s += 16;
424         }
425         __asm __volatile(SFENCE:::"memory");
426         __asm __volatile(EMMS:::"memory");
427 #endif
428         while(s < end)
429         {
430                 const int r= *s++;
431                 const int g= *s++;
432                 const int b= *s++;
433                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
434                 s++;
435         }
436 }
437
438 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
439 {
440         const uint8_t *s = src;
441         const uint8_t *end;
442 #ifdef HAVE_MMX
443         const uint8_t *mm_end;
444 #endif
445         uint16_t *d = (uint16_t *)dst;
446         end = s + src_size;
447 #ifdef HAVE_MMX
448         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
449         __asm __volatile(
450             "movq       %0, %%mm7\n\t"
451             "movq       %1, %%mm6\n\t"
452             ::"m"(red_15mask),"m"(green_15mask));
453         mm_end = end - 15;
454         while(s < mm_end)
455         {
456             __asm __volatile(
457                 PREFETCH" 32%1\n\t"
458                 "movd   %1, %%mm0\n\t"
459                 "movd   4%1, %%mm3\n\t"
460                 "punpckldq 8%1, %%mm0\n\t"
461                 "punpckldq 12%1, %%mm3\n\t"
462                 "movq   %%mm0, %%mm1\n\t"
463                 "movq   %%mm0, %%mm2\n\t"
464                 "movq   %%mm3, %%mm4\n\t"
465                 "movq   %%mm3, %%mm5\n\t"
466                 "psrlq  $3, %%mm0\n\t"
467                 "psrlq  $3, %%mm3\n\t"
468                 "pand   %2, %%mm0\n\t"
469                 "pand   %2, %%mm3\n\t"
470                 "psrlq  $6, %%mm1\n\t"
471                 "psrlq  $6, %%mm4\n\t"
472                 "pand   %%mm6, %%mm1\n\t"
473                 "pand   %%mm6, %%mm4\n\t"
474                 "psrlq  $9, %%mm2\n\t"
475                 "psrlq  $9, %%mm5\n\t"
476                 "pand   %%mm7, %%mm2\n\t"
477                 "pand   %%mm7, %%mm5\n\t"
478                 "por    %%mm1, %%mm0\n\t"
479                 "por    %%mm4, %%mm3\n\t"
480                 "por    %%mm2, %%mm0\n\t"
481                 "por    %%mm5, %%mm3\n\t"
482                 "psllq  $16, %%mm3\n\t"
483                 "por    %%mm3, %%mm0\n\t"
484                 MOVNTQ" %%mm0, %0\n\t"
485                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
486                 d += 4;
487                 s += 16;
488         }
489         __asm __volatile(SFENCE:::"memory");
490         __asm __volatile(EMMS:::"memory");
491 #endif
492         while(s < end)
493         {
494                 const int b= *s++;
495                 const int g= *s++;
496                 const int r= *s++;
497                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
498                 s++;
499         }
500 }
501
502 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
503 {
504         const uint8_t *s = src;
505         const uint8_t *end;
506 #ifdef HAVE_MMX
507         const uint8_t *mm_end;
508 #endif
509         uint16_t *d = (uint16_t *)dst;
510         end = s + src_size;
511 #ifdef HAVE_MMX
512         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
513         __asm __volatile(
514             "movq       %0, %%mm7\n\t"
515             "movq       %1, %%mm6\n\t"
516             ::"m"(red_15mask),"m"(green_15mask));
517         mm_end = end - 15;
518         while(s < mm_end)
519         {
520             __asm __volatile(
521                 PREFETCH" 32%1\n\t"
522                 "movd   %1, %%mm0\n\t"
523                 "movd   4%1, %%mm3\n\t"
524                 "punpckldq 8%1, %%mm0\n\t"
525                 "punpckldq 12%1, %%mm3\n\t"
526                 "movq   %%mm0, %%mm1\n\t"
527                 "movq   %%mm0, %%mm2\n\t"
528                 "movq   %%mm3, %%mm4\n\t"
529                 "movq   %%mm3, %%mm5\n\t"
530                 "psllq  $7, %%mm0\n\t"
531                 "psllq  $7, %%mm3\n\t"
532                 "pand   %%mm7, %%mm0\n\t"
533                 "pand   %%mm7, %%mm3\n\t"
534                 "psrlq  $6, %%mm1\n\t"
535                 "psrlq  $6, %%mm4\n\t"
536                 "pand   %%mm6, %%mm1\n\t"
537                 "pand   %%mm6, %%mm4\n\t"
538                 "psrlq  $19, %%mm2\n\t"
539                 "psrlq  $19, %%mm5\n\t"
540                 "pand   %2, %%mm2\n\t"
541                 "pand   %2, %%mm5\n\t"
542                 "por    %%mm1, %%mm0\n\t"
543                 "por    %%mm4, %%mm3\n\t"
544                 "por    %%mm2, %%mm0\n\t"
545                 "por    %%mm5, %%mm3\n\t"
546                 "psllq  $16, %%mm3\n\t"
547                 "por    %%mm3, %%mm0\n\t"
548                 MOVNTQ" %%mm0, %0\n\t"
549                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
550                 d += 4;
551                 s += 16;
552         }
553         __asm __volatile(SFENCE:::"memory");
554         __asm __volatile(EMMS:::"memory");
555 #endif
556         while(s < end)
557         {
558                 const int r= *s++;
559                 const int g= *s++;
560                 const int b= *s++;
561                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
562                 s++;
563         }
564 }
565
566 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
567 {
568         const uint8_t *s = src;
569         const uint8_t *end;
570 #ifdef HAVE_MMX
571         const uint8_t *mm_end;
572 #endif
573         uint16_t *d = (uint16_t *)dst;
574         end = s + src_size;
575 #ifdef HAVE_MMX
576         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
577         __asm __volatile(
578             "movq       %0, %%mm7\n\t"
579             "movq       %1, %%mm6\n\t"
580             ::"m"(red_16mask),"m"(green_16mask));
581         mm_end = end - 11;
582         while(s < mm_end)
583         {
584             __asm __volatile(
585                 PREFETCH" 32%1\n\t"
586                 "movd   %1, %%mm0\n\t"
587                 "movd   3%1, %%mm3\n\t"
588                 "punpckldq 6%1, %%mm0\n\t"
589                 "punpckldq 9%1, %%mm3\n\t"
590                 "movq   %%mm0, %%mm1\n\t"
591                 "movq   %%mm0, %%mm2\n\t"
592                 "movq   %%mm3, %%mm4\n\t"
593                 "movq   %%mm3, %%mm5\n\t"
594                 "psrlq  $3, %%mm0\n\t"
595                 "psrlq  $3, %%mm3\n\t"
596                 "pand   %2, %%mm0\n\t"
597                 "pand   %2, %%mm3\n\t"
598                 "psrlq  $5, %%mm1\n\t"
599                 "psrlq  $5, %%mm4\n\t"
600                 "pand   %%mm6, %%mm1\n\t"
601                 "pand   %%mm6, %%mm4\n\t"
602                 "psrlq  $8, %%mm2\n\t"
603                 "psrlq  $8, %%mm5\n\t"
604                 "pand   %%mm7, %%mm2\n\t"
605                 "pand   %%mm7, %%mm5\n\t"
606                 "por    %%mm1, %%mm0\n\t"
607                 "por    %%mm4, %%mm3\n\t"
608                 "por    %%mm2, %%mm0\n\t"
609                 "por    %%mm5, %%mm3\n\t"
610                 "psllq  $16, %%mm3\n\t"
611                 "por    %%mm3, %%mm0\n\t"
612                 MOVNTQ" %%mm0, %0\n\t"
613                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
614                 d += 4;
615                 s += 12;
616         }
617         __asm __volatile(SFENCE:::"memory");
618         __asm __volatile(EMMS:::"memory");
619 #endif
620         while(s < end)
621         {
622                 const int b= *s++;
623                 const int g= *s++;
624                 const int r= *s++;
625                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
626         }
627 }
628
629 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
630 {
631         const uint8_t *s = src;
632         const uint8_t *end;
633 #ifdef HAVE_MMX
634         const uint8_t *mm_end;
635 #endif
636         uint16_t *d = (uint16_t *)dst;
637         end = s + src_size;
638 #ifdef HAVE_MMX
639         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
640         __asm __volatile(
641             "movq       %0, %%mm7\n\t"
642             "movq       %1, %%mm6\n\t"
643             ::"m"(red_16mask),"m"(green_16mask));
644         mm_end = end - 15;
645         while(s < mm_end)
646         {
647             __asm __volatile(
648                 PREFETCH" 32%1\n\t"
649                 "movd   %1, %%mm0\n\t"
650                 "movd   3%1, %%mm3\n\t"
651                 "punpckldq 6%1, %%mm0\n\t"
652                 "punpckldq 9%1, %%mm3\n\t"
653                 "movq   %%mm0, %%mm1\n\t"
654                 "movq   %%mm0, %%mm2\n\t"
655                 "movq   %%mm3, %%mm4\n\t"
656                 "movq   %%mm3, %%mm5\n\t"
657                 "psllq  $8, %%mm0\n\t"
658                 "psllq  $8, %%mm3\n\t"
659                 "pand   %%mm7, %%mm0\n\t"
660                 "pand   %%mm7, %%mm3\n\t"
661                 "psrlq  $5, %%mm1\n\t"
662                 "psrlq  $5, %%mm4\n\t"
663                 "pand   %%mm6, %%mm1\n\t"
664                 "pand   %%mm6, %%mm4\n\t"
665                 "psrlq  $19, %%mm2\n\t"
666                 "psrlq  $19, %%mm5\n\t"
667                 "pand   %2, %%mm2\n\t"
668                 "pand   %2, %%mm5\n\t"
669                 "por    %%mm1, %%mm0\n\t"
670                 "por    %%mm4, %%mm3\n\t"
671                 "por    %%mm2, %%mm0\n\t"
672                 "por    %%mm5, %%mm3\n\t"
673                 "psllq  $16, %%mm3\n\t"
674                 "por    %%mm3, %%mm0\n\t"
675                 MOVNTQ" %%mm0, %0\n\t"
676                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
677                 d += 4;
678                 s += 12;
679         }
680         __asm __volatile(SFENCE:::"memory");
681         __asm __volatile(EMMS:::"memory");
682 #endif
683         while(s < end)
684         {
685                 const int r= *s++;
686                 const int g= *s++;
687                 const int b= *s++;
688                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
689         }
690 }
691
692 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
693 {
694         const uint8_t *s = src;
695         const uint8_t *end;
696 #ifdef HAVE_MMX
697         const uint8_t *mm_end;
698 #endif
699         uint16_t *d = (uint16_t *)dst;
700         end = s + src_size;
701 #ifdef HAVE_MMX
702         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
703         __asm __volatile(
704             "movq       %0, %%mm7\n\t"
705             "movq       %1, %%mm6\n\t"
706             ::"m"(red_15mask),"m"(green_15mask));
707         mm_end = end - 11;
708         while(s < mm_end)
709         {
710             __asm __volatile(
711                 PREFETCH" 32%1\n\t"
712                 "movd   %1, %%mm0\n\t"
713                 "movd   3%1, %%mm3\n\t"
714                 "punpckldq 6%1, %%mm0\n\t"
715                 "punpckldq 9%1, %%mm3\n\t"
716                 "movq   %%mm0, %%mm1\n\t"
717                 "movq   %%mm0, %%mm2\n\t"
718                 "movq   %%mm3, %%mm4\n\t"
719                 "movq   %%mm3, %%mm5\n\t"
720                 "psrlq  $3, %%mm0\n\t"
721                 "psrlq  $3, %%mm3\n\t"
722                 "pand   %2, %%mm0\n\t"
723                 "pand   %2, %%mm3\n\t"
724                 "psrlq  $6, %%mm1\n\t"
725                 "psrlq  $6, %%mm4\n\t"
726                 "pand   %%mm6, %%mm1\n\t"
727                 "pand   %%mm6, %%mm4\n\t"
728                 "psrlq  $9, %%mm2\n\t"
729                 "psrlq  $9, %%mm5\n\t"
730                 "pand   %%mm7, %%mm2\n\t"
731                 "pand   %%mm7, %%mm5\n\t"
732                 "por    %%mm1, %%mm0\n\t"
733                 "por    %%mm4, %%mm3\n\t"
734                 "por    %%mm2, %%mm0\n\t"
735                 "por    %%mm5, %%mm3\n\t"
736                 "psllq  $16, %%mm3\n\t"
737                 "por    %%mm3, %%mm0\n\t"
738                 MOVNTQ" %%mm0, %0\n\t"
739                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
740                 d += 4;
741                 s += 12;
742         }
743         __asm __volatile(SFENCE:::"memory");
744         __asm __volatile(EMMS:::"memory");
745 #endif
746         while(s < end)
747         {
748                 const int b= *s++;
749                 const int g= *s++;
750                 const int r= *s++;
751                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
752         }
753 }
754
755 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
756 {
757         const uint8_t *s = src;
758         const uint8_t *end;
759 #ifdef HAVE_MMX
760         const uint8_t *mm_end;
761 #endif
762         uint16_t *d = (uint16_t *)dst;
763         end = s + src_size;
764 #ifdef HAVE_MMX
765         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
766         __asm __volatile(
767             "movq       %0, %%mm7\n\t"
768             "movq       %1, %%mm6\n\t"
769             ::"m"(red_15mask),"m"(green_15mask));
770         mm_end = end - 15;
771         while(s < mm_end)
772         {
773             __asm __volatile(
774                 PREFETCH" 32%1\n\t"
775                 "movd   %1, %%mm0\n\t"
776                 "movd   3%1, %%mm3\n\t"
777                 "punpckldq 6%1, %%mm0\n\t"
778                 "punpckldq 9%1, %%mm3\n\t"
779                 "movq   %%mm0, %%mm1\n\t"
780                 "movq   %%mm0, %%mm2\n\t"
781                 "movq   %%mm3, %%mm4\n\t"
782                 "movq   %%mm3, %%mm5\n\t"
783                 "psllq  $7, %%mm0\n\t"
784                 "psllq  $7, %%mm3\n\t"
785                 "pand   %%mm7, %%mm0\n\t"
786                 "pand   %%mm7, %%mm3\n\t"
787                 "psrlq  $6, %%mm1\n\t"
788                 "psrlq  $6, %%mm4\n\t"
789                 "pand   %%mm6, %%mm1\n\t"
790                 "pand   %%mm6, %%mm4\n\t"
791                 "psrlq  $19, %%mm2\n\t"
792                 "psrlq  $19, %%mm5\n\t"
793                 "pand   %2, %%mm2\n\t"
794                 "pand   %2, %%mm5\n\t"
795                 "por    %%mm1, %%mm0\n\t"
796                 "por    %%mm4, %%mm3\n\t"
797                 "por    %%mm2, %%mm0\n\t"
798                 "por    %%mm5, %%mm3\n\t"
799                 "psllq  $16, %%mm3\n\t"
800                 "por    %%mm3, %%mm0\n\t"
801                 MOVNTQ" %%mm0, %0\n\t"
802                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
803                 d += 4;
804                 s += 12;
805         }
806         __asm __volatile(SFENCE:::"memory");
807         __asm __volatile(EMMS:::"memory");
808 #endif
809         while(s < end)
810         {
811                 const int r= *s++;
812                 const int g= *s++;
813                 const int b= *s++;
814                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
815         }
816 }
817
818 /*
819   I use here less accurate approximation by simply
820  left-shifting the input
821   value and filling the low order bits with
822  zeroes. This method improves png's
823   compression but this scheme cannot reproduce white exactly, since it does not
824   generate an all-ones maximum value; the net effect is to darken the
825   image slightly.
826
827   The better method should be "left bit replication":
828
829    4 3 2 1 0
830    ---------
831    1 1 0 1 1
832
833    7 6 5 4 3  2 1 0
834    ----------------
835    1 1 0 1 1  1 1 0
836    |=======|  |===|
837        |      Leftmost Bits Repeated to Fill Open Bits
838        |
839    Original Bits
840 */
841 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
842 {
843         const uint16_t *end;
844 #ifdef HAVE_MMX
845         const uint16_t *mm_end;
846 #endif
847         uint8_t *d = (uint8_t *)dst;
848         const uint16_t *s = (uint16_t *)src;
849         end = s + src_size/2;
850 #ifdef HAVE_MMX
851         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
852         mm_end = end - 7;
853         while(s < mm_end)
854         {
855             __asm __volatile(
856                 PREFETCH" 32%1\n\t"
857                 "movq   %1, %%mm0\n\t"
858                 "movq   %1, %%mm1\n\t"
859                 "movq   %1, %%mm2\n\t"
860                 "pand   %2, %%mm0\n\t"
861                 "pand   %3, %%mm1\n\t"
862                 "pand   %4, %%mm2\n\t"
863                 "psllq  $3, %%mm0\n\t"
864                 "psrlq  $2, %%mm1\n\t"
865                 "psrlq  $7, %%mm2\n\t"
866                 "movq   %%mm0, %%mm3\n\t"
867                 "movq   %%mm1, %%mm4\n\t"
868                 "movq   %%mm2, %%mm5\n\t"
869                 "punpcklwd %5, %%mm0\n\t"
870                 "punpcklwd %5, %%mm1\n\t"
871                 "punpcklwd %5, %%mm2\n\t"
872                 "punpckhwd %5, %%mm3\n\t"
873                 "punpckhwd %5, %%mm4\n\t"
874                 "punpckhwd %5, %%mm5\n\t"
875                 "psllq  $8, %%mm1\n\t"
876                 "psllq  $16, %%mm2\n\t"
877                 "por    %%mm1, %%mm0\n\t"
878                 "por    %%mm2, %%mm0\n\t"
879                 "psllq  $8, %%mm4\n\t"
880                 "psllq  $16, %%mm5\n\t"
881                 "por    %%mm4, %%mm3\n\t"
882                 "por    %%mm5, %%mm3\n\t"
883
884                 "movq   %%mm0, %%mm6\n\t"
885                 "movq   %%mm3, %%mm7\n\t"
886                 
887                 "movq   8%1, %%mm0\n\t"
888                 "movq   8%1, %%mm1\n\t"
889                 "movq   8%1, %%mm2\n\t"
890                 "pand   %2, %%mm0\n\t"
891                 "pand   %3, %%mm1\n\t"
892                 "pand   %4, %%mm2\n\t"
893                 "psllq  $3, %%mm0\n\t"
894                 "psrlq  $2, %%mm1\n\t"
895                 "psrlq  $7, %%mm2\n\t"
896                 "movq   %%mm0, %%mm3\n\t"
897                 "movq   %%mm1, %%mm4\n\t"
898                 "movq   %%mm2, %%mm5\n\t"
899                 "punpcklwd %5, %%mm0\n\t"
900                 "punpcklwd %5, %%mm1\n\t"
901                 "punpcklwd %5, %%mm2\n\t"
902                 "punpckhwd %5, %%mm3\n\t"
903                 "punpckhwd %5, %%mm4\n\t"
904                 "punpckhwd %5, %%mm5\n\t"
905                 "psllq  $8, %%mm1\n\t"
906                 "psllq  $16, %%mm2\n\t"
907                 "por    %%mm1, %%mm0\n\t"
908                 "por    %%mm2, %%mm0\n\t"
909                 "psllq  $8, %%mm4\n\t"
910                 "psllq  $16, %%mm5\n\t"
911                 "por    %%mm4, %%mm3\n\t"
912                 "por    %%mm5, %%mm3\n\t"
913
914                 :"=m"(*d)
915                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
916                 :"memory");
917             /* Borrowed 32 to 24 */
918             __asm __volatile(
919                 "movq   %%mm0, %%mm4\n\t"
920                 "movq   %%mm3, %%mm5\n\t"
921                 "movq   %%mm6, %%mm0\n\t"
922                 "movq   %%mm7, %%mm1\n\t"
923                 
924                 "movq   %%mm4, %%mm6\n\t"
925                 "movq   %%mm5, %%mm7\n\t"
926                 "movq   %%mm0, %%mm2\n\t"
927                 "movq   %%mm1, %%mm3\n\t"
928
929                 "psrlq  $8, %%mm2\n\t"
930                 "psrlq  $8, %%mm3\n\t"
931                 "psrlq  $8, %%mm6\n\t"
932                 "psrlq  $8, %%mm7\n\t"
933                 "pand   %2, %%mm0\n\t"
934                 "pand   %2, %%mm1\n\t"
935                 "pand   %2, %%mm4\n\t"
936                 "pand   %2, %%mm5\n\t"
937                 "pand   %3, %%mm2\n\t"
938                 "pand   %3, %%mm3\n\t"
939                 "pand   %3, %%mm6\n\t"
940                 "pand   %3, %%mm7\n\t"
941                 "por    %%mm2, %%mm0\n\t"
942                 "por    %%mm3, %%mm1\n\t"
943                 "por    %%mm6, %%mm4\n\t"
944                 "por    %%mm7, %%mm5\n\t"
945
946                 "movq   %%mm1, %%mm2\n\t"
947                 "movq   %%mm4, %%mm3\n\t"
948                 "psllq  $48, %%mm2\n\t"
949                 "psllq  $32, %%mm3\n\t"
950                 "pand   %4, %%mm2\n\t"
951                 "pand   %5, %%mm3\n\t"
952                 "por    %%mm2, %%mm0\n\t"
953                 "psrlq  $16, %%mm1\n\t"
954                 "psrlq  $32, %%mm4\n\t"
955                 "psllq  $16, %%mm5\n\t"
956                 "por    %%mm3, %%mm1\n\t"
957                 "pand   %6, %%mm5\n\t"
958                 "por    %%mm5, %%mm4\n\t"
959
960                 MOVNTQ" %%mm0, %0\n\t"
961                 MOVNTQ" %%mm1, 8%0\n\t"
962                 MOVNTQ" %%mm4, 16%0"
963
964                 :"=m"(*d)
965                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
966                 :"memory");
967                 d += 24;
968                 s += 8;
969         }
970         __asm __volatile(SFENCE:::"memory");
971         __asm __volatile(EMMS:::"memory");
972 #endif
973         while(s < end)
974         {
975                 register uint16_t bgr;
976                 bgr = *s++;
977                 *d++ = (bgr&0x1F)<<3;
978                 *d++ = (bgr&0x3E0)>>2;
979                 *d++ = (bgr&0x7C00)>>7;
980         }
981 }
982
983 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
984 {
985         const uint16_t *end;
986 #ifdef HAVE_MMX
987         const uint16_t *mm_end;
988 #endif
989         uint8_t *d = (uint8_t *)dst;
990         const uint16_t *s = (const uint16_t *)src;
991         end = s + src_size/2;
992 #ifdef HAVE_MMX
993         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
994         mm_end = end - 7;
995         while(s < mm_end)
996         {
997             __asm __volatile(
998                 PREFETCH" 32%1\n\t"
999                 "movq   %1, %%mm0\n\t"
1000                 "movq   %1, %%mm1\n\t"
1001                 "movq   %1, %%mm2\n\t"
1002                 "pand   %2, %%mm0\n\t"
1003                 "pand   %3, %%mm1\n\t"
1004                 "pand   %4, %%mm2\n\t"
1005                 "psllq  $3, %%mm0\n\t"
1006                 "psrlq  $3, %%mm1\n\t"
1007                 "psrlq  $8, %%mm2\n\t"
1008                 "movq   %%mm0, %%mm3\n\t"
1009                 "movq   %%mm1, %%mm4\n\t"
1010                 "movq   %%mm2, %%mm5\n\t"
1011                 "punpcklwd %5, %%mm0\n\t"
1012                 "punpcklwd %5, %%mm1\n\t"
1013                 "punpcklwd %5, %%mm2\n\t"
1014                 "punpckhwd %5, %%mm3\n\t"
1015                 "punpckhwd %5, %%mm4\n\t"
1016                 "punpckhwd %5, %%mm5\n\t"
1017                 "psllq  $8, %%mm1\n\t"
1018                 "psllq  $16, %%mm2\n\t"
1019                 "por    %%mm1, %%mm0\n\t"
1020                 "por    %%mm2, %%mm0\n\t"
1021                 "psllq  $8, %%mm4\n\t"
1022                 "psllq  $16, %%mm5\n\t"
1023                 "por    %%mm4, %%mm3\n\t"
1024                 "por    %%mm5, %%mm3\n\t"
1025                 
1026                 "movq   %%mm0, %%mm6\n\t"
1027                 "movq   %%mm3, %%mm7\n\t"
1028
1029                 "movq   8%1, %%mm0\n\t"
1030                 "movq   8%1, %%mm1\n\t"
1031                 "movq   8%1, %%mm2\n\t"
1032                 "pand   %2, %%mm0\n\t"
1033                 "pand   %3, %%mm1\n\t"
1034                 "pand   %4, %%mm2\n\t"
1035                 "psllq  $3, %%mm0\n\t"
1036                 "psrlq  $3, %%mm1\n\t"
1037                 "psrlq  $8, %%mm2\n\t"
1038                 "movq   %%mm0, %%mm3\n\t"
1039                 "movq   %%mm1, %%mm4\n\t"
1040                 "movq   %%mm2, %%mm5\n\t"
1041                 "punpcklwd %5, %%mm0\n\t"
1042                 "punpcklwd %5, %%mm1\n\t"
1043                 "punpcklwd %5, %%mm2\n\t"
1044                 "punpckhwd %5, %%mm3\n\t"
1045                 "punpckhwd %5, %%mm4\n\t"
1046                 "punpckhwd %5, %%mm5\n\t"
1047                 "psllq  $8, %%mm1\n\t"
1048                 "psllq  $16, %%mm2\n\t"
1049                 "por    %%mm1, %%mm0\n\t"
1050                 "por    %%mm2, %%mm0\n\t"
1051                 "psllq  $8, %%mm4\n\t"
1052                 "psllq  $16, %%mm5\n\t"
1053                 "por    %%mm4, %%mm3\n\t"
1054                 "por    %%mm5, %%mm3\n\t"
1055                 :"=m"(*d)
1056                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)           
1057                 :"memory");
1058             /* Borrowed 32 to 24 */
1059             __asm __volatile(
1060                 "movq   %%mm0, %%mm4\n\t"
1061                 "movq   %%mm3, %%mm5\n\t"
1062                 "movq   %%mm6, %%mm0\n\t"
1063                 "movq   %%mm7, %%mm1\n\t"
1064                 
1065                 "movq   %%mm4, %%mm6\n\t"
1066                 "movq   %%mm5, %%mm7\n\t"
1067                 "movq   %%mm0, %%mm2\n\t"
1068                 "movq   %%mm1, %%mm3\n\t"
1069
1070                 "psrlq  $8, %%mm2\n\t"
1071                 "psrlq  $8, %%mm3\n\t"
1072                 "psrlq  $8, %%mm6\n\t"
1073                 "psrlq  $8, %%mm7\n\t"
1074                 "pand   %2, %%mm0\n\t"
1075                 "pand   %2, %%mm1\n\t"
1076                 "pand   %2, %%mm4\n\t"
1077                 "pand   %2, %%mm5\n\t"
1078                 "pand   %3, %%mm2\n\t"
1079                 "pand   %3, %%mm3\n\t"
1080                 "pand   %3, %%mm6\n\t"
1081                 "pand   %3, %%mm7\n\t"
1082                 "por    %%mm2, %%mm0\n\t"
1083                 "por    %%mm3, %%mm1\n\t"
1084                 "por    %%mm6, %%mm4\n\t"
1085                 "por    %%mm7, %%mm5\n\t"
1086
1087                 "movq   %%mm1, %%mm2\n\t"
1088                 "movq   %%mm4, %%mm3\n\t"
1089                 "psllq  $48, %%mm2\n\t"
1090                 "psllq  $32, %%mm3\n\t"
1091                 "pand   %4, %%mm2\n\t"
1092                 "pand   %5, %%mm3\n\t"
1093                 "por    %%mm2, %%mm0\n\t"
1094                 "psrlq  $16, %%mm1\n\t"
1095                 "psrlq  $32, %%mm4\n\t"
1096                 "psllq  $16, %%mm5\n\t"
1097                 "por    %%mm3, %%mm1\n\t"
1098                 "pand   %6, %%mm5\n\t"
1099                 "por    %%mm5, %%mm4\n\t"
1100
1101                 MOVNTQ" %%mm0, %0\n\t"
1102                 MOVNTQ" %%mm1, 8%0\n\t"
1103                 MOVNTQ" %%mm4, 16%0"
1104
1105                 :"=m"(*d)
1106                 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1107                 :"memory");
1108                 d += 24;
1109                 s += 8;
1110         }
1111         __asm __volatile(SFENCE:::"memory");
1112         __asm __volatile(EMMS:::"memory");
1113 #endif
1114         while(s < end)
1115         {
1116                 register uint16_t bgr;
1117                 bgr = *s++;
1118                 *d++ = (bgr&0x1F)<<3;
1119                 *d++ = (bgr&0x7E0)>>3;
1120                 *d++ = (bgr&0xF800)>>8;
1121         }
1122 }
1123
1124 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1125 {
1126         const uint16_t *end;
1127 #ifdef HAVE_MMX
1128         const uint16_t *mm_end;
1129 #endif
1130         uint8_t *d = (uint8_t *)dst;
1131         const uint16_t *s = (const uint16_t *)src;
1132         end = s + src_size/2;
1133 #ifdef HAVE_MMX
1134         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1135         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1136         mm_end = end - 3;
1137         while(s < mm_end)
1138         {
1139             __asm __volatile(
1140                 PREFETCH" 32%1\n\t"
1141                 "movq   %1, %%mm0\n\t"
1142                 "movq   %1, %%mm1\n\t"
1143                 "movq   %1, %%mm2\n\t"
1144                 "pand   %2, %%mm0\n\t"
1145                 "pand   %3, %%mm1\n\t"
1146                 "pand   %4, %%mm2\n\t"
1147                 "psllq  $3, %%mm0\n\t"
1148                 "psrlq  $2, %%mm1\n\t"
1149                 "psrlq  $7, %%mm2\n\t"
1150                 "movq   %%mm0, %%mm3\n\t"
1151                 "movq   %%mm1, %%mm4\n\t"
1152                 "movq   %%mm2, %%mm5\n\t"
1153                 "punpcklwd %%mm7, %%mm0\n\t"
1154                 "punpcklwd %%mm7, %%mm1\n\t"
1155                 "punpcklwd %%mm7, %%mm2\n\t"
1156                 "punpckhwd %%mm7, %%mm3\n\t"
1157                 "punpckhwd %%mm7, %%mm4\n\t"
1158                 "punpckhwd %%mm7, %%mm5\n\t"
1159                 "psllq  $8, %%mm1\n\t"
1160                 "psllq  $16, %%mm2\n\t"
1161                 "por    %%mm1, %%mm0\n\t"
1162                 "por    %%mm2, %%mm0\n\t"
1163                 "psllq  $8, %%mm4\n\t"
1164                 "psllq  $16, %%mm5\n\t"
1165                 "por    %%mm4, %%mm3\n\t"
1166                 "por    %%mm5, %%mm3\n\t"
1167                 MOVNTQ" %%mm0, %0\n\t"
1168                 MOVNTQ" %%mm3, 8%0\n\t"
1169                 :"=m"(*d)
1170                 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1171                 :"memory");
1172                 d += 16;
1173                 s += 4;
1174         }
1175         __asm __volatile(SFENCE:::"memory");
1176         __asm __volatile(EMMS:::"memory");
1177 #endif
1178         while(s < end)
1179         {
1180                 register uint16_t bgr;
1181                 bgr = *s++;
1182                 *d++ = (bgr&0x1F)<<3;
1183                 *d++ = (bgr&0x3E0)>>2;
1184                 *d++ = (bgr&0x7C00)>>7;
1185                 *d++ = 0;
1186         }
1187 }
1188
1189 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1190 {
1191         const uint16_t *end;
1192 #ifdef HAVE_MMX
1193         const uint16_t *mm_end;
1194 #endif
1195         uint8_t *d = (uint8_t *)dst;
1196         const uint16_t *s = (uint16_t *)src;
1197         end = s + src_size/2;
1198 #ifdef HAVE_MMX
1199         __asm __volatile(PREFETCH"      %0"::"m"(*s):"memory");
1200         __asm __volatile("pxor  %%mm7,%%mm7\n\t":::"memory");
1201         mm_end = end - 3;
1202         while(s < mm_end)
1203         {
1204             __asm __volatile(
1205                 PREFETCH" 32%1\n\t"
1206                 "movq   %1, %%mm0\n\t"
1207                 "movq   %1, %%mm1\n\t"
1208                 "movq   %1, %%mm2\n\t"
1209                 "pand   %2, %%mm0\n\t"
1210                 "pand   %3, %%mm1\n\t"
1211                 "pand   %4, %%mm2\n\t"
1212                 "psllq  $3, %%mm0\n\t"
1213                 "psrlq  $3, %%mm1\n\t"
1214                 "psrlq  $8, %%mm2\n\t"
1215                 "movq   %%mm0, %%mm3\n\t"
1216                 "movq   %%mm1, %%mm4\n\t"
1217                 "movq   %%mm2, %%mm5\n\t"
1218                 "punpcklwd %%mm7, %%mm0\n\t"
1219                 "punpcklwd %%mm7, %%mm1\n\t"
1220                 "punpcklwd %%mm7, %%mm2\n\t"
1221                 "punpckhwd %%mm7, %%mm3\n\t"
1222                 "punpckhwd %%mm7, %%mm4\n\t"
1223                 "punpckhwd %%mm7, %%mm5\n\t"
1224                 "psllq  $8, %%mm1\n\t"
1225                 "psllq  $16, %%mm2\n\t"
1226                 "por    %%mm1, %%mm0\n\t"
1227                 "por    %%mm2, %%mm0\n\t"
1228                 "psllq  $8, %%mm4\n\t"
1229                 "psllq  $16, %%mm5\n\t"
1230                 "por    %%mm4, %%mm3\n\t"
1231                 "por    %%mm5, %%mm3\n\t"
1232                 MOVNTQ" %%mm0, %0\n\t"
1233                 MOVNTQ" %%mm3, 8%0\n\t"
1234                 :"=m"(*d)
1235                 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1236                 :"memory");
1237                 d += 16;
1238                 s += 4;
1239         }
1240         __asm __volatile(SFENCE:::"memory");
1241         __asm __volatile(EMMS:::"memory");
1242 #endif
1243         while(s < end)
1244         {
1245                 register uint16_t bgr;
1246                 bgr = *s++;
1247                 *d++ = (bgr&0x1F)<<3;
1248                 *d++ = (bgr&0x7E0)>>3;
1249                 *d++ = (bgr&0xF800)>>8;
1250                 *d++ = 0;
1251         }
1252 }
1253
1254 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1255 {
1256 #ifdef HAVE_MMX
1257 /* TODO: unroll this loop */
1258         asm volatile (
1259                 "xorl %%eax, %%eax              \n\t"
1260                 ".balign 16                     \n\t"
1261                 "1:                             \n\t"
1262                 PREFETCH" 32(%0, %%eax)         \n\t"
1263                 "movq (%0, %%eax), %%mm0        \n\t"
1264                 "movq %%mm0, %%mm1              \n\t"
1265                 "movq %%mm0, %%mm2              \n\t"
1266                 "pslld $16, %%mm0               \n\t"
1267                 "psrld $16, %%mm1               \n\t"
1268                 "pand "MANGLE(mask32r)", %%mm0  \n\t"
1269                 "pand "MANGLE(mask32g)", %%mm2  \n\t"
1270                 "pand "MANGLE(mask32b)", %%mm1  \n\t"
1271                 "por %%mm0, %%mm2               \n\t"
1272                 "por %%mm1, %%mm2               \n\t"
1273                 MOVNTQ" %%mm2, (%1, %%eax)      \n\t"
1274                 "addl $8, %%eax                 \n\t"
1275                 "cmpl %2, %%eax                 \n\t"
1276                 " jb 1b                         \n\t"
1277                 :: "r" (src), "r"(dst), "r" (src_size-7)
1278                 : "%eax"
1279         );
1280
1281         __asm __volatile(SFENCE:::"memory");
1282         __asm __volatile(EMMS:::"memory");
1283 #else
1284         unsigned i;
1285         unsigned num_pixels = src_size >> 2;
1286         for(i=0; i<num_pixels; i++)
1287         {
1288                 dst[4*i + 0] = src[4*i + 2];
1289                 dst[4*i + 1] = src[4*i + 1];
1290                 dst[4*i + 2] = src[4*i + 0];
1291         }
1292 #endif
1293 }
1294
1295 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1296 {
1297         unsigned i;
1298 #ifdef HAVE_MMX
1299         int mmx_size= 23 - src_size;
1300         asm volatile (
1301                 "movq "MANGLE(mask24r)", %%mm5  \n\t"
1302                 "movq "MANGLE(mask24g)", %%mm6  \n\t"
1303                 "movq "MANGLE(mask24b)", %%mm7  \n\t"
1304                 ".balign 16                     \n\t"
1305                 "1:                             \n\t"
1306                 PREFETCH" 32(%1, %%eax)         \n\t"
1307                 "movq   (%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1308                 "movq   (%1, %%eax), %%mm1      \n\t" // BGR BGR BG
1309                 "movq  2(%1, %%eax), %%mm2      \n\t" // R BGR BGR B
1310                 "psllq $16, %%mm0               \n\t" // 00 BGR BGR
1311                 "pand %%mm5, %%mm0              \n\t"
1312                 "pand %%mm6, %%mm1              \n\t"
1313                 "pand %%mm7, %%mm2              \n\t"
1314                 "por %%mm0, %%mm1               \n\t"
1315                 "por %%mm2, %%mm1               \n\t"                
1316                 "movq  6(%1, %%eax), %%mm0      \n\t" // BGR BGR BG
1317                 MOVNTQ" %%mm1,   (%2, %%eax)    \n\t" // RGB RGB RG
1318                 "movq  8(%1, %%eax), %%mm1      \n\t" // R BGR BGR B
1319                 "movq 10(%1, %%eax), %%mm2      \n\t" // GR BGR BGR
1320                 "pand %%mm7, %%mm0              \n\t"
1321                 "pand %%mm5, %%mm1              \n\t"
1322                 "pand %%mm6, %%mm2              \n\t"
1323                 "por %%mm0, %%mm1               \n\t"
1324                 "por %%mm2, %%mm1               \n\t"                
1325                 "movq 14(%1, %%eax), %%mm0      \n\t" // R BGR BGR B
1326                 MOVNTQ" %%mm1,  8(%2, %%eax)    \n\t" // B RGB RGB R
1327                 "movq 16(%1, %%eax), %%mm1      \n\t" // GR BGR BGR
1328                 "movq 18(%1, %%eax), %%mm2      \n\t" // BGR BGR BG
1329                 "pand %%mm6, %%mm0              \n\t"
1330                 "pand %%mm7, %%mm1              \n\t"
1331                 "pand %%mm5, %%mm2              \n\t"
1332                 "por %%mm0, %%mm1               \n\t"
1333                 "por %%mm2, %%mm1               \n\t"                
1334                 MOVNTQ" %%mm1, 16(%2, %%eax)    \n\t"
1335                 "addl $24, %%eax                \n\t"
1336                 " js 1b                         \n\t"
1337                 : "+a" (mmx_size)
1338                 : "r" (src-mmx_size), "r"(dst-mmx_size)
1339         );
1340
1341         __asm __volatile(SFENCE:::"memory");
1342         __asm __volatile(EMMS:::"memory");
1343
1344         if(mmx_size==23) return; //finihsed, was multiple of 8
1345
1346         src+= src_size;
1347         dst+= src_size;
1348         src_size= 23-mmx_size;
1349         src-= src_size;
1350         dst-= src_size;
1351 #endif
1352         for(i=0; i<src_size; i+=3)
1353         {
1354                 register uint8_t x;
1355                 x          = src[i + 2];
1356                 dst[i + 1] = src[i + 1];
1357                 dst[i + 2] = src[i + 0];
1358                 dst[i + 0] = x;
1359         }
1360 }
1361
1362 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1363         unsigned int width, unsigned int height,
1364         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1365 {
1366         unsigned y;
1367         const unsigned chromWidth= width>>1;
1368         for(y=0; y<height; y++)
1369         {
1370 #ifdef HAVE_MMX
1371 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1372                 asm volatile(
1373                         "xorl %%eax, %%eax              \n\t"
1374                         ".balign 16                     \n\t"
1375                         "1:                             \n\t"
1376                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
1377                         PREFETCH" 32(%2, %%eax)         \n\t"
1378                         PREFETCH" 32(%3, %%eax)         \n\t"
1379                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
1380                         "movq %%mm0, %%mm2              \n\t" // U(0)
1381                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
1382                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
1383                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
1384
1385                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
1386                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
1387                         "movq %%mm3, %%mm4              \n\t" // Y(0)
1388                         "movq %%mm5, %%mm6              \n\t" // Y(8)
1389                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
1390                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
1391                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
1392                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
1393
1394                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
1395                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
1396                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1397                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1398
1399                         "addl $8, %%eax                 \n\t"
1400                         "cmpl %4, %%eax                 \n\t"
1401                         " jb 1b                         \n\t"
1402                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1403                         : "%eax"
1404                 );
1405 #else
1406 #if __WORDSIZE >= 64
1407                 int i;
1408                 uint64_t *ldst = (uint64_t *) dst;
1409                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1410                 for(i = 0; i < chromWidth; i += 2){
1411                         uint64_t k, l;
1412                         k = yc[0] + (uc[0] << 8) +
1413                             (yc[1] << 16) + (vc[0] << 24);
1414                         l = yc[2] + (uc[1] << 8) +
1415                             (yc[3] << 16) + (vc[1] << 24);
1416                         *ldst++ = k + (l << 32);
1417                         yc += 4;
1418                         uc += 2;
1419                         vc += 2;
1420                 }
1421
1422 #else
1423                 int i, *idst = (int32_t *) dst;
1424                 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1425                 for(i = 0; i < chromWidth; i++){
1426                         *idst++ = yc[0] + (uc[0] << 8) +
1427                             (yc[1] << 16) + (vc[0] << 24);
1428                         yc += 2;
1429                         uc++;
1430                         vc++;
1431                 }
1432 #endif
1433 #endif
1434                 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1435                 {
1436                         usrc += chromStride;
1437                         vsrc += chromStride;
1438                 }
1439                 ysrc += lumStride;
1440                 dst += dstStride;
1441         }
1442 #ifdef HAVE_MMX
1443 asm(    EMMS" \n\t"
1444         SFENCE" \n\t"
1445         :::"memory");
1446 #endif
1447 }
1448
1449 /**
1450  *
1451  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1452  * problem for anyone then tell me, and ill fix it)
1453  */
1454 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1455         unsigned int width, unsigned int height,
1456         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1457 {
1458         //FIXME interpolate chroma
1459         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1460 }
1461
1462 /**
1463  *
1464  * width should be a multiple of 16
1465  */
1466 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467         unsigned int width, unsigned int height,
1468         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1469 {
1470         RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1471 }
1472
1473 /**
1474  *
1475  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1476  * problem for anyone then tell me, and ill fix it)
1477  */
1478 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1479         unsigned int width, unsigned int height,
1480         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1481 {
1482         unsigned y;
1483         const unsigned chromWidth= width>>1;
1484         for(y=0; y<height; y+=2)
1485         {
1486 #ifdef HAVE_MMX
1487                 asm volatile(
1488                         "xorl %%eax, %%eax              \n\t"
1489                         "pcmpeqw %%mm7, %%mm7           \n\t"
1490                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1491                         ".balign 16                     \n\t"
1492                         "1:                             \n\t"
1493                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1494                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1495                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1496                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
1497                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
1498                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
1499                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
1500                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
1501                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
1502                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1503                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1504
1505                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1506
1507                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
1508                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
1509                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
1510                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
1511                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
1512                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
1513                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
1514                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
1515                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1516                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1517
1518                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1519
1520                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1521                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1522                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1523                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1524                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1525                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1526                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1527                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1528
1529                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1530                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1531
1532                         "addl $8, %%eax                 \n\t"
1533                         "cmpl %4, %%eax                 \n\t"
1534                         " jb 1b                         \n\t"
1535                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1536                         : "memory", "%eax"
1537                 );
1538
1539                 ydst += lumStride;
1540                 src  += srcStride;
1541
1542                 asm volatile(
1543                         "xorl %%eax, %%eax              \n\t"
1544                         ".balign 16                     \n\t"
1545                         "1:                             \n\t"
1546                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1547                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1548                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1549                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1550                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1551                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
1552                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
1553                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
1554                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
1555                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1556                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1557
1558                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1559                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1560
1561                         "addl $8, %%eax                 \n\t"
1562                         "cmpl %4, %%eax                 \n\t"
1563                         " jb 1b                         \n\t"
1564
1565                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1566                         : "memory", "%eax"
1567                 );
1568 #else
1569                 unsigned i;
1570                 for(i=0; i<chromWidth; i++)
1571                 {
1572                         ydst[2*i+0]     = src[4*i+0];
1573                         udst[i]         = src[4*i+1];
1574                         ydst[2*i+1]     = src[4*i+2];
1575                         vdst[i]         = src[4*i+3];
1576                 }
1577                 ydst += lumStride;
1578                 src  += srcStride;
1579
1580                 for(i=0; i<chromWidth; i++)
1581                 {
1582                         ydst[2*i+0]     = src[4*i+0];
1583                         ydst[2*i+1]     = src[4*i+2];
1584                 }
1585 #endif
1586                 udst += chromStride;
1587                 vdst += chromStride;
1588                 ydst += lumStride;
1589                 src  += srcStride;
1590         }
1591 #ifdef HAVE_MMX
1592 asm volatile(   EMMS" \n\t"
1593                 SFENCE" \n\t"
1594                 :::"memory");
1595 #endif
1596 }
1597
1598 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1599         uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1600         unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1601 {
1602         /* Y Plane */
1603         memcpy(ydst, ysrc, width*height);
1604
1605         /* XXX: implement upscaling for U,V */
1606 }
1607
1608 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1609 {
1610         int x,y;
1611         
1612         // first line
1613         for(x=0; x<srcWidth; x++){
1614                 dst[2*x+0]=
1615                 dst[2*x+1]= src[x];
1616         }
1617         dst+= dstStride;
1618
1619         for(y=1; y<srcHeight; y++){
1620 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1621                 const int mmxSize= srcWidth;
1622                 asm volatile(
1623                         "movl %4, %%eax                 \n\t"
1624                         "1:                             \n\t"
1625                         "movq (%0, %%eax), %%mm0        \n\t"
1626                         "movq (%1, %%eax), %%mm1        \n\t"
1627                         "movq 1(%0, %%eax), %%mm2       \n\t"
1628                         "movq 1(%1, %%eax), %%mm3       \n\t"
1629                         "movq %%mm0, %%mm4              \n\t"
1630                         "movq %%mm1, %%mm5              \n\t"
1631                         PAVGB" %%mm3, %%mm0             \n\t"
1632                         PAVGB" %%mm3, %%mm0             \n\t"
1633                         PAVGB" %%mm4, %%mm3             \n\t"
1634                         PAVGB" %%mm4, %%mm3             \n\t"
1635                         PAVGB" %%mm2, %%mm1             \n\t"
1636                         PAVGB" %%mm2, %%mm1             \n\t"
1637                         PAVGB" %%mm5, %%mm2             \n\t"
1638                         PAVGB" %%mm5, %%mm2             \n\t"
1639                         "movq %%mm3, %%mm4              \n\t"
1640                         "movq %%mm2, %%mm5              \n\t"
1641                         "punpcklbw %%mm1, %%mm3         \n\t"
1642                         "punpckhbw %%mm1, %%mm4         \n\t"
1643                         "punpcklbw %%mm0, %%mm2         \n\t"
1644                         "punpckhbw %%mm0, %%mm5         \n\t"
1645 #if 1
1646                         MOVNTQ" %%mm3, (%2, %%eax, 2)   \n\t"
1647                         MOVNTQ" %%mm4, 8(%2, %%eax, 2)  \n\t"
1648                         MOVNTQ" %%mm2, (%3, %%eax, 2)   \n\t"
1649                         MOVNTQ" %%mm5, 8(%3, %%eax, 2)  \n\t"
1650 #else
1651                         "movq %%mm3, (%2, %%eax, 2)     \n\t"
1652                         "movq %%mm4, 8(%2, %%eax, 2)    \n\t"
1653                         "movq %%mm2, (%3, %%eax, 2)     \n\t"
1654                         "movq %%mm5, 8(%3, %%eax, 2)    \n\t"
1655 #endif
1656                         "addl $8, %%eax                 \n\t"
1657                         " js 1b                         \n\t"
1658                         :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1659                            "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1660                            "g" (-mmxSize)
1661                         : "%eax"
1662
1663                 );
1664                 dst[0]= 
1665                 dst[dstStride]= src[0];
1666 #else
1667                 dst[0]= 
1668                 dst[dstStride]= src[0];
1669
1670                 for(x=0; x<srcWidth-1; x++){
1671                         dst[2*x          +1]= (3*src[x+0] +   src[x+srcStride+1])>>2;
1672                         dst[2*x+dstStride+2]= (  src[x+0] + 3*src[x+srcStride+1])>>2;
1673                         dst[2*x+dstStride+1]= (  src[x+1] + 3*src[x+srcStride  ])>>2;
1674                         dst[2*x          +2]= (3*src[x+1] +   src[x+srcStride  ])>>2;
1675                 }
1676 #endif
1677                 dst[srcWidth*2 -1]= 
1678                 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1679
1680                 dst+=dstStride*2;
1681                 src+=srcStride;
1682         }
1683         src-=srcStride;
1684         
1685         // last line
1686         for(x=0; x<srcWidth; x++){
1687                 dst[2*x+0]=
1688                 dst[2*x+1]= src[x];
1689         }
1690 #ifdef HAVE_MMX
1691 asm volatile(   EMMS" \n\t"
1692                 SFENCE" \n\t"
1693                 :::"memory");
1694 #endif
1695 }
1696
1697 /**
1698  *
1699  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1700  * problem for anyone then tell me, and ill fix it)
1701  * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1702  */
1703 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1704         unsigned int width, unsigned int height,
1705         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1706 {
1707         unsigned y;
1708         const unsigned chromWidth= width>>1;
1709         for(y=0; y<height; y+=2)
1710         {
1711 #ifdef HAVE_MMX
1712                 asm volatile(
1713                         "xorl %%eax, %%eax              \n\t"
1714                         "pcmpeqw %%mm7, %%mm7           \n\t"
1715                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
1716                         ".balign 16                     \n\t"
1717                         "1:                             \n\t"
1718                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1719                         "movq (%0, %%eax, 4), %%mm0     \n\t" // UYVY UYVY(0)
1720                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // UYVY UYVY(4)
1721                         "movq %%mm0, %%mm2              \n\t" // UYVY UYVY(0)
1722                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(4)
1723                         "pand %%mm7, %%mm0              \n\t" // U0V0 U0V0(0)
1724                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(4)
1725                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(0)
1726                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(4)
1727                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
1728                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
1729
1730                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
1731
1732                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // UYVY UYVY(8)
1733                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // UYVY UYVY(12)
1734                         "movq %%mm1, %%mm3              \n\t" // UYVY UYVY(8)
1735                         "movq %%mm2, %%mm4              \n\t" // UYVY UYVY(12)
1736                         "pand %%mm7, %%mm1              \n\t" // U0V0 U0V0(8)
1737                         "pand %%mm7, %%mm2              \n\t" // U0V0 U0V0(12)
1738                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(8)
1739                         "psrlw $8, %%mm4                \n\t" // Y0Y0 Y0Y0(12)
1740                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
1741                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
1742
1743                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
1744
1745                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
1746                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
1747                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
1748                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
1749                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
1750                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
1751                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
1752                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
1753
1754                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
1755                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
1756
1757                         "addl $8, %%eax                 \n\t"
1758                         "cmpl %4, %%eax                 \n\t"
1759                         " jb 1b                         \n\t"
1760                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1761                         : "memory", "%eax"
1762                 );
1763
1764                 ydst += lumStride;
1765                 src  += srcStride;
1766
1767                 asm volatile(
1768                         "xorl %%eax, %%eax              \n\t"
1769                         ".balign 16                     \n\t"
1770                         "1:                             \n\t"
1771                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
1772                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
1773                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
1774                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
1775                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
1776                         "psrlw $8, %%mm0                \n\t" // Y0Y0 Y0Y0(0)
1777                         "psrlw $8, %%mm1                \n\t" // Y0Y0 Y0Y0(4)
1778                         "psrlw $8, %%mm2                \n\t" // Y0Y0 Y0Y0(8)
1779                         "psrlw $8, %%mm3                \n\t" // Y0Y0 Y0Y0(12)
1780                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
1781                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
1782
1783                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
1784                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
1785
1786                         "addl $8, %%eax                 \n\t"
1787                         "cmpl %4, %%eax                 \n\t"
1788                         " jb 1b                         \n\t"
1789
1790                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1791                         : "memory", "%eax"
1792                 );
1793 #else
1794                 unsigned i;
1795                 for(i=0; i<chromWidth; i++)
1796                 {
1797                         udst[i]         = src[4*i+0];
1798                         ydst[2*i+0]     = src[4*i+1];
1799                         vdst[i]         = src[4*i+2];
1800                         ydst[2*i+1]     = src[4*i+3];
1801                 }
1802                 ydst += lumStride;
1803                 src  += srcStride;
1804
1805                 for(i=0; i<chromWidth; i++)
1806                 {
1807                         ydst[2*i+0]     = src[4*i+1];
1808                         ydst[2*i+1]     = src[4*i+3];
1809                 }
1810 #endif
1811                 udst += chromStride;
1812                 vdst += chromStride;
1813                 ydst += lumStride;
1814                 src  += srcStride;
1815         }
1816 #ifdef HAVE_MMX
1817 asm volatile(   EMMS" \n\t"
1818                 SFENCE" \n\t"
1819                 :::"memory");
1820 #endif
1821 }
1822
1823 /**
1824  *
1825  * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1826  * problem for anyone then tell me, and ill fix it)
1827  * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1828  */
1829 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1830         unsigned int width, unsigned int height,
1831         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1832 {
1833         unsigned y;
1834         const unsigned chromWidth= width>>1;
1835 #ifdef HAVE_MMX
1836         for(y=0; y<height-2; y+=2)
1837         {
1838                 unsigned i;
1839                 for(i=0; i<2; i++)
1840                 {
1841                         asm volatile(
1842                                 "movl %2, %%eax                 \n\t"
1843                                 "movq "MANGLE(bgr2YCoeff)", %%mm6               \n\t"
1844                                 "movq "MANGLE(w1111)", %%mm5            \n\t"
1845                                 "pxor %%mm7, %%mm7              \n\t"
1846                                 "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1847                                 ".balign 16                     \n\t"
1848                                 "1:                             \n\t"
1849                                 PREFETCH" 64(%0, %%ebx)         \n\t"
1850                                 "movd (%0, %%ebx), %%mm0        \n\t"
1851                                 "movd 3(%0, %%ebx), %%mm1       \n\t"
1852                                 "punpcklbw %%mm7, %%mm0         \n\t"
1853                                 "punpcklbw %%mm7, %%mm1         \n\t"
1854                                 "movd 6(%0, %%ebx), %%mm2       \n\t"
1855                                 "movd 9(%0, %%ebx), %%mm3       \n\t"
1856                                 "punpcklbw %%mm7, %%mm2         \n\t"
1857                                 "punpcklbw %%mm7, %%mm3         \n\t"
1858                                 "pmaddwd %%mm6, %%mm0           \n\t"
1859                                 "pmaddwd %%mm6, %%mm1           \n\t"
1860                                 "pmaddwd %%mm6, %%mm2           \n\t"
1861                                 "pmaddwd %%mm6, %%mm3           \n\t"
1862 #ifndef FAST_BGR2YV12
1863                                 "psrad $8, %%mm0                \n\t"
1864                                 "psrad $8, %%mm1                \n\t"
1865                                 "psrad $8, %%mm2                \n\t"
1866                                 "psrad $8, %%mm3                \n\t"
1867 #endif
1868                                 "packssdw %%mm1, %%mm0          \n\t"
1869                                 "packssdw %%mm3, %%mm2          \n\t"
1870                                 "pmaddwd %%mm5, %%mm0           \n\t"
1871                                 "pmaddwd %%mm5, %%mm2           \n\t"
1872                                 "packssdw %%mm2, %%mm0          \n\t"
1873                                 "psraw $7, %%mm0                \n\t"
1874
1875                                 "movd 12(%0, %%ebx), %%mm4      \n\t"
1876                                 "movd 15(%0, %%ebx), %%mm1      \n\t"
1877                                 "punpcklbw %%mm7, %%mm4         \n\t"
1878                                 "punpcklbw %%mm7, %%mm1         \n\t"
1879                                 "movd 18(%0, %%ebx), %%mm2      \n\t"
1880                                 "movd 21(%0, %%ebx), %%mm3      \n\t"
1881                                 "punpcklbw %%mm7, %%mm2         \n\t"
1882                                 "punpcklbw %%mm7, %%mm3         \n\t"
1883                                 "pmaddwd %%mm6, %%mm4           \n\t"
1884                                 "pmaddwd %%mm6, %%mm1           \n\t"
1885                                 "pmaddwd %%mm6, %%mm2           \n\t"
1886                                 "pmaddwd %%mm6, %%mm3           \n\t"
1887 #ifndef FAST_BGR2YV12
1888                                 "psrad $8, %%mm4                \n\t"
1889                                 "psrad $8, %%mm1                \n\t"
1890                                 "psrad $8, %%mm2                \n\t"
1891                                 "psrad $8, %%mm3                \n\t"
1892 #endif
1893                                 "packssdw %%mm1, %%mm4          \n\t"
1894                                 "packssdw %%mm3, %%mm2          \n\t"
1895                                 "pmaddwd %%mm5, %%mm4           \n\t"
1896                                 "pmaddwd %%mm5, %%mm2           \n\t"
1897                                 "addl $24, %%ebx                \n\t"
1898                                 "packssdw %%mm2, %%mm4          \n\t"
1899                                 "psraw $7, %%mm4                \n\t"
1900
1901                                 "packuswb %%mm4, %%mm0          \n\t"
1902                                 "paddusb "MANGLE(bgr2YOffset)", %%mm0   \n\t"
1903
1904                                 MOVNTQ" %%mm0, (%1, %%eax)      \n\t"
1905                                 "addl $8, %%eax                 \n\t"
1906                                 " js 1b                         \n\t"
1907                                 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1908                                 : "%eax", "%ebx"
1909                         );
1910                         ydst += lumStride;
1911                         src  += srcStride;
1912                 }
1913                 src -= srcStride*2;
1914                 asm volatile(
1915                         "movl %4, %%eax                 \n\t"
1916                         "movq "MANGLE(w1111)", %%mm5            \n\t"
1917                         "movq "MANGLE(bgr2UCoeff)", %%mm6               \n\t"
1918                         "pxor %%mm7, %%mm7              \n\t"
1919                         "leal (%%eax, %%eax, 2), %%ebx  \n\t"
1920                         "addl %%ebx, %%ebx              \n\t"
1921                         ".balign 16                     \n\t"
1922                         "1:                             \n\t"
1923                         PREFETCH" 64(%0, %%ebx)         \n\t"
1924                         PREFETCH" 64(%1, %%ebx)         \n\t"
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1926                         "movq (%0, %%ebx), %%mm0        \n\t"
1927                         "movq (%1, %%ebx), %%mm1        \n\t"
1928                         "movq 6(%0, %%ebx), %%mm2       \n\t"
1929                         "movq 6(%1, %%ebx), %%mm3       \n\t"
1930                         PAVGB" %%mm1, %%mm0             \n\t"
1931                         PAVGB" %%mm3, %%mm2             \n\t"
1932                         "movq %%mm0, %%mm1              \n\t"
1933                         "movq %%mm2, %%mm3              \n\t"
1934                         "psrlq $24, %%mm0               \n\t"
1935                         "psrlq $24, %%mm2               \n\t"
1936                         PAVGB" %%mm1, %%mm0             \n\t"
1937                         PAVGB" %%mm3, %%mm2             \n\t"
1938                         "punpcklbw %%mm7, %%mm0         \n\t"
1939                         "punpcklbw %%mm7, %%mm2         \n\t"
1940 #else
1941                         "movd (%0, %%ebx), %%mm0        \n\t"
1942                         "movd (%1, %%ebx), %%mm1        \n\t"
1943                         "movd 3(%0, %%ebx), %%mm2       \n\t"
1944                         "movd 3(%1, %%ebx), %%mm3       \n\t"
1945                         "punpcklbw %%mm7, %%mm0         \n\t"
1946                         "punpcklbw %%mm7, %%mm1         \n\t"
1947                         "punpcklbw %%mm7, %%mm2         \n\t"
1948                         "punpcklbw %%mm7, %%mm3         \n\t"
1949                         "paddw %%mm1, %%mm0             \n\t"
1950                         "paddw %%mm3, %%mm2             \n\t"
1951                         "paddw %%mm2, %%mm0             \n\t"
1952                         "movd 6(%0, %%ebx), %%mm4       \n\t"
1953                         "movd 6(%1, %%ebx), %%mm1       \n\t"
1954                         "movd 9(%0, %%ebx), %%mm2       \n\t"
1955                         "movd 9(%1, %%ebx), %%mm3       \n\t"
1956                         "punpcklbw %%mm7, %%mm4         \n\t"
1957                         "punpcklbw %%mm7, %%mm1         \n\t"
1958                         "punpcklbw %%mm7, %%mm2         \n\t"
1959                         "punpcklbw %%mm7, %%mm3         \n\t"
1960                         "paddw %%mm1, %%mm4             \n\t"
1961                         "paddw %%mm3, %%mm2             \n\t"
1962                         "paddw %%mm4, %%mm2             \n\t"
1963                         "psrlw $2, %%mm0                \n\t"
1964                         "psrlw $2, %%mm2                \n\t"
1965 #endif
1966                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
1967                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
1968
1969                         "pmaddwd %%mm0, %%mm1           \n\t"
1970                         "pmaddwd %%mm2, %%mm3           \n\t"
1971                         "pmaddwd %%mm6, %%mm0           \n\t"
1972                         "pmaddwd %%mm6, %%mm2           \n\t"
1973 #ifndef FAST_BGR2YV12
1974                         "psrad $8, %%mm0                \n\t"
1975                         "psrad $8, %%mm1                \n\t"
1976                         "psrad $8, %%mm2                \n\t"
1977                         "psrad $8, %%mm3                \n\t"
1978 #endif
1979                         "packssdw %%mm2, %%mm0          \n\t"
1980                         "packssdw %%mm3, %%mm1          \n\t"
1981                         "pmaddwd %%mm5, %%mm0           \n\t"
1982                         "pmaddwd %%mm5, %%mm1           \n\t"
1983                         "packssdw %%mm1, %%mm0          \n\t" // V1 V0 U1 U0
1984                         "psraw $7, %%mm0                \n\t"
1985
1986 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1987                         "movq 12(%0, %%ebx), %%mm4      \n\t"
1988                         "movq 12(%1, %%ebx), %%mm1      \n\t"
1989                         "movq 18(%0, %%ebx), %%mm2      \n\t"
1990                         "movq 18(%1, %%ebx), %%mm3      \n\t"
1991                         PAVGB" %%mm1, %%mm4             \n\t"
1992                         PAVGB" %%mm3, %%mm2             \n\t"
1993                         "movq %%mm4, %%mm1              \n\t"
1994                         "movq %%mm2, %%mm3              \n\t"
1995                         "psrlq $24, %%mm4               \n\t"
1996                         "psrlq $24, %%mm2               \n\t"
1997                         PAVGB" %%mm1, %%mm4             \n\t"
1998                         PAVGB" %%mm3, %%mm2             \n\t"
1999                         "punpcklbw %%mm7, %%mm4         \n\t"
2000                         "punpcklbw %%mm7, %%mm2         \n\t"
2001 #else
2002                         "movd 12(%0, %%ebx), %%mm4      \n\t"
2003                         "movd 12(%1, %%ebx), %%mm1      \n\t"
2004                         "movd 15(%0, %%ebx), %%mm2      \n\t"
2005                         "movd 15(%1, %%ebx), %%mm3      \n\t"
2006                         "punpcklbw %%mm7, %%mm4         \n\t"
2007                         "punpcklbw %%mm7, %%mm1         \n\t"
2008                         "punpcklbw %%mm7, %%mm2         \n\t"
2009                         "punpcklbw %%mm7, %%mm3         \n\t"
2010                         "paddw %%mm1, %%mm4             \n\t"
2011                         "paddw %%mm3, %%mm2             \n\t"
2012                         "paddw %%mm2, %%mm4             \n\t"
2013                         "movd 18(%0, %%ebx), %%mm5      \n\t"
2014                         "movd 18(%1, %%ebx), %%mm1      \n\t"
2015                         "movd 21(%0, %%ebx), %%mm2      \n\t"
2016                         "movd 21(%1, %%ebx), %%mm3      \n\t"
2017                         "punpcklbw %%mm7, %%mm5         \n\t"
2018                         "punpcklbw %%mm7, %%mm1         \n\t"
2019                         "punpcklbw %%mm7, %%mm2         \n\t"
2020                         "punpcklbw %%mm7, %%mm3         \n\t"
2021                         "paddw %%mm1, %%mm5             \n\t"
2022                         "paddw %%mm3, %%mm2             \n\t"
2023                         "paddw %%mm5, %%mm2             \n\t"
2024                         "movq "MANGLE(w1111)", %%mm5            \n\t"
2025                         "psrlw $2, %%mm4                \n\t"
2026                         "psrlw $2, %%mm2                \n\t"
2027 #endif
2028                         "movq "MANGLE(bgr2VCoeff)", %%mm1               \n\t"
2029                         "movq "MANGLE(bgr2VCoeff)", %%mm3               \n\t"
2030
2031                         "pmaddwd %%mm4, %%mm1           \n\t"
2032                         "pmaddwd %%mm2, %%mm3           \n\t"
2033                         "pmaddwd %%mm6, %%mm4           \n\t"
2034                         "pmaddwd %%mm6, %%mm2           \n\t"
2035 #ifndef FAST_BGR2YV12
2036                         "psrad $8, %%mm4                \n\t"
2037                         "psrad $8, %%mm1                \n\t"
2038                         "psrad $8, %%mm2                \n\t"
2039                         "psrad $8, %%mm3                \n\t"
2040 #endif
2041                         "packssdw %%mm2, %%mm4          \n\t"
2042                         "packssdw %%mm3, %%mm1          \n\t"
2043                         "pmaddwd %%mm5, %%mm4           \n\t"
2044                         "pmaddwd %%mm5, %%mm1           \n\t"
2045                         "addl $24, %%ebx                \n\t"
2046                         "packssdw %%mm1, %%mm4          \n\t" // V3 V2 U3 U2
2047                         "psraw $7, %%mm4                \n\t"
2048
2049                         "movq %%mm0, %%mm1              \n\t"
2050                         "punpckldq %%mm4, %%mm0         \n\t"
2051                         "punpckhdq %%mm4, %%mm1         \n\t"
2052                         "packsswb %%mm1, %%mm0          \n\t"
2053                         "paddb "MANGLE(bgr2UVOffset)", %%mm0    \n\t"
2054
2055                         "movd %%mm0, (%2, %%eax)        \n\t"
2056                         "punpckhdq %%mm0, %%mm0         \n\t"
2057                         "movd %%mm0, (%3, %%eax)        \n\t"
2058                         "addl $4, %%eax                 \n\t"
2059                         " js 1b                         \n\t"
2060                         : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2061                         : "%eax", "%ebx"
2062                 );
2063
2064                 udst += chromStride;
2065                 vdst += chromStride;
2066                 src  += srcStride*2;
2067         }
2068
2069         asm volatile(   EMMS" \n\t"
2070                         SFENCE" \n\t"
2071                         :::"memory");
2072 #else
2073         y=0;
2074 #endif
2075         for(; y<height; y+=2)
2076         {
2077                 unsigned i;
2078                 for(i=0; i<chromWidth; i++)
2079                 {
2080                         unsigned int b= src[6*i+0];
2081                         unsigned int g= src[6*i+1];
2082                         unsigned int r= src[6*i+2];
2083
2084                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2085                         unsigned int V  =  ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2086                         unsigned int U  =  ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2087
2088                         udst[i]         = U;
2089                         vdst[i]         = V;
2090                         ydst[2*i]       = Y;
2091
2092                         b= src[6*i+3];
2093                         g= src[6*i+4];
2094                         r= src[6*i+5];
2095
2096                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2097                         ydst[2*i+1]     = Y;
2098                 }
2099                 ydst += lumStride;
2100                 src  += srcStride;
2101
2102                 for(i=0; i<chromWidth; i++)
2103                 {
2104                         unsigned int b= src[6*i+0];
2105                         unsigned int g= src[6*i+1];
2106                         unsigned int r= src[6*i+2];
2107
2108                         unsigned int Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2109
2110                         ydst[2*i]       = Y;
2111
2112                         b= src[6*i+3];
2113                         g= src[6*i+4];
2114                         r= src[6*i+5];
2115
2116                         Y  =  ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2117                         ydst[2*i+1]     = Y;
2118                 }
2119                 udst += chromStride;
2120                 vdst += chromStride;
2121                 ydst += lumStride;
2122                 src  += srcStride;
2123         }
2124 }
2125
2126 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2127                             unsigned width, unsigned height, unsigned src1Stride,
2128                             unsigned src2Stride, unsigned dstStride){
2129         unsigned h;
2130
2131         for(h=0; h < height; h++)
2132         {
2133                 unsigned w;
2134
2135 #ifdef HAVE_MMX
2136 #ifdef HAVE_SSE2
2137                 asm(
2138                         "xorl %%eax, %%eax              \n\t"
2139                         "1:                             \n\t"
2140                         PREFETCH" 64(%1, %%eax)         \n\t"
2141                         PREFETCH" 64(%2, %%eax)         \n\t"
2142                         "movdqa (%1, %%eax), %%xmm0     \n\t"
2143                         "movdqa (%1, %%eax), %%xmm1     \n\t"
2144                         "movdqa (%2, %%eax), %%xmm2     \n\t"
2145                         "punpcklbw %%xmm2, %%xmm0       \n\t"
2146                         "punpckhbw %%xmm2, %%xmm1       \n\t"
2147                         "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2148                         "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2149                         "addl $16, %%eax                        \n\t"
2150                         "cmpl %3, %%eax                 \n\t"
2151                         " jb 1b                         \n\t"
2152                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2153                         : "memory", "%eax"
2154                 );
2155 #else
2156                 asm(
2157                         "xorl %%eax, %%eax              \n\t"
2158                         "1:                             \n\t"
2159                         PREFETCH" 64(%1, %%eax)         \n\t"
2160                         PREFETCH" 64(%2, %%eax)         \n\t"
2161                         "movq (%1, %%eax), %%mm0        \n\t"
2162                         "movq 8(%1, %%eax), %%mm2       \n\t"
2163                         "movq %%mm0, %%mm1              \n\t"
2164                         "movq %%mm2, %%mm3              \n\t"
2165                         "movq (%2, %%eax), %%mm4        \n\t"
2166                         "movq 8(%2, %%eax), %%mm5       \n\t"
2167                         "punpcklbw %%mm4, %%mm0         \n\t"
2168                         "punpckhbw %%mm4, %%mm1         \n\t"
2169                         "punpcklbw %%mm5, %%mm2         \n\t"
2170                         "punpckhbw %%mm5, %%mm3         \n\t"
2171                         MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
2172                         MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
2173                         MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2174                         MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2175                         "addl $16, %%eax                        \n\t"
2176                         "cmpl %3, %%eax                 \n\t"
2177                         " jb 1b                         \n\t"
2178                         ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2179                         : "memory", "%eax"
2180                 );
2181 #endif
2182                 for(w= (width&(~15)); w < width; w++)
2183                 {
2184                         dest[2*w+0] = src1[w];
2185                         dest[2*w+1] = src2[w];
2186                 }
2187 #else
2188                 for(w=0; w < width; w++)
2189                 {
2190                         dest[2*w+0] = src1[w];
2191                         dest[2*w+1] = src2[w];
2192                 }
2193 #endif
2194                 dest += dstStride;
2195                 src1 += src1Stride;
2196                 src2 += src2Stride;
2197         }
2198 #ifdef HAVE_MMX
2199         asm(
2200                 EMMS" \n\t"
2201                 SFENCE" \n\t"
2202                 ::: "memory"
2203                 );
2204 #endif
2205 }
2206
2207 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2208                         uint8_t *dst1, uint8_t *dst2,
2209                         unsigned width, unsigned height,
2210                         unsigned srcStride1, unsigned srcStride2,
2211                         unsigned dstStride1, unsigned dstStride2)
2212 {
2213     unsigned y,x,w,h;
2214     w=width/2; h=height/2;
2215 #ifdef HAVE_MMX
2216     asm volatile(
2217         PREFETCH" %0\n\t"
2218         PREFETCH" %1\n\t"
2219         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2220 #endif
2221     for(y=0;y<h;y++){
2222         const uint8_t* s1=src1+srcStride1*(y>>1);
2223         uint8_t* d=dst1+dstStride1*y;
2224         x=0;
2225 #ifdef HAVE_MMX
2226         if(w > 32)
2227         for(;x<w;x+=32)
2228         {
2229             asm volatile(
2230                 PREFETCH" 32%1\n\t"
2231                 "movq   %1, %%mm0\n\t"
2232                 "movq   8%1, %%mm2\n\t"
2233                 "movq   16%1, %%mm4\n\t"
2234                 "movq   24%1, %%mm6\n\t"
2235                 "movq   %%mm0, %%mm1\n\t"
2236                 "movq   %%mm2, %%mm3\n\t"
2237                 "movq   %%mm4, %%mm5\n\t"
2238                 "movq   %%mm6, %%mm7\n\t"
2239                 "punpcklbw %%mm0, %%mm0\n\t"
2240                 "punpckhbw %%mm1, %%mm1\n\t"
2241                 "punpcklbw %%mm2, %%mm2\n\t"
2242                 "punpckhbw %%mm3, %%mm3\n\t"
2243                 "punpcklbw %%mm4, %%mm4\n\t"
2244                 "punpckhbw %%mm5, %%mm5\n\t"
2245                 "punpcklbw %%mm6, %%mm6\n\t"
2246                 "punpckhbw %%mm7, %%mm7\n\t"
2247                 MOVNTQ" %%mm0, %0\n\t"
2248                 MOVNTQ" %%mm1, 8%0\n\t"
2249                 MOVNTQ" %%mm2, 16%0\n\t"
2250                 MOVNTQ" %%mm3, 24%0\n\t"
2251                 MOVNTQ" %%mm4, 32%0\n\t"
2252                 MOVNTQ" %%mm5, 40%0\n\t"
2253                 MOVNTQ" %%mm6, 48%0\n\t"
2254                 MOVNTQ" %%mm7, 56%0"
2255                 :"=m"(d[2*x])
2256                 :"m"(s1[x])
2257                 :"memory");
2258         }
2259 #endif
2260         for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2261     }
2262     for(y=0;y<h;y++){
2263         const uint8_t* s2=src2+srcStride2*(y>>1);
2264         uint8_t* d=dst2+dstStride2*y;
2265         x=0;
2266 #ifdef HAVE_MMX
2267         if(w > 32)
2268         for(;x<w;x+=32)
2269         {
2270             asm volatile(
2271                 PREFETCH" 32%1\n\t"
2272                 "movq   %1, %%mm0\n\t"
2273                 "movq   8%1, %%mm2\n\t"
2274                 "movq   16%1, %%mm4\n\t"
2275                 "movq   24%1, %%mm6\n\t"
2276                 "movq   %%mm0, %%mm1\n\t"
2277                 "movq   %%mm2, %%mm3\n\t"
2278                 "movq   %%mm4, %%mm5\n\t"
2279                 "movq   %%mm6, %%mm7\n\t"
2280                 "punpcklbw %%mm0, %%mm0\n\t"
2281                 "punpckhbw %%mm1, %%mm1\n\t"
2282                 "punpcklbw %%mm2, %%mm2\n\t"
2283                 "punpckhbw %%mm3, %%mm3\n\t"
2284                 "punpcklbw %%mm4, %%mm4\n\t"
2285                 "punpckhbw %%mm5, %%mm5\n\t"
2286                 "punpcklbw %%mm6, %%mm6\n\t"
2287                 "punpckhbw %%mm7, %%mm7\n\t"
2288                 MOVNTQ" %%mm0, %0\n\t"
2289                 MOVNTQ" %%mm1, 8%0\n\t"
2290                 MOVNTQ" %%mm2, 16%0\n\t"
2291                 MOVNTQ" %%mm3, 24%0\n\t"
2292                 MOVNTQ" %%mm4, 32%0\n\t"
2293                 MOVNTQ" %%mm5, 40%0\n\t"
2294                 MOVNTQ" %%mm6, 48%0\n\t"
2295                 MOVNTQ" %%mm7, 56%0"
2296                 :"=m"(d[2*x])
2297                 :"m"(s2[x])
2298                 :"memory");
2299         }
2300 #endif
2301         for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2302     }
2303 #ifdef HAVE_MMX
2304         asm(
2305                 EMMS" \n\t"
2306                 SFENCE" \n\t"
2307                 ::: "memory"
2308                 );
2309 #endif
2310 }
2311
2312 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2313                         uint8_t *dst,
2314                         unsigned width, unsigned height,
2315                         unsigned srcStride1, unsigned srcStride2,
2316                         unsigned srcStride3, unsigned dstStride)
2317 {
2318     unsigned y,x,x2,w,h;
2319     w=width/2; h=height;
2320 #ifdef HAVE_MMX
2321     asm volatile(
2322         PREFETCH" %0\n\t"
2323         PREFETCH" %1\n\t"
2324         PREFETCH" %2\n\t"
2325         ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2326 #endif
2327     for(y=0;y<h;y++){
2328         const uint8_t* yp=src1+srcStride1*y;
2329         const uint8_t* up=src2+srcStride2*(y>>2);
2330         const uint8_t* vp=src3+srcStride3*(y>>2);
2331         uint8_t* d=dst+dstStride*y;
2332         x2=0;
2333         x=0;
2334 #ifdef HAVE_MMX
2335         for(;x<w;x+=8,x2+=32)
2336         {
2337             asm volatile(
2338                 PREFETCH" 32%1\n\t"
2339                 PREFETCH" 32%2\n\t"
2340                 PREFETCH" 32%3\n\t"
2341                 "movq   %1, %%mm0\n\t"       /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2342                 "movq   %2, %%mm1\n\t"       /* U0U1U2U3U4U5U6U7 */
2343                 "movq   %3, %%mm2\n\t"       /* V0V1V2V3V4V5V6V7 */
2344                 "movq   %%mm0, %%mm3\n\t"    /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2345                 "movq   %%mm1, %%mm4\n\t"    /* U0U1U2U3U4U5U6U7 */
2346                 "movq   %%mm2, %%mm5\n\t"    /* V0V1V2V3V4V5V6V7 */
2347                 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2348                 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2349                 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2350                 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2351
2352                 "movq   %%mm1, %%mm6\n\t"
2353                 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2354                 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2355                 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2356                 MOVNTQ" %%mm0, %0\n\t"
2357                 MOVNTQ" %%mm3, 8%0\n\t"
2358                 
2359                 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2360                 "movq   8%1, %%mm0\n\t"
2361                 "movq   %%mm0, %%mm3\n\t"
2362                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2363                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2364                 MOVNTQ" %%mm0, 16%0\n\t"
2365                 MOVNTQ" %%mm3, 24%0\n\t"
2366
2367                 "movq   %%mm4, %%mm6\n\t"
2368                 "movq   16%1, %%mm0\n\t"
2369                 "movq   %%mm0, %%mm3\n\t"
2370                 "punpcklbw %%mm5, %%mm4\n\t"
2371                 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2372                 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2373                 MOVNTQ" %%mm0, 32%0\n\t"
2374                 MOVNTQ" %%mm3, 40%0\n\t"
2375                 
2376                 "punpckhbw %%mm5, %%mm6\n\t"
2377                 "movq   24%1, %%mm0\n\t"
2378                 "movq   %%mm0, %%mm3\n\t"
2379                 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2380                 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2381                 MOVNTQ" %%mm0, 48%0\n\t"
2382                 MOVNTQ" %%mm3, 56%0\n\t"
2383
2384                 :"=m"(d[8*x])
2385                 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])
2386                 :"memory");
2387         }
2388 #endif
2389         for(;x<w;x++,x2+=4)
2390         {
2391             d[8*x+0]=yp[x2];
2392             d[8*x+1]=up[x];
2393             d[8*x+2]=yp[x2+1];
2394             d[8*x+3]=vp[x];
2395             d[8*x+4]=yp[x2+2];
2396             d[8*x+5]=up[x];
2397             d[8*x+6]=yp[x2+3];
2398             d[8*x+7]=vp[x];
2399         }
2400     }
2401 #ifdef HAVE_MMX
2402         asm(
2403                 EMMS" \n\t"
2404                 SFENCE" \n\t"
2405                 ::: "memory"
2406                 );
2407 #endif
2408 }