3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246 register const uint8_t* s=src;
247 register uint8_t* d=dst;
248 register const uint8_t *end;
249 const uint8_t *mm_end;
252 __asm __volatile(PREFETCH" %0"::"m"(*s));
253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
261 "movq 8%1, %%mm2\n\t"
262 "movq %%mm0, %%mm1\n\t"
263 "movq %%mm2, %%mm3\n\t"
264 "psrlq $1, %%mm0\n\t"
265 "psrlq $1, %%mm2\n\t"
266 "pand %%mm7, %%mm0\n\t"
267 "pand %%mm7, %%mm2\n\t"
268 "pand %%mm6, %%mm1\n\t"
269 "pand %%mm6, %%mm3\n\t"
270 "por %%mm1, %%mm0\n\t"
271 "por %%mm3, %%mm2\n\t"
272 MOVNTQ" %%mm0, %0\n\t"
280 __asm __volatile(SFENCE:::"memory");
281 __asm __volatile(EMMS:::"memory");
286 register uint32_t x= *((uint32_t *)s);
287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
293 register uint16_t x= *((uint16_t *)s);
294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302 const uint8_t *s = src;
305 const uint8_t *mm_end;
307 uint16_t *d = (uint16_t *)dst;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
313 "movq %3, %%mm5 \n\t"
314 "movq %4, %%mm6 \n\t"
315 "movq %5, %%mm7 \n\t"
318 PREFETCH" 32(%1) \n\t"
319 "movd (%1), %%mm0 \n\t"
320 "movd 4(%1), %%mm3 \n\t"
321 "punpckldq 8(%1), %%mm0 \n\t"
322 "punpckldq 12(%1), %%mm3 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "movq %%mm3, %%mm4 \n\t"
325 "pand %%mm6, %%mm0 \n\t"
326 "pand %%mm6, %%mm3 \n\t"
327 "pmaddwd %%mm7, %%mm0 \n\t"
328 "pmaddwd %%mm7, %%mm3 \n\t"
329 "pand %%mm5, %%mm1 \n\t"
330 "pand %%mm5, %%mm4 \n\t"
331 "por %%mm1, %%mm0 \n\t"
332 "por %%mm4, %%mm3 \n\t"
333 "psrld $5, %%mm0 \n\t"
334 "pslld $11, %%mm3 \n\t"
335 "por %%mm3, %%mm0 \n\t"
336 MOVNTQ" %%mm0, (%0) \n\t"
342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
349 ::"m"(red_16mask),"m"(green_16mask));
355 "movd 4%1, %%mm3\n\t"
356 "punpckldq 8%1, %%mm0\n\t"
357 "punpckldq 12%1, %%mm3\n\t"
358 "movq %%mm0, %%mm1\n\t"
359 "movq %%mm0, %%mm2\n\t"
360 "movq %%mm3, %%mm4\n\t"
361 "movq %%mm3, %%mm5\n\t"
362 "psrlq $3, %%mm0\n\t"
363 "psrlq $3, %%mm3\n\t"
366 "psrlq $5, %%mm1\n\t"
367 "psrlq $5, %%mm4\n\t"
368 "pand %%mm6, %%mm1\n\t"
369 "pand %%mm6, %%mm4\n\t"
370 "psrlq $8, %%mm2\n\t"
371 "psrlq $8, %%mm5\n\t"
372 "pand %%mm7, %%mm2\n\t"
373 "pand %%mm7, %%mm5\n\t"
374 "por %%mm1, %%mm0\n\t"
375 "por %%mm4, %%mm3\n\t"
376 "por %%mm2, %%mm0\n\t"
377 "por %%mm5, %%mm3\n\t"
378 "psllq $16, %%mm3\n\t"
379 "por %%mm3, %%mm0\n\t"
380 MOVNTQ" %%mm0, %0\n\t"
381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
391 const int src= *((uint32_t*)s)++;
392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399 const uint8_t *s = src;
402 const uint8_t *mm_end;
404 uint16_t *d = (uint16_t *)dst;
407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
411 ::"m"(red_16mask),"m"(green_16mask));
418 "movd 4%1, %%mm3\n\t"
419 "punpckldq 8%1, %%mm0\n\t"
420 "punpckldq 12%1, %%mm3\n\t"
421 "movq %%mm0, %%mm1\n\t"
422 "movq %%mm0, %%mm2\n\t"
423 "movq %%mm3, %%mm4\n\t"
424 "movq %%mm3, %%mm5\n\t"
425 "psllq $8, %%mm0\n\t"
426 "psllq $8, %%mm3\n\t"
427 "pand %%mm7, %%mm0\n\t"
428 "pand %%mm7, %%mm3\n\t"
429 "psrlq $5, %%mm1\n\t"
430 "psrlq $5, %%mm4\n\t"
431 "pand %%mm6, %%mm1\n\t"
432 "pand %%mm6, %%mm4\n\t"
433 "psrlq $19, %%mm2\n\t"
434 "psrlq $19, %%mm5\n\t"
437 "por %%mm1, %%mm0\n\t"
438 "por %%mm4, %%mm3\n\t"
439 "por %%mm2, %%mm0\n\t"
440 "por %%mm5, %%mm3\n\t"
441 "psllq $16, %%mm3\n\t"
442 "por %%mm3, %%mm0\n\t"
443 MOVNTQ" %%mm0, %0\n\t"
444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
448 __asm __volatile(SFENCE:::"memory");
449 __asm __volatile(EMMS:::"memory");
453 const int src= *((uint32_t*)s)++;
454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460 const uint8_t *s = src;
463 const uint8_t *mm_end;
465 uint16_t *d = (uint16_t *)dst;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
471 "movq %3, %%mm5 \n\t"
472 "movq %4, %%mm6 \n\t"
473 "movq %5, %%mm7 \n\t"
476 PREFETCH" 32(%1) \n\t"
477 "movd (%1), %%mm0 \n\t"
478 "movd 4(%1), %%mm3 \n\t"
479 "punpckldq 8(%1), %%mm0 \n\t"
480 "punpckldq 12(%1), %%mm3 \n\t"
481 "movq %%mm0, %%mm1 \n\t"
482 "movq %%mm3, %%mm4 \n\t"
483 "pand %%mm6, %%mm0 \n\t"
484 "pand %%mm6, %%mm3 \n\t"
485 "pmaddwd %%mm7, %%mm0 \n\t"
486 "pmaddwd %%mm7, %%mm3 \n\t"
487 "pand %%mm5, %%mm1 \n\t"
488 "pand %%mm5, %%mm4 \n\t"
489 "por %%mm1, %%mm0 \n\t"
490 "por %%mm4, %%mm3 \n\t"
491 "psrld $6, %%mm0 \n\t"
492 "pslld $10, %%mm3 \n\t"
493 "por %%mm3, %%mm0 \n\t"
494 MOVNTQ" %%mm0, (%0) \n\t"
500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
507 ::"m"(red_15mask),"m"(green_15mask));
513 "movd 4%1, %%mm3\n\t"
514 "punpckldq 8%1, %%mm0\n\t"
515 "punpckldq 12%1, %%mm3\n\t"
516 "movq %%mm0, %%mm1\n\t"
517 "movq %%mm0, %%mm2\n\t"
518 "movq %%mm3, %%mm4\n\t"
519 "movq %%mm3, %%mm5\n\t"
520 "psrlq $3, %%mm0\n\t"
521 "psrlq $3, %%mm3\n\t"
524 "psrlq $6, %%mm1\n\t"
525 "psrlq $6, %%mm4\n\t"
526 "pand %%mm6, %%mm1\n\t"
527 "pand %%mm6, %%mm4\n\t"
528 "psrlq $9, %%mm2\n\t"
529 "psrlq $9, %%mm5\n\t"
530 "pand %%mm7, %%mm2\n\t"
531 "pand %%mm7, %%mm5\n\t"
532 "por %%mm1, %%mm0\n\t"
533 "por %%mm4, %%mm3\n\t"
534 "por %%mm2, %%mm0\n\t"
535 "por %%mm5, %%mm3\n\t"
536 "psllq $16, %%mm3\n\t"
537 "por %%mm3, %%mm0\n\t"
538 MOVNTQ" %%mm0, %0\n\t"
539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
544 __asm __volatile(SFENCE:::"memory");
545 __asm __volatile(EMMS:::"memory");
549 const int src= *((uint32_t*)s)++;
550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556 const uint8_t *s = src;
559 const uint8_t *mm_end;
561 uint16_t *d = (uint16_t *)dst;
564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
568 ::"m"(red_15mask),"m"(green_15mask));
575 "movd 4%1, %%mm3\n\t"
576 "punpckldq 8%1, %%mm0\n\t"
577 "punpckldq 12%1, %%mm3\n\t"
578 "movq %%mm0, %%mm1\n\t"
579 "movq %%mm0, %%mm2\n\t"
580 "movq %%mm3, %%mm4\n\t"
581 "movq %%mm3, %%mm5\n\t"
582 "psllq $7, %%mm0\n\t"
583 "psllq $7, %%mm3\n\t"
584 "pand %%mm7, %%mm0\n\t"
585 "pand %%mm7, %%mm3\n\t"
586 "psrlq $6, %%mm1\n\t"
587 "psrlq $6, %%mm4\n\t"
588 "pand %%mm6, %%mm1\n\t"
589 "pand %%mm6, %%mm4\n\t"
590 "psrlq $19, %%mm2\n\t"
591 "psrlq $19, %%mm5\n\t"
594 "por %%mm1, %%mm0\n\t"
595 "por %%mm4, %%mm3\n\t"
596 "por %%mm2, %%mm0\n\t"
597 "por %%mm5, %%mm3\n\t"
598 "psllq $16, %%mm3\n\t"
599 "por %%mm3, %%mm0\n\t"
600 MOVNTQ" %%mm0, %0\n\t"
601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
605 __asm __volatile(SFENCE:::"memory");
606 __asm __volatile(EMMS:::"memory");
610 const int src= *((uint32_t*)s)++;
611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617 const uint8_t *s = src;
620 const uint8_t *mm_end;
622 uint16_t *d = (uint16_t *)dst;
625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
629 ::"m"(red_16mask),"m"(green_16mask));
636 "movd 3%1, %%mm3\n\t"
637 "punpckldq 6%1, %%mm0\n\t"
638 "punpckldq 9%1, %%mm3\n\t"
639 "movq %%mm0, %%mm1\n\t"
640 "movq %%mm0, %%mm2\n\t"
641 "movq %%mm3, %%mm4\n\t"
642 "movq %%mm3, %%mm5\n\t"
643 "psrlq $3, %%mm0\n\t"
644 "psrlq $3, %%mm3\n\t"
647 "psrlq $5, %%mm1\n\t"
648 "psrlq $5, %%mm4\n\t"
649 "pand %%mm6, %%mm1\n\t"
650 "pand %%mm6, %%mm4\n\t"
651 "psrlq $8, %%mm2\n\t"
652 "psrlq $8, %%mm5\n\t"
653 "pand %%mm7, %%mm2\n\t"
654 "pand %%mm7, %%mm5\n\t"
655 "por %%mm1, %%mm0\n\t"
656 "por %%mm4, %%mm3\n\t"
657 "por %%mm2, %%mm0\n\t"
658 "por %%mm5, %%mm3\n\t"
659 "psllq $16, %%mm3\n\t"
660 "por %%mm3, %%mm0\n\t"
661 MOVNTQ" %%mm0, %0\n\t"
662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
666 __asm __volatile(SFENCE:::"memory");
667 __asm __volatile(EMMS:::"memory");
674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680 const uint8_t *s = src;
683 const uint8_t *mm_end;
685 uint16_t *d = (uint16_t *)dst;
688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
692 ::"m"(red_16mask),"m"(green_16mask));
699 "movd 3%1, %%mm3\n\t"
700 "punpckldq 6%1, %%mm0\n\t"
701 "punpckldq 9%1, %%mm3\n\t"
702 "movq %%mm0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm3, %%mm4\n\t"
705 "movq %%mm3, %%mm5\n\t"
706 "psllq $8, %%mm0\n\t"
707 "psllq $8, %%mm3\n\t"
708 "pand %%mm7, %%mm0\n\t"
709 "pand %%mm7, %%mm3\n\t"
710 "psrlq $5, %%mm1\n\t"
711 "psrlq $5, %%mm4\n\t"
712 "pand %%mm6, %%mm1\n\t"
713 "pand %%mm6, %%mm4\n\t"
714 "psrlq $19, %%mm2\n\t"
715 "psrlq $19, %%mm5\n\t"
718 "por %%mm1, %%mm0\n\t"
719 "por %%mm4, %%mm3\n\t"
720 "por %%mm2, %%mm0\n\t"
721 "por %%mm5, %%mm3\n\t"
722 "psllq $16, %%mm3\n\t"
723 "por %%mm3, %%mm0\n\t"
724 MOVNTQ" %%mm0, %0\n\t"
725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
729 __asm __volatile(SFENCE:::"memory");
730 __asm __volatile(EMMS:::"memory");
737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743 const uint8_t *s = src;
746 const uint8_t *mm_end;
748 uint16_t *d = (uint16_t *)dst;
751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
755 ::"m"(red_15mask),"m"(green_15mask));
762 "movd 3%1, %%mm3\n\t"
763 "punpckldq 6%1, %%mm0\n\t"
764 "punpckldq 9%1, %%mm3\n\t"
765 "movq %%mm0, %%mm1\n\t"
766 "movq %%mm0, %%mm2\n\t"
767 "movq %%mm3, %%mm4\n\t"
768 "movq %%mm3, %%mm5\n\t"
769 "psrlq $3, %%mm0\n\t"
770 "psrlq $3, %%mm3\n\t"
773 "psrlq $6, %%mm1\n\t"
774 "psrlq $6, %%mm4\n\t"
775 "pand %%mm6, %%mm1\n\t"
776 "pand %%mm6, %%mm4\n\t"
777 "psrlq $9, %%mm2\n\t"
778 "psrlq $9, %%mm5\n\t"
779 "pand %%mm7, %%mm2\n\t"
780 "pand %%mm7, %%mm5\n\t"
781 "por %%mm1, %%mm0\n\t"
782 "por %%mm4, %%mm3\n\t"
783 "por %%mm2, %%mm0\n\t"
784 "por %%mm5, %%mm3\n\t"
785 "psllq $16, %%mm3\n\t"
786 "por %%mm3, %%mm0\n\t"
787 MOVNTQ" %%mm0, %0\n\t"
788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
792 __asm __volatile(SFENCE:::"memory");
793 __asm __volatile(EMMS:::"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806 const uint8_t *s = src;
809 const uint8_t *mm_end;
811 uint16_t *d = (uint16_t *)dst;
814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
818 ::"m"(red_15mask),"m"(green_15mask));
825 "movd 3%1, %%mm3\n\t"
826 "punpckldq 6%1, %%mm0\n\t"
827 "punpckldq 9%1, %%mm3\n\t"
828 "movq %%mm0, %%mm1\n\t"
829 "movq %%mm0, %%mm2\n\t"
830 "movq %%mm3, %%mm4\n\t"
831 "movq %%mm3, %%mm5\n\t"
832 "psllq $7, %%mm0\n\t"
833 "psllq $7, %%mm3\n\t"
834 "pand %%mm7, %%mm0\n\t"
835 "pand %%mm7, %%mm3\n\t"
836 "psrlq $6, %%mm1\n\t"
837 "psrlq $6, %%mm4\n\t"
838 "pand %%mm6, %%mm1\n\t"
839 "pand %%mm6, %%mm4\n\t"
840 "psrlq $19, %%mm2\n\t"
841 "psrlq $19, %%mm5\n\t"
844 "por %%mm1, %%mm0\n\t"
845 "por %%mm4, %%mm3\n\t"
846 "por %%mm2, %%mm0\n\t"
847 "por %%mm5, %%mm3\n\t"
848 "psllq $16, %%mm3\n\t"
849 "por %%mm3, %%mm0\n\t"
850 MOVNTQ" %%mm0, %0\n\t"
851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 __asm __volatile(SFENCE:::"memory");
856 __asm __volatile(EMMS:::"memory");
863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868 I use here less accurate approximation by simply
869 left-shifting the input
870 value and filling the low order bits with
871 zeroes. This method improves png's
872 compression but this scheme cannot reproduce white exactly, since it does not
873 generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | Leftmost Bits Repeated to Fill Open Bits
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
894 const uint16_t *mm_end;
896 uint8_t *d = (uint8_t *)dst;
897 const uint16_t *s = (uint16_t *)src;
898 end = s + src_size/2;
900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
912 "psllq $3, %%mm0\n\t"
913 "psrlq $2, %%mm1\n\t"
914 "psrlq $7, %%mm2\n\t"
915 "movq %%mm0, %%mm3\n\t"
916 "movq %%mm1, %%mm4\n\t"
917 "movq %%mm2, %%mm5\n\t"
918 "punpcklwd %5, %%mm0\n\t"
919 "punpcklwd %5, %%mm1\n\t"
920 "punpcklwd %5, %%mm2\n\t"
921 "punpckhwd %5, %%mm3\n\t"
922 "punpckhwd %5, %%mm4\n\t"
923 "punpckhwd %5, %%mm5\n\t"
924 "psllq $8, %%mm1\n\t"
925 "psllq $16, %%mm2\n\t"
926 "por %%mm1, %%mm0\n\t"
927 "por %%mm2, %%mm0\n\t"
928 "psllq $8, %%mm4\n\t"
929 "psllq $16, %%mm5\n\t"
930 "por %%mm4, %%mm3\n\t"
931 "por %%mm5, %%mm3\n\t"
933 "movq %%mm0, %%mm6\n\t"
934 "movq %%mm3, %%mm7\n\t"
936 "movq 8%1, %%mm0\n\t"
937 "movq 8%1, %%mm1\n\t"
938 "movq 8%1, %%mm2\n\t"
942 "psllq $3, %%mm0\n\t"
943 "psrlq $2, %%mm1\n\t"
944 "psrlq $7, %%mm2\n\t"
945 "movq %%mm0, %%mm3\n\t"
946 "movq %%mm1, %%mm4\n\t"
947 "movq %%mm2, %%mm5\n\t"
948 "punpcklwd %5, %%mm0\n\t"
949 "punpcklwd %5, %%mm1\n\t"
950 "punpcklwd %5, %%mm2\n\t"
951 "punpckhwd %5, %%mm3\n\t"
952 "punpckhwd %5, %%mm4\n\t"
953 "punpckhwd %5, %%mm5\n\t"
954 "psllq $8, %%mm1\n\t"
955 "psllq $16, %%mm2\n\t"
956 "por %%mm1, %%mm0\n\t"
957 "por %%mm2, %%mm0\n\t"
958 "psllq $8, %%mm4\n\t"
959 "psllq $16, %%mm5\n\t"
960 "por %%mm4, %%mm3\n\t"
961 "por %%mm5, %%mm3\n\t"
964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
966 /* Borrowed 32 to 24 */
968 "movq %%mm0, %%mm4\n\t"
969 "movq %%mm3, %%mm5\n\t"
970 "movq %%mm6, %%mm0\n\t"
971 "movq %%mm7, %%mm1\n\t"
973 "movq %%mm4, %%mm6\n\t"
974 "movq %%mm5, %%mm7\n\t"
975 "movq %%mm0, %%mm2\n\t"
976 "movq %%mm1, %%mm3\n\t"
978 "psrlq $8, %%mm2\n\t"
979 "psrlq $8, %%mm3\n\t"
980 "psrlq $8, %%mm6\n\t"
981 "psrlq $8, %%mm7\n\t"
990 "por %%mm2, %%mm0\n\t"
991 "por %%mm3, %%mm1\n\t"
992 "por %%mm6, %%mm4\n\t"
993 "por %%mm7, %%mm5\n\t"
995 "movq %%mm1, %%mm2\n\t"
996 "movq %%mm4, %%mm3\n\t"
997 "psllq $48, %%mm2\n\t"
998 "psllq $32, %%mm3\n\t"
1000 "pand %5, %%mm3\n\t"
1001 "por %%mm2, %%mm0\n\t"
1002 "psrlq $16, %%mm1\n\t"
1003 "psrlq $32, %%mm4\n\t"
1004 "psllq $16, %%mm5\n\t"
1005 "por %%mm3, %%mm1\n\t"
1006 "pand %6, %%mm5\n\t"
1007 "por %%mm5, %%mm4\n\t"
1009 MOVNTQ" %%mm0, %0\n\t"
1010 MOVNTQ" %%mm1, 8%0\n\t"
1011 MOVNTQ" %%mm4, 16%0"
1014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1019 __asm __volatile(SFENCE:::"memory");
1020 __asm __volatile(EMMS:::"memory");
1024 register uint16_t bgr;
1026 *d++ = (bgr&0x1F)<<3;
1027 *d++ = (bgr&0x3E0)>>2;
1028 *d++ = (bgr&0x7C00)>>7;
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034 const uint16_t *end;
1036 const uint16_t *mm_end;
1038 uint8_t *d = (uint8_t *)dst;
1039 const uint16_t *s = (const uint16_t *)src;
1040 end = s + src_size/2;
1042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1048 "movq %1, %%mm0\n\t"
1049 "movq %1, %%mm1\n\t"
1050 "movq %1, %%mm2\n\t"
1051 "pand %2, %%mm0\n\t"
1052 "pand %3, %%mm1\n\t"
1053 "pand %4, %%mm2\n\t"
1054 "psllq $3, %%mm0\n\t"
1055 "psrlq $3, %%mm1\n\t"
1056 "psrlq $8, %%mm2\n\t"
1057 "movq %%mm0, %%mm3\n\t"
1058 "movq %%mm1, %%mm4\n\t"
1059 "movq %%mm2, %%mm5\n\t"
1060 "punpcklwd %5, %%mm0\n\t"
1061 "punpcklwd %5, %%mm1\n\t"
1062 "punpcklwd %5, %%mm2\n\t"
1063 "punpckhwd %5, %%mm3\n\t"
1064 "punpckhwd %5, %%mm4\n\t"
1065 "punpckhwd %5, %%mm5\n\t"
1066 "psllq $8, %%mm1\n\t"
1067 "psllq $16, %%mm2\n\t"
1068 "por %%mm1, %%mm0\n\t"
1069 "por %%mm2, %%mm0\n\t"
1070 "psllq $8, %%mm4\n\t"
1071 "psllq $16, %%mm5\n\t"
1072 "por %%mm4, %%mm3\n\t"
1073 "por %%mm5, %%mm3\n\t"
1075 "movq %%mm0, %%mm6\n\t"
1076 "movq %%mm3, %%mm7\n\t"
1078 "movq 8%1, %%mm0\n\t"
1079 "movq 8%1, %%mm1\n\t"
1080 "movq 8%1, %%mm2\n\t"
1081 "pand %2, %%mm0\n\t"
1082 "pand %3, %%mm1\n\t"
1083 "pand %4, %%mm2\n\t"
1084 "psllq $3, %%mm0\n\t"
1085 "psrlq $3, %%mm1\n\t"
1086 "psrlq $8, %%mm2\n\t"
1087 "movq %%mm0, %%mm3\n\t"
1088 "movq %%mm1, %%mm4\n\t"
1089 "movq %%mm2, %%mm5\n\t"
1090 "punpcklwd %5, %%mm0\n\t"
1091 "punpcklwd %5, %%mm1\n\t"
1092 "punpcklwd %5, %%mm2\n\t"
1093 "punpckhwd %5, %%mm3\n\t"
1094 "punpckhwd %5, %%mm4\n\t"
1095 "punpckhwd %5, %%mm5\n\t"
1096 "psllq $8, %%mm1\n\t"
1097 "psllq $16, %%mm2\n\t"
1098 "por %%mm1, %%mm0\n\t"
1099 "por %%mm2, %%mm0\n\t"
1100 "psllq $8, %%mm4\n\t"
1101 "psllq $16, %%mm5\n\t"
1102 "por %%mm4, %%mm3\n\t"
1103 "por %%mm5, %%mm3\n\t"
1105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107 /* Borrowed 32 to 24 */
1109 "movq %%mm0, %%mm4\n\t"
1110 "movq %%mm3, %%mm5\n\t"
1111 "movq %%mm6, %%mm0\n\t"
1112 "movq %%mm7, %%mm1\n\t"
1114 "movq %%mm4, %%mm6\n\t"
1115 "movq %%mm5, %%mm7\n\t"
1116 "movq %%mm0, %%mm2\n\t"
1117 "movq %%mm1, %%mm3\n\t"
1119 "psrlq $8, %%mm2\n\t"
1120 "psrlq $8, %%mm3\n\t"
1121 "psrlq $8, %%mm6\n\t"
1122 "psrlq $8, %%mm7\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %2, %%mm1\n\t"
1125 "pand %2, %%mm4\n\t"
1126 "pand %2, %%mm5\n\t"
1127 "pand %3, %%mm2\n\t"
1128 "pand %3, %%mm3\n\t"
1129 "pand %3, %%mm6\n\t"
1130 "pand %3, %%mm7\n\t"
1131 "por %%mm2, %%mm0\n\t"
1132 "por %%mm3, %%mm1\n\t"
1133 "por %%mm6, %%mm4\n\t"
1134 "por %%mm7, %%mm5\n\t"
1136 "movq %%mm1, %%mm2\n\t"
1137 "movq %%mm4, %%mm3\n\t"
1138 "psllq $48, %%mm2\n\t"
1139 "psllq $32, %%mm3\n\t"
1140 "pand %4, %%mm2\n\t"
1141 "pand %5, %%mm3\n\t"
1142 "por %%mm2, %%mm0\n\t"
1143 "psrlq $16, %%mm1\n\t"
1144 "psrlq $32, %%mm4\n\t"
1145 "psllq $16, %%mm5\n\t"
1146 "por %%mm3, %%mm1\n\t"
1147 "pand %6, %%mm5\n\t"
1148 "por %%mm5, %%mm4\n\t"
1150 MOVNTQ" %%mm0, %0\n\t"
1151 MOVNTQ" %%mm1, 8%0\n\t"
1152 MOVNTQ" %%mm4, 16%0"
1155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1160 __asm __volatile(SFENCE:::"memory");
1161 __asm __volatile(EMMS:::"memory");
1165 register uint16_t bgr;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175 const uint16_t *end;
1177 const uint16_t *mm_end;
1179 uint8_t *d = (uint8_t *)dst;
1180 const uint16_t *s = (const uint16_t *)src;
1181 end = s + src_size/2;
1183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1190 "movq %1, %%mm0\n\t"
1191 "movq %1, %%mm1\n\t"
1192 "movq %1, %%mm2\n\t"
1193 "pand %2, %%mm0\n\t"
1194 "pand %3, %%mm1\n\t"
1195 "pand %4, %%mm2\n\t"
1196 "psllq $3, %%mm0\n\t"
1197 "psrlq $2, %%mm1\n\t"
1198 "psrlq $7, %%mm2\n\t"
1199 "movq %%mm0, %%mm3\n\t"
1200 "movq %%mm1, %%mm4\n\t"
1201 "movq %%mm2, %%mm5\n\t"
1202 "punpcklwd %%mm7, %%mm0\n\t"
1203 "punpcklwd %%mm7, %%mm1\n\t"
1204 "punpcklwd %%mm7, %%mm2\n\t"
1205 "punpckhwd %%mm7, %%mm3\n\t"
1206 "punpckhwd %%mm7, %%mm4\n\t"
1207 "punpckhwd %%mm7, %%mm5\n\t"
1208 "psllq $8, %%mm1\n\t"
1209 "psllq $16, %%mm2\n\t"
1210 "por %%mm1, %%mm0\n\t"
1211 "por %%mm2, %%mm0\n\t"
1212 "psllq $8, %%mm4\n\t"
1213 "psllq $16, %%mm5\n\t"
1214 "por %%mm4, %%mm3\n\t"
1215 "por %%mm5, %%mm3\n\t"
1216 MOVNTQ" %%mm0, %0\n\t"
1217 MOVNTQ" %%mm3, 8%0\n\t"
1219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1224 __asm __volatile(SFENCE:::"memory");
1225 __asm __volatile(EMMS:::"memory");
1229 #if 0 //slightly slower on athlon
1231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234 register uint16_t bgr;
1236 *d++ = (bgr&0x1F)<<3;
1237 *d++ = (bgr&0x3E0)>>2;
1238 *d++ = (bgr&0x7C00)>>7;
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246 const uint16_t *end;
1248 const uint16_t *mm_end;
1250 uint8_t *d = (uint8_t *)dst;
1251 const uint16_t *s = (uint16_t *)src;
1252 end = s + src_size/2;
1254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1261 "movq %1, %%mm0\n\t"
1262 "movq %1, %%mm1\n\t"
1263 "movq %1, %%mm2\n\t"
1264 "pand %2, %%mm0\n\t"
1265 "pand %3, %%mm1\n\t"
1266 "pand %4, %%mm2\n\t"
1267 "psllq $3, %%mm0\n\t"
1268 "psrlq $3, %%mm1\n\t"
1269 "psrlq $8, %%mm2\n\t"
1270 "movq %%mm0, %%mm3\n\t"
1271 "movq %%mm1, %%mm4\n\t"
1272 "movq %%mm2, %%mm5\n\t"
1273 "punpcklwd %%mm7, %%mm0\n\t"
1274 "punpcklwd %%mm7, %%mm1\n\t"
1275 "punpcklwd %%mm7, %%mm2\n\t"
1276 "punpckhwd %%mm7, %%mm3\n\t"
1277 "punpckhwd %%mm7, %%mm4\n\t"
1278 "punpckhwd %%mm7, %%mm5\n\t"
1279 "psllq $8, %%mm1\n\t"
1280 "psllq $16, %%mm2\n\t"
1281 "por %%mm1, %%mm0\n\t"
1282 "por %%mm2, %%mm0\n\t"
1283 "psllq $8, %%mm4\n\t"
1284 "psllq $16, %%mm5\n\t"
1285 "por %%mm4, %%mm3\n\t"
1286 "por %%mm5, %%mm3\n\t"
1287 MOVNTQ" %%mm0, %0\n\t"
1288 MOVNTQ" %%mm3, 8%0\n\t"
1290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1295 __asm __volatile(SFENCE:::"memory");
1296 __asm __volatile(EMMS:::"memory");
1300 register uint16_t bgr;
1302 *d++ = (bgr&0x1F)<<3;
1303 *d++ = (bgr&0x7E0)>>3;
1304 *d++ = (bgr&0xF800)>>8;
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1312 /* TODO: unroll this loop */
1314 "xorl %%eax, %%eax \n\t"
1317 PREFETCH" 32(%0, %%eax) \n\t"
1318 "movq (%0, %%eax), %%mm0 \n\t"
1319 "movq %%mm0, %%mm1 \n\t"
1320 "movq %%mm0, %%mm2 \n\t"
1321 "pslld $16, %%mm0 \n\t"
1322 "psrld $16, %%mm1 \n\t"
1323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1326 "por %%mm0, %%mm2 \n\t"
1327 "por %%mm1, %%mm2 \n\t"
1328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329 "addl $8, %%eax \n\t"
1330 "cmpl %2, %%eax \n\t"
1332 :: "r" (src), "r"(dst), "r" (src_size-7)
1336 __asm __volatile(SFENCE:::"memory");
1337 __asm __volatile(EMMS:::"memory");
1340 unsigned num_pixels = src_size >> 2;
1341 for(i=0; i<num_pixels; i++)
1343 dst[4*i + 0] = src[4*i + 2];
1344 dst[4*i + 1] = src[4*i + 1];
1345 dst[4*i + 2] = src[4*i + 0];
1350 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1354 int mmx_size= 23 - src_size;
1356 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1357 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1358 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1361 PREFETCH" 32(%1, %%eax) \n\t"
1362 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1363 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1364 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1365 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1366 "pand %%mm5, %%mm0 \n\t"
1367 "pand %%mm6, %%mm1 \n\t"
1368 "pand %%mm7, %%mm2 \n\t"
1369 "por %%mm0, %%mm1 \n\t"
1370 "por %%mm2, %%mm1 \n\t"
1371 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1372 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1373 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1374 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1375 "pand %%mm7, %%mm0 \n\t"
1376 "pand %%mm5, %%mm1 \n\t"
1377 "pand %%mm6, %%mm2 \n\t"
1378 "por %%mm0, %%mm1 \n\t"
1379 "por %%mm2, %%mm1 \n\t"
1380 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1381 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1382 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1383 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1384 "pand %%mm6, %%mm0 \n\t"
1385 "pand %%mm7, %%mm1 \n\t"
1386 "pand %%mm5, %%mm2 \n\t"
1387 "por %%mm0, %%mm1 \n\t"
1388 "por %%mm2, %%mm1 \n\t"
1389 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1390 "addl $24, %%eax \n\t"
1393 : "r" (src-mmx_size), "r"(dst-mmx_size)
1396 __asm __volatile(SFENCE:::"memory");
1397 __asm __volatile(EMMS:::"memory");
1399 if(mmx_size==23) return; //finihsed, was multiple of 8
1403 src_size= 23-mmx_size;
1407 for(i=0; i<src_size; i+=3)
1411 dst[i + 1] = src[i + 1];
1412 dst[i + 2] = src[i + 0];
1417 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1418 unsigned int width, unsigned int height,
1419 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1422 const unsigned chromWidth= width>>1;
1423 for(y=0; y<height; y++)
1426 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1428 "xorl %%eax, %%eax \n\t"
1431 PREFETCH" 32(%1, %%eax, 2) \n\t"
1432 PREFETCH" 32(%2, %%eax) \n\t"
1433 PREFETCH" 32(%3, %%eax) \n\t"
1434 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1435 "movq %%mm0, %%mm2 \n\t" // U(0)
1436 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1437 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1438 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1440 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1441 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1442 "movq %%mm3, %%mm4 \n\t" // Y(0)
1443 "movq %%mm5, %%mm6 \n\t" // Y(8)
1444 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1445 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1446 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1447 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1449 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1450 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1451 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1452 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1454 "addl $8, %%eax \n\t"
1455 "cmpl %4, %%eax \n\t"
1457 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1462 #if defined ARCH_ALPHA && defined HAVE_MVI
1463 #define pl2yuy2(n) \
1468 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1469 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1470 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1471 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1472 yuv1 = (u << 8) + (v << 24); \
1479 uint64_t *qdst = (uint64_t *) dst;
1480 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1481 const uint32_t *yc = (uint32_t *) ysrc;
1482 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1483 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1484 for(i = 0; i < chromWidth; i += 8){
1485 uint64_t y1, y2, yuv1, yuv2;
1488 asm("ldq $31,64(%0)" :: "r"(yc));
1489 asm("ldq $31,64(%0)" :: "r"(yc2));
1490 asm("ldq $31,64(%0)" :: "r"(uc));
1491 asm("ldq $31,64(%0)" :: "r"(vc));
1509 #elif __WORDSIZE >= 64
1511 uint64_t *ldst = (uint64_t *) dst;
1512 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1513 for(i = 0; i < chromWidth; i += 2){
1515 k = yc[0] + (uc[0] << 8) +
1516 (yc[1] << 16) + (vc[0] << 24);
1517 l = yc[2] + (uc[1] << 8) +
1518 (yc[3] << 16) + (vc[1] << 24);
1519 *ldst++ = k + (l << 32);
1526 int i, *idst = (int32_t *) dst;
1527 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1528 for(i = 0; i < chromWidth; i++){
1529 *idst++ = yc[0] + (uc[0] << 8) +
1530 (yc[1] << 16) + (vc[0] << 24);
1537 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1539 usrc += chromStride;
1540 vsrc += chromStride;
1554 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1555 * problem for anyone then tell me, and ill fix it)
1557 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1558 unsigned int width, unsigned int height,
1559 int lumStride, int chromStride, int dstStride)
1561 //FIXME interpolate chroma
1562 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1567 * width should be a multiple of 16
1569 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1570 unsigned int width, unsigned int height,
1571 int lumStride, int chromStride, int dstStride)
1573 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1578 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1579 * problem for anyone then tell me, and ill fix it)
1581 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1582 unsigned int width, unsigned int height,
1583 int lumStride, int chromStride, int srcStride)
1586 const unsigned chromWidth= width>>1;
1587 for(y=0; y<height; y+=2)
1591 "xorl %%eax, %%eax \n\t"
1592 "pcmpeqw %%mm7, %%mm7 \n\t"
1593 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1596 PREFETCH" 64(%0, %%eax, 4) \n\t"
1597 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1598 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1599 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1600 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1601 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1602 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1603 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1604 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1605 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1606 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1608 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1610 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1611 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1612 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1613 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1614 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1615 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1616 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1617 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1618 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1619 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1621 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1623 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1624 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1625 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1626 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1627 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1628 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1629 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1630 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1632 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1633 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1635 "addl $8, %%eax \n\t"
1636 "cmpl %4, %%eax \n\t"
1638 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1646 "xorl %%eax, %%eax \n\t"
1649 PREFETCH" 64(%0, %%eax, 4) \n\t"
1650 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1651 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1652 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1653 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1654 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1655 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1656 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1657 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1658 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1659 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1661 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1662 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1664 "addl $8, %%eax \n\t"
1665 "cmpl %4, %%eax \n\t"
1668 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1673 for(i=0; i<chromWidth; i++)
1675 ydst[2*i+0] = src[4*i+0];
1676 udst[i] = src[4*i+1];
1677 ydst[2*i+1] = src[4*i+2];
1678 vdst[i] = src[4*i+3];
1683 for(i=0; i<chromWidth; i++)
1685 ydst[2*i+0] = src[4*i+0];
1686 ydst[2*i+1] = src[4*i+2];
1689 udst += chromStride;
1690 vdst += chromStride;
1695 asm volatile( EMMS" \n\t"
1701 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1702 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1703 unsigned int width, unsigned int height, int lumStride, int chromStride)
1706 memcpy(ydst, ysrc, width*height);
1708 /* XXX: implement upscaling for U,V */
1711 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1718 for(x=0; x<srcWidth-1; x++){
1719 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1720 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1722 dst[2*srcWidth-1]= src[srcWidth-1];
1726 for(y=1; y<srcHeight; y++){
1727 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1728 const int mmxSize= srcWidth&~15;
1730 "movl %4, %%eax \n\t"
1732 "movq (%0, %%eax), %%mm0 \n\t"
1733 "movq (%1, %%eax), %%mm1 \n\t"
1734 "movq 1(%0, %%eax), %%mm2 \n\t"
1735 "movq 1(%1, %%eax), %%mm3 \n\t"
1736 "movq -1(%0, %%eax), %%mm4 \n\t"
1737 "movq -1(%1, %%eax), %%mm5 \n\t"
1738 PAVGB" %%mm0, %%mm5 \n\t"
1739 PAVGB" %%mm0, %%mm3 \n\t"
1740 PAVGB" %%mm0, %%mm5 \n\t"
1741 PAVGB" %%mm0, %%mm3 \n\t"
1742 PAVGB" %%mm1, %%mm4 \n\t"
1743 PAVGB" %%mm1, %%mm2 \n\t"
1744 PAVGB" %%mm1, %%mm4 \n\t"
1745 PAVGB" %%mm1, %%mm2 \n\t"
1746 "movq %%mm5, %%mm7 \n\t"
1747 "movq %%mm4, %%mm6 \n\t"
1748 "punpcklbw %%mm3, %%mm5 \n\t"
1749 "punpckhbw %%mm3, %%mm7 \n\t"
1750 "punpcklbw %%mm2, %%mm4 \n\t"
1751 "punpckhbw %%mm2, %%mm6 \n\t"
1753 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1754 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1755 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1756 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1758 "movq %%mm5, (%2, %%eax, 2) \n\t"
1759 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1760 "movq %%mm4, (%3, %%eax, 2) \n\t"
1761 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1763 "addl $8, %%eax \n\t"
1765 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1766 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1772 const int mmxSize=1;
1774 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1775 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1777 for(x=mmxSize-1; x<srcWidth-1; x++){
1778 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1779 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1780 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1781 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1783 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1784 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1794 for(x=0; x<srcWidth-1; x++){
1795 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1796 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1798 dst[2*srcWidth-1]= src[srcWidth-1];
1800 for(x=0; x<srcWidth; x++){
1807 asm volatile( EMMS" \n\t"
1815 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1816 * problem for anyone then tell me, and ill fix it)
1817 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1819 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1820 unsigned int width, unsigned int height,
1821 int lumStride, int chromStride, int srcStride)
1824 const unsigned chromWidth= width>>1;
1825 for(y=0; y<height; y+=2)
1829 "xorl %%eax, %%eax \n\t"
1830 "pcmpeqw %%mm7, %%mm7 \n\t"
1831 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1834 PREFETCH" 64(%0, %%eax, 4) \n\t"
1835 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1836 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1837 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1838 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1839 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1840 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1841 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1842 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1843 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1844 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1846 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1848 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1849 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1850 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1851 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1852 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1853 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1854 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1855 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1856 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1857 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1859 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1861 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1862 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1863 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1864 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1865 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1866 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1867 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1868 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1870 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1871 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1873 "addl $8, %%eax \n\t"
1874 "cmpl %4, %%eax \n\t"
1876 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1884 "xorl %%eax, %%eax \n\t"
1887 PREFETCH" 64(%0, %%eax, 4) \n\t"
1888 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1889 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1890 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1891 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1892 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1893 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1894 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1895 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1896 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1897 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1899 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1900 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1902 "addl $8, %%eax \n\t"
1903 "cmpl %4, %%eax \n\t"
1906 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1911 for(i=0; i<chromWidth; i++)
1913 udst[i] = src[4*i+0];
1914 ydst[2*i+0] = src[4*i+1];
1915 vdst[i] = src[4*i+2];
1916 ydst[2*i+1] = src[4*i+3];
1921 for(i=0; i<chromWidth; i++)
1923 ydst[2*i+0] = src[4*i+1];
1924 ydst[2*i+1] = src[4*i+3];
1927 udst += chromStride;
1928 vdst += chromStride;
1933 asm volatile( EMMS" \n\t"
1941 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1942 * problem for anyone then tell me, and ill fix it)
1943 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1945 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1946 unsigned int width, unsigned int height,
1947 int lumStride, int chromStride, int srcStride)
1950 const unsigned chromWidth= width>>1;
1952 for(y=0; y<height-2; y+=2)
1958 "movl %2, %%eax \n\t"
1959 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1960 "movq "MANGLE(w1111)", %%mm5 \n\t"
1961 "pxor %%mm7, %%mm7 \n\t"
1962 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1965 PREFETCH" 64(%0, %%ebx) \n\t"
1966 "movd (%0, %%ebx), %%mm0 \n\t"
1967 "movd 3(%0, %%ebx), %%mm1 \n\t"
1968 "punpcklbw %%mm7, %%mm0 \n\t"
1969 "punpcklbw %%mm7, %%mm1 \n\t"
1970 "movd 6(%0, %%ebx), %%mm2 \n\t"
1971 "movd 9(%0, %%ebx), %%mm3 \n\t"
1972 "punpcklbw %%mm7, %%mm2 \n\t"
1973 "punpcklbw %%mm7, %%mm3 \n\t"
1974 "pmaddwd %%mm6, %%mm0 \n\t"
1975 "pmaddwd %%mm6, %%mm1 \n\t"
1976 "pmaddwd %%mm6, %%mm2 \n\t"
1977 "pmaddwd %%mm6, %%mm3 \n\t"
1978 #ifndef FAST_BGR2YV12
1979 "psrad $8, %%mm0 \n\t"
1980 "psrad $8, %%mm1 \n\t"
1981 "psrad $8, %%mm2 \n\t"
1982 "psrad $8, %%mm3 \n\t"
1984 "packssdw %%mm1, %%mm0 \n\t"
1985 "packssdw %%mm3, %%mm2 \n\t"
1986 "pmaddwd %%mm5, %%mm0 \n\t"
1987 "pmaddwd %%mm5, %%mm2 \n\t"
1988 "packssdw %%mm2, %%mm0 \n\t"
1989 "psraw $7, %%mm0 \n\t"
1991 "movd 12(%0, %%ebx), %%mm4 \n\t"
1992 "movd 15(%0, %%ebx), %%mm1 \n\t"
1993 "punpcklbw %%mm7, %%mm4 \n\t"
1994 "punpcklbw %%mm7, %%mm1 \n\t"
1995 "movd 18(%0, %%ebx), %%mm2 \n\t"
1996 "movd 21(%0, %%ebx), %%mm3 \n\t"
1997 "punpcklbw %%mm7, %%mm2 \n\t"
1998 "punpcklbw %%mm7, %%mm3 \n\t"
1999 "pmaddwd %%mm6, %%mm4 \n\t"
2000 "pmaddwd %%mm6, %%mm1 \n\t"
2001 "pmaddwd %%mm6, %%mm2 \n\t"
2002 "pmaddwd %%mm6, %%mm3 \n\t"
2003 #ifndef FAST_BGR2YV12
2004 "psrad $8, %%mm4 \n\t"
2005 "psrad $8, %%mm1 \n\t"
2006 "psrad $8, %%mm2 \n\t"
2007 "psrad $8, %%mm3 \n\t"
2009 "packssdw %%mm1, %%mm4 \n\t"
2010 "packssdw %%mm3, %%mm2 \n\t"
2011 "pmaddwd %%mm5, %%mm4 \n\t"
2012 "pmaddwd %%mm5, %%mm2 \n\t"
2013 "addl $24, %%ebx \n\t"
2014 "packssdw %%mm2, %%mm4 \n\t"
2015 "psraw $7, %%mm4 \n\t"
2017 "packuswb %%mm4, %%mm0 \n\t"
2018 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2020 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2021 "addl $8, %%eax \n\t"
2023 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2031 "movl %4, %%eax \n\t"
2032 "movq "MANGLE(w1111)", %%mm5 \n\t"
2033 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2034 "pxor %%mm7, %%mm7 \n\t"
2035 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2036 "addl %%ebx, %%ebx \n\t"
2039 PREFETCH" 64(%0, %%ebx) \n\t"
2040 PREFETCH" 64(%1, %%ebx) \n\t"
2041 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2042 "movq (%0, %%ebx), %%mm0 \n\t"
2043 "movq (%1, %%ebx), %%mm1 \n\t"
2044 "movq 6(%0, %%ebx), %%mm2 \n\t"
2045 "movq 6(%1, %%ebx), %%mm3 \n\t"
2046 PAVGB" %%mm1, %%mm0 \n\t"
2047 PAVGB" %%mm3, %%mm2 \n\t"
2048 "movq %%mm0, %%mm1 \n\t"
2049 "movq %%mm2, %%mm3 \n\t"
2050 "psrlq $24, %%mm0 \n\t"
2051 "psrlq $24, %%mm2 \n\t"
2052 PAVGB" %%mm1, %%mm0 \n\t"
2053 PAVGB" %%mm3, %%mm2 \n\t"
2054 "punpcklbw %%mm7, %%mm0 \n\t"
2055 "punpcklbw %%mm7, %%mm2 \n\t"
2057 "movd (%0, %%ebx), %%mm0 \n\t"
2058 "movd (%1, %%ebx), %%mm1 \n\t"
2059 "movd 3(%0, %%ebx), %%mm2 \n\t"
2060 "movd 3(%1, %%ebx), %%mm3 \n\t"
2061 "punpcklbw %%mm7, %%mm0 \n\t"
2062 "punpcklbw %%mm7, %%mm1 \n\t"
2063 "punpcklbw %%mm7, %%mm2 \n\t"
2064 "punpcklbw %%mm7, %%mm3 \n\t"
2065 "paddw %%mm1, %%mm0 \n\t"
2066 "paddw %%mm3, %%mm2 \n\t"
2067 "paddw %%mm2, %%mm0 \n\t"
2068 "movd 6(%0, %%ebx), %%mm4 \n\t"
2069 "movd 6(%1, %%ebx), %%mm1 \n\t"
2070 "movd 9(%0, %%ebx), %%mm2 \n\t"
2071 "movd 9(%1, %%ebx), %%mm3 \n\t"
2072 "punpcklbw %%mm7, %%mm4 \n\t"
2073 "punpcklbw %%mm7, %%mm1 \n\t"
2074 "punpcklbw %%mm7, %%mm2 \n\t"
2075 "punpcklbw %%mm7, %%mm3 \n\t"
2076 "paddw %%mm1, %%mm4 \n\t"
2077 "paddw %%mm3, %%mm2 \n\t"
2078 "paddw %%mm4, %%mm2 \n\t"
2079 "psrlw $2, %%mm0 \n\t"
2080 "psrlw $2, %%mm2 \n\t"
2082 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2083 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2085 "pmaddwd %%mm0, %%mm1 \n\t"
2086 "pmaddwd %%mm2, %%mm3 \n\t"
2087 "pmaddwd %%mm6, %%mm0 \n\t"
2088 "pmaddwd %%mm6, %%mm2 \n\t"
2089 #ifndef FAST_BGR2YV12
2090 "psrad $8, %%mm0 \n\t"
2091 "psrad $8, %%mm1 \n\t"
2092 "psrad $8, %%mm2 \n\t"
2093 "psrad $8, %%mm3 \n\t"
2095 "packssdw %%mm2, %%mm0 \n\t"
2096 "packssdw %%mm3, %%mm1 \n\t"
2097 "pmaddwd %%mm5, %%mm0 \n\t"
2098 "pmaddwd %%mm5, %%mm1 \n\t"
2099 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2100 "psraw $7, %%mm0 \n\t"
2102 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2103 "movq 12(%0, %%ebx), %%mm4 \n\t"
2104 "movq 12(%1, %%ebx), %%mm1 \n\t"
2105 "movq 18(%0, %%ebx), %%mm2 \n\t"
2106 "movq 18(%1, %%ebx), %%mm3 \n\t"
2107 PAVGB" %%mm1, %%mm4 \n\t"
2108 PAVGB" %%mm3, %%mm2 \n\t"
2109 "movq %%mm4, %%mm1 \n\t"
2110 "movq %%mm2, %%mm3 \n\t"
2111 "psrlq $24, %%mm4 \n\t"
2112 "psrlq $24, %%mm2 \n\t"
2113 PAVGB" %%mm1, %%mm4 \n\t"
2114 PAVGB" %%mm3, %%mm2 \n\t"
2115 "punpcklbw %%mm7, %%mm4 \n\t"
2116 "punpcklbw %%mm7, %%mm2 \n\t"
2118 "movd 12(%0, %%ebx), %%mm4 \n\t"
2119 "movd 12(%1, %%ebx), %%mm1 \n\t"
2120 "movd 15(%0, %%ebx), %%mm2 \n\t"
2121 "movd 15(%1, %%ebx), %%mm3 \n\t"
2122 "punpcklbw %%mm7, %%mm4 \n\t"
2123 "punpcklbw %%mm7, %%mm1 \n\t"
2124 "punpcklbw %%mm7, %%mm2 \n\t"
2125 "punpcklbw %%mm7, %%mm3 \n\t"
2126 "paddw %%mm1, %%mm4 \n\t"
2127 "paddw %%mm3, %%mm2 \n\t"
2128 "paddw %%mm2, %%mm4 \n\t"
2129 "movd 18(%0, %%ebx), %%mm5 \n\t"
2130 "movd 18(%1, %%ebx), %%mm1 \n\t"
2131 "movd 21(%0, %%ebx), %%mm2 \n\t"
2132 "movd 21(%1, %%ebx), %%mm3 \n\t"
2133 "punpcklbw %%mm7, %%mm5 \n\t"
2134 "punpcklbw %%mm7, %%mm1 \n\t"
2135 "punpcklbw %%mm7, %%mm2 \n\t"
2136 "punpcklbw %%mm7, %%mm3 \n\t"
2137 "paddw %%mm1, %%mm5 \n\t"
2138 "paddw %%mm3, %%mm2 \n\t"
2139 "paddw %%mm5, %%mm2 \n\t"
2140 "movq "MANGLE(w1111)", %%mm5 \n\t"
2141 "psrlw $2, %%mm4 \n\t"
2142 "psrlw $2, %%mm2 \n\t"
2144 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2145 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2147 "pmaddwd %%mm4, %%mm1 \n\t"
2148 "pmaddwd %%mm2, %%mm3 \n\t"
2149 "pmaddwd %%mm6, %%mm4 \n\t"
2150 "pmaddwd %%mm6, %%mm2 \n\t"
2151 #ifndef FAST_BGR2YV12
2152 "psrad $8, %%mm4 \n\t"
2153 "psrad $8, %%mm1 \n\t"
2154 "psrad $8, %%mm2 \n\t"
2155 "psrad $8, %%mm3 \n\t"
2157 "packssdw %%mm2, %%mm4 \n\t"
2158 "packssdw %%mm3, %%mm1 \n\t"
2159 "pmaddwd %%mm5, %%mm4 \n\t"
2160 "pmaddwd %%mm5, %%mm1 \n\t"
2161 "addl $24, %%ebx \n\t"
2162 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2163 "psraw $7, %%mm4 \n\t"
2165 "movq %%mm0, %%mm1 \n\t"
2166 "punpckldq %%mm4, %%mm0 \n\t"
2167 "punpckhdq %%mm4, %%mm1 \n\t"
2168 "packsswb %%mm1, %%mm0 \n\t"
2169 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2171 "movd %%mm0, (%2, %%eax) \n\t"
2172 "punpckhdq %%mm0, %%mm0 \n\t"
2173 "movd %%mm0, (%3, %%eax) \n\t"
2174 "addl $4, %%eax \n\t"
2176 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2180 udst += chromStride;
2181 vdst += chromStride;
2185 asm volatile( EMMS" \n\t"
2191 for(; y<height; y+=2)
2194 for(i=0; i<chromWidth; i++)
2196 unsigned int b= src[6*i+0];
2197 unsigned int g= src[6*i+1];
2198 unsigned int r= src[6*i+2];
2200 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2201 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2202 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2212 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2218 for(i=0; i<chromWidth; i++)
2220 unsigned int b= src[6*i+0];
2221 unsigned int g= src[6*i+1];
2222 unsigned int r= src[6*i+2];
2224 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2232 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2235 udst += chromStride;
2236 vdst += chromStride;
2242 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2243 unsigned width, unsigned height, int src1Stride,
2244 int src2Stride, int dstStride){
2247 for(h=0; h < height; h++)
2254 "xorl %%eax, %%eax \n\t"
2256 PREFETCH" 64(%1, %%eax) \n\t"
2257 PREFETCH" 64(%2, %%eax) \n\t"
2258 "movdqa (%1, %%eax), %%xmm0 \n\t"
2259 "movdqa (%1, %%eax), %%xmm1 \n\t"
2260 "movdqa (%2, %%eax), %%xmm2 \n\t"
2261 "punpcklbw %%xmm2, %%xmm0 \n\t"
2262 "punpckhbw %%xmm2, %%xmm1 \n\t"
2263 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2264 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2265 "addl $16, %%eax \n\t"
2266 "cmpl %3, %%eax \n\t"
2268 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2273 "xorl %%eax, %%eax \n\t"
2275 PREFETCH" 64(%1, %%eax) \n\t"
2276 PREFETCH" 64(%2, %%eax) \n\t"
2277 "movq (%1, %%eax), %%mm0 \n\t"
2278 "movq 8(%1, %%eax), %%mm2 \n\t"
2279 "movq %%mm0, %%mm1 \n\t"
2280 "movq %%mm2, %%mm3 \n\t"
2281 "movq (%2, %%eax), %%mm4 \n\t"
2282 "movq 8(%2, %%eax), %%mm5 \n\t"
2283 "punpcklbw %%mm4, %%mm0 \n\t"
2284 "punpckhbw %%mm4, %%mm1 \n\t"
2285 "punpcklbw %%mm5, %%mm2 \n\t"
2286 "punpckhbw %%mm5, %%mm3 \n\t"
2287 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2288 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2289 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2290 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2291 "addl $16, %%eax \n\t"
2292 "cmpl %3, %%eax \n\t"
2294 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2298 for(w= (width&(~15)); w < width; w++)
2300 dest[2*w+0] = src1[w];
2301 dest[2*w+1] = src2[w];
2304 for(w=0; w < width; w++)
2306 dest[2*w+0] = src1[w];
2307 dest[2*w+1] = src2[w];
2323 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2324 uint8_t *dst1, uint8_t *dst2,
2325 unsigned width, unsigned height,
2326 int srcStride1, int srcStride2,
2327 int dstStride1, int dstStride2)
2331 w=width/2; h=height/2;
2336 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2339 const uint8_t* s1=src1+srcStride1*(y>>1);
2340 uint8_t* d=dst1+dstStride1*y;
2347 "movq %1, %%mm0\n\t"
2348 "movq 8%1, %%mm2\n\t"
2349 "movq 16%1, %%mm4\n\t"
2350 "movq 24%1, %%mm6\n\t"
2351 "movq %%mm0, %%mm1\n\t"
2352 "movq %%mm2, %%mm3\n\t"
2353 "movq %%mm4, %%mm5\n\t"
2354 "movq %%mm6, %%mm7\n\t"
2355 "punpcklbw %%mm0, %%mm0\n\t"
2356 "punpckhbw %%mm1, %%mm1\n\t"
2357 "punpcklbw %%mm2, %%mm2\n\t"
2358 "punpckhbw %%mm3, %%mm3\n\t"
2359 "punpcklbw %%mm4, %%mm4\n\t"
2360 "punpckhbw %%mm5, %%mm5\n\t"
2361 "punpcklbw %%mm6, %%mm6\n\t"
2362 "punpckhbw %%mm7, %%mm7\n\t"
2363 MOVNTQ" %%mm0, %0\n\t"
2364 MOVNTQ" %%mm1, 8%0\n\t"
2365 MOVNTQ" %%mm2, 16%0\n\t"
2366 MOVNTQ" %%mm3, 24%0\n\t"
2367 MOVNTQ" %%mm4, 32%0\n\t"
2368 MOVNTQ" %%mm5, 40%0\n\t"
2369 MOVNTQ" %%mm6, 48%0\n\t"
2370 MOVNTQ" %%mm7, 56%0"
2376 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2379 const uint8_t* s2=src2+srcStride2*(y>>1);
2380 uint8_t* d=dst2+dstStride2*y;
2387 "movq %1, %%mm0\n\t"
2388 "movq 8%1, %%mm2\n\t"
2389 "movq 16%1, %%mm4\n\t"
2390 "movq 24%1, %%mm6\n\t"
2391 "movq %%mm0, %%mm1\n\t"
2392 "movq %%mm2, %%mm3\n\t"
2393 "movq %%mm4, %%mm5\n\t"
2394 "movq %%mm6, %%mm7\n\t"
2395 "punpcklbw %%mm0, %%mm0\n\t"
2396 "punpckhbw %%mm1, %%mm1\n\t"
2397 "punpcklbw %%mm2, %%mm2\n\t"
2398 "punpckhbw %%mm3, %%mm3\n\t"
2399 "punpcklbw %%mm4, %%mm4\n\t"
2400 "punpckhbw %%mm5, %%mm5\n\t"
2401 "punpcklbw %%mm6, %%mm6\n\t"
2402 "punpckhbw %%mm7, %%mm7\n\t"
2403 MOVNTQ" %%mm0, %0\n\t"
2404 MOVNTQ" %%mm1, 8%0\n\t"
2405 MOVNTQ" %%mm2, 16%0\n\t"
2406 MOVNTQ" %%mm3, 24%0\n\t"
2407 MOVNTQ" %%mm4, 32%0\n\t"
2408 MOVNTQ" %%mm5, 40%0\n\t"
2409 MOVNTQ" %%mm6, 48%0\n\t"
2410 MOVNTQ" %%mm7, 56%0"
2416 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2427 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2429 unsigned width, unsigned height,
2430 int srcStride1, int srcStride2,
2431 int srcStride3, int dstStride)
2434 w=width/2; h=height;
2436 const uint8_t* yp=src1+srcStride1*y;
2437 const uint8_t* up=src2+srcStride2*(y>>2);
2438 const uint8_t* vp=src3+srcStride3*(y>>2);
2439 uint8_t* d=dst+dstStride*y;
2445 PREFETCH" 32(%1, %0)\n\t"
2446 PREFETCH" 32(%2, %0)\n\t"
2447 PREFETCH" 32(%3, %0)\n\t"
2448 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2449 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2450 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2451 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2452 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2453 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2454 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2455 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2456 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2457 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2459 "movq %%mm1, %%mm6\n\t"
2460 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2461 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2462 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2463 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2464 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2466 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2467 "movq 8(%1, %0, 4), %%mm0\n\t"
2468 "movq %%mm0, %%mm3\n\t"
2469 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2470 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2471 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2472 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2474 "movq %%mm4, %%mm6\n\t"
2475 "movq 16(%1, %0, 4), %%mm0\n\t"
2476 "movq %%mm0, %%mm3\n\t"
2477 "punpcklbw %%mm5, %%mm4\n\t"
2478 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2479 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2480 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2481 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2483 "punpckhbw %%mm5, %%mm6\n\t"
2484 "movq 24(%1, %0, 4), %%mm0\n\t"
2485 "movq %%mm0, %%mm3\n\t"
2486 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2487 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2488 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2489 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2492 : "r"(yp), "r" (up), "r"(vp), "r"(d)