3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 #warning You have misconfigured system and probably will lose performance!
33 #define PREFETCH "prefetch"
34 #define PREFETCHW "prefetchw"
35 #define PAVGB "pavgusb"
36 #elif defined ( HAVE_MMX2 )
37 #define PREFETCH "prefetchnta"
38 #define PREFETCHW "prefetcht0"
41 #define PREFETCH "/nop"
42 #define PREFETCHW "/nop"
46 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
53 #define MOVNTQ "movntq"
54 #define SFENCE "sfence"
60 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
63 const uint8_t *s = src;
66 const uint8_t *mm_end;
70 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
72 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
78 "punpckldq 3%1, %%mm0\n\t"
80 "punpckldq 9%1, %%mm1\n\t"
81 "movd 12%1, %%mm2\n\t"
82 "punpckldq 15%1, %%mm2\n\t"
83 "movd 18%1, %%mm3\n\t"
84 "punpckldq 21%1, %%mm3\n\t"
85 "pand %%mm7, %%mm0\n\t"
86 "pand %%mm7, %%mm1\n\t"
87 "pand %%mm7, %%mm2\n\t"
88 "pand %%mm7, %%mm3\n\t"
89 MOVNTQ" %%mm0, %0\n\t"
90 MOVNTQ" %%mm1, 8%0\n\t"
91 MOVNTQ" %%mm2, 16%0\n\t"
99 __asm __volatile(SFENCE:::"memory");
100 __asm __volatile(EMMS:::"memory");
111 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
114 const uint8_t *s = src;
117 const uint8_t *mm_end;
121 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
128 "movq 8%1, %%mm1\n\t"
129 "movq 16%1, %%mm4\n\t"
130 "movq 24%1, %%mm5\n\t"
131 "movq %%mm0, %%mm2\n\t"
132 "movq %%mm1, %%mm3\n\t"
133 "movq %%mm4, %%mm6\n\t"
134 "movq %%mm5, %%mm7\n\t"
135 "psrlq $8, %%mm2\n\t"
136 "psrlq $8, %%mm3\n\t"
137 "psrlq $8, %%mm6\n\t"
138 "psrlq $8, %%mm7\n\t"
147 "por %%mm2, %%mm0\n\t"
148 "por %%mm3, %%mm1\n\t"
149 "por %%mm6, %%mm4\n\t"
150 "por %%mm7, %%mm5\n\t"
152 "movq %%mm1, %%mm2\n\t"
153 "movq %%mm4, %%mm3\n\t"
154 "psllq $48, %%mm2\n\t"
155 "psllq $32, %%mm3\n\t"
158 "por %%mm2, %%mm0\n\t"
159 "psrlq $16, %%mm1\n\t"
160 "psrlq $32, %%mm4\n\t"
161 "psllq $16, %%mm5\n\t"
162 "por %%mm3, %%mm1\n\t"
164 "por %%mm5, %%mm4\n\t"
166 MOVNTQ" %%mm0, %0\n\t"
167 MOVNTQ" %%mm1, 8%0\n\t"
170 :"m"(*s),"m"(mask24l),
171 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
176 __asm __volatile(SFENCE:::"memory");
177 __asm __volatile(EMMS:::"memory");
189 Original by Strepto/Astral
190 ported to gcc & bugfixed : A'rpi
191 MMX2, 3DNOW optimization by Nick Kurshev
192 32bit c version, and and&add trick by Michael Niedermayer
194 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
196 register const uint8_t* s=src;
197 register uint8_t* d=dst;
198 register const uint8_t *end;
199 const uint8_t *mm_end;
202 __asm __volatile(PREFETCH" %0"::"m"(*s));
203 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
210 "movq 8%1, %%mm2\n\t"
211 "movq %%mm0, %%mm1\n\t"
212 "movq %%mm2, %%mm3\n\t"
213 "pand %%mm4, %%mm0\n\t"
214 "pand %%mm4, %%mm2\n\t"
215 "paddw %%mm1, %%mm0\n\t"
216 "paddw %%mm3, %%mm2\n\t"
217 MOVNTQ" %%mm0, %0\n\t"
225 __asm __volatile(SFENCE:::"memory");
226 __asm __volatile(EMMS:::"memory");
231 register unsigned x= *((uint32_t *)s);
232 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
238 register unsigned short x= *((uint16_t *)s);
239 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
243 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
245 unsigned j,i,num_pixels=src_size/3;
246 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
254 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
256 register const uint8_t* s=src;
257 register uint8_t* d=dst;
258 register const uint8_t *end;
259 const uint8_t *mm_end;
262 __asm __volatile(PREFETCH" %0"::"m"(*s));
263 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
264 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
271 "movq 8%1, %%mm2\n\t"
272 "movq %%mm0, %%mm1\n\t"
273 "movq %%mm2, %%mm3\n\t"
274 "psrlq $1, %%mm0\n\t"
275 "psrlq $1, %%mm2\n\t"
276 "pand %%mm7, %%mm0\n\t"
277 "pand %%mm7, %%mm2\n\t"
278 "pand %%mm6, %%mm1\n\t"
279 "pand %%mm6, %%mm3\n\t"
280 "por %%mm1, %%mm0\n\t"
281 "por %%mm3, %%mm2\n\t"
282 MOVNTQ" %%mm0, %0\n\t"
290 __asm __volatile(SFENCE:::"memory");
291 __asm __volatile(EMMS:::"memory");
296 register uint32_t x= *((uint32_t *)s);
297 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
303 register uint16_t x= *((uint16_t *)s);
304 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
310 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
312 const uint8_t *s = src;
315 const uint8_t *mm_end;
317 uint16_t *d = (uint16_t *)dst;
320 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
324 ::"m"(red_16mask),"m"(green_16mask));
331 "movd 4%1, %%mm3\n\t"
332 "punpckldq 8%1, %%mm0\n\t"
333 "punpckldq 12%1, %%mm3\n\t"
334 "movq %%mm0, %%mm1\n\t"
335 "movq %%mm0, %%mm2\n\t"
336 "movq %%mm3, %%mm4\n\t"
337 "movq %%mm3, %%mm5\n\t"
338 "psrlq $3, %%mm0\n\t"
339 "psrlq $3, %%mm3\n\t"
342 "psrlq $5, %%mm1\n\t"
343 "psrlq $5, %%mm4\n\t"
344 "pand %%mm6, %%mm1\n\t"
345 "pand %%mm6, %%mm4\n\t"
346 "psrlq $8, %%mm2\n\t"
347 "psrlq $8, %%mm5\n\t"
348 "pand %%mm7, %%mm2\n\t"
349 "pand %%mm7, %%mm5\n\t"
350 "por %%mm1, %%mm0\n\t"
351 "por %%mm4, %%mm3\n\t"
352 "por %%mm2, %%mm0\n\t"
353 "por %%mm5, %%mm3\n\t"
354 "psllq $16, %%mm3\n\t"
355 "por %%mm3, %%mm0\n\t"
356 MOVNTQ" %%mm0, %0\n\t"
357 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
361 __asm __volatile(SFENCE:::"memory");
362 __asm __volatile(EMMS:::"memory");
369 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
376 const uint8_t *s = src;
379 const uint8_t *mm_end;
381 uint16_t *d = (uint16_t *)dst;
384 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
388 ::"m"(red_16mask),"m"(green_16mask));
395 "movd 4%1, %%mm3\n\t"
396 "punpckldq 8%1, %%mm0\n\t"
397 "punpckldq 12%1, %%mm3\n\t"
398 "movq %%mm0, %%mm1\n\t"
399 "movq %%mm0, %%mm2\n\t"
400 "movq %%mm3, %%mm4\n\t"
401 "movq %%mm3, %%mm5\n\t"
402 "psllq $8, %%mm0\n\t"
403 "psllq $8, %%mm3\n\t"
404 "pand %%mm7, %%mm0\n\t"
405 "pand %%mm7, %%mm3\n\t"
406 "psrlq $5, %%mm1\n\t"
407 "psrlq $5, %%mm4\n\t"
408 "pand %%mm6, %%mm1\n\t"
409 "pand %%mm6, %%mm4\n\t"
410 "psrlq $19, %%mm2\n\t"
411 "psrlq $19, %%mm5\n\t"
414 "por %%mm1, %%mm0\n\t"
415 "por %%mm4, %%mm3\n\t"
416 "por %%mm2, %%mm0\n\t"
417 "por %%mm5, %%mm3\n\t"
418 "psllq $16, %%mm3\n\t"
419 "por %%mm3, %%mm0\n\t"
420 MOVNTQ" %%mm0, %0\n\t"
421 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
425 __asm __volatile(SFENCE:::"memory");
426 __asm __volatile(EMMS:::"memory");
433 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
438 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
440 const uint8_t *s = src;
443 const uint8_t *mm_end;
445 uint16_t *d = (uint16_t *)dst;
448 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
452 ::"m"(red_15mask),"m"(green_15mask));
459 "movd 4%1, %%mm3\n\t"
460 "punpckldq 8%1, %%mm0\n\t"
461 "punpckldq 12%1, %%mm3\n\t"
462 "movq %%mm0, %%mm1\n\t"
463 "movq %%mm0, %%mm2\n\t"
464 "movq %%mm3, %%mm4\n\t"
465 "movq %%mm3, %%mm5\n\t"
466 "psrlq $3, %%mm0\n\t"
467 "psrlq $3, %%mm3\n\t"
470 "psrlq $6, %%mm1\n\t"
471 "psrlq $6, %%mm4\n\t"
472 "pand %%mm6, %%mm1\n\t"
473 "pand %%mm6, %%mm4\n\t"
474 "psrlq $9, %%mm2\n\t"
475 "psrlq $9, %%mm5\n\t"
476 "pand %%mm7, %%mm2\n\t"
477 "pand %%mm7, %%mm5\n\t"
478 "por %%mm1, %%mm0\n\t"
479 "por %%mm4, %%mm3\n\t"
480 "por %%mm2, %%mm0\n\t"
481 "por %%mm5, %%mm3\n\t"
482 "psllq $16, %%mm3\n\t"
483 "por %%mm3, %%mm0\n\t"
484 MOVNTQ" %%mm0, %0\n\t"
485 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
489 __asm __volatile(SFENCE:::"memory");
490 __asm __volatile(EMMS:::"memory");
497 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
502 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
504 const uint8_t *s = src;
507 const uint8_t *mm_end;
509 uint16_t *d = (uint16_t *)dst;
512 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
516 ::"m"(red_15mask),"m"(green_15mask));
523 "movd 4%1, %%mm3\n\t"
524 "punpckldq 8%1, %%mm0\n\t"
525 "punpckldq 12%1, %%mm3\n\t"
526 "movq %%mm0, %%mm1\n\t"
527 "movq %%mm0, %%mm2\n\t"
528 "movq %%mm3, %%mm4\n\t"
529 "movq %%mm3, %%mm5\n\t"
530 "psllq $7, %%mm0\n\t"
531 "psllq $7, %%mm3\n\t"
532 "pand %%mm7, %%mm0\n\t"
533 "pand %%mm7, %%mm3\n\t"
534 "psrlq $6, %%mm1\n\t"
535 "psrlq $6, %%mm4\n\t"
536 "pand %%mm6, %%mm1\n\t"
537 "pand %%mm6, %%mm4\n\t"
538 "psrlq $19, %%mm2\n\t"
539 "psrlq $19, %%mm5\n\t"
542 "por %%mm1, %%mm0\n\t"
543 "por %%mm4, %%mm3\n\t"
544 "por %%mm2, %%mm0\n\t"
545 "por %%mm5, %%mm3\n\t"
546 "psllq $16, %%mm3\n\t"
547 "por %%mm3, %%mm0\n\t"
548 MOVNTQ" %%mm0, %0\n\t"
549 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
553 __asm __volatile(SFENCE:::"memory");
554 __asm __volatile(EMMS:::"memory");
561 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
566 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
568 const uint8_t *s = src;
571 const uint8_t *mm_end;
573 uint16_t *d = (uint16_t *)dst;
576 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
580 ::"m"(red_16mask),"m"(green_16mask));
587 "movd 3%1, %%mm3\n\t"
588 "punpckldq 6%1, %%mm0\n\t"
589 "punpckldq 9%1, %%mm3\n\t"
590 "movq %%mm0, %%mm1\n\t"
591 "movq %%mm0, %%mm2\n\t"
592 "movq %%mm3, %%mm4\n\t"
593 "movq %%mm3, %%mm5\n\t"
594 "psrlq $3, %%mm0\n\t"
595 "psrlq $3, %%mm3\n\t"
598 "psrlq $5, %%mm1\n\t"
599 "psrlq $5, %%mm4\n\t"
600 "pand %%mm6, %%mm1\n\t"
601 "pand %%mm6, %%mm4\n\t"
602 "psrlq $8, %%mm2\n\t"
603 "psrlq $8, %%mm5\n\t"
604 "pand %%mm7, %%mm2\n\t"
605 "pand %%mm7, %%mm5\n\t"
606 "por %%mm1, %%mm0\n\t"
607 "por %%mm4, %%mm3\n\t"
608 "por %%mm2, %%mm0\n\t"
609 "por %%mm5, %%mm3\n\t"
610 "psllq $16, %%mm3\n\t"
611 "por %%mm3, %%mm0\n\t"
612 MOVNTQ" %%mm0, %0\n\t"
613 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
617 __asm __volatile(SFENCE:::"memory");
618 __asm __volatile(EMMS:::"memory");
625 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
629 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
631 const uint8_t *s = src;
634 const uint8_t *mm_end;
636 uint16_t *d = (uint16_t *)dst;
639 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
643 ::"m"(red_16mask),"m"(green_16mask));
650 "movd 3%1, %%mm3\n\t"
651 "punpckldq 6%1, %%mm0\n\t"
652 "punpckldq 9%1, %%mm3\n\t"
653 "movq %%mm0, %%mm1\n\t"
654 "movq %%mm0, %%mm2\n\t"
655 "movq %%mm3, %%mm4\n\t"
656 "movq %%mm3, %%mm5\n\t"
657 "psllq $8, %%mm0\n\t"
658 "psllq $8, %%mm3\n\t"
659 "pand %%mm7, %%mm0\n\t"
660 "pand %%mm7, %%mm3\n\t"
661 "psrlq $5, %%mm1\n\t"
662 "psrlq $5, %%mm4\n\t"
663 "pand %%mm6, %%mm1\n\t"
664 "pand %%mm6, %%mm4\n\t"
665 "psrlq $19, %%mm2\n\t"
666 "psrlq $19, %%mm5\n\t"
669 "por %%mm1, %%mm0\n\t"
670 "por %%mm4, %%mm3\n\t"
671 "por %%mm2, %%mm0\n\t"
672 "por %%mm5, %%mm3\n\t"
673 "psllq $16, %%mm3\n\t"
674 "por %%mm3, %%mm0\n\t"
675 MOVNTQ" %%mm0, %0\n\t"
676 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
680 __asm __volatile(SFENCE:::"memory");
681 __asm __volatile(EMMS:::"memory");
688 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
692 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
694 const uint8_t *s = src;
697 const uint8_t *mm_end;
699 uint16_t *d = (uint16_t *)dst;
702 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
706 ::"m"(red_15mask),"m"(green_15mask));
713 "movd 3%1, %%mm3\n\t"
714 "punpckldq 6%1, %%mm0\n\t"
715 "punpckldq 9%1, %%mm3\n\t"
716 "movq %%mm0, %%mm1\n\t"
717 "movq %%mm0, %%mm2\n\t"
718 "movq %%mm3, %%mm4\n\t"
719 "movq %%mm3, %%mm5\n\t"
720 "psrlq $3, %%mm0\n\t"
721 "psrlq $3, %%mm3\n\t"
724 "psrlq $6, %%mm1\n\t"
725 "psrlq $6, %%mm4\n\t"
726 "pand %%mm6, %%mm1\n\t"
727 "pand %%mm6, %%mm4\n\t"
728 "psrlq $9, %%mm2\n\t"
729 "psrlq $9, %%mm5\n\t"
730 "pand %%mm7, %%mm2\n\t"
731 "pand %%mm7, %%mm5\n\t"
732 "por %%mm1, %%mm0\n\t"
733 "por %%mm4, %%mm3\n\t"
734 "por %%mm2, %%mm0\n\t"
735 "por %%mm5, %%mm3\n\t"
736 "psllq $16, %%mm3\n\t"
737 "por %%mm3, %%mm0\n\t"
738 MOVNTQ" %%mm0, %0\n\t"
739 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
743 __asm __volatile(SFENCE:::"memory");
744 __asm __volatile(EMMS:::"memory");
751 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
755 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
757 const uint8_t *s = src;
760 const uint8_t *mm_end;
762 uint16_t *d = (uint16_t *)dst;
765 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
769 ::"m"(red_15mask),"m"(green_15mask));
776 "movd 3%1, %%mm3\n\t"
777 "punpckldq 6%1, %%mm0\n\t"
778 "punpckldq 9%1, %%mm3\n\t"
779 "movq %%mm0, %%mm1\n\t"
780 "movq %%mm0, %%mm2\n\t"
781 "movq %%mm3, %%mm4\n\t"
782 "movq %%mm3, %%mm5\n\t"
783 "psllq $7, %%mm0\n\t"
784 "psllq $7, %%mm3\n\t"
785 "pand %%mm7, %%mm0\n\t"
786 "pand %%mm7, %%mm3\n\t"
787 "psrlq $6, %%mm1\n\t"
788 "psrlq $6, %%mm4\n\t"
789 "pand %%mm6, %%mm1\n\t"
790 "pand %%mm6, %%mm4\n\t"
791 "psrlq $19, %%mm2\n\t"
792 "psrlq $19, %%mm5\n\t"
795 "por %%mm1, %%mm0\n\t"
796 "por %%mm4, %%mm3\n\t"
797 "por %%mm2, %%mm0\n\t"
798 "por %%mm5, %%mm3\n\t"
799 "psllq $16, %%mm3\n\t"
800 "por %%mm3, %%mm0\n\t"
801 MOVNTQ" %%mm0, %0\n\t"
802 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
806 __asm __volatile(SFENCE:::"memory");
807 __asm __volatile(EMMS:::"memory");
814 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
819 I use here less accurate approximation by simply
820 left-shifting the input
821 value and filling the low order bits with
822 zeroes. This method improves png's
823 compression but this scheme cannot reproduce white exactly, since it does not
824 generate an all-ones maximum value; the net effect is to darken the
827 The better method should be "left bit replication":
837 | Leftmost Bits Repeated to Fill Open Bits
841 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
845 const uint16_t *mm_end;
847 uint8_t *d = (uint8_t *)dst;
848 const uint16_t *s = (uint16_t *)src;
849 end = s + src_size/2;
851 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
863 "psllq $3, %%mm0\n\t"
864 "psrlq $2, %%mm1\n\t"
865 "psrlq $7, %%mm2\n\t"
866 "movq %%mm0, %%mm3\n\t"
867 "movq %%mm1, %%mm4\n\t"
868 "movq %%mm2, %%mm5\n\t"
869 "punpcklwd %5, %%mm0\n\t"
870 "punpcklwd %5, %%mm1\n\t"
871 "punpcklwd %5, %%mm2\n\t"
872 "punpckhwd %5, %%mm3\n\t"
873 "punpckhwd %5, %%mm4\n\t"
874 "punpckhwd %5, %%mm5\n\t"
875 "psllq $8, %%mm1\n\t"
876 "psllq $16, %%mm2\n\t"
877 "por %%mm1, %%mm0\n\t"
878 "por %%mm2, %%mm0\n\t"
879 "psllq $8, %%mm4\n\t"
880 "psllq $16, %%mm5\n\t"
881 "por %%mm4, %%mm3\n\t"
882 "por %%mm5, %%mm3\n\t"
884 "movq %%mm0, %%mm6\n\t"
885 "movq %%mm3, %%mm7\n\t"
887 "movq 8%1, %%mm0\n\t"
888 "movq 8%1, %%mm1\n\t"
889 "movq 8%1, %%mm2\n\t"
893 "psllq $3, %%mm0\n\t"
894 "psrlq $2, %%mm1\n\t"
895 "psrlq $7, %%mm2\n\t"
896 "movq %%mm0, %%mm3\n\t"
897 "movq %%mm1, %%mm4\n\t"
898 "movq %%mm2, %%mm5\n\t"
899 "punpcklwd %5, %%mm0\n\t"
900 "punpcklwd %5, %%mm1\n\t"
901 "punpcklwd %5, %%mm2\n\t"
902 "punpckhwd %5, %%mm3\n\t"
903 "punpckhwd %5, %%mm4\n\t"
904 "punpckhwd %5, %%mm5\n\t"
905 "psllq $8, %%mm1\n\t"
906 "psllq $16, %%mm2\n\t"
907 "por %%mm1, %%mm0\n\t"
908 "por %%mm2, %%mm0\n\t"
909 "psllq $8, %%mm4\n\t"
910 "psllq $16, %%mm5\n\t"
911 "por %%mm4, %%mm3\n\t"
912 "por %%mm5, %%mm3\n\t"
915 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
917 /* Borrowed 32 to 24 */
919 "movq %%mm0, %%mm4\n\t"
920 "movq %%mm3, %%mm5\n\t"
921 "movq %%mm6, %%mm0\n\t"
922 "movq %%mm7, %%mm1\n\t"
924 "movq %%mm4, %%mm6\n\t"
925 "movq %%mm5, %%mm7\n\t"
926 "movq %%mm0, %%mm2\n\t"
927 "movq %%mm1, %%mm3\n\t"
929 "psrlq $8, %%mm2\n\t"
930 "psrlq $8, %%mm3\n\t"
931 "psrlq $8, %%mm6\n\t"
932 "psrlq $8, %%mm7\n\t"
941 "por %%mm2, %%mm0\n\t"
942 "por %%mm3, %%mm1\n\t"
943 "por %%mm6, %%mm4\n\t"
944 "por %%mm7, %%mm5\n\t"
946 "movq %%mm1, %%mm2\n\t"
947 "movq %%mm4, %%mm3\n\t"
948 "psllq $48, %%mm2\n\t"
949 "psllq $32, %%mm3\n\t"
952 "por %%mm2, %%mm0\n\t"
953 "psrlq $16, %%mm1\n\t"
954 "psrlq $32, %%mm4\n\t"
955 "psllq $16, %%mm5\n\t"
956 "por %%mm3, %%mm1\n\t"
958 "por %%mm5, %%mm4\n\t"
960 MOVNTQ" %%mm0, %0\n\t"
961 MOVNTQ" %%mm1, 8%0\n\t"
965 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
970 __asm __volatile(SFENCE:::"memory");
971 __asm __volatile(EMMS:::"memory");
975 register uint16_t bgr;
977 *d++ = (bgr&0x1F)<<3;
978 *d++ = (bgr&0x3E0)>>2;
979 *d++ = (bgr&0x7C00)>>7;
983 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
987 const uint16_t *mm_end;
989 uint8_t *d = (uint8_t *)dst;
990 const uint16_t *s = (const uint16_t *)src;
991 end = s + src_size/2;
993 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1000 "movq %1, %%mm1\n\t"
1001 "movq %1, %%mm2\n\t"
1002 "pand %2, %%mm0\n\t"
1003 "pand %3, %%mm1\n\t"
1004 "pand %4, %%mm2\n\t"
1005 "psllq $3, %%mm0\n\t"
1006 "psrlq $3, %%mm1\n\t"
1007 "psrlq $8, %%mm2\n\t"
1008 "movq %%mm0, %%mm3\n\t"
1009 "movq %%mm1, %%mm4\n\t"
1010 "movq %%mm2, %%mm5\n\t"
1011 "punpcklwd %5, %%mm0\n\t"
1012 "punpcklwd %5, %%mm1\n\t"
1013 "punpcklwd %5, %%mm2\n\t"
1014 "punpckhwd %5, %%mm3\n\t"
1015 "punpckhwd %5, %%mm4\n\t"
1016 "punpckhwd %5, %%mm5\n\t"
1017 "psllq $8, %%mm1\n\t"
1018 "psllq $16, %%mm2\n\t"
1019 "por %%mm1, %%mm0\n\t"
1020 "por %%mm2, %%mm0\n\t"
1021 "psllq $8, %%mm4\n\t"
1022 "psllq $16, %%mm5\n\t"
1023 "por %%mm4, %%mm3\n\t"
1024 "por %%mm5, %%mm3\n\t"
1026 "movq %%mm0, %%mm6\n\t"
1027 "movq %%mm3, %%mm7\n\t"
1029 "movq 8%1, %%mm0\n\t"
1030 "movq 8%1, %%mm1\n\t"
1031 "movq 8%1, %%mm2\n\t"
1032 "pand %2, %%mm0\n\t"
1033 "pand %3, %%mm1\n\t"
1034 "pand %4, %%mm2\n\t"
1035 "psllq $3, %%mm0\n\t"
1036 "psrlq $3, %%mm1\n\t"
1037 "psrlq $8, %%mm2\n\t"
1038 "movq %%mm0, %%mm3\n\t"
1039 "movq %%mm1, %%mm4\n\t"
1040 "movq %%mm2, %%mm5\n\t"
1041 "punpcklwd %5, %%mm0\n\t"
1042 "punpcklwd %5, %%mm1\n\t"
1043 "punpcklwd %5, %%mm2\n\t"
1044 "punpckhwd %5, %%mm3\n\t"
1045 "punpckhwd %5, %%mm4\n\t"
1046 "punpckhwd %5, %%mm5\n\t"
1047 "psllq $8, %%mm1\n\t"
1048 "psllq $16, %%mm2\n\t"
1049 "por %%mm1, %%mm0\n\t"
1050 "por %%mm2, %%mm0\n\t"
1051 "psllq $8, %%mm4\n\t"
1052 "psllq $16, %%mm5\n\t"
1053 "por %%mm4, %%mm3\n\t"
1054 "por %%mm5, %%mm3\n\t"
1056 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1058 /* Borrowed 32 to 24 */
1060 "movq %%mm0, %%mm4\n\t"
1061 "movq %%mm3, %%mm5\n\t"
1062 "movq %%mm6, %%mm0\n\t"
1063 "movq %%mm7, %%mm1\n\t"
1065 "movq %%mm4, %%mm6\n\t"
1066 "movq %%mm5, %%mm7\n\t"
1067 "movq %%mm0, %%mm2\n\t"
1068 "movq %%mm1, %%mm3\n\t"
1070 "psrlq $8, %%mm2\n\t"
1071 "psrlq $8, %%mm3\n\t"
1072 "psrlq $8, %%mm6\n\t"
1073 "psrlq $8, %%mm7\n\t"
1074 "pand %2, %%mm0\n\t"
1075 "pand %2, %%mm1\n\t"
1076 "pand %2, %%mm4\n\t"
1077 "pand %2, %%mm5\n\t"
1078 "pand %3, %%mm2\n\t"
1079 "pand %3, %%mm3\n\t"
1080 "pand %3, %%mm6\n\t"
1081 "pand %3, %%mm7\n\t"
1082 "por %%mm2, %%mm0\n\t"
1083 "por %%mm3, %%mm1\n\t"
1084 "por %%mm6, %%mm4\n\t"
1085 "por %%mm7, %%mm5\n\t"
1087 "movq %%mm1, %%mm2\n\t"
1088 "movq %%mm4, %%mm3\n\t"
1089 "psllq $48, %%mm2\n\t"
1090 "psllq $32, %%mm3\n\t"
1091 "pand %4, %%mm2\n\t"
1092 "pand %5, %%mm3\n\t"
1093 "por %%mm2, %%mm0\n\t"
1094 "psrlq $16, %%mm1\n\t"
1095 "psrlq $32, %%mm4\n\t"
1096 "psllq $16, %%mm5\n\t"
1097 "por %%mm3, %%mm1\n\t"
1098 "pand %6, %%mm5\n\t"
1099 "por %%mm5, %%mm4\n\t"
1101 MOVNTQ" %%mm0, %0\n\t"
1102 MOVNTQ" %%mm1, 8%0\n\t"
1103 MOVNTQ" %%mm4, 16%0"
1106 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1111 __asm __volatile(SFENCE:::"memory");
1112 __asm __volatile(EMMS:::"memory");
1116 register uint16_t bgr;
1118 *d++ = (bgr&0x1F)<<3;
1119 *d++ = (bgr&0x7E0)>>3;
1120 *d++ = (bgr&0xF800)>>8;
1124 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1126 const uint16_t *end;
1128 const uint16_t *mm_end;
1130 uint8_t *d = (uint8_t *)dst;
1131 const uint16_t *s = (const uint16_t *)src;
1132 end = s + src_size/2;
1134 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1135 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1141 "movq %1, %%mm0\n\t"
1142 "movq %1, %%mm1\n\t"
1143 "movq %1, %%mm2\n\t"
1144 "pand %2, %%mm0\n\t"
1145 "pand %3, %%mm1\n\t"
1146 "pand %4, %%mm2\n\t"
1147 "psllq $3, %%mm0\n\t"
1148 "psrlq $2, %%mm1\n\t"
1149 "psrlq $7, %%mm2\n\t"
1150 "movq %%mm0, %%mm3\n\t"
1151 "movq %%mm1, %%mm4\n\t"
1152 "movq %%mm2, %%mm5\n\t"
1153 "punpcklwd %%mm7, %%mm0\n\t"
1154 "punpcklwd %%mm7, %%mm1\n\t"
1155 "punpcklwd %%mm7, %%mm2\n\t"
1156 "punpckhwd %%mm7, %%mm3\n\t"
1157 "punpckhwd %%mm7, %%mm4\n\t"
1158 "punpckhwd %%mm7, %%mm5\n\t"
1159 "psllq $8, %%mm1\n\t"
1160 "psllq $16, %%mm2\n\t"
1161 "por %%mm1, %%mm0\n\t"
1162 "por %%mm2, %%mm0\n\t"
1163 "psllq $8, %%mm4\n\t"
1164 "psllq $16, %%mm5\n\t"
1165 "por %%mm4, %%mm3\n\t"
1166 "por %%mm5, %%mm3\n\t"
1167 MOVNTQ" %%mm0, %0\n\t"
1168 MOVNTQ" %%mm3, 8%0\n\t"
1170 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1175 __asm __volatile(SFENCE:::"memory");
1176 __asm __volatile(EMMS:::"memory");
1180 register uint16_t bgr;
1182 *d++ = (bgr&0x1F)<<3;
1183 *d++ = (bgr&0x3E0)>>2;
1184 *d++ = (bgr&0x7C00)>>7;
1189 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1191 const uint16_t *end;
1193 const uint16_t *mm_end;
1195 uint8_t *d = (uint8_t *)dst;
1196 const uint16_t *s = (uint16_t *)src;
1197 end = s + src_size/2;
1199 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1200 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1206 "movq %1, %%mm0\n\t"
1207 "movq %1, %%mm1\n\t"
1208 "movq %1, %%mm2\n\t"
1209 "pand %2, %%mm0\n\t"
1210 "pand %3, %%mm1\n\t"
1211 "pand %4, %%mm2\n\t"
1212 "psllq $3, %%mm0\n\t"
1213 "psrlq $3, %%mm1\n\t"
1214 "psrlq $8, %%mm2\n\t"
1215 "movq %%mm0, %%mm3\n\t"
1216 "movq %%mm1, %%mm4\n\t"
1217 "movq %%mm2, %%mm5\n\t"
1218 "punpcklwd %%mm7, %%mm0\n\t"
1219 "punpcklwd %%mm7, %%mm1\n\t"
1220 "punpcklwd %%mm7, %%mm2\n\t"
1221 "punpckhwd %%mm7, %%mm3\n\t"
1222 "punpckhwd %%mm7, %%mm4\n\t"
1223 "punpckhwd %%mm7, %%mm5\n\t"
1224 "psllq $8, %%mm1\n\t"
1225 "psllq $16, %%mm2\n\t"
1226 "por %%mm1, %%mm0\n\t"
1227 "por %%mm2, %%mm0\n\t"
1228 "psllq $8, %%mm4\n\t"
1229 "psllq $16, %%mm5\n\t"
1230 "por %%mm4, %%mm3\n\t"
1231 "por %%mm5, %%mm3\n\t"
1232 MOVNTQ" %%mm0, %0\n\t"
1233 MOVNTQ" %%mm3, 8%0\n\t"
1235 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1240 __asm __volatile(SFENCE:::"memory");
1241 __asm __volatile(EMMS:::"memory");
1245 register uint16_t bgr;
1247 *d++ = (bgr&0x1F)<<3;
1248 *d++ = (bgr&0x7E0)>>3;
1249 *d++ = (bgr&0xF800)>>8;
1254 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1257 /* TODO: unroll this loop */
1259 "xorl %%eax, %%eax \n\t"
1262 PREFETCH" 32(%0, %%eax) \n\t"
1263 "movq (%0, %%eax), %%mm0 \n\t"
1264 "movq %%mm0, %%mm1 \n\t"
1265 "movq %%mm0, %%mm2 \n\t"
1266 "pslld $16, %%mm0 \n\t"
1267 "psrld $16, %%mm1 \n\t"
1268 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1269 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1270 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1271 "por %%mm0, %%mm2 \n\t"
1272 "por %%mm1, %%mm2 \n\t"
1273 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1274 "addl $8, %%eax \n\t"
1275 "cmpl %2, %%eax \n\t"
1277 :: "r" (src), "r"(dst), "r" (src_size-7)
1281 __asm __volatile(SFENCE:::"memory");
1282 __asm __volatile(EMMS:::"memory");
1285 unsigned num_pixels = src_size >> 2;
1286 for(i=0; i<num_pixels; i++)
1288 dst[4*i + 0] = src[4*i + 2];
1289 dst[4*i + 1] = src[4*i + 1];
1290 dst[4*i + 2] = src[4*i + 0];
1295 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1299 int mmx_size= 23 - src_size;
1301 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1302 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1303 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1306 PREFETCH" 32(%1, %%eax) \n\t"
1307 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1308 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1309 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1310 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1311 "pand %%mm5, %%mm0 \n\t"
1312 "pand %%mm6, %%mm1 \n\t"
1313 "pand %%mm7, %%mm2 \n\t"
1314 "por %%mm0, %%mm1 \n\t"
1315 "por %%mm2, %%mm1 \n\t"
1316 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1317 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1318 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1319 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1320 "pand %%mm7, %%mm0 \n\t"
1321 "pand %%mm5, %%mm1 \n\t"
1322 "pand %%mm6, %%mm2 \n\t"
1323 "por %%mm0, %%mm1 \n\t"
1324 "por %%mm2, %%mm1 \n\t"
1325 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1326 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1327 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1328 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1329 "pand %%mm6, %%mm0 \n\t"
1330 "pand %%mm7, %%mm1 \n\t"
1331 "pand %%mm5, %%mm2 \n\t"
1332 "por %%mm0, %%mm1 \n\t"
1333 "por %%mm2, %%mm1 \n\t"
1334 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1335 "addl $24, %%eax \n\t"
1338 : "r" (src-mmx_size), "r"(dst-mmx_size)
1341 __asm __volatile(SFENCE:::"memory");
1342 __asm __volatile(EMMS:::"memory");
1344 if(mmx_size==23) return; //finihsed, was multiple of 8
1348 src_size= 23-mmx_size;
1352 for(i=0; i<src_size; i+=3)
1356 dst[i + 1] = src[i + 1];
1357 dst[i + 2] = src[i + 0];
1362 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1363 unsigned int width, unsigned int height,
1364 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1367 const unsigned chromWidth= width>>1;
1368 for(y=0; y<height; y++)
1371 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1373 "xorl %%eax, %%eax \n\t"
1376 PREFETCH" 32(%1, %%eax, 2) \n\t"
1377 PREFETCH" 32(%2, %%eax) \n\t"
1378 PREFETCH" 32(%3, %%eax) \n\t"
1379 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1380 "movq %%mm0, %%mm2 \n\t" // U(0)
1381 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1382 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1383 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1385 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1386 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1387 "movq %%mm3, %%mm4 \n\t" // Y(0)
1388 "movq %%mm5, %%mm6 \n\t" // Y(8)
1389 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1390 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1391 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1392 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1394 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1395 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1396 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1397 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1399 "addl $8, %%eax \n\t"
1400 "cmpl %4, %%eax \n\t"
1402 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1406 #if __WORDSIZE >= 64
1408 uint64_t *ldst = (uint64_t *) dst;
1409 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1410 for(i = 0; i < chromWidth; i += 2){
1412 k = yc[0] + (uc[0] << 8) +
1413 (yc[1] << 16) + (vc[0] << 24);
1414 l = yc[2] + (uc[1] << 8) +
1415 (yc[3] << 16) + (vc[1] << 24);
1416 *ldst++ = k + (l << 32);
1423 int i, *idst = (int32_t *) dst;
1424 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1425 for(i = 0; i < chromWidth; i++){
1426 *idst++ = yc[0] + (uc[0] << 8) +
1427 (yc[1] << 16) + (vc[0] << 24);
1434 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1436 usrc += chromStride;
1437 vsrc += chromStride;
1451 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1452 * problem for anyone then tell me, and ill fix it)
1454 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1455 unsigned int width, unsigned int height,
1456 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1458 //FIXME interpolate chroma
1459 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1464 * width should be a multiple of 16
1466 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467 unsigned int width, unsigned int height,
1468 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1470 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1475 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1476 * problem for anyone then tell me, and ill fix it)
1478 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1479 unsigned int width, unsigned int height,
1480 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1483 const unsigned chromWidth= width>>1;
1484 for(y=0; y<height; y+=2)
1488 "xorl %%eax, %%eax \n\t"
1489 "pcmpeqw %%mm7, %%mm7 \n\t"
1490 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1493 PREFETCH" 64(%0, %%eax, 4) \n\t"
1494 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1495 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1496 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1497 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1498 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1499 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1500 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1501 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1502 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1503 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1505 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1507 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1508 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1509 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1510 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1511 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1512 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1513 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1514 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1515 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1516 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1518 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1520 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1521 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1522 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1523 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1524 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1525 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1526 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1527 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1529 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1530 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1532 "addl $8, %%eax \n\t"
1533 "cmpl %4, %%eax \n\t"
1535 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1543 "xorl %%eax, %%eax \n\t"
1546 PREFETCH" 64(%0, %%eax, 4) \n\t"
1547 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1548 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1549 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1550 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1551 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1552 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1553 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1554 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1555 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1556 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1558 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1559 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1561 "addl $8, %%eax \n\t"
1562 "cmpl %4, %%eax \n\t"
1565 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1570 for(i=0; i<chromWidth; i++)
1572 ydst[2*i+0] = src[4*i+0];
1573 udst[i] = src[4*i+1];
1574 ydst[2*i+1] = src[4*i+2];
1575 vdst[i] = src[4*i+3];
1580 for(i=0; i<chromWidth; i++)
1582 ydst[2*i+0] = src[4*i+0];
1583 ydst[2*i+1] = src[4*i+2];
1586 udst += chromStride;
1587 vdst += chromStride;
1592 asm volatile( EMMS" \n\t"
1598 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1599 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1600 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1603 memcpy(ydst, ysrc, width*height);
1605 /* XXX: implement upscaling for U,V */
1608 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1613 for(x=0; x<srcWidth; x++){
1619 for(y=1; y<srcHeight; y++){
1620 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1621 const int mmxSize= srcWidth;
1623 "movl %4, %%eax \n\t"
1625 "movq (%0, %%eax), %%mm0 \n\t"
1626 "movq (%1, %%eax), %%mm1 \n\t"
1627 "movq 1(%0, %%eax), %%mm2 \n\t"
1628 "movq 1(%1, %%eax), %%mm3 \n\t"
1629 "movq %%mm0, %%mm4 \n\t"
1630 "movq %%mm1, %%mm5 \n\t"
1631 PAVGB" %%mm3, %%mm0 \n\t"
1632 PAVGB" %%mm3, %%mm0 \n\t"
1633 PAVGB" %%mm4, %%mm3 \n\t"
1634 PAVGB" %%mm4, %%mm3 \n\t"
1635 PAVGB" %%mm2, %%mm1 \n\t"
1636 PAVGB" %%mm2, %%mm1 \n\t"
1637 PAVGB" %%mm5, %%mm2 \n\t"
1638 PAVGB" %%mm5, %%mm2 \n\t"
1639 "movq %%mm3, %%mm4 \n\t"
1640 "movq %%mm2, %%mm5 \n\t"
1641 "punpcklbw %%mm1, %%mm3 \n\t"
1642 "punpckhbw %%mm1, %%mm4 \n\t"
1643 "punpcklbw %%mm0, %%mm2 \n\t"
1644 "punpckhbw %%mm0, %%mm5 \n\t"
1646 MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
1647 MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
1648 MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
1649 MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
1651 "movq %%mm3, (%2, %%eax, 2) \n\t"
1652 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1653 "movq %%mm2, (%3, %%eax, 2) \n\t"
1654 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1656 "addl $8, %%eax \n\t"
1658 :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1659 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1665 dst[dstStride]= src[0];
1668 dst[dstStride]= src[0];
1670 for(x=0; x<srcWidth-1; x++){
1671 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1672 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1673 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1674 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1678 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1686 for(x=0; x<srcWidth; x++){
1691 asm volatile( EMMS" \n\t"
1699 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1700 * problem for anyone then tell me, and ill fix it)
1701 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1703 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1704 unsigned int width, unsigned int height,
1705 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1708 const unsigned chromWidth= width>>1;
1709 for(y=0; y<height; y+=2)
1713 "xorl %%eax, %%eax \n\t"
1714 "pcmpeqw %%mm7, %%mm7 \n\t"
1715 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1718 PREFETCH" 64(%0, %%eax, 4) \n\t"
1719 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1720 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1721 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1722 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1723 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1724 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1725 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1726 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1727 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1728 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1730 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1732 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1733 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1734 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1735 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1736 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1737 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1738 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1739 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1740 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1741 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1743 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1745 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1746 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1747 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1748 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1749 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1750 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1751 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1752 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1754 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1755 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1757 "addl $8, %%eax \n\t"
1758 "cmpl %4, %%eax \n\t"
1760 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1768 "xorl %%eax, %%eax \n\t"
1771 PREFETCH" 64(%0, %%eax, 4) \n\t"
1772 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1773 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1774 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1775 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1776 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1777 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1778 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1779 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1780 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1781 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1783 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1784 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1786 "addl $8, %%eax \n\t"
1787 "cmpl %4, %%eax \n\t"
1790 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1795 for(i=0; i<chromWidth; i++)
1797 udst[i] = src[4*i+0];
1798 ydst[2*i+0] = src[4*i+1];
1799 vdst[i] = src[4*i+2];
1800 ydst[2*i+1] = src[4*i+3];
1805 for(i=0; i<chromWidth; i++)
1807 ydst[2*i+0] = src[4*i+1];
1808 ydst[2*i+1] = src[4*i+3];
1811 udst += chromStride;
1812 vdst += chromStride;
1817 asm volatile( EMMS" \n\t"
1825 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1826 * problem for anyone then tell me, and ill fix it)
1827 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1829 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1830 unsigned int width, unsigned int height,
1831 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1834 const unsigned chromWidth= width>>1;
1836 for(y=0; y<height-2; y+=2)
1842 "movl %2, %%eax \n\t"
1843 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1844 "movq "MANGLE(w1111)", %%mm5 \n\t"
1845 "pxor %%mm7, %%mm7 \n\t"
1846 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1849 PREFETCH" 64(%0, %%ebx) \n\t"
1850 "movd (%0, %%ebx), %%mm0 \n\t"
1851 "movd 3(%0, %%ebx), %%mm1 \n\t"
1852 "punpcklbw %%mm7, %%mm0 \n\t"
1853 "punpcklbw %%mm7, %%mm1 \n\t"
1854 "movd 6(%0, %%ebx), %%mm2 \n\t"
1855 "movd 9(%0, %%ebx), %%mm3 \n\t"
1856 "punpcklbw %%mm7, %%mm2 \n\t"
1857 "punpcklbw %%mm7, %%mm3 \n\t"
1858 "pmaddwd %%mm6, %%mm0 \n\t"
1859 "pmaddwd %%mm6, %%mm1 \n\t"
1860 "pmaddwd %%mm6, %%mm2 \n\t"
1861 "pmaddwd %%mm6, %%mm3 \n\t"
1862 #ifndef FAST_BGR2YV12
1863 "psrad $8, %%mm0 \n\t"
1864 "psrad $8, %%mm1 \n\t"
1865 "psrad $8, %%mm2 \n\t"
1866 "psrad $8, %%mm3 \n\t"
1868 "packssdw %%mm1, %%mm0 \n\t"
1869 "packssdw %%mm3, %%mm2 \n\t"
1870 "pmaddwd %%mm5, %%mm0 \n\t"
1871 "pmaddwd %%mm5, %%mm2 \n\t"
1872 "packssdw %%mm2, %%mm0 \n\t"
1873 "psraw $7, %%mm0 \n\t"
1875 "movd 12(%0, %%ebx), %%mm4 \n\t"
1876 "movd 15(%0, %%ebx), %%mm1 \n\t"
1877 "punpcklbw %%mm7, %%mm4 \n\t"
1878 "punpcklbw %%mm7, %%mm1 \n\t"
1879 "movd 18(%0, %%ebx), %%mm2 \n\t"
1880 "movd 21(%0, %%ebx), %%mm3 \n\t"
1881 "punpcklbw %%mm7, %%mm2 \n\t"
1882 "punpcklbw %%mm7, %%mm3 \n\t"
1883 "pmaddwd %%mm6, %%mm4 \n\t"
1884 "pmaddwd %%mm6, %%mm1 \n\t"
1885 "pmaddwd %%mm6, %%mm2 \n\t"
1886 "pmaddwd %%mm6, %%mm3 \n\t"
1887 #ifndef FAST_BGR2YV12
1888 "psrad $8, %%mm4 \n\t"
1889 "psrad $8, %%mm1 \n\t"
1890 "psrad $8, %%mm2 \n\t"
1891 "psrad $8, %%mm3 \n\t"
1893 "packssdw %%mm1, %%mm4 \n\t"
1894 "packssdw %%mm3, %%mm2 \n\t"
1895 "pmaddwd %%mm5, %%mm4 \n\t"
1896 "pmaddwd %%mm5, %%mm2 \n\t"
1897 "addl $24, %%ebx \n\t"
1898 "packssdw %%mm2, %%mm4 \n\t"
1899 "psraw $7, %%mm4 \n\t"
1901 "packuswb %%mm4, %%mm0 \n\t"
1902 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1904 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1905 "addl $8, %%eax \n\t"
1907 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1915 "movl %4, %%eax \n\t"
1916 "movq "MANGLE(w1111)", %%mm5 \n\t"
1917 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1918 "pxor %%mm7, %%mm7 \n\t"
1919 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1920 "addl %%ebx, %%ebx \n\t"
1923 PREFETCH" 64(%0, %%ebx) \n\t"
1924 PREFETCH" 64(%1, %%ebx) \n\t"
1925 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1926 "movq (%0, %%ebx), %%mm0 \n\t"
1927 "movq (%1, %%ebx), %%mm1 \n\t"
1928 "movq 6(%0, %%ebx), %%mm2 \n\t"
1929 "movq 6(%1, %%ebx), %%mm3 \n\t"
1930 PAVGB" %%mm1, %%mm0 \n\t"
1931 PAVGB" %%mm3, %%mm2 \n\t"
1932 "movq %%mm0, %%mm1 \n\t"
1933 "movq %%mm2, %%mm3 \n\t"
1934 "psrlq $24, %%mm0 \n\t"
1935 "psrlq $24, %%mm2 \n\t"
1936 PAVGB" %%mm1, %%mm0 \n\t"
1937 PAVGB" %%mm3, %%mm2 \n\t"
1938 "punpcklbw %%mm7, %%mm0 \n\t"
1939 "punpcklbw %%mm7, %%mm2 \n\t"
1941 "movd (%0, %%ebx), %%mm0 \n\t"
1942 "movd (%1, %%ebx), %%mm1 \n\t"
1943 "movd 3(%0, %%ebx), %%mm2 \n\t"
1944 "movd 3(%1, %%ebx), %%mm3 \n\t"
1945 "punpcklbw %%mm7, %%mm0 \n\t"
1946 "punpcklbw %%mm7, %%mm1 \n\t"
1947 "punpcklbw %%mm7, %%mm2 \n\t"
1948 "punpcklbw %%mm7, %%mm3 \n\t"
1949 "paddw %%mm1, %%mm0 \n\t"
1950 "paddw %%mm3, %%mm2 \n\t"
1951 "paddw %%mm2, %%mm0 \n\t"
1952 "movd 6(%0, %%ebx), %%mm4 \n\t"
1953 "movd 6(%1, %%ebx), %%mm1 \n\t"
1954 "movd 9(%0, %%ebx), %%mm2 \n\t"
1955 "movd 9(%1, %%ebx), %%mm3 \n\t"
1956 "punpcklbw %%mm7, %%mm4 \n\t"
1957 "punpcklbw %%mm7, %%mm1 \n\t"
1958 "punpcklbw %%mm7, %%mm2 \n\t"
1959 "punpcklbw %%mm7, %%mm3 \n\t"
1960 "paddw %%mm1, %%mm4 \n\t"
1961 "paddw %%mm3, %%mm2 \n\t"
1962 "paddw %%mm4, %%mm2 \n\t"
1963 "psrlw $2, %%mm0 \n\t"
1964 "psrlw $2, %%mm2 \n\t"
1966 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1967 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1969 "pmaddwd %%mm0, %%mm1 \n\t"
1970 "pmaddwd %%mm2, %%mm3 \n\t"
1971 "pmaddwd %%mm6, %%mm0 \n\t"
1972 "pmaddwd %%mm6, %%mm2 \n\t"
1973 #ifndef FAST_BGR2YV12
1974 "psrad $8, %%mm0 \n\t"
1975 "psrad $8, %%mm1 \n\t"
1976 "psrad $8, %%mm2 \n\t"
1977 "psrad $8, %%mm3 \n\t"
1979 "packssdw %%mm2, %%mm0 \n\t"
1980 "packssdw %%mm3, %%mm1 \n\t"
1981 "pmaddwd %%mm5, %%mm0 \n\t"
1982 "pmaddwd %%mm5, %%mm1 \n\t"
1983 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1984 "psraw $7, %%mm0 \n\t"
1986 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1987 "movq 12(%0, %%ebx), %%mm4 \n\t"
1988 "movq 12(%1, %%ebx), %%mm1 \n\t"
1989 "movq 18(%0, %%ebx), %%mm2 \n\t"
1990 "movq 18(%1, %%ebx), %%mm3 \n\t"
1991 PAVGB" %%mm1, %%mm4 \n\t"
1992 PAVGB" %%mm3, %%mm2 \n\t"
1993 "movq %%mm4, %%mm1 \n\t"
1994 "movq %%mm2, %%mm3 \n\t"
1995 "psrlq $24, %%mm4 \n\t"
1996 "psrlq $24, %%mm2 \n\t"
1997 PAVGB" %%mm1, %%mm4 \n\t"
1998 PAVGB" %%mm3, %%mm2 \n\t"
1999 "punpcklbw %%mm7, %%mm4 \n\t"
2000 "punpcklbw %%mm7, %%mm2 \n\t"
2002 "movd 12(%0, %%ebx), %%mm4 \n\t"
2003 "movd 12(%1, %%ebx), %%mm1 \n\t"
2004 "movd 15(%0, %%ebx), %%mm2 \n\t"
2005 "movd 15(%1, %%ebx), %%mm3 \n\t"
2006 "punpcklbw %%mm7, %%mm4 \n\t"
2007 "punpcklbw %%mm7, %%mm1 \n\t"
2008 "punpcklbw %%mm7, %%mm2 \n\t"
2009 "punpcklbw %%mm7, %%mm3 \n\t"
2010 "paddw %%mm1, %%mm4 \n\t"
2011 "paddw %%mm3, %%mm2 \n\t"
2012 "paddw %%mm2, %%mm4 \n\t"
2013 "movd 18(%0, %%ebx), %%mm5 \n\t"
2014 "movd 18(%1, %%ebx), %%mm1 \n\t"
2015 "movd 21(%0, %%ebx), %%mm2 \n\t"
2016 "movd 21(%1, %%ebx), %%mm3 \n\t"
2017 "punpcklbw %%mm7, %%mm5 \n\t"
2018 "punpcklbw %%mm7, %%mm1 \n\t"
2019 "punpcklbw %%mm7, %%mm2 \n\t"
2020 "punpcklbw %%mm7, %%mm3 \n\t"
2021 "paddw %%mm1, %%mm5 \n\t"
2022 "paddw %%mm3, %%mm2 \n\t"
2023 "paddw %%mm5, %%mm2 \n\t"
2024 "movq "MANGLE(w1111)", %%mm5 \n\t"
2025 "psrlw $2, %%mm4 \n\t"
2026 "psrlw $2, %%mm2 \n\t"
2028 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2029 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2031 "pmaddwd %%mm4, %%mm1 \n\t"
2032 "pmaddwd %%mm2, %%mm3 \n\t"
2033 "pmaddwd %%mm6, %%mm4 \n\t"
2034 "pmaddwd %%mm6, %%mm2 \n\t"
2035 #ifndef FAST_BGR2YV12
2036 "psrad $8, %%mm4 \n\t"
2037 "psrad $8, %%mm1 \n\t"
2038 "psrad $8, %%mm2 \n\t"
2039 "psrad $8, %%mm3 \n\t"
2041 "packssdw %%mm2, %%mm4 \n\t"
2042 "packssdw %%mm3, %%mm1 \n\t"
2043 "pmaddwd %%mm5, %%mm4 \n\t"
2044 "pmaddwd %%mm5, %%mm1 \n\t"
2045 "addl $24, %%ebx \n\t"
2046 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2047 "psraw $7, %%mm4 \n\t"
2049 "movq %%mm0, %%mm1 \n\t"
2050 "punpckldq %%mm4, %%mm0 \n\t"
2051 "punpckhdq %%mm4, %%mm1 \n\t"
2052 "packsswb %%mm1, %%mm0 \n\t"
2053 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2055 "movd %%mm0, (%2, %%eax) \n\t"
2056 "punpckhdq %%mm0, %%mm0 \n\t"
2057 "movd %%mm0, (%3, %%eax) \n\t"
2058 "addl $4, %%eax \n\t"
2060 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2064 udst += chromStride;
2065 vdst += chromStride;
2069 asm volatile( EMMS" \n\t"
2075 for(; y<height; y+=2)
2078 for(i=0; i<chromWidth; i++)
2080 unsigned int b= src[6*i+0];
2081 unsigned int g= src[6*i+1];
2082 unsigned int r= src[6*i+2];
2084 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2085 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2086 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2096 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2102 for(i=0; i<chromWidth; i++)
2104 unsigned int b= src[6*i+0];
2105 unsigned int g= src[6*i+1];
2106 unsigned int r= src[6*i+2];
2108 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2116 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2119 udst += chromStride;
2120 vdst += chromStride;
2126 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2127 unsigned width, unsigned height, unsigned src1Stride,
2128 unsigned src2Stride, unsigned dstStride){
2131 for(h=0; h < height; h++)
2138 "xorl %%eax, %%eax \n\t"
2140 PREFETCH" 64(%1, %%eax) \n\t"
2141 PREFETCH" 64(%2, %%eax) \n\t"
2142 "movdqa (%1, %%eax), %%xmm0 \n\t"
2143 "movdqa (%1, %%eax), %%xmm1 \n\t"
2144 "movdqa (%2, %%eax), %%xmm2 \n\t"
2145 "punpcklbw %%xmm2, %%xmm0 \n\t"
2146 "punpckhbw %%xmm2, %%xmm1 \n\t"
2147 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2148 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2149 "addl $16, %%eax \n\t"
2150 "cmpl %3, %%eax \n\t"
2152 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2157 "xorl %%eax, %%eax \n\t"
2159 PREFETCH" 64(%1, %%eax) \n\t"
2160 PREFETCH" 64(%2, %%eax) \n\t"
2161 "movq (%1, %%eax), %%mm0 \n\t"
2162 "movq 8(%1, %%eax), %%mm2 \n\t"
2163 "movq %%mm0, %%mm1 \n\t"
2164 "movq %%mm2, %%mm3 \n\t"
2165 "movq (%2, %%eax), %%mm4 \n\t"
2166 "movq 8(%2, %%eax), %%mm5 \n\t"
2167 "punpcklbw %%mm4, %%mm0 \n\t"
2168 "punpckhbw %%mm4, %%mm1 \n\t"
2169 "punpcklbw %%mm5, %%mm2 \n\t"
2170 "punpckhbw %%mm5, %%mm3 \n\t"
2171 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2172 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2173 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2174 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2175 "addl $16, %%eax \n\t"
2176 "cmpl %3, %%eax \n\t"
2178 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2182 for(w= (width&(~15)); w < width; w++)
2184 dest[2*w+0] = src1[w];
2185 dest[2*w+1] = src2[w];
2188 for(w=0; w < width; w++)
2190 dest[2*w+0] = src1[w];
2191 dest[2*w+1] = src2[w];
2207 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2208 uint8_t *dst1, uint8_t *dst2,
2209 unsigned width, unsigned height,
2210 unsigned srcStride1, unsigned srcStride2,
2211 unsigned dstStride1, unsigned dstStride2)
2214 w=width/2; h=height/2;
2219 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2222 const uint8_t* s1=src1+srcStride1*(y>>1);
2223 uint8_t* d=dst1+dstStride1*y;
2231 "movq %1, %%mm0\n\t"
2232 "movq 8%1, %%mm2\n\t"
2233 "movq 16%1, %%mm4\n\t"
2234 "movq 24%1, %%mm6\n\t"
2235 "movq %%mm0, %%mm1\n\t"
2236 "movq %%mm2, %%mm3\n\t"
2237 "movq %%mm4, %%mm5\n\t"
2238 "movq %%mm6, %%mm7\n\t"
2239 "punpcklbw %%mm0, %%mm0\n\t"
2240 "punpckhbw %%mm1, %%mm1\n\t"
2241 "punpcklbw %%mm2, %%mm2\n\t"
2242 "punpckhbw %%mm3, %%mm3\n\t"
2243 "punpcklbw %%mm4, %%mm4\n\t"
2244 "punpckhbw %%mm5, %%mm5\n\t"
2245 "punpcklbw %%mm6, %%mm6\n\t"
2246 "punpckhbw %%mm7, %%mm7\n\t"
2247 MOVNTQ" %%mm0, %0\n\t"
2248 MOVNTQ" %%mm1, 8%0\n\t"
2249 MOVNTQ" %%mm2, 16%0\n\t"
2250 MOVNTQ" %%mm3, 24%0\n\t"
2251 MOVNTQ" %%mm4, 32%0\n\t"
2252 MOVNTQ" %%mm5, 40%0\n\t"
2253 MOVNTQ" %%mm6, 48%0\n\t"
2254 MOVNTQ" %%mm7, 56%0"
2260 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2263 const uint8_t* s2=src2+srcStride2*(y>>1);
2264 uint8_t* d=dst2+dstStride2*y;
2272 "movq %1, %%mm0\n\t"
2273 "movq 8%1, %%mm2\n\t"
2274 "movq 16%1, %%mm4\n\t"
2275 "movq 24%1, %%mm6\n\t"
2276 "movq %%mm0, %%mm1\n\t"
2277 "movq %%mm2, %%mm3\n\t"
2278 "movq %%mm4, %%mm5\n\t"
2279 "movq %%mm6, %%mm7\n\t"
2280 "punpcklbw %%mm0, %%mm0\n\t"
2281 "punpckhbw %%mm1, %%mm1\n\t"
2282 "punpcklbw %%mm2, %%mm2\n\t"
2283 "punpckhbw %%mm3, %%mm3\n\t"
2284 "punpcklbw %%mm4, %%mm4\n\t"
2285 "punpckhbw %%mm5, %%mm5\n\t"
2286 "punpcklbw %%mm6, %%mm6\n\t"
2287 "punpckhbw %%mm7, %%mm7\n\t"
2288 MOVNTQ" %%mm0, %0\n\t"
2289 MOVNTQ" %%mm1, 8%0\n\t"
2290 MOVNTQ" %%mm2, 16%0\n\t"
2291 MOVNTQ" %%mm3, 24%0\n\t"
2292 MOVNTQ" %%mm4, 32%0\n\t"
2293 MOVNTQ" %%mm5, 40%0\n\t"
2294 MOVNTQ" %%mm6, 48%0\n\t"
2295 MOVNTQ" %%mm7, 56%0"
2301 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2312 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2314 unsigned width, unsigned height,
2315 unsigned srcStride1, unsigned srcStride2,
2316 unsigned srcStride3, unsigned dstStride)
2318 unsigned y,x,x2,w,h;
2319 w=width/2; h=height;
2325 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2328 const uint8_t* yp=src1+srcStride1*y;
2329 const uint8_t* up=src2+srcStride2*(y>>2);
2330 const uint8_t* vp=src3+srcStride3*(y>>2);
2331 uint8_t* d=dst+dstStride*y;
2335 for(;x<w;x+=8,x2+=32)
2341 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2342 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2343 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2344 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2345 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2346 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2347 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2348 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2349 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2350 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2352 "movq %%mm1, %%mm6\n\t"
2353 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2354 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2355 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2356 MOVNTQ" %%mm0, %0\n\t"
2357 MOVNTQ" %%mm3, 8%0\n\t"
2359 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2360 "movq 8%1, %%mm0\n\t"
2361 "movq %%mm0, %%mm3\n\t"
2362 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2363 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2364 MOVNTQ" %%mm0, 16%0\n\t"
2365 MOVNTQ" %%mm3, 24%0\n\t"
2367 "movq %%mm4, %%mm6\n\t"
2368 "movq 16%1, %%mm0\n\t"
2369 "movq %%mm0, %%mm3\n\t"
2370 "punpcklbw %%mm5, %%mm4\n\t"
2371 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2372 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2373 MOVNTQ" %%mm0, 32%0\n\t"
2374 MOVNTQ" %%mm3, 40%0\n\t"
2376 "punpckhbw %%mm5, %%mm6\n\t"
2377 "movq 24%1, %%mm0\n\t"
2378 "movq %%mm0, %%mm3\n\t"
2379 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2380 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2381 MOVNTQ" %%mm0, 48%0\n\t"
2382 MOVNTQ" %%mm3, 56%0\n\t"
2385 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])