3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
246 unsigned j,i,num_pixels=src_size/3;
247 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
257 register const uint8_t* s=src;
258 register uint8_t* d=dst;
259 register const uint8_t *end;
260 const uint8_t *mm_end;
263 __asm __volatile(PREFETCH" %0"::"m"(*s));
264 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
265 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ" %%mm0, %0\n\t"
291 __asm __volatile(SFENCE:::"memory");
292 __asm __volatile(EMMS:::"memory");
297 register uint32_t x= *((uint32_t *)s);
298 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
304 register uint16_t x= *((uint16_t *)s);
305 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
313 const uint8_t *s = src;
316 const uint8_t *mm_end;
318 uint16_t *d = (uint16_t *)dst;
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
325 ::"m"(red_16mask),"m"(green_16mask));
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
362 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory");
367 #ifndef WORDS_BIGENDIAN
372 const int a= *s++; /*skip*/
377 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
378 #ifndef WORDS_BIGENDIAN
384 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
386 const uint8_t *s = src;
389 const uint8_t *mm_end;
391 uint16_t *d = (uint16_t *)dst;
394 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
398 ::"m"(red_16mask),"m"(green_16mask));
405 "movd 4%1, %%mm3\n\t"
406 "punpckldq 8%1, %%mm0\n\t"
407 "punpckldq 12%1, %%mm3\n\t"
408 "movq %%mm0, %%mm1\n\t"
409 "movq %%mm0, %%mm2\n\t"
410 "movq %%mm3, %%mm4\n\t"
411 "movq %%mm3, %%mm5\n\t"
412 "psllq $8, %%mm0\n\t"
413 "psllq $8, %%mm3\n\t"
414 "pand %%mm7, %%mm0\n\t"
415 "pand %%mm7, %%mm3\n\t"
416 "psrlq $5, %%mm1\n\t"
417 "psrlq $5, %%mm4\n\t"
418 "pand %%mm6, %%mm1\n\t"
419 "pand %%mm6, %%mm4\n\t"
420 "psrlq $19, %%mm2\n\t"
421 "psrlq $19, %%mm5\n\t"
424 "por %%mm1, %%mm0\n\t"
425 "por %%mm4, %%mm3\n\t"
426 "por %%mm2, %%mm0\n\t"
427 "por %%mm5, %%mm3\n\t"
428 "psllq $16, %%mm3\n\t"
429 "por %%mm3, %%mm0\n\t"
430 MOVNTQ" %%mm0, %0\n\t"
431 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
435 __asm __volatile(SFENCE:::"memory");
436 __asm __volatile(EMMS:::"memory");
443 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
448 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
450 const uint8_t *s = src;
453 const uint8_t *mm_end;
455 uint16_t *d = (uint16_t *)dst;
458 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
462 ::"m"(red_15mask),"m"(green_15mask));
469 "movd 4%1, %%mm3\n\t"
470 "punpckldq 8%1, %%mm0\n\t"
471 "punpckldq 12%1, %%mm3\n\t"
472 "movq %%mm0, %%mm1\n\t"
473 "movq %%mm0, %%mm2\n\t"
474 "movq %%mm3, %%mm4\n\t"
475 "movq %%mm3, %%mm5\n\t"
476 "psrlq $3, %%mm0\n\t"
477 "psrlq $3, %%mm3\n\t"
480 "psrlq $6, %%mm1\n\t"
481 "psrlq $6, %%mm4\n\t"
482 "pand %%mm6, %%mm1\n\t"
483 "pand %%mm6, %%mm4\n\t"
484 "psrlq $9, %%mm2\n\t"
485 "psrlq $9, %%mm5\n\t"
486 "pand %%mm7, %%mm2\n\t"
487 "pand %%mm7, %%mm5\n\t"
488 "por %%mm1, %%mm0\n\t"
489 "por %%mm4, %%mm3\n\t"
490 "por %%mm2, %%mm0\n\t"
491 "por %%mm5, %%mm3\n\t"
492 "psllq $16, %%mm3\n\t"
493 "por %%mm3, %%mm0\n\t"
494 MOVNTQ" %%mm0, %0\n\t"
495 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
499 __asm __volatile(SFENCE:::"memory");
500 __asm __volatile(EMMS:::"memory");
507 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
512 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
514 const uint8_t *s = src;
517 const uint8_t *mm_end;
519 uint16_t *d = (uint16_t *)dst;
522 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
526 ::"m"(red_15mask),"m"(green_15mask));
533 "movd 4%1, %%mm3\n\t"
534 "punpckldq 8%1, %%mm0\n\t"
535 "punpckldq 12%1, %%mm3\n\t"
536 "movq %%mm0, %%mm1\n\t"
537 "movq %%mm0, %%mm2\n\t"
538 "movq %%mm3, %%mm4\n\t"
539 "movq %%mm3, %%mm5\n\t"
540 "psllq $7, %%mm0\n\t"
541 "psllq $7, %%mm3\n\t"
542 "pand %%mm7, %%mm0\n\t"
543 "pand %%mm7, %%mm3\n\t"
544 "psrlq $6, %%mm1\n\t"
545 "psrlq $6, %%mm4\n\t"
546 "pand %%mm6, %%mm1\n\t"
547 "pand %%mm6, %%mm4\n\t"
548 "psrlq $19, %%mm2\n\t"
549 "psrlq $19, %%mm5\n\t"
552 "por %%mm1, %%mm0\n\t"
553 "por %%mm4, %%mm3\n\t"
554 "por %%mm2, %%mm0\n\t"
555 "por %%mm5, %%mm3\n\t"
556 "psllq $16, %%mm3\n\t"
557 "por %%mm3, %%mm0\n\t"
558 MOVNTQ" %%mm0, %0\n\t"
559 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
563 __asm __volatile(SFENCE:::"memory");
564 __asm __volatile(EMMS:::"memory");
571 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
576 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
578 const uint8_t *s = src;
581 const uint8_t *mm_end;
583 uint16_t *d = (uint16_t *)dst;
586 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
590 ::"m"(red_16mask),"m"(green_16mask));
597 "movd 3%1, %%mm3\n\t"
598 "punpckldq 6%1, %%mm0\n\t"
599 "punpckldq 9%1, %%mm3\n\t"
600 "movq %%mm0, %%mm1\n\t"
601 "movq %%mm0, %%mm2\n\t"
602 "movq %%mm3, %%mm4\n\t"
603 "movq %%mm3, %%mm5\n\t"
604 "psrlq $3, %%mm0\n\t"
605 "psrlq $3, %%mm3\n\t"
608 "psrlq $5, %%mm1\n\t"
609 "psrlq $5, %%mm4\n\t"
610 "pand %%mm6, %%mm1\n\t"
611 "pand %%mm6, %%mm4\n\t"
612 "psrlq $8, %%mm2\n\t"
613 "psrlq $8, %%mm5\n\t"
614 "pand %%mm7, %%mm2\n\t"
615 "pand %%mm7, %%mm5\n\t"
616 "por %%mm1, %%mm0\n\t"
617 "por %%mm4, %%mm3\n\t"
618 "por %%mm2, %%mm0\n\t"
619 "por %%mm5, %%mm3\n\t"
620 "psllq $16, %%mm3\n\t"
621 "por %%mm3, %%mm0\n\t"
622 MOVNTQ" %%mm0, %0\n\t"
623 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
627 __asm __volatile(SFENCE:::"memory");
628 __asm __volatile(EMMS:::"memory");
635 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
639 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
641 const uint8_t *s = src;
644 const uint8_t *mm_end;
646 uint16_t *d = (uint16_t *)dst;
649 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
653 ::"m"(red_16mask),"m"(green_16mask));
660 "movd 3%1, %%mm3\n\t"
661 "punpckldq 6%1, %%mm0\n\t"
662 "punpckldq 9%1, %%mm3\n\t"
663 "movq %%mm0, %%mm1\n\t"
664 "movq %%mm0, %%mm2\n\t"
665 "movq %%mm3, %%mm4\n\t"
666 "movq %%mm3, %%mm5\n\t"
667 "psllq $8, %%mm0\n\t"
668 "psllq $8, %%mm3\n\t"
669 "pand %%mm7, %%mm0\n\t"
670 "pand %%mm7, %%mm3\n\t"
671 "psrlq $5, %%mm1\n\t"
672 "psrlq $5, %%mm4\n\t"
673 "pand %%mm6, %%mm1\n\t"
674 "pand %%mm6, %%mm4\n\t"
675 "psrlq $19, %%mm2\n\t"
676 "psrlq $19, %%mm5\n\t"
679 "por %%mm1, %%mm0\n\t"
680 "por %%mm4, %%mm3\n\t"
681 "por %%mm2, %%mm0\n\t"
682 "por %%mm5, %%mm3\n\t"
683 "psllq $16, %%mm3\n\t"
684 "por %%mm3, %%mm0\n\t"
685 MOVNTQ" %%mm0, %0\n\t"
686 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
690 __asm __volatile(SFENCE:::"memory");
691 __asm __volatile(EMMS:::"memory");
698 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
702 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
704 const uint8_t *s = src;
707 const uint8_t *mm_end;
709 uint16_t *d = (uint16_t *)dst;
712 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
716 ::"m"(red_15mask),"m"(green_15mask));
723 "movd 3%1, %%mm3\n\t"
724 "punpckldq 6%1, %%mm0\n\t"
725 "punpckldq 9%1, %%mm3\n\t"
726 "movq %%mm0, %%mm1\n\t"
727 "movq %%mm0, %%mm2\n\t"
728 "movq %%mm3, %%mm4\n\t"
729 "movq %%mm3, %%mm5\n\t"
730 "psrlq $3, %%mm0\n\t"
731 "psrlq $3, %%mm3\n\t"
734 "psrlq $6, %%mm1\n\t"
735 "psrlq $6, %%mm4\n\t"
736 "pand %%mm6, %%mm1\n\t"
737 "pand %%mm6, %%mm4\n\t"
738 "psrlq $9, %%mm2\n\t"
739 "psrlq $9, %%mm5\n\t"
740 "pand %%mm7, %%mm2\n\t"
741 "pand %%mm7, %%mm5\n\t"
742 "por %%mm1, %%mm0\n\t"
743 "por %%mm4, %%mm3\n\t"
744 "por %%mm2, %%mm0\n\t"
745 "por %%mm5, %%mm3\n\t"
746 "psllq $16, %%mm3\n\t"
747 "por %%mm3, %%mm0\n\t"
748 MOVNTQ" %%mm0, %0\n\t"
749 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
753 __asm __volatile(SFENCE:::"memory");
754 __asm __volatile(EMMS:::"memory");
761 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
765 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
767 const uint8_t *s = src;
770 const uint8_t *mm_end;
772 uint16_t *d = (uint16_t *)dst;
775 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
779 ::"m"(red_15mask),"m"(green_15mask));
786 "movd 3%1, %%mm3\n\t"
787 "punpckldq 6%1, %%mm0\n\t"
788 "punpckldq 9%1, %%mm3\n\t"
789 "movq %%mm0, %%mm1\n\t"
790 "movq %%mm0, %%mm2\n\t"
791 "movq %%mm3, %%mm4\n\t"
792 "movq %%mm3, %%mm5\n\t"
793 "psllq $7, %%mm0\n\t"
794 "psllq $7, %%mm3\n\t"
795 "pand %%mm7, %%mm0\n\t"
796 "pand %%mm7, %%mm3\n\t"
797 "psrlq $6, %%mm1\n\t"
798 "psrlq $6, %%mm4\n\t"
799 "pand %%mm6, %%mm1\n\t"
800 "pand %%mm6, %%mm4\n\t"
801 "psrlq $19, %%mm2\n\t"
802 "psrlq $19, %%mm5\n\t"
805 "por %%mm1, %%mm0\n\t"
806 "por %%mm4, %%mm3\n\t"
807 "por %%mm2, %%mm0\n\t"
808 "por %%mm5, %%mm3\n\t"
809 "psllq $16, %%mm3\n\t"
810 "por %%mm3, %%mm0\n\t"
811 MOVNTQ" %%mm0, %0\n\t"
812 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
816 __asm __volatile(SFENCE:::"memory");
817 __asm __volatile(EMMS:::"memory");
824 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
829 I use here less accurate approximation by simply
830 left-shifting the input
831 value and filling the low order bits with
832 zeroes. This method improves png's
833 compression but this scheme cannot reproduce white exactly, since it does not
834 generate an all-ones maximum value; the net effect is to darken the
837 The better method should be "left bit replication":
847 | Leftmost Bits Repeated to Fill Open Bits
851 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
855 const uint16_t *mm_end;
857 uint8_t *d = (uint8_t *)dst;
858 const uint16_t *s = (uint16_t *)src;
859 end = s + src_size/2;
861 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
873 "psllq $3, %%mm0\n\t"
874 "psrlq $2, %%mm1\n\t"
875 "psrlq $7, %%mm2\n\t"
876 "movq %%mm0, %%mm3\n\t"
877 "movq %%mm1, %%mm4\n\t"
878 "movq %%mm2, %%mm5\n\t"
879 "punpcklwd %5, %%mm0\n\t"
880 "punpcklwd %5, %%mm1\n\t"
881 "punpcklwd %5, %%mm2\n\t"
882 "punpckhwd %5, %%mm3\n\t"
883 "punpckhwd %5, %%mm4\n\t"
884 "punpckhwd %5, %%mm5\n\t"
885 "psllq $8, %%mm1\n\t"
886 "psllq $16, %%mm2\n\t"
887 "por %%mm1, %%mm0\n\t"
888 "por %%mm2, %%mm0\n\t"
889 "psllq $8, %%mm4\n\t"
890 "psllq $16, %%mm5\n\t"
891 "por %%mm4, %%mm3\n\t"
892 "por %%mm5, %%mm3\n\t"
894 "movq %%mm0, %%mm6\n\t"
895 "movq %%mm3, %%mm7\n\t"
897 "movq 8%1, %%mm0\n\t"
898 "movq 8%1, %%mm1\n\t"
899 "movq 8%1, %%mm2\n\t"
903 "psllq $3, %%mm0\n\t"
904 "psrlq $2, %%mm1\n\t"
905 "psrlq $7, %%mm2\n\t"
906 "movq %%mm0, %%mm3\n\t"
907 "movq %%mm1, %%mm4\n\t"
908 "movq %%mm2, %%mm5\n\t"
909 "punpcklwd %5, %%mm0\n\t"
910 "punpcklwd %5, %%mm1\n\t"
911 "punpcklwd %5, %%mm2\n\t"
912 "punpckhwd %5, %%mm3\n\t"
913 "punpckhwd %5, %%mm4\n\t"
914 "punpckhwd %5, %%mm5\n\t"
915 "psllq $8, %%mm1\n\t"
916 "psllq $16, %%mm2\n\t"
917 "por %%mm1, %%mm0\n\t"
918 "por %%mm2, %%mm0\n\t"
919 "psllq $8, %%mm4\n\t"
920 "psllq $16, %%mm5\n\t"
921 "por %%mm4, %%mm3\n\t"
922 "por %%mm5, %%mm3\n\t"
925 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
927 /* Borrowed 32 to 24 */
929 "movq %%mm0, %%mm4\n\t"
930 "movq %%mm3, %%mm5\n\t"
931 "movq %%mm6, %%mm0\n\t"
932 "movq %%mm7, %%mm1\n\t"
934 "movq %%mm4, %%mm6\n\t"
935 "movq %%mm5, %%mm7\n\t"
936 "movq %%mm0, %%mm2\n\t"
937 "movq %%mm1, %%mm3\n\t"
939 "psrlq $8, %%mm2\n\t"
940 "psrlq $8, %%mm3\n\t"
941 "psrlq $8, %%mm6\n\t"
942 "psrlq $8, %%mm7\n\t"
951 "por %%mm2, %%mm0\n\t"
952 "por %%mm3, %%mm1\n\t"
953 "por %%mm6, %%mm4\n\t"
954 "por %%mm7, %%mm5\n\t"
956 "movq %%mm1, %%mm2\n\t"
957 "movq %%mm4, %%mm3\n\t"
958 "psllq $48, %%mm2\n\t"
959 "psllq $32, %%mm3\n\t"
962 "por %%mm2, %%mm0\n\t"
963 "psrlq $16, %%mm1\n\t"
964 "psrlq $32, %%mm4\n\t"
965 "psllq $16, %%mm5\n\t"
966 "por %%mm3, %%mm1\n\t"
968 "por %%mm5, %%mm4\n\t"
970 MOVNTQ" %%mm0, %0\n\t"
971 MOVNTQ" %%mm1, 8%0\n\t"
975 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
980 __asm __volatile(SFENCE:::"memory");
981 __asm __volatile(EMMS:::"memory");
985 register uint16_t bgr;
987 *d++ = (bgr&0x1F)<<3;
988 *d++ = (bgr&0x3E0)>>2;
989 *d++ = (bgr&0x7C00)>>7;
993 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
997 const uint16_t *mm_end;
999 uint8_t *d = (uint8_t *)dst;
1000 const uint16_t *s = (const uint16_t *)src;
1001 end = s + src_size/2;
1003 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1009 "movq %1, %%mm0\n\t"
1010 "movq %1, %%mm1\n\t"
1011 "movq %1, %%mm2\n\t"
1012 "pand %2, %%mm0\n\t"
1013 "pand %3, %%mm1\n\t"
1014 "pand %4, %%mm2\n\t"
1015 "psllq $3, %%mm0\n\t"
1016 "psrlq $3, %%mm1\n\t"
1017 "psrlq $8, %%mm2\n\t"
1018 "movq %%mm0, %%mm3\n\t"
1019 "movq %%mm1, %%mm4\n\t"
1020 "movq %%mm2, %%mm5\n\t"
1021 "punpcklwd %5, %%mm0\n\t"
1022 "punpcklwd %5, %%mm1\n\t"
1023 "punpcklwd %5, %%mm2\n\t"
1024 "punpckhwd %5, %%mm3\n\t"
1025 "punpckhwd %5, %%mm4\n\t"
1026 "punpckhwd %5, %%mm5\n\t"
1027 "psllq $8, %%mm1\n\t"
1028 "psllq $16, %%mm2\n\t"
1029 "por %%mm1, %%mm0\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psllq $8, %%mm4\n\t"
1032 "psllq $16, %%mm5\n\t"
1033 "por %%mm4, %%mm3\n\t"
1034 "por %%mm5, %%mm3\n\t"
1036 "movq %%mm0, %%mm6\n\t"
1037 "movq %%mm3, %%mm7\n\t"
1039 "movq 8%1, %%mm0\n\t"
1040 "movq 8%1, %%mm1\n\t"
1041 "movq 8%1, %%mm2\n\t"
1042 "pand %2, %%mm0\n\t"
1043 "pand %3, %%mm1\n\t"
1044 "pand %4, %%mm2\n\t"
1045 "psllq $3, %%mm0\n\t"
1046 "psrlq $3, %%mm1\n\t"
1047 "psrlq $8, %%mm2\n\t"
1048 "movq %%mm0, %%mm3\n\t"
1049 "movq %%mm1, %%mm4\n\t"
1050 "movq %%mm2, %%mm5\n\t"
1051 "punpcklwd %5, %%mm0\n\t"
1052 "punpcklwd %5, %%mm1\n\t"
1053 "punpcklwd %5, %%mm2\n\t"
1054 "punpckhwd %5, %%mm3\n\t"
1055 "punpckhwd %5, %%mm4\n\t"
1056 "punpckhwd %5, %%mm5\n\t"
1057 "psllq $8, %%mm1\n\t"
1058 "psllq $16, %%mm2\n\t"
1059 "por %%mm1, %%mm0\n\t"
1060 "por %%mm2, %%mm0\n\t"
1061 "psllq $8, %%mm4\n\t"
1062 "psllq $16, %%mm5\n\t"
1063 "por %%mm4, %%mm3\n\t"
1064 "por %%mm5, %%mm3\n\t"
1066 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1068 /* Borrowed 32 to 24 */
1070 "movq %%mm0, %%mm4\n\t"
1071 "movq %%mm3, %%mm5\n\t"
1072 "movq %%mm6, %%mm0\n\t"
1073 "movq %%mm7, %%mm1\n\t"
1075 "movq %%mm4, %%mm6\n\t"
1076 "movq %%mm5, %%mm7\n\t"
1077 "movq %%mm0, %%mm2\n\t"
1078 "movq %%mm1, %%mm3\n\t"
1080 "psrlq $8, %%mm2\n\t"
1081 "psrlq $8, %%mm3\n\t"
1082 "psrlq $8, %%mm6\n\t"
1083 "psrlq $8, %%mm7\n\t"
1084 "pand %2, %%mm0\n\t"
1085 "pand %2, %%mm1\n\t"
1086 "pand %2, %%mm4\n\t"
1087 "pand %2, %%mm5\n\t"
1088 "pand %3, %%mm2\n\t"
1089 "pand %3, %%mm3\n\t"
1090 "pand %3, %%mm6\n\t"
1091 "pand %3, %%mm7\n\t"
1092 "por %%mm2, %%mm0\n\t"
1093 "por %%mm3, %%mm1\n\t"
1094 "por %%mm6, %%mm4\n\t"
1095 "por %%mm7, %%mm5\n\t"
1097 "movq %%mm1, %%mm2\n\t"
1098 "movq %%mm4, %%mm3\n\t"
1099 "psllq $48, %%mm2\n\t"
1100 "psllq $32, %%mm3\n\t"
1101 "pand %4, %%mm2\n\t"
1102 "pand %5, %%mm3\n\t"
1103 "por %%mm2, %%mm0\n\t"
1104 "psrlq $16, %%mm1\n\t"
1105 "psrlq $32, %%mm4\n\t"
1106 "psllq $16, %%mm5\n\t"
1107 "por %%mm3, %%mm1\n\t"
1108 "pand %6, %%mm5\n\t"
1109 "por %%mm5, %%mm4\n\t"
1111 MOVNTQ" %%mm0, %0\n\t"
1112 MOVNTQ" %%mm1, 8%0\n\t"
1113 MOVNTQ" %%mm4, 16%0"
1116 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1121 __asm __volatile(SFENCE:::"memory");
1122 __asm __volatile(EMMS:::"memory");
1126 register uint16_t bgr;
1128 *d++ = (bgr&0x1F)<<3;
1129 *d++ = (bgr&0x7E0)>>3;
1130 *d++ = (bgr&0xF800)>>8;
1134 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1136 const uint16_t *end;
1138 const uint16_t *mm_end;
1140 uint8_t *d = (uint8_t *)dst;
1141 const uint16_t *s = (const uint16_t *)src;
1142 end = s + src_size/2;
1144 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1145 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1151 "movq %1, %%mm0\n\t"
1152 "movq %1, %%mm1\n\t"
1153 "movq %1, %%mm2\n\t"
1154 "pand %2, %%mm0\n\t"
1155 "pand %3, %%mm1\n\t"
1156 "pand %4, %%mm2\n\t"
1157 "psllq $3, %%mm0\n\t"
1158 "psrlq $2, %%mm1\n\t"
1159 "psrlq $7, %%mm2\n\t"
1160 "movq %%mm0, %%mm3\n\t"
1161 "movq %%mm1, %%mm4\n\t"
1162 "movq %%mm2, %%mm5\n\t"
1163 "punpcklwd %%mm7, %%mm0\n\t"
1164 "punpcklwd %%mm7, %%mm1\n\t"
1165 "punpcklwd %%mm7, %%mm2\n\t"
1166 "punpckhwd %%mm7, %%mm3\n\t"
1167 "punpckhwd %%mm7, %%mm4\n\t"
1168 "punpckhwd %%mm7, %%mm5\n\t"
1169 "psllq $8, %%mm1\n\t"
1170 "psllq $16, %%mm2\n\t"
1171 "por %%mm1, %%mm0\n\t"
1172 "por %%mm2, %%mm0\n\t"
1173 "psllq $8, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm4, %%mm3\n\t"
1176 "por %%mm5, %%mm3\n\t"
1177 MOVNTQ" %%mm0, %0\n\t"
1178 MOVNTQ" %%mm3, 8%0\n\t"
1180 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1185 __asm __volatile(SFENCE:::"memory");
1186 __asm __volatile(EMMS:::"memory");
1190 register uint16_t bgr;
1192 *d++ = (bgr&0x1F)<<3;
1193 *d++ = (bgr&0x3E0)>>2;
1194 *d++ = (bgr&0x7C00)>>7;
1199 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1201 const uint16_t *end;
1203 const uint16_t *mm_end;
1205 uint8_t *d = (uint8_t *)dst;
1206 const uint16_t *s = (uint16_t *)src;
1207 end = s + src_size/2;
1209 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1210 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1216 "movq %1, %%mm0\n\t"
1217 "movq %1, %%mm1\n\t"
1218 "movq %1, %%mm2\n\t"
1219 "pand %2, %%mm0\n\t"
1220 "pand %3, %%mm1\n\t"
1221 "pand %4, %%mm2\n\t"
1222 "psllq $3, %%mm0\n\t"
1223 "psrlq $3, %%mm1\n\t"
1224 "psrlq $8, %%mm2\n\t"
1225 "movq %%mm0, %%mm3\n\t"
1226 "movq %%mm1, %%mm4\n\t"
1227 "movq %%mm2, %%mm5\n\t"
1228 "punpcklwd %%mm7, %%mm0\n\t"
1229 "punpcklwd %%mm7, %%mm1\n\t"
1230 "punpcklwd %%mm7, %%mm2\n\t"
1231 "punpckhwd %%mm7, %%mm3\n\t"
1232 "punpckhwd %%mm7, %%mm4\n\t"
1233 "punpckhwd %%mm7, %%mm5\n\t"
1234 "psllq $8, %%mm1\n\t"
1235 "psllq $16, %%mm2\n\t"
1236 "por %%mm1, %%mm0\n\t"
1237 "por %%mm2, %%mm0\n\t"
1238 "psllq $8, %%mm4\n\t"
1239 "psllq $16, %%mm5\n\t"
1240 "por %%mm4, %%mm3\n\t"
1241 "por %%mm5, %%mm3\n\t"
1242 MOVNTQ" %%mm0, %0\n\t"
1243 MOVNTQ" %%mm3, 8%0\n\t"
1245 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1250 __asm __volatile(SFENCE:::"memory");
1251 __asm __volatile(EMMS:::"memory");
1255 register uint16_t bgr;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x7E0)>>3;
1259 *d++ = (bgr&0xF800)>>8;
1264 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1267 /* TODO: unroll this loop */
1269 "xorl %%eax, %%eax \n\t"
1272 PREFETCH" 32(%0, %%eax) \n\t"
1273 "movq (%0, %%eax), %%mm0 \n\t"
1274 "movq %%mm0, %%mm1 \n\t"
1275 "movq %%mm0, %%mm2 \n\t"
1276 "pslld $16, %%mm0 \n\t"
1277 "psrld $16, %%mm1 \n\t"
1278 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1279 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1280 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1281 "por %%mm0, %%mm2 \n\t"
1282 "por %%mm1, %%mm2 \n\t"
1283 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1284 "addl $8, %%eax \n\t"
1285 "cmpl %2, %%eax \n\t"
1287 :: "r" (src), "r"(dst), "r" (src_size-7)
1291 __asm __volatile(SFENCE:::"memory");
1292 __asm __volatile(EMMS:::"memory");
1295 unsigned num_pixels = src_size >> 2;
1296 for(i=0; i<num_pixels; i++)
1298 dst[4*i + 0] = src[4*i + 2];
1299 dst[4*i + 1] = src[4*i + 1];
1300 dst[4*i + 2] = src[4*i + 0];
1305 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1309 int mmx_size= 23 - src_size;
1311 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1312 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1313 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1316 PREFETCH" 32(%1, %%eax) \n\t"
1317 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1319 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1320 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1321 "pand %%mm5, %%mm0 \n\t"
1322 "pand %%mm6, %%mm1 \n\t"
1323 "pand %%mm7, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1327 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1328 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1329 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1330 "pand %%mm7, %%mm0 \n\t"
1331 "pand %%mm5, %%mm1 \n\t"
1332 "pand %%mm6, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1336 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1337 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1338 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1339 "pand %%mm6, %%mm0 \n\t"
1340 "pand %%mm7, %%mm1 \n\t"
1341 "pand %%mm5, %%mm2 \n\t"
1342 "por %%mm0, %%mm1 \n\t"
1343 "por %%mm2, %%mm1 \n\t"
1344 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1345 "addl $24, %%eax \n\t"
1348 : "r" (src-mmx_size), "r"(dst-mmx_size)
1351 __asm __volatile(SFENCE:::"memory");
1352 __asm __volatile(EMMS:::"memory");
1354 if(mmx_size==23) return; //finihsed, was multiple of 8
1358 src_size= 23-mmx_size;
1362 for(i=0; i<src_size; i+=3)
1366 dst[i + 1] = src[i + 1];
1367 dst[i + 2] = src[i + 0];
1372 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1373 unsigned int width, unsigned int height,
1374 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1377 const unsigned chromWidth= width>>1;
1378 for(y=0; y<height; y++)
1381 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1383 "xorl %%eax, %%eax \n\t"
1386 PREFETCH" 32(%1, %%eax, 2) \n\t"
1387 PREFETCH" 32(%2, %%eax) \n\t"
1388 PREFETCH" 32(%3, %%eax) \n\t"
1389 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1390 "movq %%mm0, %%mm2 \n\t" // U(0)
1391 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1392 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1393 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1395 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1396 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1397 "movq %%mm3, %%mm4 \n\t" // Y(0)
1398 "movq %%mm5, %%mm6 \n\t" // Y(8)
1399 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1400 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1401 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1402 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1404 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1405 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1406 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1409 "addl $8, %%eax \n\t"
1410 "cmpl %4, %%eax \n\t"
1412 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1416 #if __WORDSIZE >= 64
1418 uint64_t *ldst = (uint64_t *) dst;
1419 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420 for(i = 0; i < chromWidth; i += 2){
1422 k = yc[0] + (uc[0] << 8) +
1423 (yc[1] << 16) + (vc[0] << 24);
1424 l = yc[2] + (uc[1] << 8) +
1425 (yc[3] << 16) + (vc[1] << 24);
1426 *ldst++ = k + (l << 32);
1433 int i, *idst = (int32_t *) dst;
1434 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435 for(i = 0; i < chromWidth; i++){
1436 *idst++ = yc[0] + (uc[0] << 8) +
1437 (yc[1] << 16) + (vc[0] << 24);
1444 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1446 usrc += chromStride;
1447 vsrc += chromStride;
1461 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462 * problem for anyone then tell me, and ill fix it)
1464 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465 unsigned int width, unsigned int height,
1466 int lumStride, int chromStride, int dstStride)
1468 //FIXME interpolate chroma
1469 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1474 * width should be a multiple of 16
1476 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477 unsigned int width, unsigned int height,
1478 int lumStride, int chromStride, int dstStride)
1480 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1485 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486 * problem for anyone then tell me, and ill fix it)
1488 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1489 unsigned int width, unsigned int height,
1490 int lumStride, int chromStride, int srcStride)
1493 const unsigned chromWidth= width>>1;
1494 for(y=0; y<height; y+=2)
1498 "xorl %%eax, %%eax \n\t"
1499 "pcmpeqw %%mm7, %%mm7 \n\t"
1500 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1503 PREFETCH" 64(%0, %%eax, 4) \n\t"
1504 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1505 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1506 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1507 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1508 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1509 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1510 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1511 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1512 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1513 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1515 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1517 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1518 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1519 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1520 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1521 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1522 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1523 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1524 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1525 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1526 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1528 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1530 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1531 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1532 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1533 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1534 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1535 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1536 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1537 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1539 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1540 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1542 "addl $8, %%eax \n\t"
1543 "cmpl %4, %%eax \n\t"
1545 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1553 "xorl %%eax, %%eax \n\t"
1556 PREFETCH" 64(%0, %%eax, 4) \n\t"
1557 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1558 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1559 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1560 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1561 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1562 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1563 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1564 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1565 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1566 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1568 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1569 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1571 "addl $8, %%eax \n\t"
1572 "cmpl %4, %%eax \n\t"
1575 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1580 for(i=0; i<chromWidth; i++)
1582 ydst[2*i+0] = src[4*i+0];
1583 udst[i] = src[4*i+1];
1584 ydst[2*i+1] = src[4*i+2];
1585 vdst[i] = src[4*i+3];
1590 for(i=0; i<chromWidth; i++)
1592 ydst[2*i+0] = src[4*i+0];
1593 ydst[2*i+1] = src[4*i+2];
1596 udst += chromStride;
1597 vdst += chromStride;
1602 asm volatile( EMMS" \n\t"
1608 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610 unsigned int width, unsigned int height, int lumStride, int chromStride)
1613 memcpy(ydst, ysrc, width*height);
1615 /* XXX: implement upscaling for U,V */
1618 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1625 for(x=0; x<srcWidth-1; x++){
1626 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1627 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1629 dst[2*srcWidth-1]= src[srcWidth-1];
1633 for(y=1; y<srcHeight; y++){
1634 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1635 const int mmxSize= srcWidth&~15;
1637 "movl %4, %%eax \n\t"
1639 "movq (%0, %%eax), %%mm0 \n\t"
1640 "movq (%1, %%eax), %%mm1 \n\t"
1641 "movq 1(%0, %%eax), %%mm2 \n\t"
1642 "movq 1(%1, %%eax), %%mm3 \n\t"
1643 "movq -1(%0, %%eax), %%mm4 \n\t"
1644 "movq -1(%1, %%eax), %%mm5 \n\t"
1645 PAVGB" %%mm0, %%mm5 \n\t"
1646 PAVGB" %%mm0, %%mm3 \n\t"
1647 PAVGB" %%mm0, %%mm5 \n\t"
1648 PAVGB" %%mm0, %%mm3 \n\t"
1649 PAVGB" %%mm1, %%mm4 \n\t"
1650 PAVGB" %%mm1, %%mm2 \n\t"
1651 PAVGB" %%mm1, %%mm4 \n\t"
1652 PAVGB" %%mm1, %%mm2 \n\t"
1653 "movq %%mm5, %%mm7 \n\t"
1654 "movq %%mm4, %%mm6 \n\t"
1655 "punpcklbw %%mm3, %%mm5 \n\t"
1656 "punpckhbw %%mm3, %%mm7 \n\t"
1657 "punpcklbw %%mm2, %%mm4 \n\t"
1658 "punpckhbw %%mm2, %%mm6 \n\t"
1660 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1661 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1662 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1663 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1665 "movq %%mm5, (%2, %%eax, 2) \n\t"
1666 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1667 "movq %%mm4, (%3, %%eax, 2) \n\t"
1668 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1670 "addl $8, %%eax \n\t"
1672 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1673 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1679 const int mmxSize=1;
1681 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1682 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1684 for(x=mmxSize-1; x<srcWidth-1; x++){
1685 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1686 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1687 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1688 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1690 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1691 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1701 for(x=0; x<srcWidth-1; x++){
1702 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1703 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1705 dst[2*srcWidth-1]= src[srcWidth-1];
1707 for(x=0; x<srcWidth; x++){
1714 asm volatile( EMMS" \n\t"
1722 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1723 * problem for anyone then tell me, and ill fix it)
1724 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1726 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1727 unsigned int width, unsigned int height,
1728 int lumStride, int chromStride, int srcStride)
1731 const unsigned chromWidth= width>>1;
1732 for(y=0; y<height; y+=2)
1736 "xorl %%eax, %%eax \n\t"
1737 "pcmpeqw %%mm7, %%mm7 \n\t"
1738 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1741 PREFETCH" 64(%0, %%eax, 4) \n\t"
1742 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1743 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1744 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1745 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1746 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1747 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1748 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1749 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1750 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1751 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1753 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1755 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1756 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1757 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1758 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1759 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1760 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1761 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1762 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1763 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1764 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1766 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1768 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1769 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1770 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1771 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1772 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1773 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1774 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1775 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1777 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1778 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1780 "addl $8, %%eax \n\t"
1781 "cmpl %4, %%eax \n\t"
1783 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1791 "xorl %%eax, %%eax \n\t"
1794 PREFETCH" 64(%0, %%eax, 4) \n\t"
1795 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1796 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1797 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1798 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1799 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1800 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1801 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1802 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1803 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1804 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1806 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1807 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1809 "addl $8, %%eax \n\t"
1810 "cmpl %4, %%eax \n\t"
1813 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1818 for(i=0; i<chromWidth; i++)
1820 udst[i] = src[4*i+0];
1821 ydst[2*i+0] = src[4*i+1];
1822 vdst[i] = src[4*i+2];
1823 ydst[2*i+1] = src[4*i+3];
1828 for(i=0; i<chromWidth; i++)
1830 ydst[2*i+0] = src[4*i+1];
1831 ydst[2*i+1] = src[4*i+3];
1834 udst += chromStride;
1835 vdst += chromStride;
1840 asm volatile( EMMS" \n\t"
1848 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1849 * problem for anyone then tell me, and ill fix it)
1850 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1852 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853 unsigned int width, unsigned int height,
1854 int lumStride, int chromStride, int srcStride)
1857 const unsigned chromWidth= width>>1;
1859 for(y=0; y<height-2; y+=2)
1865 "movl %2, %%eax \n\t"
1866 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1867 "movq "MANGLE(w1111)", %%mm5 \n\t"
1868 "pxor %%mm7, %%mm7 \n\t"
1869 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1872 PREFETCH" 64(%0, %%ebx) \n\t"
1873 "movd (%0, %%ebx), %%mm0 \n\t"
1874 "movd 3(%0, %%ebx), %%mm1 \n\t"
1875 "punpcklbw %%mm7, %%mm0 \n\t"
1876 "punpcklbw %%mm7, %%mm1 \n\t"
1877 "movd 6(%0, %%ebx), %%mm2 \n\t"
1878 "movd 9(%0, %%ebx), %%mm3 \n\t"
1879 "punpcklbw %%mm7, %%mm2 \n\t"
1880 "punpcklbw %%mm7, %%mm3 \n\t"
1881 "pmaddwd %%mm6, %%mm0 \n\t"
1882 "pmaddwd %%mm6, %%mm1 \n\t"
1883 "pmaddwd %%mm6, %%mm2 \n\t"
1884 "pmaddwd %%mm6, %%mm3 \n\t"
1885 #ifndef FAST_BGR2YV12
1886 "psrad $8, %%mm0 \n\t"
1887 "psrad $8, %%mm1 \n\t"
1888 "psrad $8, %%mm2 \n\t"
1889 "psrad $8, %%mm3 \n\t"
1891 "packssdw %%mm1, %%mm0 \n\t"
1892 "packssdw %%mm3, %%mm2 \n\t"
1893 "pmaddwd %%mm5, %%mm0 \n\t"
1894 "pmaddwd %%mm5, %%mm2 \n\t"
1895 "packssdw %%mm2, %%mm0 \n\t"
1896 "psraw $7, %%mm0 \n\t"
1898 "movd 12(%0, %%ebx), %%mm4 \n\t"
1899 "movd 15(%0, %%ebx), %%mm1 \n\t"
1900 "punpcklbw %%mm7, %%mm4 \n\t"
1901 "punpcklbw %%mm7, %%mm1 \n\t"
1902 "movd 18(%0, %%ebx), %%mm2 \n\t"
1903 "movd 21(%0, %%ebx), %%mm3 \n\t"
1904 "punpcklbw %%mm7, %%mm2 \n\t"
1905 "punpcklbw %%mm7, %%mm3 \n\t"
1906 "pmaddwd %%mm6, %%mm4 \n\t"
1907 "pmaddwd %%mm6, %%mm1 \n\t"
1908 "pmaddwd %%mm6, %%mm2 \n\t"
1909 "pmaddwd %%mm6, %%mm3 \n\t"
1910 #ifndef FAST_BGR2YV12
1911 "psrad $8, %%mm4 \n\t"
1912 "psrad $8, %%mm1 \n\t"
1913 "psrad $8, %%mm2 \n\t"
1914 "psrad $8, %%mm3 \n\t"
1916 "packssdw %%mm1, %%mm4 \n\t"
1917 "packssdw %%mm3, %%mm2 \n\t"
1918 "pmaddwd %%mm5, %%mm4 \n\t"
1919 "pmaddwd %%mm5, %%mm2 \n\t"
1920 "addl $24, %%ebx \n\t"
1921 "packssdw %%mm2, %%mm4 \n\t"
1922 "psraw $7, %%mm4 \n\t"
1924 "packuswb %%mm4, %%mm0 \n\t"
1925 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1927 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1928 "addl $8, %%eax \n\t"
1930 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1938 "movl %4, %%eax \n\t"
1939 "movq "MANGLE(w1111)", %%mm5 \n\t"
1940 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1941 "pxor %%mm7, %%mm7 \n\t"
1942 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1943 "addl %%ebx, %%ebx \n\t"
1946 PREFETCH" 64(%0, %%ebx) \n\t"
1947 PREFETCH" 64(%1, %%ebx) \n\t"
1948 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1949 "movq (%0, %%ebx), %%mm0 \n\t"
1950 "movq (%1, %%ebx), %%mm1 \n\t"
1951 "movq 6(%0, %%ebx), %%mm2 \n\t"
1952 "movq 6(%1, %%ebx), %%mm3 \n\t"
1953 PAVGB" %%mm1, %%mm0 \n\t"
1954 PAVGB" %%mm3, %%mm2 \n\t"
1955 "movq %%mm0, %%mm1 \n\t"
1956 "movq %%mm2, %%mm3 \n\t"
1957 "psrlq $24, %%mm0 \n\t"
1958 "psrlq $24, %%mm2 \n\t"
1959 PAVGB" %%mm1, %%mm0 \n\t"
1960 PAVGB" %%mm3, %%mm2 \n\t"
1961 "punpcklbw %%mm7, %%mm0 \n\t"
1962 "punpcklbw %%mm7, %%mm2 \n\t"
1964 "movd (%0, %%ebx), %%mm0 \n\t"
1965 "movd (%1, %%ebx), %%mm1 \n\t"
1966 "movd 3(%0, %%ebx), %%mm2 \n\t"
1967 "movd 3(%1, %%ebx), %%mm3 \n\t"
1968 "punpcklbw %%mm7, %%mm0 \n\t"
1969 "punpcklbw %%mm7, %%mm1 \n\t"
1970 "punpcklbw %%mm7, %%mm2 \n\t"
1971 "punpcklbw %%mm7, %%mm3 \n\t"
1972 "paddw %%mm1, %%mm0 \n\t"
1973 "paddw %%mm3, %%mm2 \n\t"
1974 "paddw %%mm2, %%mm0 \n\t"
1975 "movd 6(%0, %%ebx), %%mm4 \n\t"
1976 "movd 6(%1, %%ebx), %%mm1 \n\t"
1977 "movd 9(%0, %%ebx), %%mm2 \n\t"
1978 "movd 9(%1, %%ebx), %%mm3 \n\t"
1979 "punpcklbw %%mm7, %%mm4 \n\t"
1980 "punpcklbw %%mm7, %%mm1 \n\t"
1981 "punpcklbw %%mm7, %%mm2 \n\t"
1982 "punpcklbw %%mm7, %%mm3 \n\t"
1983 "paddw %%mm1, %%mm4 \n\t"
1984 "paddw %%mm3, %%mm2 \n\t"
1985 "paddw %%mm4, %%mm2 \n\t"
1986 "psrlw $2, %%mm0 \n\t"
1987 "psrlw $2, %%mm2 \n\t"
1989 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1990 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1992 "pmaddwd %%mm0, %%mm1 \n\t"
1993 "pmaddwd %%mm2, %%mm3 \n\t"
1994 "pmaddwd %%mm6, %%mm0 \n\t"
1995 "pmaddwd %%mm6, %%mm2 \n\t"
1996 #ifndef FAST_BGR2YV12
1997 "psrad $8, %%mm0 \n\t"
1998 "psrad $8, %%mm1 \n\t"
1999 "psrad $8, %%mm2 \n\t"
2000 "psrad $8, %%mm3 \n\t"
2002 "packssdw %%mm2, %%mm0 \n\t"
2003 "packssdw %%mm3, %%mm1 \n\t"
2004 "pmaddwd %%mm5, %%mm0 \n\t"
2005 "pmaddwd %%mm5, %%mm1 \n\t"
2006 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2007 "psraw $7, %%mm0 \n\t"
2009 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2010 "movq 12(%0, %%ebx), %%mm4 \n\t"
2011 "movq 12(%1, %%ebx), %%mm1 \n\t"
2012 "movq 18(%0, %%ebx), %%mm2 \n\t"
2013 "movq 18(%1, %%ebx), %%mm3 \n\t"
2014 PAVGB" %%mm1, %%mm4 \n\t"
2015 PAVGB" %%mm3, %%mm2 \n\t"
2016 "movq %%mm4, %%mm1 \n\t"
2017 "movq %%mm2, %%mm3 \n\t"
2018 "psrlq $24, %%mm4 \n\t"
2019 "psrlq $24, %%mm2 \n\t"
2020 PAVGB" %%mm1, %%mm4 \n\t"
2021 PAVGB" %%mm3, %%mm2 \n\t"
2022 "punpcklbw %%mm7, %%mm4 \n\t"
2023 "punpcklbw %%mm7, %%mm2 \n\t"
2025 "movd 12(%0, %%ebx), %%mm4 \n\t"
2026 "movd 12(%1, %%ebx), %%mm1 \n\t"
2027 "movd 15(%0, %%ebx), %%mm2 \n\t"
2028 "movd 15(%1, %%ebx), %%mm3 \n\t"
2029 "punpcklbw %%mm7, %%mm4 \n\t"
2030 "punpcklbw %%mm7, %%mm1 \n\t"
2031 "punpcklbw %%mm7, %%mm2 \n\t"
2032 "punpcklbw %%mm7, %%mm3 \n\t"
2033 "paddw %%mm1, %%mm4 \n\t"
2034 "paddw %%mm3, %%mm2 \n\t"
2035 "paddw %%mm2, %%mm4 \n\t"
2036 "movd 18(%0, %%ebx), %%mm5 \n\t"
2037 "movd 18(%1, %%ebx), %%mm1 \n\t"
2038 "movd 21(%0, %%ebx), %%mm2 \n\t"
2039 "movd 21(%1, %%ebx), %%mm3 \n\t"
2040 "punpcklbw %%mm7, %%mm5 \n\t"
2041 "punpcklbw %%mm7, %%mm1 \n\t"
2042 "punpcklbw %%mm7, %%mm2 \n\t"
2043 "punpcklbw %%mm7, %%mm3 \n\t"
2044 "paddw %%mm1, %%mm5 \n\t"
2045 "paddw %%mm3, %%mm2 \n\t"
2046 "paddw %%mm5, %%mm2 \n\t"
2047 "movq "MANGLE(w1111)", %%mm5 \n\t"
2048 "psrlw $2, %%mm4 \n\t"
2049 "psrlw $2, %%mm2 \n\t"
2051 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2052 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2054 "pmaddwd %%mm4, %%mm1 \n\t"
2055 "pmaddwd %%mm2, %%mm3 \n\t"
2056 "pmaddwd %%mm6, %%mm4 \n\t"
2057 "pmaddwd %%mm6, %%mm2 \n\t"
2058 #ifndef FAST_BGR2YV12
2059 "psrad $8, %%mm4 \n\t"
2060 "psrad $8, %%mm1 \n\t"
2061 "psrad $8, %%mm2 \n\t"
2062 "psrad $8, %%mm3 \n\t"
2064 "packssdw %%mm2, %%mm4 \n\t"
2065 "packssdw %%mm3, %%mm1 \n\t"
2066 "pmaddwd %%mm5, %%mm4 \n\t"
2067 "pmaddwd %%mm5, %%mm1 \n\t"
2068 "addl $24, %%ebx \n\t"
2069 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2070 "psraw $7, %%mm4 \n\t"
2072 "movq %%mm0, %%mm1 \n\t"
2073 "punpckldq %%mm4, %%mm0 \n\t"
2074 "punpckhdq %%mm4, %%mm1 \n\t"
2075 "packsswb %%mm1, %%mm0 \n\t"
2076 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2078 "movd %%mm0, (%2, %%eax) \n\t"
2079 "punpckhdq %%mm0, %%mm0 \n\t"
2080 "movd %%mm0, (%3, %%eax) \n\t"
2081 "addl $4, %%eax \n\t"
2083 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2087 udst += chromStride;
2088 vdst += chromStride;
2092 asm volatile( EMMS" \n\t"
2098 for(; y<height; y+=2)
2101 for(i=0; i<chromWidth; i++)
2103 unsigned int b= src[6*i+0];
2104 unsigned int g= src[6*i+1];
2105 unsigned int r= src[6*i+2];
2107 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2108 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2109 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2119 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2125 for(i=0; i<chromWidth; i++)
2127 unsigned int b= src[6*i+0];
2128 unsigned int g= src[6*i+1];
2129 unsigned int r= src[6*i+2];
2131 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2139 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2142 udst += chromStride;
2143 vdst += chromStride;
2149 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2150 unsigned width, unsigned height, int src1Stride,
2151 int src2Stride, int dstStride){
2154 for(h=0; h < height; h++)
2161 "xorl %%eax, %%eax \n\t"
2163 PREFETCH" 64(%1, %%eax) \n\t"
2164 PREFETCH" 64(%2, %%eax) \n\t"
2165 "movdqa (%1, %%eax), %%xmm0 \n\t"
2166 "movdqa (%1, %%eax), %%xmm1 \n\t"
2167 "movdqa (%2, %%eax), %%xmm2 \n\t"
2168 "punpcklbw %%xmm2, %%xmm0 \n\t"
2169 "punpckhbw %%xmm2, %%xmm1 \n\t"
2170 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2171 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2172 "addl $16, %%eax \n\t"
2173 "cmpl %3, %%eax \n\t"
2175 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2180 "xorl %%eax, %%eax \n\t"
2182 PREFETCH" 64(%1, %%eax) \n\t"
2183 PREFETCH" 64(%2, %%eax) \n\t"
2184 "movq (%1, %%eax), %%mm0 \n\t"
2185 "movq 8(%1, %%eax), %%mm2 \n\t"
2186 "movq %%mm0, %%mm1 \n\t"
2187 "movq %%mm2, %%mm3 \n\t"
2188 "movq (%2, %%eax), %%mm4 \n\t"
2189 "movq 8(%2, %%eax), %%mm5 \n\t"
2190 "punpcklbw %%mm4, %%mm0 \n\t"
2191 "punpckhbw %%mm4, %%mm1 \n\t"
2192 "punpcklbw %%mm5, %%mm2 \n\t"
2193 "punpckhbw %%mm5, %%mm3 \n\t"
2194 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2195 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2196 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2197 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2198 "addl $16, %%eax \n\t"
2199 "cmpl %3, %%eax \n\t"
2201 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2205 for(w= (width&(~15)); w < width; w++)
2207 dest[2*w+0] = src1[w];
2208 dest[2*w+1] = src2[w];
2211 for(w=0; w < width; w++)
2213 dest[2*w+0] = src1[w];
2214 dest[2*w+1] = src2[w];
2230 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2231 uint8_t *dst1, uint8_t *dst2,
2232 unsigned width, unsigned height,
2233 int srcStride1, int srcStride2,
2234 int dstStride1, int dstStride2)
2238 w=width/2; h=height/2;
2243 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2246 const uint8_t* s1=src1+srcStride1*(y>>1);
2247 uint8_t* d=dst1+dstStride1*y;
2254 "movq %1, %%mm0\n\t"
2255 "movq 8%1, %%mm2\n\t"
2256 "movq 16%1, %%mm4\n\t"
2257 "movq 24%1, %%mm6\n\t"
2258 "movq %%mm0, %%mm1\n\t"
2259 "movq %%mm2, %%mm3\n\t"
2260 "movq %%mm4, %%mm5\n\t"
2261 "movq %%mm6, %%mm7\n\t"
2262 "punpcklbw %%mm0, %%mm0\n\t"
2263 "punpckhbw %%mm1, %%mm1\n\t"
2264 "punpcklbw %%mm2, %%mm2\n\t"
2265 "punpckhbw %%mm3, %%mm3\n\t"
2266 "punpcklbw %%mm4, %%mm4\n\t"
2267 "punpckhbw %%mm5, %%mm5\n\t"
2268 "punpcklbw %%mm6, %%mm6\n\t"
2269 "punpckhbw %%mm7, %%mm7\n\t"
2270 MOVNTQ" %%mm0, %0\n\t"
2271 MOVNTQ" %%mm1, 8%0\n\t"
2272 MOVNTQ" %%mm2, 16%0\n\t"
2273 MOVNTQ" %%mm3, 24%0\n\t"
2274 MOVNTQ" %%mm4, 32%0\n\t"
2275 MOVNTQ" %%mm5, 40%0\n\t"
2276 MOVNTQ" %%mm6, 48%0\n\t"
2277 MOVNTQ" %%mm7, 56%0"
2283 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2286 const uint8_t* s2=src2+srcStride2*(y>>1);
2287 uint8_t* d=dst2+dstStride2*y;
2294 "movq %1, %%mm0\n\t"
2295 "movq 8%1, %%mm2\n\t"
2296 "movq 16%1, %%mm4\n\t"
2297 "movq 24%1, %%mm6\n\t"
2298 "movq %%mm0, %%mm1\n\t"
2299 "movq %%mm2, %%mm3\n\t"
2300 "movq %%mm4, %%mm5\n\t"
2301 "movq %%mm6, %%mm7\n\t"
2302 "punpcklbw %%mm0, %%mm0\n\t"
2303 "punpckhbw %%mm1, %%mm1\n\t"
2304 "punpcklbw %%mm2, %%mm2\n\t"
2305 "punpckhbw %%mm3, %%mm3\n\t"
2306 "punpcklbw %%mm4, %%mm4\n\t"
2307 "punpckhbw %%mm5, %%mm5\n\t"
2308 "punpcklbw %%mm6, %%mm6\n\t"
2309 "punpckhbw %%mm7, %%mm7\n\t"
2310 MOVNTQ" %%mm0, %0\n\t"
2311 MOVNTQ" %%mm1, 8%0\n\t"
2312 MOVNTQ" %%mm2, 16%0\n\t"
2313 MOVNTQ" %%mm3, 24%0\n\t"
2314 MOVNTQ" %%mm4, 32%0\n\t"
2315 MOVNTQ" %%mm5, 40%0\n\t"
2316 MOVNTQ" %%mm6, 48%0\n\t"
2317 MOVNTQ" %%mm7, 56%0"
2323 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2334 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2336 unsigned width, unsigned height,
2337 int srcStride1, int srcStride2,
2338 int srcStride3, int dstStride)
2340 unsigned y,x,x2,w,h;
2341 w=width/2; h=height;
2347 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2350 const uint8_t* yp=src1+srcStride1*y;
2351 const uint8_t* up=src2+srcStride2*(y>>2);
2352 const uint8_t* vp=src3+srcStride3*(y>>2);
2353 uint8_t* d=dst+dstStride*y;
2357 for(;x<w;x+=8,x2+=32)
2363 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2364 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2365 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2366 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2367 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2368 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2369 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2370 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2371 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2372 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2374 "movq %%mm1, %%mm6\n\t"
2375 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2376 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2377 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2378 MOVNTQ" %%mm0, %0\n\t"
2379 MOVNTQ" %%mm3, 8%0\n\t"
2381 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2382 "movq 8%1, %%mm0\n\t"
2383 "movq %%mm0, %%mm3\n\t"
2384 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2385 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2386 MOVNTQ" %%mm0, 16%0\n\t"
2387 MOVNTQ" %%mm3, 24%0\n\t"
2389 "movq %%mm4, %%mm6\n\t"
2390 "movq 16%1, %%mm0\n\t"
2391 "movq %%mm0, %%mm3\n\t"
2392 "punpcklbw %%mm5, %%mm4\n\t"
2393 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2394 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2395 MOVNTQ" %%mm0, 32%0\n\t"
2396 MOVNTQ" %%mm3, 40%0\n\t"
2398 "punpckhbw %%mm5, %%mm6\n\t"
2399 "movq 24%1, %%mm0\n\t"
2400 "movq %%mm0, %%mm3\n\t"
2401 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2402 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2403 MOVNTQ" %%mm0, 48%0\n\t"
2404 MOVNTQ" %%mm3, 56%0\n\t"
2407 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])