3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
246 unsigned j,i,num_pixels=src_size/3;
247 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
255 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
257 register const uint8_t* s=src;
258 register uint8_t* d=dst;
259 register const uint8_t *end;
260 const uint8_t *mm_end;
263 __asm __volatile(PREFETCH" %0"::"m"(*s));
264 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
265 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
272 "movq 8%1, %%mm2\n\t"
273 "movq %%mm0, %%mm1\n\t"
274 "movq %%mm2, %%mm3\n\t"
275 "psrlq $1, %%mm0\n\t"
276 "psrlq $1, %%mm2\n\t"
277 "pand %%mm7, %%mm0\n\t"
278 "pand %%mm7, %%mm2\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm3\n\t"
281 "por %%mm1, %%mm0\n\t"
282 "por %%mm3, %%mm2\n\t"
283 MOVNTQ" %%mm0, %0\n\t"
291 __asm __volatile(SFENCE:::"memory");
292 __asm __volatile(EMMS:::"memory");
297 register uint32_t x= *((uint32_t *)s);
298 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
304 register uint16_t x= *((uint16_t *)s);
305 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
311 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
313 const uint8_t *s = src;
316 const uint8_t *mm_end;
318 uint16_t *d = (uint16_t *)dst;
321 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
325 ::"m"(red_16mask),"m"(green_16mask));
332 "movd 4%1, %%mm3\n\t"
333 "punpckldq 8%1, %%mm0\n\t"
334 "punpckldq 12%1, %%mm3\n\t"
335 "movq %%mm0, %%mm1\n\t"
336 "movq %%mm0, %%mm2\n\t"
337 "movq %%mm3, %%mm4\n\t"
338 "movq %%mm3, %%mm5\n\t"
339 "psrlq $3, %%mm0\n\t"
340 "psrlq $3, %%mm3\n\t"
343 "psrlq $5, %%mm1\n\t"
344 "psrlq $5, %%mm4\n\t"
345 "pand %%mm6, %%mm1\n\t"
346 "pand %%mm6, %%mm4\n\t"
347 "psrlq $8, %%mm2\n\t"
348 "psrlq $8, %%mm5\n\t"
349 "pand %%mm7, %%mm2\n\t"
350 "pand %%mm7, %%mm5\n\t"
351 "por %%mm1, %%mm0\n\t"
352 "por %%mm4, %%mm3\n\t"
353 "por %%mm2, %%mm0\n\t"
354 "por %%mm5, %%mm3\n\t"
355 "psllq $16, %%mm3\n\t"
356 "por %%mm3, %%mm0\n\t"
357 MOVNTQ" %%mm0, %0\n\t"
358 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
362 __asm __volatile(SFENCE:::"memory");
363 __asm __volatile(EMMS:::"memory");
367 #ifndef WORDS_BIGENDIAN
372 const int a= *s++; /*skip*/
377 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
378 #ifndef WORDS_BIGENDIAN
384 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
386 const uint8_t *s = src;
389 const uint8_t *mm_end;
391 uint16_t *d = (uint16_t *)dst;
394 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
398 ::"m"(red_16mask),"m"(green_16mask));
405 "movd 4%1, %%mm3\n\t"
406 "punpckldq 8%1, %%mm0\n\t"
407 "punpckldq 12%1, %%mm3\n\t"
408 "movq %%mm0, %%mm1\n\t"
409 "movq %%mm0, %%mm2\n\t"
410 "movq %%mm3, %%mm4\n\t"
411 "movq %%mm3, %%mm5\n\t"
412 "psllq $8, %%mm0\n\t"
413 "psllq $8, %%mm3\n\t"
414 "pand %%mm7, %%mm0\n\t"
415 "pand %%mm7, %%mm3\n\t"
416 "psrlq $5, %%mm1\n\t"
417 "psrlq $5, %%mm4\n\t"
418 "pand %%mm6, %%mm1\n\t"
419 "pand %%mm6, %%mm4\n\t"
420 "psrlq $19, %%mm2\n\t"
421 "psrlq $19, %%mm5\n\t"
424 "por %%mm1, %%mm0\n\t"
425 "por %%mm4, %%mm3\n\t"
426 "por %%mm2, %%mm0\n\t"
427 "por %%mm5, %%mm3\n\t"
428 "psllq $16, %%mm3\n\t"
429 "por %%mm3, %%mm0\n\t"
430 MOVNTQ" %%mm0, %0\n\t"
431 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
435 __asm __volatile(SFENCE:::"memory");
436 __asm __volatile(EMMS:::"memory");
443 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
448 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
450 const uint8_t *s = src;
453 const uint8_t *mm_end;
455 uint16_t *d = (uint16_t *)dst;
458 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
462 ::"m"(red_15mask),"m"(green_15mask));
469 "movd 4%1, %%mm3\n\t"
470 "punpckldq 8%1, %%mm0\n\t"
471 "punpckldq 12%1, %%mm3\n\t"
472 "movq %%mm0, %%mm1\n\t"
473 "movq %%mm0, %%mm2\n\t"
474 "movq %%mm3, %%mm4\n\t"
475 "movq %%mm3, %%mm5\n\t"
476 "psrlq $3, %%mm0\n\t"
477 "psrlq $3, %%mm3\n\t"
480 "psrlq $6, %%mm1\n\t"
481 "psrlq $6, %%mm4\n\t"
482 "pand %%mm6, %%mm1\n\t"
483 "pand %%mm6, %%mm4\n\t"
484 "psrlq $9, %%mm2\n\t"
485 "psrlq $9, %%mm5\n\t"
486 "pand %%mm7, %%mm2\n\t"
487 "pand %%mm7, %%mm5\n\t"
488 "por %%mm1, %%mm0\n\t"
489 "por %%mm4, %%mm3\n\t"
490 "por %%mm2, %%mm0\n\t"
491 "por %%mm5, %%mm3\n\t"
492 "psllq $16, %%mm3\n\t"
493 "por %%mm3, %%mm0\n\t"
494 MOVNTQ" %%mm0, %0\n\t"
495 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
499 __asm __volatile(SFENCE:::"memory");
500 __asm __volatile(EMMS:::"memory");
507 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
512 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
514 const uint8_t *s = src;
517 const uint8_t *mm_end;
519 uint16_t *d = (uint16_t *)dst;
522 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
526 ::"m"(red_15mask),"m"(green_15mask));
533 "movd 4%1, %%mm3\n\t"
534 "punpckldq 8%1, %%mm0\n\t"
535 "punpckldq 12%1, %%mm3\n\t"
536 "movq %%mm0, %%mm1\n\t"
537 "movq %%mm0, %%mm2\n\t"
538 "movq %%mm3, %%mm4\n\t"
539 "movq %%mm3, %%mm5\n\t"
540 "psllq $7, %%mm0\n\t"
541 "psllq $7, %%mm3\n\t"
542 "pand %%mm7, %%mm0\n\t"
543 "pand %%mm7, %%mm3\n\t"
544 "psrlq $6, %%mm1\n\t"
545 "psrlq $6, %%mm4\n\t"
546 "pand %%mm6, %%mm1\n\t"
547 "pand %%mm6, %%mm4\n\t"
548 "psrlq $19, %%mm2\n\t"
549 "psrlq $19, %%mm5\n\t"
552 "por %%mm1, %%mm0\n\t"
553 "por %%mm4, %%mm3\n\t"
554 "por %%mm2, %%mm0\n\t"
555 "por %%mm5, %%mm3\n\t"
556 "psllq $16, %%mm3\n\t"
557 "por %%mm3, %%mm0\n\t"
558 MOVNTQ" %%mm0, %0\n\t"
559 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
563 __asm __volatile(SFENCE:::"memory");
564 __asm __volatile(EMMS:::"memory");
571 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
576 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
578 const uint8_t *s = src;
581 const uint8_t *mm_end;
583 uint16_t *d = (uint16_t *)dst;
586 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
590 ::"m"(red_16mask),"m"(green_16mask));
597 "movd 3%1, %%mm3\n\t"
598 "punpckldq 6%1, %%mm0\n\t"
599 "punpckldq 9%1, %%mm3\n\t"
600 "movq %%mm0, %%mm1\n\t"
601 "movq %%mm0, %%mm2\n\t"
602 "movq %%mm3, %%mm4\n\t"
603 "movq %%mm3, %%mm5\n\t"
604 "psrlq $3, %%mm0\n\t"
605 "psrlq $3, %%mm3\n\t"
608 "psrlq $5, %%mm1\n\t"
609 "psrlq $5, %%mm4\n\t"
610 "pand %%mm6, %%mm1\n\t"
611 "pand %%mm6, %%mm4\n\t"
612 "psrlq $8, %%mm2\n\t"
613 "psrlq $8, %%mm5\n\t"
614 "pand %%mm7, %%mm2\n\t"
615 "pand %%mm7, %%mm5\n\t"
616 "por %%mm1, %%mm0\n\t"
617 "por %%mm4, %%mm3\n\t"
618 "por %%mm2, %%mm0\n\t"
619 "por %%mm5, %%mm3\n\t"
620 "psllq $16, %%mm3\n\t"
621 "por %%mm3, %%mm0\n\t"
622 MOVNTQ" %%mm0, %0\n\t"
623 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
627 __asm __volatile(SFENCE:::"memory");
628 __asm __volatile(EMMS:::"memory");
635 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
639 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
641 const uint8_t *s = src;
644 const uint8_t *mm_end;
646 uint16_t *d = (uint16_t *)dst;
649 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
653 ::"m"(red_16mask),"m"(green_16mask));
660 "movd 3%1, %%mm3\n\t"
661 "punpckldq 6%1, %%mm0\n\t"
662 "punpckldq 9%1, %%mm3\n\t"
663 "movq %%mm0, %%mm1\n\t"
664 "movq %%mm0, %%mm2\n\t"
665 "movq %%mm3, %%mm4\n\t"
666 "movq %%mm3, %%mm5\n\t"
667 "psllq $8, %%mm0\n\t"
668 "psllq $8, %%mm3\n\t"
669 "pand %%mm7, %%mm0\n\t"
670 "pand %%mm7, %%mm3\n\t"
671 "psrlq $5, %%mm1\n\t"
672 "psrlq $5, %%mm4\n\t"
673 "pand %%mm6, %%mm1\n\t"
674 "pand %%mm6, %%mm4\n\t"
675 "psrlq $19, %%mm2\n\t"
676 "psrlq $19, %%mm5\n\t"
679 "por %%mm1, %%mm0\n\t"
680 "por %%mm4, %%mm3\n\t"
681 "por %%mm2, %%mm0\n\t"
682 "por %%mm5, %%mm3\n\t"
683 "psllq $16, %%mm3\n\t"
684 "por %%mm3, %%mm0\n\t"
685 MOVNTQ" %%mm0, %0\n\t"
686 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
690 __asm __volatile(SFENCE:::"memory");
691 __asm __volatile(EMMS:::"memory");
698 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
702 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
704 const uint8_t *s = src;
707 const uint8_t *mm_end;
709 uint16_t *d = (uint16_t *)dst;
712 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
716 ::"m"(red_15mask),"m"(green_15mask));
723 "movd 3%1, %%mm3\n\t"
724 "punpckldq 6%1, %%mm0\n\t"
725 "punpckldq 9%1, %%mm3\n\t"
726 "movq %%mm0, %%mm1\n\t"
727 "movq %%mm0, %%mm2\n\t"
728 "movq %%mm3, %%mm4\n\t"
729 "movq %%mm3, %%mm5\n\t"
730 "psrlq $3, %%mm0\n\t"
731 "psrlq $3, %%mm3\n\t"
734 "psrlq $6, %%mm1\n\t"
735 "psrlq $6, %%mm4\n\t"
736 "pand %%mm6, %%mm1\n\t"
737 "pand %%mm6, %%mm4\n\t"
738 "psrlq $9, %%mm2\n\t"
739 "psrlq $9, %%mm5\n\t"
740 "pand %%mm7, %%mm2\n\t"
741 "pand %%mm7, %%mm5\n\t"
742 "por %%mm1, %%mm0\n\t"
743 "por %%mm4, %%mm3\n\t"
744 "por %%mm2, %%mm0\n\t"
745 "por %%mm5, %%mm3\n\t"
746 "psllq $16, %%mm3\n\t"
747 "por %%mm3, %%mm0\n\t"
748 MOVNTQ" %%mm0, %0\n\t"
749 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
753 __asm __volatile(SFENCE:::"memory");
754 __asm __volatile(EMMS:::"memory");
761 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
765 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
767 const uint8_t *s = src;
770 const uint8_t *mm_end;
772 uint16_t *d = (uint16_t *)dst;
775 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
779 ::"m"(red_15mask),"m"(green_15mask));
786 "movd 3%1, %%mm3\n\t"
787 "punpckldq 6%1, %%mm0\n\t"
788 "punpckldq 9%1, %%mm3\n\t"
789 "movq %%mm0, %%mm1\n\t"
790 "movq %%mm0, %%mm2\n\t"
791 "movq %%mm3, %%mm4\n\t"
792 "movq %%mm3, %%mm5\n\t"
793 "psllq $7, %%mm0\n\t"
794 "psllq $7, %%mm3\n\t"
795 "pand %%mm7, %%mm0\n\t"
796 "pand %%mm7, %%mm3\n\t"
797 "psrlq $6, %%mm1\n\t"
798 "psrlq $6, %%mm4\n\t"
799 "pand %%mm6, %%mm1\n\t"
800 "pand %%mm6, %%mm4\n\t"
801 "psrlq $19, %%mm2\n\t"
802 "psrlq $19, %%mm5\n\t"
805 "por %%mm1, %%mm0\n\t"
806 "por %%mm4, %%mm3\n\t"
807 "por %%mm2, %%mm0\n\t"
808 "por %%mm5, %%mm3\n\t"
809 "psllq $16, %%mm3\n\t"
810 "por %%mm3, %%mm0\n\t"
811 MOVNTQ" %%mm0, %0\n\t"
812 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
816 __asm __volatile(SFENCE:::"memory");
817 __asm __volatile(EMMS:::"memory");
824 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
829 I use here less accurate approximation by simply
830 left-shifting the input
831 value and filling the low order bits with
832 zeroes. This method improves png's
833 compression but this scheme cannot reproduce white exactly, since it does not
834 generate an all-ones maximum value; the net effect is to darken the
837 The better method should be "left bit replication":
847 | Leftmost Bits Repeated to Fill Open Bits
851 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
855 const uint16_t *mm_end;
857 uint8_t *d = (uint8_t *)dst;
858 const uint16_t *s = (uint16_t *)src;
859 end = s + src_size/2;
861 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
873 "psllq $3, %%mm0\n\t"
874 "psrlq $2, %%mm1\n\t"
875 "psrlq $7, %%mm2\n\t"
876 "movq %%mm0, %%mm3\n\t"
877 "movq %%mm1, %%mm4\n\t"
878 "movq %%mm2, %%mm5\n\t"
879 "punpcklwd %5, %%mm0\n\t"
880 "punpcklwd %5, %%mm1\n\t"
881 "punpcklwd %5, %%mm2\n\t"
882 "punpckhwd %5, %%mm3\n\t"
883 "punpckhwd %5, %%mm4\n\t"
884 "punpckhwd %5, %%mm5\n\t"
885 "psllq $8, %%mm1\n\t"
886 "psllq $16, %%mm2\n\t"
887 "por %%mm1, %%mm0\n\t"
888 "por %%mm2, %%mm0\n\t"
889 "psllq $8, %%mm4\n\t"
890 "psllq $16, %%mm5\n\t"
891 "por %%mm4, %%mm3\n\t"
892 "por %%mm5, %%mm3\n\t"
894 "movq %%mm0, %%mm6\n\t"
895 "movq %%mm3, %%mm7\n\t"
897 "movq 8%1, %%mm0\n\t"
898 "movq 8%1, %%mm1\n\t"
899 "movq 8%1, %%mm2\n\t"
903 "psllq $3, %%mm0\n\t"
904 "psrlq $2, %%mm1\n\t"
905 "psrlq $7, %%mm2\n\t"
906 "movq %%mm0, %%mm3\n\t"
907 "movq %%mm1, %%mm4\n\t"
908 "movq %%mm2, %%mm5\n\t"
909 "punpcklwd %5, %%mm0\n\t"
910 "punpcklwd %5, %%mm1\n\t"
911 "punpcklwd %5, %%mm2\n\t"
912 "punpckhwd %5, %%mm3\n\t"
913 "punpckhwd %5, %%mm4\n\t"
914 "punpckhwd %5, %%mm5\n\t"
915 "psllq $8, %%mm1\n\t"
916 "psllq $16, %%mm2\n\t"
917 "por %%mm1, %%mm0\n\t"
918 "por %%mm2, %%mm0\n\t"
919 "psllq $8, %%mm4\n\t"
920 "psllq $16, %%mm5\n\t"
921 "por %%mm4, %%mm3\n\t"
922 "por %%mm5, %%mm3\n\t"
925 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
927 /* Borrowed 32 to 24 */
929 "movq %%mm0, %%mm4\n\t"
930 "movq %%mm3, %%mm5\n\t"
931 "movq %%mm6, %%mm0\n\t"
932 "movq %%mm7, %%mm1\n\t"
934 "movq %%mm4, %%mm6\n\t"
935 "movq %%mm5, %%mm7\n\t"
936 "movq %%mm0, %%mm2\n\t"
937 "movq %%mm1, %%mm3\n\t"
939 "psrlq $8, %%mm2\n\t"
940 "psrlq $8, %%mm3\n\t"
941 "psrlq $8, %%mm6\n\t"
942 "psrlq $8, %%mm7\n\t"
951 "por %%mm2, %%mm0\n\t"
952 "por %%mm3, %%mm1\n\t"
953 "por %%mm6, %%mm4\n\t"
954 "por %%mm7, %%mm5\n\t"
956 "movq %%mm1, %%mm2\n\t"
957 "movq %%mm4, %%mm3\n\t"
958 "psllq $48, %%mm2\n\t"
959 "psllq $32, %%mm3\n\t"
962 "por %%mm2, %%mm0\n\t"
963 "psrlq $16, %%mm1\n\t"
964 "psrlq $32, %%mm4\n\t"
965 "psllq $16, %%mm5\n\t"
966 "por %%mm3, %%mm1\n\t"
968 "por %%mm5, %%mm4\n\t"
970 MOVNTQ" %%mm0, %0\n\t"
971 MOVNTQ" %%mm1, 8%0\n\t"
975 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
980 __asm __volatile(SFENCE:::"memory");
981 __asm __volatile(EMMS:::"memory");
985 register uint16_t bgr;
987 *d++ = (bgr&0x1F)<<3;
988 *d++ = (bgr&0x3E0)>>2;
989 *d++ = (bgr&0x7C00)>>7;
993 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
997 const uint16_t *mm_end;
999 uint8_t *d = (uint8_t *)dst;
1000 const uint16_t *s = (const uint16_t *)src;
1001 end = s + src_size/2;
1003 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1009 "movq %1, %%mm0\n\t"
1010 "movq %1, %%mm1\n\t"
1011 "movq %1, %%mm2\n\t"
1012 "pand %2, %%mm0\n\t"
1013 "pand %3, %%mm1\n\t"
1014 "pand %4, %%mm2\n\t"
1015 "psllq $3, %%mm0\n\t"
1016 "psrlq $3, %%mm1\n\t"
1017 "psrlq $8, %%mm2\n\t"
1018 "movq %%mm0, %%mm3\n\t"
1019 "movq %%mm1, %%mm4\n\t"
1020 "movq %%mm2, %%mm5\n\t"
1021 "punpcklwd %5, %%mm0\n\t"
1022 "punpcklwd %5, %%mm1\n\t"
1023 "punpcklwd %5, %%mm2\n\t"
1024 "punpckhwd %5, %%mm3\n\t"
1025 "punpckhwd %5, %%mm4\n\t"
1026 "punpckhwd %5, %%mm5\n\t"
1027 "psllq $8, %%mm1\n\t"
1028 "psllq $16, %%mm2\n\t"
1029 "por %%mm1, %%mm0\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psllq $8, %%mm4\n\t"
1032 "psllq $16, %%mm5\n\t"
1033 "por %%mm4, %%mm3\n\t"
1034 "por %%mm5, %%mm3\n\t"
1036 "movq %%mm0, %%mm6\n\t"
1037 "movq %%mm3, %%mm7\n\t"
1039 "movq 8%1, %%mm0\n\t"
1040 "movq 8%1, %%mm1\n\t"
1041 "movq 8%1, %%mm2\n\t"
1042 "pand %2, %%mm0\n\t"
1043 "pand %3, %%mm1\n\t"
1044 "pand %4, %%mm2\n\t"
1045 "psllq $3, %%mm0\n\t"
1046 "psrlq $3, %%mm1\n\t"
1047 "psrlq $8, %%mm2\n\t"
1048 "movq %%mm0, %%mm3\n\t"
1049 "movq %%mm1, %%mm4\n\t"
1050 "movq %%mm2, %%mm5\n\t"
1051 "punpcklwd %5, %%mm0\n\t"
1052 "punpcklwd %5, %%mm1\n\t"
1053 "punpcklwd %5, %%mm2\n\t"
1054 "punpckhwd %5, %%mm3\n\t"
1055 "punpckhwd %5, %%mm4\n\t"
1056 "punpckhwd %5, %%mm5\n\t"
1057 "psllq $8, %%mm1\n\t"
1058 "psllq $16, %%mm2\n\t"
1059 "por %%mm1, %%mm0\n\t"
1060 "por %%mm2, %%mm0\n\t"
1061 "psllq $8, %%mm4\n\t"
1062 "psllq $16, %%mm5\n\t"
1063 "por %%mm4, %%mm3\n\t"
1064 "por %%mm5, %%mm3\n\t"
1066 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1068 /* Borrowed 32 to 24 */
1070 "movq %%mm0, %%mm4\n\t"
1071 "movq %%mm3, %%mm5\n\t"
1072 "movq %%mm6, %%mm0\n\t"
1073 "movq %%mm7, %%mm1\n\t"
1075 "movq %%mm4, %%mm6\n\t"
1076 "movq %%mm5, %%mm7\n\t"
1077 "movq %%mm0, %%mm2\n\t"
1078 "movq %%mm1, %%mm3\n\t"
1080 "psrlq $8, %%mm2\n\t"
1081 "psrlq $8, %%mm3\n\t"
1082 "psrlq $8, %%mm6\n\t"
1083 "psrlq $8, %%mm7\n\t"
1084 "pand %2, %%mm0\n\t"
1085 "pand %2, %%mm1\n\t"
1086 "pand %2, %%mm4\n\t"
1087 "pand %2, %%mm5\n\t"
1088 "pand %3, %%mm2\n\t"
1089 "pand %3, %%mm3\n\t"
1090 "pand %3, %%mm6\n\t"
1091 "pand %3, %%mm7\n\t"
1092 "por %%mm2, %%mm0\n\t"
1093 "por %%mm3, %%mm1\n\t"
1094 "por %%mm6, %%mm4\n\t"
1095 "por %%mm7, %%mm5\n\t"
1097 "movq %%mm1, %%mm2\n\t"
1098 "movq %%mm4, %%mm3\n\t"
1099 "psllq $48, %%mm2\n\t"
1100 "psllq $32, %%mm3\n\t"
1101 "pand %4, %%mm2\n\t"
1102 "pand %5, %%mm3\n\t"
1103 "por %%mm2, %%mm0\n\t"
1104 "psrlq $16, %%mm1\n\t"
1105 "psrlq $32, %%mm4\n\t"
1106 "psllq $16, %%mm5\n\t"
1107 "por %%mm3, %%mm1\n\t"
1108 "pand %6, %%mm5\n\t"
1109 "por %%mm5, %%mm4\n\t"
1111 MOVNTQ" %%mm0, %0\n\t"
1112 MOVNTQ" %%mm1, 8%0\n\t"
1113 MOVNTQ" %%mm4, 16%0"
1116 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1121 __asm __volatile(SFENCE:::"memory");
1122 __asm __volatile(EMMS:::"memory");
1126 register uint16_t bgr;
1128 *d++ = (bgr&0x1F)<<3;
1129 *d++ = (bgr&0x7E0)>>3;
1130 *d++ = (bgr&0xF800)>>8;
1134 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1136 const uint16_t *end;
1138 const uint16_t *mm_end;
1140 uint8_t *d = (uint8_t *)dst;
1141 const uint16_t *s = (const uint16_t *)src;
1142 end = s + src_size/2;
1144 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1145 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1151 "movq %1, %%mm0\n\t"
1152 "movq %1, %%mm1\n\t"
1153 "movq %1, %%mm2\n\t"
1154 "pand %2, %%mm0\n\t"
1155 "pand %3, %%mm1\n\t"
1156 "pand %4, %%mm2\n\t"
1157 "psllq $3, %%mm0\n\t"
1158 "psrlq $2, %%mm1\n\t"
1159 "psrlq $7, %%mm2\n\t"
1160 "movq %%mm0, %%mm3\n\t"
1161 "movq %%mm1, %%mm4\n\t"
1162 "movq %%mm2, %%mm5\n\t"
1163 "punpcklwd %%mm7, %%mm0\n\t"
1164 "punpcklwd %%mm7, %%mm1\n\t"
1165 "punpcklwd %%mm7, %%mm2\n\t"
1166 "punpckhwd %%mm7, %%mm3\n\t"
1167 "punpckhwd %%mm7, %%mm4\n\t"
1168 "punpckhwd %%mm7, %%mm5\n\t"
1169 "psllq $8, %%mm1\n\t"
1170 "psllq $16, %%mm2\n\t"
1171 "por %%mm1, %%mm0\n\t"
1172 "por %%mm2, %%mm0\n\t"
1173 "psllq $8, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm4, %%mm3\n\t"
1176 "por %%mm5, %%mm3\n\t"
1177 MOVNTQ" %%mm0, %0\n\t"
1178 MOVNTQ" %%mm3, 8%0\n\t"
1180 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1185 __asm __volatile(SFENCE:::"memory");
1186 __asm __volatile(EMMS:::"memory");
1190 register uint16_t bgr;
1192 *d++ = (bgr&0x1F)<<3;
1193 *d++ = (bgr&0x3E0)>>2;
1194 *d++ = (bgr&0x7C00)>>7;
1199 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1201 const uint16_t *end;
1203 const uint16_t *mm_end;
1205 uint8_t *d = (uint8_t *)dst;
1206 const uint16_t *s = (uint16_t *)src;
1207 end = s + src_size/2;
1209 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1210 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1216 "movq %1, %%mm0\n\t"
1217 "movq %1, %%mm1\n\t"
1218 "movq %1, %%mm2\n\t"
1219 "pand %2, %%mm0\n\t"
1220 "pand %3, %%mm1\n\t"
1221 "pand %4, %%mm2\n\t"
1222 "psllq $3, %%mm0\n\t"
1223 "psrlq $3, %%mm1\n\t"
1224 "psrlq $8, %%mm2\n\t"
1225 "movq %%mm0, %%mm3\n\t"
1226 "movq %%mm1, %%mm4\n\t"
1227 "movq %%mm2, %%mm5\n\t"
1228 "punpcklwd %%mm7, %%mm0\n\t"
1229 "punpcklwd %%mm7, %%mm1\n\t"
1230 "punpcklwd %%mm7, %%mm2\n\t"
1231 "punpckhwd %%mm7, %%mm3\n\t"
1232 "punpckhwd %%mm7, %%mm4\n\t"
1233 "punpckhwd %%mm7, %%mm5\n\t"
1234 "psllq $8, %%mm1\n\t"
1235 "psllq $16, %%mm2\n\t"
1236 "por %%mm1, %%mm0\n\t"
1237 "por %%mm2, %%mm0\n\t"
1238 "psllq $8, %%mm4\n\t"
1239 "psllq $16, %%mm5\n\t"
1240 "por %%mm4, %%mm3\n\t"
1241 "por %%mm5, %%mm3\n\t"
1242 MOVNTQ" %%mm0, %0\n\t"
1243 MOVNTQ" %%mm3, 8%0\n\t"
1245 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1250 __asm __volatile(SFENCE:::"memory");
1251 __asm __volatile(EMMS:::"memory");
1255 register uint16_t bgr;
1257 *d++ = (bgr&0x1F)<<3;
1258 *d++ = (bgr&0x7E0)>>3;
1259 *d++ = (bgr&0xF800)>>8;
1264 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1267 /* TODO: unroll this loop */
1269 "xorl %%eax, %%eax \n\t"
1272 PREFETCH" 32(%0, %%eax) \n\t"
1273 "movq (%0, %%eax), %%mm0 \n\t"
1274 "movq %%mm0, %%mm1 \n\t"
1275 "movq %%mm0, %%mm2 \n\t"
1276 "pslld $16, %%mm0 \n\t"
1277 "psrld $16, %%mm1 \n\t"
1278 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1279 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1280 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1281 "por %%mm0, %%mm2 \n\t"
1282 "por %%mm1, %%mm2 \n\t"
1283 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1284 "addl $8, %%eax \n\t"
1285 "cmpl %2, %%eax \n\t"
1287 :: "r" (src), "r"(dst), "r" (src_size-7)
1291 __asm __volatile(SFENCE:::"memory");
1292 __asm __volatile(EMMS:::"memory");
1295 unsigned num_pixels = src_size >> 2;
1296 for(i=0; i<num_pixels; i++)
1298 dst[4*i + 0] = src[4*i + 2];
1299 dst[4*i + 1] = src[4*i + 1];
1300 dst[4*i + 2] = src[4*i + 0];
1305 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1309 int mmx_size= 23 - src_size;
1311 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1312 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1313 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1316 PREFETCH" 32(%1, %%eax) \n\t"
1317 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1318 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1319 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1320 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1321 "pand %%mm5, %%mm0 \n\t"
1322 "pand %%mm6, %%mm1 \n\t"
1323 "pand %%mm7, %%mm2 \n\t"
1324 "por %%mm0, %%mm1 \n\t"
1325 "por %%mm2, %%mm1 \n\t"
1326 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1327 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1328 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1329 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1330 "pand %%mm7, %%mm0 \n\t"
1331 "pand %%mm5, %%mm1 \n\t"
1332 "pand %%mm6, %%mm2 \n\t"
1333 "por %%mm0, %%mm1 \n\t"
1334 "por %%mm2, %%mm1 \n\t"
1335 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1336 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1337 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1338 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1339 "pand %%mm6, %%mm0 \n\t"
1340 "pand %%mm7, %%mm1 \n\t"
1341 "pand %%mm5, %%mm2 \n\t"
1342 "por %%mm0, %%mm1 \n\t"
1343 "por %%mm2, %%mm1 \n\t"
1344 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1345 "addl $24, %%eax \n\t"
1348 : "r" (src-mmx_size), "r"(dst-mmx_size)
1351 __asm __volatile(SFENCE:::"memory");
1352 __asm __volatile(EMMS:::"memory");
1354 if(mmx_size==23) return; //finihsed, was multiple of 8
1358 src_size= 23-mmx_size;
1362 for(i=0; i<src_size; i+=3)
1366 dst[i + 1] = src[i + 1];
1367 dst[i + 2] = src[i + 0];
1372 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1373 unsigned int width, unsigned int height,
1374 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
1377 const unsigned chromWidth= width>>1;
1378 for(y=0; y<height; y++)
1381 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1383 "xorl %%eax, %%eax \n\t"
1386 PREFETCH" 32(%1, %%eax, 2) \n\t"
1387 PREFETCH" 32(%2, %%eax) \n\t"
1388 PREFETCH" 32(%3, %%eax) \n\t"
1389 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1390 "movq %%mm0, %%mm2 \n\t" // U(0)
1391 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1392 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1393 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1395 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1396 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1397 "movq %%mm3, %%mm4 \n\t" // Y(0)
1398 "movq %%mm5, %%mm6 \n\t" // Y(8)
1399 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1400 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1401 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1402 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1404 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1405 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1406 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1407 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1409 "addl $8, %%eax \n\t"
1410 "cmpl %4, %%eax \n\t"
1412 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
1416 #if __WORDSIZE >= 64
1418 uint64_t *ldst = (uint64_t *) dst;
1419 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1420 for(i = 0; i < chromWidth; i += 2){
1422 k = yc[0] + (uc[0] << 8) +
1423 (yc[1] << 16) + (vc[0] << 24);
1424 l = yc[2] + (uc[1] << 8) +
1425 (yc[3] << 16) + (vc[1] << 24);
1426 *ldst++ = k + (l << 32);
1433 int i, *idst = (int32_t *) dst;
1434 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1435 for(i = 0; i < chromWidth; i++){
1436 *idst++ = yc[0] + (uc[0] << 8) +
1437 (yc[1] << 16) + (vc[0] << 24);
1444 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1446 usrc += chromStride;
1447 vsrc += chromStride;
1461 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1462 * problem for anyone then tell me, and ill fix it)
1464 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465 unsigned int width, unsigned int height,
1466 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1468 //FIXME interpolate chroma
1469 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1474 * width should be a multiple of 16
1476 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477 unsigned int width, unsigned int height,
1478 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
1480 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1485 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1486 * problem for anyone then tell me, and ill fix it)
1488 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1489 unsigned int width, unsigned int height,
1490 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1493 const unsigned chromWidth= width>>1;
1494 for(y=0; y<height; y+=2)
1498 "xorl %%eax, %%eax \n\t"
1499 "pcmpeqw %%mm7, %%mm7 \n\t"
1500 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1503 PREFETCH" 64(%0, %%eax, 4) \n\t"
1504 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1505 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1506 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1507 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1508 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1509 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1510 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1511 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1512 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1513 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1515 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1517 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1518 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1519 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1520 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1521 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1522 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1523 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1524 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1525 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1526 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1528 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1530 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1531 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1532 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1533 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1534 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1535 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1536 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1537 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1539 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1540 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1542 "addl $8, %%eax \n\t"
1543 "cmpl %4, %%eax \n\t"
1545 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1553 "xorl %%eax, %%eax \n\t"
1556 PREFETCH" 64(%0, %%eax, 4) \n\t"
1557 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1558 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1559 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1560 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1561 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1562 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1563 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1564 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1565 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1566 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1568 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1569 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1571 "addl $8, %%eax \n\t"
1572 "cmpl %4, %%eax \n\t"
1575 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1580 for(i=0; i<chromWidth; i++)
1582 ydst[2*i+0] = src[4*i+0];
1583 udst[i] = src[4*i+1];
1584 ydst[2*i+1] = src[4*i+2];
1585 vdst[i] = src[4*i+3];
1590 for(i=0; i<chromWidth; i++)
1592 ydst[2*i+0] = src[4*i+0];
1593 ydst[2*i+1] = src[4*i+2];
1596 udst += chromStride;
1597 vdst += chromStride;
1602 asm volatile( EMMS" \n\t"
1608 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1609 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1610 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
1613 memcpy(ydst, ysrc, width*height);
1615 /* XXX: implement upscaling for U,V */
1618 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1623 for(x=0; x<srcWidth; x++){
1629 for(y=1; y<srcHeight; y++){
1630 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1631 const int mmxSize= srcWidth;
1633 "movl %4, %%eax \n\t"
1635 "movq (%0, %%eax), %%mm0 \n\t"
1636 "movq (%1, %%eax), %%mm1 \n\t"
1637 "movq 1(%0, %%eax), %%mm2 \n\t"
1638 "movq 1(%1, %%eax), %%mm3 \n\t"
1639 "movq %%mm0, %%mm4 \n\t"
1640 "movq %%mm1, %%mm5 \n\t"
1641 PAVGB" %%mm3, %%mm0 \n\t"
1642 PAVGB" %%mm3, %%mm0 \n\t"
1643 PAVGB" %%mm4, %%mm3 \n\t"
1644 PAVGB" %%mm4, %%mm3 \n\t"
1645 PAVGB" %%mm2, %%mm1 \n\t"
1646 PAVGB" %%mm2, %%mm1 \n\t"
1647 PAVGB" %%mm5, %%mm2 \n\t"
1648 PAVGB" %%mm5, %%mm2 \n\t"
1649 "movq %%mm3, %%mm4 \n\t"
1650 "movq %%mm2, %%mm5 \n\t"
1651 "punpcklbw %%mm1, %%mm3 \n\t"
1652 "punpckhbw %%mm1, %%mm4 \n\t"
1653 "punpcklbw %%mm0, %%mm2 \n\t"
1654 "punpckhbw %%mm0, %%mm5 \n\t"
1656 MOVNTQ" %%mm3, (%2, %%eax, 2) \n\t"
1657 MOVNTQ" %%mm4, 8(%2, %%eax, 2) \n\t"
1658 MOVNTQ" %%mm2, (%3, %%eax, 2) \n\t"
1659 MOVNTQ" %%mm5, 8(%3, %%eax, 2) \n\t"
1661 "movq %%mm3, (%2, %%eax, 2) \n\t"
1662 "movq %%mm4, 8(%2, %%eax, 2) \n\t"
1663 "movq %%mm2, (%3, %%eax, 2) \n\t"
1664 "movq %%mm5, 8(%3, %%eax, 2) \n\t"
1666 "addl $8, %%eax \n\t"
1668 :: "r" (src + mmxSize-1), "r" (src + srcStride + mmxSize-1),
1669 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1675 dst[dstStride]= src[0];
1678 dst[dstStride]= src[0];
1680 for(x=0; x<srcWidth-1; x++){
1681 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1682 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1683 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1684 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1688 dst[srcWidth*2 -1 + dstStride]= src[srcWidth-1];
1696 for(x=0; x<srcWidth; x++){
1701 asm volatile( EMMS" \n\t"
1709 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1710 * problem for anyone then tell me, and ill fix it)
1711 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1713 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1714 unsigned int width, unsigned int height,
1715 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1718 const unsigned chromWidth= width>>1;
1719 for(y=0; y<height; y+=2)
1723 "xorl %%eax, %%eax \n\t"
1724 "pcmpeqw %%mm7, %%mm7 \n\t"
1725 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1728 PREFETCH" 64(%0, %%eax, 4) \n\t"
1729 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1730 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1731 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1732 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1733 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1734 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1735 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1736 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1737 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1738 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1740 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1742 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1743 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1744 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1745 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1746 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1747 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1748 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1749 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1750 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1751 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1753 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1755 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1756 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1757 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1758 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1759 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1760 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1761 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1762 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1764 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1765 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1767 "addl $8, %%eax \n\t"
1768 "cmpl %4, %%eax \n\t"
1770 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1778 "xorl %%eax, %%eax \n\t"
1781 PREFETCH" 64(%0, %%eax, 4) \n\t"
1782 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1783 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1784 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1785 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1786 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1787 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1788 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1789 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1790 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1793 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1794 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1796 "addl $8, %%eax \n\t"
1797 "cmpl %4, %%eax \n\t"
1800 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
1805 for(i=0; i<chromWidth; i++)
1807 udst[i] = src[4*i+0];
1808 ydst[2*i+0] = src[4*i+1];
1809 vdst[i] = src[4*i+2];
1810 ydst[2*i+1] = src[4*i+3];
1815 for(i=0; i<chromWidth; i++)
1817 ydst[2*i+0] = src[4*i+1];
1818 ydst[2*i+1] = src[4*i+3];
1821 udst += chromStride;
1822 vdst += chromStride;
1827 asm volatile( EMMS" \n\t"
1835 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1836 * problem for anyone then tell me, and ill fix it)
1837 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1839 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1840 unsigned int width, unsigned int height,
1841 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1844 const unsigned chromWidth= width>>1;
1846 for(y=0; y<height-2; y+=2)
1852 "movl %2, %%eax \n\t"
1853 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1854 "movq "MANGLE(w1111)", %%mm5 \n\t"
1855 "pxor %%mm7, %%mm7 \n\t"
1856 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1859 PREFETCH" 64(%0, %%ebx) \n\t"
1860 "movd (%0, %%ebx), %%mm0 \n\t"
1861 "movd 3(%0, %%ebx), %%mm1 \n\t"
1862 "punpcklbw %%mm7, %%mm0 \n\t"
1863 "punpcklbw %%mm7, %%mm1 \n\t"
1864 "movd 6(%0, %%ebx), %%mm2 \n\t"
1865 "movd 9(%0, %%ebx), %%mm3 \n\t"
1866 "punpcklbw %%mm7, %%mm2 \n\t"
1867 "punpcklbw %%mm7, %%mm3 \n\t"
1868 "pmaddwd %%mm6, %%mm0 \n\t"
1869 "pmaddwd %%mm6, %%mm1 \n\t"
1870 "pmaddwd %%mm6, %%mm2 \n\t"
1871 "pmaddwd %%mm6, %%mm3 \n\t"
1872 #ifndef FAST_BGR2YV12
1873 "psrad $8, %%mm0 \n\t"
1874 "psrad $8, %%mm1 \n\t"
1875 "psrad $8, %%mm2 \n\t"
1876 "psrad $8, %%mm3 \n\t"
1878 "packssdw %%mm1, %%mm0 \n\t"
1879 "packssdw %%mm3, %%mm2 \n\t"
1880 "pmaddwd %%mm5, %%mm0 \n\t"
1881 "pmaddwd %%mm5, %%mm2 \n\t"
1882 "packssdw %%mm2, %%mm0 \n\t"
1883 "psraw $7, %%mm0 \n\t"
1885 "movd 12(%0, %%ebx), %%mm4 \n\t"
1886 "movd 15(%0, %%ebx), %%mm1 \n\t"
1887 "punpcklbw %%mm7, %%mm4 \n\t"
1888 "punpcklbw %%mm7, %%mm1 \n\t"
1889 "movd 18(%0, %%ebx), %%mm2 \n\t"
1890 "movd 21(%0, %%ebx), %%mm3 \n\t"
1891 "punpcklbw %%mm7, %%mm2 \n\t"
1892 "punpcklbw %%mm7, %%mm3 \n\t"
1893 "pmaddwd %%mm6, %%mm4 \n\t"
1894 "pmaddwd %%mm6, %%mm1 \n\t"
1895 "pmaddwd %%mm6, %%mm2 \n\t"
1896 "pmaddwd %%mm6, %%mm3 \n\t"
1897 #ifndef FAST_BGR2YV12
1898 "psrad $8, %%mm4 \n\t"
1899 "psrad $8, %%mm1 \n\t"
1900 "psrad $8, %%mm2 \n\t"
1901 "psrad $8, %%mm3 \n\t"
1903 "packssdw %%mm1, %%mm4 \n\t"
1904 "packssdw %%mm3, %%mm2 \n\t"
1905 "pmaddwd %%mm5, %%mm4 \n\t"
1906 "pmaddwd %%mm5, %%mm2 \n\t"
1907 "addl $24, %%ebx \n\t"
1908 "packssdw %%mm2, %%mm4 \n\t"
1909 "psraw $7, %%mm4 \n\t"
1911 "packuswb %%mm4, %%mm0 \n\t"
1912 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1914 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1915 "addl $8, %%eax \n\t"
1917 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1925 "movl %4, %%eax \n\t"
1926 "movq "MANGLE(w1111)", %%mm5 \n\t"
1927 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1928 "pxor %%mm7, %%mm7 \n\t"
1929 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1930 "addl %%ebx, %%ebx \n\t"
1933 PREFETCH" 64(%0, %%ebx) \n\t"
1934 PREFETCH" 64(%1, %%ebx) \n\t"
1935 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1936 "movq (%0, %%ebx), %%mm0 \n\t"
1937 "movq (%1, %%ebx), %%mm1 \n\t"
1938 "movq 6(%0, %%ebx), %%mm2 \n\t"
1939 "movq 6(%1, %%ebx), %%mm3 \n\t"
1940 PAVGB" %%mm1, %%mm0 \n\t"
1941 PAVGB" %%mm3, %%mm2 \n\t"
1942 "movq %%mm0, %%mm1 \n\t"
1943 "movq %%mm2, %%mm3 \n\t"
1944 "psrlq $24, %%mm0 \n\t"
1945 "psrlq $24, %%mm2 \n\t"
1946 PAVGB" %%mm1, %%mm0 \n\t"
1947 PAVGB" %%mm3, %%mm2 \n\t"
1948 "punpcklbw %%mm7, %%mm0 \n\t"
1949 "punpcklbw %%mm7, %%mm2 \n\t"
1951 "movd (%0, %%ebx), %%mm0 \n\t"
1952 "movd (%1, %%ebx), %%mm1 \n\t"
1953 "movd 3(%0, %%ebx), %%mm2 \n\t"
1954 "movd 3(%1, %%ebx), %%mm3 \n\t"
1955 "punpcklbw %%mm7, %%mm0 \n\t"
1956 "punpcklbw %%mm7, %%mm1 \n\t"
1957 "punpcklbw %%mm7, %%mm2 \n\t"
1958 "punpcklbw %%mm7, %%mm3 \n\t"
1959 "paddw %%mm1, %%mm0 \n\t"
1960 "paddw %%mm3, %%mm2 \n\t"
1961 "paddw %%mm2, %%mm0 \n\t"
1962 "movd 6(%0, %%ebx), %%mm4 \n\t"
1963 "movd 6(%1, %%ebx), %%mm1 \n\t"
1964 "movd 9(%0, %%ebx), %%mm2 \n\t"
1965 "movd 9(%1, %%ebx), %%mm3 \n\t"
1966 "punpcklbw %%mm7, %%mm4 \n\t"
1967 "punpcklbw %%mm7, %%mm1 \n\t"
1968 "punpcklbw %%mm7, %%mm2 \n\t"
1969 "punpcklbw %%mm7, %%mm3 \n\t"
1970 "paddw %%mm1, %%mm4 \n\t"
1971 "paddw %%mm3, %%mm2 \n\t"
1972 "paddw %%mm4, %%mm2 \n\t"
1973 "psrlw $2, %%mm0 \n\t"
1974 "psrlw $2, %%mm2 \n\t"
1976 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1977 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1979 "pmaddwd %%mm0, %%mm1 \n\t"
1980 "pmaddwd %%mm2, %%mm3 \n\t"
1981 "pmaddwd %%mm6, %%mm0 \n\t"
1982 "pmaddwd %%mm6, %%mm2 \n\t"
1983 #ifndef FAST_BGR2YV12
1984 "psrad $8, %%mm0 \n\t"
1985 "psrad $8, %%mm1 \n\t"
1986 "psrad $8, %%mm2 \n\t"
1987 "psrad $8, %%mm3 \n\t"
1989 "packssdw %%mm2, %%mm0 \n\t"
1990 "packssdw %%mm3, %%mm1 \n\t"
1991 "pmaddwd %%mm5, %%mm0 \n\t"
1992 "pmaddwd %%mm5, %%mm1 \n\t"
1993 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1994 "psraw $7, %%mm0 \n\t"
1996 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1997 "movq 12(%0, %%ebx), %%mm4 \n\t"
1998 "movq 12(%1, %%ebx), %%mm1 \n\t"
1999 "movq 18(%0, %%ebx), %%mm2 \n\t"
2000 "movq 18(%1, %%ebx), %%mm3 \n\t"
2001 PAVGB" %%mm1, %%mm4 \n\t"
2002 PAVGB" %%mm3, %%mm2 \n\t"
2003 "movq %%mm4, %%mm1 \n\t"
2004 "movq %%mm2, %%mm3 \n\t"
2005 "psrlq $24, %%mm4 \n\t"
2006 "psrlq $24, %%mm2 \n\t"
2007 PAVGB" %%mm1, %%mm4 \n\t"
2008 PAVGB" %%mm3, %%mm2 \n\t"
2009 "punpcklbw %%mm7, %%mm4 \n\t"
2010 "punpcklbw %%mm7, %%mm2 \n\t"
2012 "movd 12(%0, %%ebx), %%mm4 \n\t"
2013 "movd 12(%1, %%ebx), %%mm1 \n\t"
2014 "movd 15(%0, %%ebx), %%mm2 \n\t"
2015 "movd 15(%1, %%ebx), %%mm3 \n\t"
2016 "punpcklbw %%mm7, %%mm4 \n\t"
2017 "punpcklbw %%mm7, %%mm1 \n\t"
2018 "punpcklbw %%mm7, %%mm2 \n\t"
2019 "punpcklbw %%mm7, %%mm3 \n\t"
2020 "paddw %%mm1, %%mm4 \n\t"
2021 "paddw %%mm3, %%mm2 \n\t"
2022 "paddw %%mm2, %%mm4 \n\t"
2023 "movd 18(%0, %%ebx), %%mm5 \n\t"
2024 "movd 18(%1, %%ebx), %%mm1 \n\t"
2025 "movd 21(%0, %%ebx), %%mm2 \n\t"
2026 "movd 21(%1, %%ebx), %%mm3 \n\t"
2027 "punpcklbw %%mm7, %%mm5 \n\t"
2028 "punpcklbw %%mm7, %%mm1 \n\t"
2029 "punpcklbw %%mm7, %%mm2 \n\t"
2030 "punpcklbw %%mm7, %%mm3 \n\t"
2031 "paddw %%mm1, %%mm5 \n\t"
2032 "paddw %%mm3, %%mm2 \n\t"
2033 "paddw %%mm5, %%mm2 \n\t"
2034 "movq "MANGLE(w1111)", %%mm5 \n\t"
2035 "psrlw $2, %%mm4 \n\t"
2036 "psrlw $2, %%mm2 \n\t"
2038 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2039 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2041 "pmaddwd %%mm4, %%mm1 \n\t"
2042 "pmaddwd %%mm2, %%mm3 \n\t"
2043 "pmaddwd %%mm6, %%mm4 \n\t"
2044 "pmaddwd %%mm6, %%mm2 \n\t"
2045 #ifndef FAST_BGR2YV12
2046 "psrad $8, %%mm4 \n\t"
2047 "psrad $8, %%mm1 \n\t"
2048 "psrad $8, %%mm2 \n\t"
2049 "psrad $8, %%mm3 \n\t"
2051 "packssdw %%mm2, %%mm4 \n\t"
2052 "packssdw %%mm3, %%mm1 \n\t"
2053 "pmaddwd %%mm5, %%mm4 \n\t"
2054 "pmaddwd %%mm5, %%mm1 \n\t"
2055 "addl $24, %%ebx \n\t"
2056 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2057 "psraw $7, %%mm4 \n\t"
2059 "movq %%mm0, %%mm1 \n\t"
2060 "punpckldq %%mm4, %%mm0 \n\t"
2061 "punpckhdq %%mm4, %%mm1 \n\t"
2062 "packsswb %%mm1, %%mm0 \n\t"
2063 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2065 "movd %%mm0, (%2, %%eax) \n\t"
2066 "punpckhdq %%mm0, %%mm0 \n\t"
2067 "movd %%mm0, (%3, %%eax) \n\t"
2068 "addl $4, %%eax \n\t"
2070 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2074 udst += chromStride;
2075 vdst += chromStride;
2079 asm volatile( EMMS" \n\t"
2085 for(; y<height; y+=2)
2088 for(i=0; i<chromWidth; i++)
2090 unsigned int b= src[6*i+0];
2091 unsigned int g= src[6*i+1];
2092 unsigned int r= src[6*i+2];
2094 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2095 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2096 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2106 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2112 for(i=0; i<chromWidth; i++)
2114 unsigned int b= src[6*i+0];
2115 unsigned int g= src[6*i+1];
2116 unsigned int r= src[6*i+2];
2118 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2126 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2129 udst += chromStride;
2130 vdst += chromStride;
2136 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2137 unsigned width, unsigned height, unsigned src1Stride,
2138 unsigned src2Stride, unsigned dstStride){
2141 for(h=0; h < height; h++)
2148 "xorl %%eax, %%eax \n\t"
2150 PREFETCH" 64(%1, %%eax) \n\t"
2151 PREFETCH" 64(%2, %%eax) \n\t"
2152 "movdqa (%1, %%eax), %%xmm0 \n\t"
2153 "movdqa (%1, %%eax), %%xmm1 \n\t"
2154 "movdqa (%2, %%eax), %%xmm2 \n\t"
2155 "punpcklbw %%xmm2, %%xmm0 \n\t"
2156 "punpckhbw %%xmm2, %%xmm1 \n\t"
2157 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2158 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2159 "addl $16, %%eax \n\t"
2160 "cmpl %3, %%eax \n\t"
2162 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2167 "xorl %%eax, %%eax \n\t"
2169 PREFETCH" 64(%1, %%eax) \n\t"
2170 PREFETCH" 64(%2, %%eax) \n\t"
2171 "movq (%1, %%eax), %%mm0 \n\t"
2172 "movq 8(%1, %%eax), %%mm2 \n\t"
2173 "movq %%mm0, %%mm1 \n\t"
2174 "movq %%mm2, %%mm3 \n\t"
2175 "movq (%2, %%eax), %%mm4 \n\t"
2176 "movq 8(%2, %%eax), %%mm5 \n\t"
2177 "punpcklbw %%mm4, %%mm0 \n\t"
2178 "punpckhbw %%mm4, %%mm1 \n\t"
2179 "punpcklbw %%mm5, %%mm2 \n\t"
2180 "punpckhbw %%mm5, %%mm3 \n\t"
2181 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2182 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2183 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2184 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2185 "addl $16, %%eax \n\t"
2186 "cmpl %3, %%eax \n\t"
2188 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2192 for(w= (width&(~15)); w < width; w++)
2194 dest[2*w+0] = src1[w];
2195 dest[2*w+1] = src2[w];
2198 for(w=0; w < width; w++)
2200 dest[2*w+0] = src1[w];
2201 dest[2*w+1] = src2[w];
2217 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2218 uint8_t *dst1, uint8_t *dst2,
2219 unsigned width, unsigned height,
2220 unsigned srcStride1, unsigned srcStride2,
2221 unsigned dstStride1, unsigned dstStride2)
2224 w=width/2; h=height/2;
2229 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2232 const uint8_t* s1=src1+srcStride1*(y>>1);
2233 uint8_t* d=dst1+dstStride1*y;
2241 "movq %1, %%mm0\n\t"
2242 "movq 8%1, %%mm2\n\t"
2243 "movq 16%1, %%mm4\n\t"
2244 "movq 24%1, %%mm6\n\t"
2245 "movq %%mm0, %%mm1\n\t"
2246 "movq %%mm2, %%mm3\n\t"
2247 "movq %%mm4, %%mm5\n\t"
2248 "movq %%mm6, %%mm7\n\t"
2249 "punpcklbw %%mm0, %%mm0\n\t"
2250 "punpckhbw %%mm1, %%mm1\n\t"
2251 "punpcklbw %%mm2, %%mm2\n\t"
2252 "punpckhbw %%mm3, %%mm3\n\t"
2253 "punpcklbw %%mm4, %%mm4\n\t"
2254 "punpckhbw %%mm5, %%mm5\n\t"
2255 "punpcklbw %%mm6, %%mm6\n\t"
2256 "punpckhbw %%mm7, %%mm7\n\t"
2257 MOVNTQ" %%mm0, %0\n\t"
2258 MOVNTQ" %%mm1, 8%0\n\t"
2259 MOVNTQ" %%mm2, 16%0\n\t"
2260 MOVNTQ" %%mm3, 24%0\n\t"
2261 MOVNTQ" %%mm4, 32%0\n\t"
2262 MOVNTQ" %%mm5, 40%0\n\t"
2263 MOVNTQ" %%mm6, 48%0\n\t"
2264 MOVNTQ" %%mm7, 56%0"
2270 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2273 const uint8_t* s2=src2+srcStride2*(y>>1);
2274 uint8_t* d=dst2+dstStride2*y;
2282 "movq %1, %%mm0\n\t"
2283 "movq 8%1, %%mm2\n\t"
2284 "movq 16%1, %%mm4\n\t"
2285 "movq 24%1, %%mm6\n\t"
2286 "movq %%mm0, %%mm1\n\t"
2287 "movq %%mm2, %%mm3\n\t"
2288 "movq %%mm4, %%mm5\n\t"
2289 "movq %%mm6, %%mm7\n\t"
2290 "punpcklbw %%mm0, %%mm0\n\t"
2291 "punpckhbw %%mm1, %%mm1\n\t"
2292 "punpcklbw %%mm2, %%mm2\n\t"
2293 "punpckhbw %%mm3, %%mm3\n\t"
2294 "punpcklbw %%mm4, %%mm4\n\t"
2295 "punpckhbw %%mm5, %%mm5\n\t"
2296 "punpcklbw %%mm6, %%mm6\n\t"
2297 "punpckhbw %%mm7, %%mm7\n\t"
2298 MOVNTQ" %%mm0, %0\n\t"
2299 MOVNTQ" %%mm1, 8%0\n\t"
2300 MOVNTQ" %%mm2, 16%0\n\t"
2301 MOVNTQ" %%mm3, 24%0\n\t"
2302 MOVNTQ" %%mm4, 32%0\n\t"
2303 MOVNTQ" %%mm5, 40%0\n\t"
2304 MOVNTQ" %%mm6, 48%0\n\t"
2305 MOVNTQ" %%mm7, 56%0"
2311 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2322 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2324 unsigned width, unsigned height,
2325 unsigned srcStride1, unsigned srcStride2,
2326 unsigned srcStride3, unsigned dstStride)
2328 unsigned y,x,x2,w,h;
2329 w=width/2; h=height;
2335 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)),"m"(*(src3+srcStride3)):"memory");
2338 const uint8_t* yp=src1+srcStride1*y;
2339 const uint8_t* up=src2+srcStride2*(y>>2);
2340 const uint8_t* vp=src3+srcStride3*(y>>2);
2341 uint8_t* d=dst+dstStride*y;
2345 for(;x<w;x+=8,x2+=32)
2351 "movq %1, %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2352 "movq %2, %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2353 "movq %3, %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2354 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2355 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2356 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2357 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2358 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2359 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2360 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2362 "movq %%mm1, %%mm6\n\t"
2363 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2364 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2365 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2366 MOVNTQ" %%mm0, %0\n\t"
2367 MOVNTQ" %%mm3, 8%0\n\t"
2369 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2370 "movq 8%1, %%mm0\n\t"
2371 "movq %%mm0, %%mm3\n\t"
2372 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2373 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2374 MOVNTQ" %%mm0, 16%0\n\t"
2375 MOVNTQ" %%mm3, 24%0\n\t"
2377 "movq %%mm4, %%mm6\n\t"
2378 "movq 16%1, %%mm0\n\t"
2379 "movq %%mm0, %%mm3\n\t"
2380 "punpcklbw %%mm5, %%mm4\n\t"
2381 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2382 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2383 MOVNTQ" %%mm0, 32%0\n\t"
2384 MOVNTQ" %%mm3, 40%0\n\t"
2386 "punpckhbw %%mm5, %%mm6\n\t"
2387 "movq 24%1, %%mm0\n\t"
2388 "movq %%mm0, %%mm3\n\t"
2389 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2390 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2391 MOVNTQ" %%mm0, 48%0\n\t"
2392 MOVNTQ" %%mm3, 56%0\n\t"
2395 :"m"(yp[x2]),"m"(up[x]),"m"(vp[x])