3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
13 #include <inttypes.h> /* for __WORDSIZE */
18 // #warning You have misconfigured system and probably will lose performance!
19 #define __WORDSIZE MP_WORDSIZE
37 #define PREFETCH "prefetch"
38 #define PREFETCHW "prefetchw"
39 #define PAVGB "pavgusb"
40 #elif defined ( HAVE_MMX2 )
41 #define PREFETCH "prefetchnta"
42 #define PREFETCHW "prefetcht0"
49 #define PREFETCH "/nop"
50 #define PREFETCHW "/nop"
55 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
62 #define MOVNTQ "movntq"
63 #define SFENCE "sfence"
73 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
76 const uint8_t *s = src;
79 const uint8_t *mm_end;
83 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
85 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
91 "punpckldq 3%1, %%mm0\n\t"
93 "punpckldq 9%1, %%mm1\n\t"
94 "movd 12%1, %%mm2\n\t"
95 "punpckldq 15%1, %%mm2\n\t"
96 "movd 18%1, %%mm3\n\t"
97 "punpckldq 21%1, %%mm3\n\t"
98 "pand %%mm7, %%mm0\n\t"
99 "pand %%mm7, %%mm1\n\t"
100 "pand %%mm7, %%mm2\n\t"
101 "pand %%mm7, %%mm3\n\t"
102 MOVNTQ" %%mm0, %0\n\t"
103 MOVNTQ" %%mm1, 8%0\n\t"
104 MOVNTQ" %%mm2, 16%0\n\t"
112 __asm __volatile(SFENCE:::"memory");
113 __asm __volatile(EMMS:::"memory");
117 #ifdef WORDS_BIGENDIAN
118 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
133 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
136 const uint8_t *s = src;
139 const uint8_t *mm_end;
143 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
150 "movq 8%1, %%mm1\n\t"
151 "movq 16%1, %%mm4\n\t"
152 "movq 24%1, %%mm5\n\t"
153 "movq %%mm0, %%mm2\n\t"
154 "movq %%mm1, %%mm3\n\t"
155 "movq %%mm4, %%mm6\n\t"
156 "movq %%mm5, %%mm7\n\t"
157 "psrlq $8, %%mm2\n\t"
158 "psrlq $8, %%mm3\n\t"
159 "psrlq $8, %%mm6\n\t"
160 "psrlq $8, %%mm7\n\t"
169 "por %%mm2, %%mm0\n\t"
170 "por %%mm3, %%mm1\n\t"
171 "por %%mm6, %%mm4\n\t"
172 "por %%mm7, %%mm5\n\t"
174 "movq %%mm1, %%mm2\n\t"
175 "movq %%mm4, %%mm3\n\t"
176 "psllq $48, %%mm2\n\t"
177 "psllq $32, %%mm3\n\t"
180 "por %%mm2, %%mm0\n\t"
181 "psrlq $16, %%mm1\n\t"
182 "psrlq $32, %%mm4\n\t"
183 "psllq $16, %%mm5\n\t"
184 "por %%mm3, %%mm1\n\t"
186 "por %%mm5, %%mm4\n\t"
188 MOVNTQ" %%mm0, %0\n\t"
189 MOVNTQ" %%mm1, 8%0\n\t"
192 :"m"(*s),"m"(mask24l),
193 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
198 __asm __volatile(SFENCE:::"memory");
199 __asm __volatile(EMMS:::"memory");
203 #ifdef WORDS_BIGENDIAN
204 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
220 Original by Strepto/Astral
221 ported to gcc & bugfixed : A'rpi
222 MMX2, 3DNOW optimization by Nick Kurshev
223 32bit c version, and and&add trick by Michael Niedermayer
225 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
227 register const uint8_t* s=src;
228 register uint8_t* d=dst;
229 register const uint8_t *end;
230 const uint8_t *mm_end;
233 __asm __volatile(PREFETCH" %0"::"m"(*s));
234 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
241 "movq 8%1, %%mm2\n\t"
242 "movq %%mm0, %%mm1\n\t"
243 "movq %%mm2, %%mm3\n\t"
244 "pand %%mm4, %%mm0\n\t"
245 "pand %%mm4, %%mm2\n\t"
246 "paddw %%mm1, %%mm0\n\t"
247 "paddw %%mm3, %%mm2\n\t"
248 MOVNTQ" %%mm0, %0\n\t"
256 __asm __volatile(SFENCE:::"memory");
257 __asm __volatile(EMMS:::"memory");
262 register unsigned x= *((uint32_t *)s);
263 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
269 register unsigned short x= *((uint16_t *)s);
270 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
274 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
276 register const uint8_t* s=src;
277 register uint8_t* d=dst;
278 register const uint8_t *end;
279 const uint8_t *mm_end;
282 __asm __volatile(PREFETCH" %0"::"m"(*s));
283 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
284 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
291 "movq 8%1, %%mm2\n\t"
292 "movq %%mm0, %%mm1\n\t"
293 "movq %%mm2, %%mm3\n\t"
294 "psrlq $1, %%mm0\n\t"
295 "psrlq $1, %%mm2\n\t"
296 "pand %%mm7, %%mm0\n\t"
297 "pand %%mm7, %%mm2\n\t"
298 "pand %%mm6, %%mm1\n\t"
299 "pand %%mm6, %%mm3\n\t"
300 "por %%mm1, %%mm0\n\t"
301 "por %%mm3, %%mm2\n\t"
302 MOVNTQ" %%mm0, %0\n\t"
310 __asm __volatile(SFENCE:::"memory");
311 __asm __volatile(EMMS:::"memory");
316 register uint32_t x= *((uint32_t *)s);
317 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
323 register uint16_t x= *((uint16_t *)s);
324 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
330 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
332 const uint8_t *s = src;
335 const uint8_t *mm_end;
337 uint16_t *d = (uint16_t *)dst;
341 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
343 "movq %3, %%mm5 \n\t"
344 "movq %4, %%mm6 \n\t"
345 "movq %5, %%mm7 \n\t"
348 PREFETCH" 32(%1) \n\t"
349 "movd (%1), %%mm0 \n\t"
350 "movd 4(%1), %%mm3 \n\t"
351 "punpckldq 8(%1), %%mm0 \n\t"
352 "punpckldq 12(%1), %%mm3 \n\t"
353 "movq %%mm0, %%mm1 \n\t"
354 "movq %%mm3, %%mm4 \n\t"
355 "pand %%mm6, %%mm0 \n\t"
356 "pand %%mm6, %%mm3 \n\t"
357 "pmaddwd %%mm7, %%mm0 \n\t"
358 "pmaddwd %%mm7, %%mm3 \n\t"
359 "pand %%mm5, %%mm1 \n\t"
360 "pand %%mm5, %%mm4 \n\t"
361 "por %%mm1, %%mm0 \n\t"
362 "por %%mm4, %%mm3 \n\t"
363 "psrld $5, %%mm0 \n\t"
364 "pslld $11, %%mm3 \n\t"
365 "por %%mm3, %%mm0 \n\t"
366 MOVNTQ" %%mm0, (%0) \n\t"
372 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
375 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
379 ::"m"(red_16mask),"m"(green_16mask));
385 "movd 4%1, %%mm3\n\t"
386 "punpckldq 8%1, %%mm0\n\t"
387 "punpckldq 12%1, %%mm3\n\t"
388 "movq %%mm0, %%mm1\n\t"
389 "movq %%mm0, %%mm2\n\t"
390 "movq %%mm3, %%mm4\n\t"
391 "movq %%mm3, %%mm5\n\t"
392 "psrlq $3, %%mm0\n\t"
393 "psrlq $3, %%mm3\n\t"
396 "psrlq $5, %%mm1\n\t"
397 "psrlq $5, %%mm4\n\t"
398 "pand %%mm6, %%mm1\n\t"
399 "pand %%mm6, %%mm4\n\t"
400 "psrlq $8, %%mm2\n\t"
401 "psrlq $8, %%mm5\n\t"
402 "pand %%mm7, %%mm2\n\t"
403 "pand %%mm7, %%mm5\n\t"
404 "por %%mm1, %%mm0\n\t"
405 "por %%mm4, %%mm3\n\t"
406 "por %%mm2, %%mm0\n\t"
407 "por %%mm5, %%mm3\n\t"
408 "psllq $16, %%mm3\n\t"
409 "por %%mm3, %%mm0\n\t"
410 MOVNTQ" %%mm0, %0\n\t"
411 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
416 __asm __volatile(SFENCE:::"memory");
417 __asm __volatile(EMMS:::"memory");
421 register int rgb = *(uint32_t*)s; s += 4;
422 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
426 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
428 const uint8_t *s = src;
431 const uint8_t *mm_end;
433 uint16_t *d = (uint16_t *)dst;
436 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
440 ::"m"(red_16mask),"m"(green_16mask));
447 "movd 4%1, %%mm3\n\t"
448 "punpckldq 8%1, %%mm0\n\t"
449 "punpckldq 12%1, %%mm3\n\t"
450 "movq %%mm0, %%mm1\n\t"
451 "movq %%mm0, %%mm2\n\t"
452 "movq %%mm3, %%mm4\n\t"
453 "movq %%mm3, %%mm5\n\t"
454 "psllq $8, %%mm0\n\t"
455 "psllq $8, %%mm3\n\t"
456 "pand %%mm7, %%mm0\n\t"
457 "pand %%mm7, %%mm3\n\t"
458 "psrlq $5, %%mm1\n\t"
459 "psrlq $5, %%mm4\n\t"
460 "pand %%mm6, %%mm1\n\t"
461 "pand %%mm6, %%mm4\n\t"
462 "psrlq $19, %%mm2\n\t"
463 "psrlq $19, %%mm5\n\t"
466 "por %%mm1, %%mm0\n\t"
467 "por %%mm4, %%mm3\n\t"
468 "por %%mm2, %%mm0\n\t"
469 "por %%mm5, %%mm3\n\t"
470 "psllq $16, %%mm3\n\t"
471 "por %%mm3, %%mm0\n\t"
472 MOVNTQ" %%mm0, %0\n\t"
473 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
477 __asm __volatile(SFENCE:::"memory");
478 __asm __volatile(EMMS:::"memory");
482 register int rgb = *(uint32_t*)s; s += 4;
483 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
487 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
489 const uint8_t *s = src;
492 const uint8_t *mm_end;
494 uint16_t *d = (uint16_t *)dst;
498 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
500 "movq %3, %%mm5 \n\t"
501 "movq %4, %%mm6 \n\t"
502 "movq %5, %%mm7 \n\t"
505 PREFETCH" 32(%1) \n\t"
506 "movd (%1), %%mm0 \n\t"
507 "movd 4(%1), %%mm3 \n\t"
508 "punpckldq 8(%1), %%mm0 \n\t"
509 "punpckldq 12(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm3, %%mm4 \n\t"
512 "pand %%mm6, %%mm0 \n\t"
513 "pand %%mm6, %%mm3 \n\t"
514 "pmaddwd %%mm7, %%mm0 \n\t"
515 "pmaddwd %%mm7, %%mm3 \n\t"
516 "pand %%mm5, %%mm1 \n\t"
517 "pand %%mm5, %%mm4 \n\t"
518 "por %%mm1, %%mm0 \n\t"
519 "por %%mm4, %%mm3 \n\t"
520 "psrld $6, %%mm0 \n\t"
521 "pslld $10, %%mm3 \n\t"
522 "por %%mm3, %%mm0 \n\t"
523 MOVNTQ" %%mm0, (%0) \n\t"
529 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
532 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
536 ::"m"(red_15mask),"m"(green_15mask));
542 "movd 4%1, %%mm3\n\t"
543 "punpckldq 8%1, %%mm0\n\t"
544 "punpckldq 12%1, %%mm3\n\t"
545 "movq %%mm0, %%mm1\n\t"
546 "movq %%mm0, %%mm2\n\t"
547 "movq %%mm3, %%mm4\n\t"
548 "movq %%mm3, %%mm5\n\t"
549 "psrlq $3, %%mm0\n\t"
550 "psrlq $3, %%mm3\n\t"
553 "psrlq $6, %%mm1\n\t"
554 "psrlq $6, %%mm4\n\t"
555 "pand %%mm6, %%mm1\n\t"
556 "pand %%mm6, %%mm4\n\t"
557 "psrlq $9, %%mm2\n\t"
558 "psrlq $9, %%mm5\n\t"
559 "pand %%mm7, %%mm2\n\t"
560 "pand %%mm7, %%mm5\n\t"
561 "por %%mm1, %%mm0\n\t"
562 "por %%mm4, %%mm3\n\t"
563 "por %%mm2, %%mm0\n\t"
564 "por %%mm5, %%mm3\n\t"
565 "psllq $16, %%mm3\n\t"
566 "por %%mm3, %%mm0\n\t"
567 MOVNTQ" %%mm0, %0\n\t"
568 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
573 __asm __volatile(SFENCE:::"memory");
574 __asm __volatile(EMMS:::"memory");
578 register int rgb = *(uint32_t*)s; s += 4;
579 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
583 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
585 const uint8_t *s = src;
588 const uint8_t *mm_end;
590 uint16_t *d = (uint16_t *)dst;
593 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
597 ::"m"(red_15mask),"m"(green_15mask));
604 "movd 4%1, %%mm3\n\t"
605 "punpckldq 8%1, %%mm0\n\t"
606 "punpckldq 12%1, %%mm3\n\t"
607 "movq %%mm0, %%mm1\n\t"
608 "movq %%mm0, %%mm2\n\t"
609 "movq %%mm3, %%mm4\n\t"
610 "movq %%mm3, %%mm5\n\t"
611 "psllq $7, %%mm0\n\t"
612 "psllq $7, %%mm3\n\t"
613 "pand %%mm7, %%mm0\n\t"
614 "pand %%mm7, %%mm3\n\t"
615 "psrlq $6, %%mm1\n\t"
616 "psrlq $6, %%mm4\n\t"
617 "pand %%mm6, %%mm1\n\t"
618 "pand %%mm6, %%mm4\n\t"
619 "psrlq $19, %%mm2\n\t"
620 "psrlq $19, %%mm5\n\t"
623 "por %%mm1, %%mm0\n\t"
624 "por %%mm4, %%mm3\n\t"
625 "por %%mm2, %%mm0\n\t"
626 "por %%mm5, %%mm3\n\t"
627 "psllq $16, %%mm3\n\t"
628 "por %%mm3, %%mm0\n\t"
629 MOVNTQ" %%mm0, %0\n\t"
630 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
634 __asm __volatile(SFENCE:::"memory");
635 __asm __volatile(EMMS:::"memory");
639 register int rgb = *(uint32_t*)s; s += 4;
640 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
644 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
646 const uint8_t *s = src;
649 const uint8_t *mm_end;
651 uint16_t *d = (uint16_t *)dst;
654 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
658 ::"m"(red_16mask),"m"(green_16mask));
665 "movd 3%1, %%mm3\n\t"
666 "punpckldq 6%1, %%mm0\n\t"
667 "punpckldq 9%1, %%mm3\n\t"
668 "movq %%mm0, %%mm1\n\t"
669 "movq %%mm0, %%mm2\n\t"
670 "movq %%mm3, %%mm4\n\t"
671 "movq %%mm3, %%mm5\n\t"
672 "psrlq $3, %%mm0\n\t"
673 "psrlq $3, %%mm3\n\t"
676 "psrlq $5, %%mm1\n\t"
677 "psrlq $5, %%mm4\n\t"
678 "pand %%mm6, %%mm1\n\t"
679 "pand %%mm6, %%mm4\n\t"
680 "psrlq $8, %%mm2\n\t"
681 "psrlq $8, %%mm5\n\t"
682 "pand %%mm7, %%mm2\n\t"
683 "pand %%mm7, %%mm5\n\t"
684 "por %%mm1, %%mm0\n\t"
685 "por %%mm4, %%mm3\n\t"
686 "por %%mm2, %%mm0\n\t"
687 "por %%mm5, %%mm3\n\t"
688 "psllq $16, %%mm3\n\t"
689 "por %%mm3, %%mm0\n\t"
690 MOVNTQ" %%mm0, %0\n\t"
691 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
695 __asm __volatile(SFENCE:::"memory");
696 __asm __volatile(EMMS:::"memory");
703 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
707 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
709 const uint8_t *s = src;
712 const uint8_t *mm_end;
714 uint16_t *d = (uint16_t *)dst;
717 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
721 ::"m"(red_16mask),"m"(green_16mask));
728 "movd 3%1, %%mm3\n\t"
729 "punpckldq 6%1, %%mm0\n\t"
730 "punpckldq 9%1, %%mm3\n\t"
731 "movq %%mm0, %%mm1\n\t"
732 "movq %%mm0, %%mm2\n\t"
733 "movq %%mm3, %%mm4\n\t"
734 "movq %%mm3, %%mm5\n\t"
735 "psllq $8, %%mm0\n\t"
736 "psllq $8, %%mm3\n\t"
737 "pand %%mm7, %%mm0\n\t"
738 "pand %%mm7, %%mm3\n\t"
739 "psrlq $5, %%mm1\n\t"
740 "psrlq $5, %%mm4\n\t"
741 "pand %%mm6, %%mm1\n\t"
742 "pand %%mm6, %%mm4\n\t"
743 "psrlq $19, %%mm2\n\t"
744 "psrlq $19, %%mm5\n\t"
747 "por %%mm1, %%mm0\n\t"
748 "por %%mm4, %%mm3\n\t"
749 "por %%mm2, %%mm0\n\t"
750 "por %%mm5, %%mm3\n\t"
751 "psllq $16, %%mm3\n\t"
752 "por %%mm3, %%mm0\n\t"
753 MOVNTQ" %%mm0, %0\n\t"
754 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
758 __asm __volatile(SFENCE:::"memory");
759 __asm __volatile(EMMS:::"memory");
766 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
770 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
772 const uint8_t *s = src;
775 const uint8_t *mm_end;
777 uint16_t *d = (uint16_t *)dst;
780 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
784 ::"m"(red_15mask),"m"(green_15mask));
791 "movd 3%1, %%mm3\n\t"
792 "punpckldq 6%1, %%mm0\n\t"
793 "punpckldq 9%1, %%mm3\n\t"
794 "movq %%mm0, %%mm1\n\t"
795 "movq %%mm0, %%mm2\n\t"
796 "movq %%mm3, %%mm4\n\t"
797 "movq %%mm3, %%mm5\n\t"
798 "psrlq $3, %%mm0\n\t"
799 "psrlq $3, %%mm3\n\t"
802 "psrlq $6, %%mm1\n\t"
803 "psrlq $6, %%mm4\n\t"
804 "pand %%mm6, %%mm1\n\t"
805 "pand %%mm6, %%mm4\n\t"
806 "psrlq $9, %%mm2\n\t"
807 "psrlq $9, %%mm5\n\t"
808 "pand %%mm7, %%mm2\n\t"
809 "pand %%mm7, %%mm5\n\t"
810 "por %%mm1, %%mm0\n\t"
811 "por %%mm4, %%mm3\n\t"
812 "por %%mm2, %%mm0\n\t"
813 "por %%mm5, %%mm3\n\t"
814 "psllq $16, %%mm3\n\t"
815 "por %%mm3, %%mm0\n\t"
816 MOVNTQ" %%mm0, %0\n\t"
817 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
821 __asm __volatile(SFENCE:::"memory");
822 __asm __volatile(EMMS:::"memory");
829 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
833 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
835 const uint8_t *s = src;
838 const uint8_t *mm_end;
840 uint16_t *d = (uint16_t *)dst;
843 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
847 ::"m"(red_15mask),"m"(green_15mask));
854 "movd 3%1, %%mm3\n\t"
855 "punpckldq 6%1, %%mm0\n\t"
856 "punpckldq 9%1, %%mm3\n\t"
857 "movq %%mm0, %%mm1\n\t"
858 "movq %%mm0, %%mm2\n\t"
859 "movq %%mm3, %%mm4\n\t"
860 "movq %%mm3, %%mm5\n\t"
861 "psllq $7, %%mm0\n\t"
862 "psllq $7, %%mm3\n\t"
863 "pand %%mm7, %%mm0\n\t"
864 "pand %%mm7, %%mm3\n\t"
865 "psrlq $6, %%mm1\n\t"
866 "psrlq $6, %%mm4\n\t"
867 "pand %%mm6, %%mm1\n\t"
868 "pand %%mm6, %%mm4\n\t"
869 "psrlq $19, %%mm2\n\t"
870 "psrlq $19, %%mm5\n\t"
873 "por %%mm1, %%mm0\n\t"
874 "por %%mm4, %%mm3\n\t"
875 "por %%mm2, %%mm0\n\t"
876 "por %%mm5, %%mm3\n\t"
877 "psllq $16, %%mm3\n\t"
878 "por %%mm3, %%mm0\n\t"
879 MOVNTQ" %%mm0, %0\n\t"
880 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
884 __asm __volatile(SFENCE:::"memory");
885 __asm __volatile(EMMS:::"memory");
892 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
897 I use here less accurate approximation by simply
898 left-shifting the input
899 value and filling the low order bits with
900 zeroes. This method improves png's
901 compression but this scheme cannot reproduce white exactly, since it does not
902 generate an all-ones maximum value; the net effect is to darken the
905 The better method should be "left bit replication":
915 | Leftmost Bits Repeated to Fill Open Bits
919 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
923 const uint16_t *mm_end;
925 uint8_t *d = (uint8_t *)dst;
926 const uint16_t *s = (uint16_t *)src;
927 end = s + src_size/2;
929 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
941 "psllq $3, %%mm0\n\t"
942 "psrlq $2, %%mm1\n\t"
943 "psrlq $7, %%mm2\n\t"
944 "movq %%mm0, %%mm3\n\t"
945 "movq %%mm1, %%mm4\n\t"
946 "movq %%mm2, %%mm5\n\t"
947 "punpcklwd %5, %%mm0\n\t"
948 "punpcklwd %5, %%mm1\n\t"
949 "punpcklwd %5, %%mm2\n\t"
950 "punpckhwd %5, %%mm3\n\t"
951 "punpckhwd %5, %%mm4\n\t"
952 "punpckhwd %5, %%mm5\n\t"
953 "psllq $8, %%mm1\n\t"
954 "psllq $16, %%mm2\n\t"
955 "por %%mm1, %%mm0\n\t"
956 "por %%mm2, %%mm0\n\t"
957 "psllq $8, %%mm4\n\t"
958 "psllq $16, %%mm5\n\t"
959 "por %%mm4, %%mm3\n\t"
960 "por %%mm5, %%mm3\n\t"
962 "movq %%mm0, %%mm6\n\t"
963 "movq %%mm3, %%mm7\n\t"
965 "movq 8%1, %%mm0\n\t"
966 "movq 8%1, %%mm1\n\t"
967 "movq 8%1, %%mm2\n\t"
971 "psllq $3, %%mm0\n\t"
972 "psrlq $2, %%mm1\n\t"
973 "psrlq $7, %%mm2\n\t"
974 "movq %%mm0, %%mm3\n\t"
975 "movq %%mm1, %%mm4\n\t"
976 "movq %%mm2, %%mm5\n\t"
977 "punpcklwd %5, %%mm0\n\t"
978 "punpcklwd %5, %%mm1\n\t"
979 "punpcklwd %5, %%mm2\n\t"
980 "punpckhwd %5, %%mm3\n\t"
981 "punpckhwd %5, %%mm4\n\t"
982 "punpckhwd %5, %%mm5\n\t"
983 "psllq $8, %%mm1\n\t"
984 "psllq $16, %%mm2\n\t"
985 "por %%mm1, %%mm0\n\t"
986 "por %%mm2, %%mm0\n\t"
987 "psllq $8, %%mm4\n\t"
988 "psllq $16, %%mm5\n\t"
989 "por %%mm4, %%mm3\n\t"
990 "por %%mm5, %%mm3\n\t"
993 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
995 /* Borrowed 32 to 24 */
997 "movq %%mm0, %%mm4\n\t"
998 "movq %%mm3, %%mm5\n\t"
999 "movq %%mm6, %%mm0\n\t"
1000 "movq %%mm7, %%mm1\n\t"
1002 "movq %%mm4, %%mm6\n\t"
1003 "movq %%mm5, %%mm7\n\t"
1004 "movq %%mm0, %%mm2\n\t"
1005 "movq %%mm1, %%mm3\n\t"
1007 "psrlq $8, %%mm2\n\t"
1008 "psrlq $8, %%mm3\n\t"
1009 "psrlq $8, %%mm6\n\t"
1010 "psrlq $8, %%mm7\n\t"
1011 "pand %2, %%mm0\n\t"
1012 "pand %2, %%mm1\n\t"
1013 "pand %2, %%mm4\n\t"
1014 "pand %2, %%mm5\n\t"
1015 "pand %3, %%mm2\n\t"
1016 "pand %3, %%mm3\n\t"
1017 "pand %3, %%mm6\n\t"
1018 "pand %3, %%mm7\n\t"
1019 "por %%mm2, %%mm0\n\t"
1020 "por %%mm3, %%mm1\n\t"
1021 "por %%mm6, %%mm4\n\t"
1022 "por %%mm7, %%mm5\n\t"
1024 "movq %%mm1, %%mm2\n\t"
1025 "movq %%mm4, %%mm3\n\t"
1026 "psllq $48, %%mm2\n\t"
1027 "psllq $32, %%mm3\n\t"
1028 "pand %4, %%mm2\n\t"
1029 "pand %5, %%mm3\n\t"
1030 "por %%mm2, %%mm0\n\t"
1031 "psrlq $16, %%mm1\n\t"
1032 "psrlq $32, %%mm4\n\t"
1033 "psllq $16, %%mm5\n\t"
1034 "por %%mm3, %%mm1\n\t"
1035 "pand %6, %%mm5\n\t"
1036 "por %%mm5, %%mm4\n\t"
1038 MOVNTQ" %%mm0, %0\n\t"
1039 MOVNTQ" %%mm1, 8%0\n\t"
1040 MOVNTQ" %%mm4, 16%0"
1043 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1048 __asm __volatile(SFENCE:::"memory");
1049 __asm __volatile(EMMS:::"memory");
1053 register uint16_t bgr;
1055 *d++ = (bgr&0x1F)<<3;
1056 *d++ = (bgr&0x3E0)>>2;
1057 *d++ = (bgr&0x7C00)>>7;
1061 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1063 const uint16_t *end;
1065 const uint16_t *mm_end;
1067 uint8_t *d = (uint8_t *)dst;
1068 const uint16_t *s = (const uint16_t *)src;
1069 end = s + src_size/2;
1071 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1077 "movq %1, %%mm0\n\t"
1078 "movq %1, %%mm1\n\t"
1079 "movq %1, %%mm2\n\t"
1080 "pand %2, %%mm0\n\t"
1081 "pand %3, %%mm1\n\t"
1082 "pand %4, %%mm2\n\t"
1083 "psllq $3, %%mm0\n\t"
1084 "psrlq $3, %%mm1\n\t"
1085 "psrlq $8, %%mm2\n\t"
1086 "movq %%mm0, %%mm3\n\t"
1087 "movq %%mm1, %%mm4\n\t"
1088 "movq %%mm2, %%mm5\n\t"
1089 "punpcklwd %5, %%mm0\n\t"
1090 "punpcklwd %5, %%mm1\n\t"
1091 "punpcklwd %5, %%mm2\n\t"
1092 "punpckhwd %5, %%mm3\n\t"
1093 "punpckhwd %5, %%mm4\n\t"
1094 "punpckhwd %5, %%mm5\n\t"
1095 "psllq $8, %%mm1\n\t"
1096 "psllq $16, %%mm2\n\t"
1097 "por %%mm1, %%mm0\n\t"
1098 "por %%mm2, %%mm0\n\t"
1099 "psllq $8, %%mm4\n\t"
1100 "psllq $16, %%mm5\n\t"
1101 "por %%mm4, %%mm3\n\t"
1102 "por %%mm5, %%mm3\n\t"
1104 "movq %%mm0, %%mm6\n\t"
1105 "movq %%mm3, %%mm7\n\t"
1107 "movq 8%1, %%mm0\n\t"
1108 "movq 8%1, %%mm1\n\t"
1109 "movq 8%1, %%mm2\n\t"
1110 "pand %2, %%mm0\n\t"
1111 "pand %3, %%mm1\n\t"
1112 "pand %4, %%mm2\n\t"
1113 "psllq $3, %%mm0\n\t"
1114 "psrlq $3, %%mm1\n\t"
1115 "psrlq $8, %%mm2\n\t"
1116 "movq %%mm0, %%mm3\n\t"
1117 "movq %%mm1, %%mm4\n\t"
1118 "movq %%mm2, %%mm5\n\t"
1119 "punpcklwd %5, %%mm0\n\t"
1120 "punpcklwd %5, %%mm1\n\t"
1121 "punpcklwd %5, %%mm2\n\t"
1122 "punpckhwd %5, %%mm3\n\t"
1123 "punpckhwd %5, %%mm4\n\t"
1124 "punpckhwd %5, %%mm5\n\t"
1125 "psllq $8, %%mm1\n\t"
1126 "psllq $16, %%mm2\n\t"
1127 "por %%mm1, %%mm0\n\t"
1128 "por %%mm2, %%mm0\n\t"
1129 "psllq $8, %%mm4\n\t"
1130 "psllq $16, %%mm5\n\t"
1131 "por %%mm4, %%mm3\n\t"
1132 "por %%mm5, %%mm3\n\t"
1134 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1136 /* Borrowed 32 to 24 */
1138 "movq %%mm0, %%mm4\n\t"
1139 "movq %%mm3, %%mm5\n\t"
1140 "movq %%mm6, %%mm0\n\t"
1141 "movq %%mm7, %%mm1\n\t"
1143 "movq %%mm4, %%mm6\n\t"
1144 "movq %%mm5, %%mm7\n\t"
1145 "movq %%mm0, %%mm2\n\t"
1146 "movq %%mm1, %%mm3\n\t"
1148 "psrlq $8, %%mm2\n\t"
1149 "psrlq $8, %%mm3\n\t"
1150 "psrlq $8, %%mm6\n\t"
1151 "psrlq $8, %%mm7\n\t"
1152 "pand %2, %%mm0\n\t"
1153 "pand %2, %%mm1\n\t"
1154 "pand %2, %%mm4\n\t"
1155 "pand %2, %%mm5\n\t"
1156 "pand %3, %%mm2\n\t"
1157 "pand %3, %%mm3\n\t"
1158 "pand %3, %%mm6\n\t"
1159 "pand %3, %%mm7\n\t"
1160 "por %%mm2, %%mm0\n\t"
1161 "por %%mm3, %%mm1\n\t"
1162 "por %%mm6, %%mm4\n\t"
1163 "por %%mm7, %%mm5\n\t"
1165 "movq %%mm1, %%mm2\n\t"
1166 "movq %%mm4, %%mm3\n\t"
1167 "psllq $48, %%mm2\n\t"
1168 "psllq $32, %%mm3\n\t"
1169 "pand %4, %%mm2\n\t"
1170 "pand %5, %%mm3\n\t"
1171 "por %%mm2, %%mm0\n\t"
1172 "psrlq $16, %%mm1\n\t"
1173 "psrlq $32, %%mm4\n\t"
1174 "psllq $16, %%mm5\n\t"
1175 "por %%mm3, %%mm1\n\t"
1176 "pand %6, %%mm5\n\t"
1177 "por %%mm5, %%mm4\n\t"
1179 MOVNTQ" %%mm0, %0\n\t"
1180 MOVNTQ" %%mm1, 8%0\n\t"
1181 MOVNTQ" %%mm4, 16%0"
1184 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1189 __asm __volatile(SFENCE:::"memory");
1190 __asm __volatile(EMMS:::"memory");
1194 register uint16_t bgr;
1196 *d++ = (bgr&0x1F)<<3;
1197 *d++ = (bgr&0x7E0)>>3;
1198 *d++ = (bgr&0xF800)>>8;
1202 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1204 const uint16_t *end;
1206 const uint16_t *mm_end;
1208 uint8_t *d = (uint8_t *)dst;
1209 const uint16_t *s = (const uint16_t *)src;
1210 end = s + src_size/2;
1212 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1213 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1219 "movq %1, %%mm0\n\t"
1220 "movq %1, %%mm1\n\t"
1221 "movq %1, %%mm2\n\t"
1222 "pand %2, %%mm0\n\t"
1223 "pand %3, %%mm1\n\t"
1224 "pand %4, %%mm2\n\t"
1225 "psllq $3, %%mm0\n\t"
1226 "psrlq $2, %%mm1\n\t"
1227 "psrlq $7, %%mm2\n\t"
1228 "movq %%mm0, %%mm3\n\t"
1229 "movq %%mm1, %%mm4\n\t"
1230 "movq %%mm2, %%mm5\n\t"
1231 "punpcklwd %%mm7, %%mm0\n\t"
1232 "punpcklwd %%mm7, %%mm1\n\t"
1233 "punpcklwd %%mm7, %%mm2\n\t"
1234 "punpckhwd %%mm7, %%mm3\n\t"
1235 "punpckhwd %%mm7, %%mm4\n\t"
1236 "punpckhwd %%mm7, %%mm5\n\t"
1237 "psllq $8, %%mm1\n\t"
1238 "psllq $16, %%mm2\n\t"
1239 "por %%mm1, %%mm0\n\t"
1240 "por %%mm2, %%mm0\n\t"
1241 "psllq $8, %%mm4\n\t"
1242 "psllq $16, %%mm5\n\t"
1243 "por %%mm4, %%mm3\n\t"
1244 "por %%mm5, %%mm3\n\t"
1245 MOVNTQ" %%mm0, %0\n\t"
1246 MOVNTQ" %%mm3, 8%0\n\t"
1248 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1253 __asm __volatile(SFENCE:::"memory");
1254 __asm __volatile(EMMS:::"memory");
1258 #if 0 //slightly slower on athlon
1260 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1262 register uint16_t bgr;
1264 #ifdef WORDS_BIGENDIAN
1266 *d++ = (bgr&0x7C00)>>7;
1267 *d++ = (bgr&0x3E0)>>2;
1268 *d++ = (bgr&0x1F)<<3;
1270 *d++ = (bgr&0x1F)<<3;
1271 *d++ = (bgr&0x3E0)>>2;
1272 *d++ = (bgr&0x7C00)>>7;
1280 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1282 const uint16_t *end;
1284 const uint16_t *mm_end;
1286 uint8_t *d = (uint8_t *)dst;
1287 const uint16_t *s = (uint16_t *)src;
1288 end = s + src_size/2;
1290 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1291 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1297 "movq %1, %%mm0\n\t"
1298 "movq %1, %%mm1\n\t"
1299 "movq %1, %%mm2\n\t"
1300 "pand %2, %%mm0\n\t"
1301 "pand %3, %%mm1\n\t"
1302 "pand %4, %%mm2\n\t"
1303 "psllq $3, %%mm0\n\t"
1304 "psrlq $3, %%mm1\n\t"
1305 "psrlq $8, %%mm2\n\t"
1306 "movq %%mm0, %%mm3\n\t"
1307 "movq %%mm1, %%mm4\n\t"
1308 "movq %%mm2, %%mm5\n\t"
1309 "punpcklwd %%mm7, %%mm0\n\t"
1310 "punpcklwd %%mm7, %%mm1\n\t"
1311 "punpcklwd %%mm7, %%mm2\n\t"
1312 "punpckhwd %%mm7, %%mm3\n\t"
1313 "punpckhwd %%mm7, %%mm4\n\t"
1314 "punpckhwd %%mm7, %%mm5\n\t"
1315 "psllq $8, %%mm1\n\t"
1316 "psllq $16, %%mm2\n\t"
1317 "por %%mm1, %%mm0\n\t"
1318 "por %%mm2, %%mm0\n\t"
1319 "psllq $8, %%mm4\n\t"
1320 "psllq $16, %%mm5\n\t"
1321 "por %%mm4, %%mm3\n\t"
1322 "por %%mm5, %%mm3\n\t"
1323 MOVNTQ" %%mm0, %0\n\t"
1324 MOVNTQ" %%mm3, 8%0\n\t"
1326 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1331 __asm __volatile(SFENCE:::"memory");
1332 __asm __volatile(EMMS:::"memory");
1336 register uint16_t bgr;
1338 #ifdef WORDS_BIGENDIAN
1340 *d++ = (bgr&0xF800)>>8;
1341 *d++ = (bgr&0x7E0)>>3;
1342 *d++ = (bgr&0x1F)<<3;
1344 *d++ = (bgr&0x1F)<<3;
1345 *d++ = (bgr&0x7E0)>>3;
1346 *d++ = (bgr&0xF800)>>8;
1352 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1355 /* TODO: unroll this loop */
1357 "xor %%"REG_a", %%"REG_a" \n\t"
1360 PREFETCH" 32(%0, %%"REG_a") \n\t"
1361 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1362 "movq %%mm0, %%mm1 \n\t"
1363 "movq %%mm0, %%mm2 \n\t"
1364 "pslld $16, %%mm0 \n\t"
1365 "psrld $16, %%mm1 \n\t"
1366 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1367 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1368 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1369 "por %%mm0, %%mm2 \n\t"
1370 "por %%mm1, %%mm2 \n\t"
1371 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1372 "add $8, %%"REG_a" \n\t"
1373 "cmp %2, %%"REG_a" \n\t"
1375 :: "r" (src), "r"(dst), "r" (src_size-7)
1379 __asm __volatile(SFENCE:::"memory");
1380 __asm __volatile(EMMS:::"memory");
1383 unsigned num_pixels = src_size >> 2;
1384 for(i=0; i<num_pixels; i++)
1386 #ifdef WORDS_BIGENDIAN
1387 dst[4*i + 1] = src[4*i + 3];
1388 dst[4*i + 2] = src[4*i + 2];
1389 dst[4*i + 3] = src[4*i + 1];
1391 dst[4*i + 0] = src[4*i + 2];
1392 dst[4*i + 1] = src[4*i + 1];
1393 dst[4*i + 2] = src[4*i + 0];
1399 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1403 long mmx_size= 23 - src_size;
1405 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1406 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1407 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1410 PREFETCH" 32(%1, %%"REG_a") \n\t"
1411 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1412 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1413 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1414 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1415 "pand %%mm5, %%mm0 \n\t"
1416 "pand %%mm6, %%mm1 \n\t"
1417 "pand %%mm7, %%mm2 \n\t"
1418 "por %%mm0, %%mm1 \n\t"
1419 "por %%mm2, %%mm1 \n\t"
1420 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1421 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1422 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1423 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1424 "pand %%mm7, %%mm0 \n\t"
1425 "pand %%mm5, %%mm1 \n\t"
1426 "pand %%mm6, %%mm2 \n\t"
1427 "por %%mm0, %%mm1 \n\t"
1428 "por %%mm2, %%mm1 \n\t"
1429 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1430 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1431 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1432 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1433 "pand %%mm6, %%mm0 \n\t"
1434 "pand %%mm7, %%mm1 \n\t"
1435 "pand %%mm5, %%mm2 \n\t"
1436 "por %%mm0, %%mm1 \n\t"
1437 "por %%mm2, %%mm1 \n\t"
1438 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1439 "add $24, %%"REG_a" \n\t"
1442 : "r" (src-mmx_size), "r"(dst-mmx_size)
1445 __asm __volatile(SFENCE:::"memory");
1446 __asm __volatile(EMMS:::"memory");
1448 if(mmx_size==23) return; //finihsed, was multiple of 8
1452 src_size= 23-mmx_size;
1456 for(i=0; i<src_size; i+=3)
1460 dst[i + 1] = src[i + 1];
1461 dst[i + 2] = src[i + 0];
1466 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1467 long width, long height,
1468 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1471 const long chromWidth= width>>1;
1472 for(y=0; y<height; y++)
1475 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1477 "xor %%"REG_a", %%"REG_a" \n\t"
1480 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1481 PREFETCH" 32(%2, %%"REG_a") \n\t"
1482 PREFETCH" 32(%3, %%"REG_a") \n\t"
1483 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1484 "movq %%mm0, %%mm2 \n\t" // U(0)
1485 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1486 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1487 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1489 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1490 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1491 "movq %%mm3, %%mm4 \n\t" // Y(0)
1492 "movq %%mm5, %%mm6 \n\t" // Y(8)
1493 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1494 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1495 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1496 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1498 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1499 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1500 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1501 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1503 "add $8, %%"REG_a" \n\t"
1504 "cmp %4, %%"REG_a" \n\t"
1506 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1511 #if defined ARCH_ALPHA && defined HAVE_MVI
1512 #define pl2yuy2(n) \
1517 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1518 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1519 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1520 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1521 yuv1 = (u << 8) + (v << 24); \
1528 uint64_t *qdst = (uint64_t *) dst;
1529 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1530 const uint32_t *yc = (uint32_t *) ysrc;
1531 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1532 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1533 for(i = 0; i < chromWidth; i += 8){
1534 uint64_t y1, y2, yuv1, yuv2;
1537 asm("ldq $31,64(%0)" :: "r"(yc));
1538 asm("ldq $31,64(%0)" :: "r"(yc2));
1539 asm("ldq $31,64(%0)" :: "r"(uc));
1540 asm("ldq $31,64(%0)" :: "r"(vc));
1558 #elif __WORDSIZE >= 64
1560 uint64_t *ldst = (uint64_t *) dst;
1561 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1562 for(i = 0; i < chromWidth; i += 2){
1564 k = yc[0] + (uc[0] << 8) +
1565 (yc[1] << 16) + (vc[0] << 24);
1566 l = yc[2] + (uc[1] << 8) +
1567 (yc[3] << 16) + (vc[1] << 24);
1568 *ldst++ = k + (l << 32);
1575 int i, *idst = (int32_t *) dst;
1576 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1577 for(i = 0; i < chromWidth; i++){
1578 #ifdef WORDS_BIGENDIAN
1579 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1580 (yc[1] << 8) + (vc[0] << 0);
1582 *idst++ = yc[0] + (uc[0] << 8) +
1583 (yc[1] << 16) + (vc[0] << 24);
1591 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1593 usrc += chromStride;
1594 vsrc += chromStride;
1608 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1609 * problem for anyone then tell me, and ill fix it)
1611 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1612 long width, long height,
1613 long lumStride, long chromStride, long dstStride)
1615 //FIXME interpolate chroma
1616 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1619 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1620 long width, long height,
1621 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1624 const long chromWidth= width>>1;
1625 for(y=0; y<height; y++)
1628 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1630 "xor %%"REG_a", %%"REG_a" \n\t"
1633 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1634 PREFETCH" 32(%2, %%"REG_a") \n\t"
1635 PREFETCH" 32(%3, %%"REG_a") \n\t"
1636 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1637 "movq %%mm0, %%mm2 \n\t" // U(0)
1638 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1639 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1640 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1642 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1643 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1644 "movq %%mm0, %%mm4 \n\t" // Y(0)
1645 "movq %%mm2, %%mm6 \n\t" // Y(8)
1646 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1647 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1648 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1649 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1651 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1652 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1653 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1654 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1656 "add $8, %%"REG_a" \n\t"
1657 "cmp %4, %%"REG_a" \n\t"
1659 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1663 //FIXME adapt the alpha asm code from yv12->yuy2
1665 #if __WORDSIZE >= 64
1667 uint64_t *ldst = (uint64_t *) dst;
1668 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1669 for(i = 0; i < chromWidth; i += 2){
1671 k = uc[0] + (yc[0] << 8) +
1672 (vc[0] << 16) + (yc[1] << 24);
1673 l = uc[1] + (yc[2] << 8) +
1674 (vc[1] << 16) + (yc[3] << 24);
1675 *ldst++ = k + (l << 32);
1682 int i, *idst = (int32_t *) dst;
1683 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1684 for(i = 0; i < chromWidth; i++){
1685 #ifdef WORDS_BIGENDIAN
1686 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1687 (vc[0] << 8) + (yc[1] << 0);
1689 *idst++ = uc[0] + (yc[0] << 8) +
1690 (vc[0] << 16) + (yc[1] << 24);
1698 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1700 usrc += chromStride;
1701 vsrc += chromStride;
1715 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1716 * problem for anyone then tell me, and ill fix it)
1718 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1719 long width, long height,
1720 long lumStride, long chromStride, long dstStride)
1722 //FIXME interpolate chroma
1723 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1728 * width should be a multiple of 16
1730 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1731 long width, long height,
1732 long lumStride, long chromStride, long dstStride)
1734 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1739 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1740 * problem for anyone then tell me, and ill fix it)
1742 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1743 long width, long height,
1744 long lumStride, long chromStride, long srcStride)
1747 const long chromWidth= width>>1;
1748 for(y=0; y<height; y+=2)
1752 "xor %%"REG_a", %%"REG_a" \n\t"
1753 "pcmpeqw %%mm7, %%mm7 \n\t"
1754 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1757 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1758 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1759 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1760 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1761 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1762 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1763 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1766 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1769 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1771 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1772 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1773 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1774 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1775 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1776 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1777 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1778 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1779 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1780 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1782 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1784 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1785 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1786 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1787 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1788 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1789 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1790 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1791 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1793 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1794 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1796 "add $8, %%"REG_a" \n\t"
1797 "cmp %4, %%"REG_a" \n\t"
1799 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1800 : "memory", "%"REG_a
1807 "xor %%"REG_a", %%"REG_a" \n\t"
1810 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1811 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1812 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1813 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1814 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1815 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1816 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1817 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1818 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1819 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1820 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1822 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1823 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1825 "add $8, %%"REG_a" \n\t"
1826 "cmp %4, %%"REG_a" \n\t"
1829 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1830 : "memory", "%"REG_a
1834 for(i=0; i<chromWidth; i++)
1836 ydst[2*i+0] = src[4*i+0];
1837 udst[i] = src[4*i+1];
1838 ydst[2*i+1] = src[4*i+2];
1839 vdst[i] = src[4*i+3];
1844 for(i=0; i<chromWidth; i++)
1846 ydst[2*i+0] = src[4*i+0];
1847 ydst[2*i+1] = src[4*i+2];
1850 udst += chromStride;
1851 vdst += chromStride;
1856 asm volatile( EMMS" \n\t"
1862 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1863 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1864 long width, long height, long lumStride, long chromStride)
1867 memcpy(ydst, ysrc, width*height);
1869 /* XXX: implement upscaling for U,V */
1872 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1879 for(x=0; x<srcWidth-1; x++){
1880 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1881 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1883 dst[2*srcWidth-1]= src[srcWidth-1];
1887 for(y=1; y<srcHeight; y++){
1888 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1889 const long mmxSize= srcWidth&~15;
1891 "mov %4, %%"REG_a" \n\t"
1893 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1894 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1895 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1896 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1897 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1898 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1899 PAVGB" %%mm0, %%mm5 \n\t"
1900 PAVGB" %%mm0, %%mm3 \n\t"
1901 PAVGB" %%mm0, %%mm5 \n\t"
1902 PAVGB" %%mm0, %%mm3 \n\t"
1903 PAVGB" %%mm1, %%mm4 \n\t"
1904 PAVGB" %%mm1, %%mm2 \n\t"
1905 PAVGB" %%mm1, %%mm4 \n\t"
1906 PAVGB" %%mm1, %%mm2 \n\t"
1907 "movq %%mm5, %%mm7 \n\t"
1908 "movq %%mm4, %%mm6 \n\t"
1909 "punpcklbw %%mm3, %%mm5 \n\t"
1910 "punpckhbw %%mm3, %%mm7 \n\t"
1911 "punpcklbw %%mm2, %%mm4 \n\t"
1912 "punpckhbw %%mm2, %%mm6 \n\t"
1914 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1915 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1916 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1917 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1919 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1920 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1921 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1922 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1924 "add $8, %%"REG_a" \n\t"
1926 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1927 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1933 const long mmxSize=1;
1935 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1936 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1938 for(x=mmxSize-1; x<srcWidth-1; x++){
1939 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1940 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1941 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1942 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1944 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1945 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1955 for(x=0; x<srcWidth-1; x++){
1956 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1957 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1959 dst[2*srcWidth-1]= src[srcWidth-1];
1961 for(x=0; x<srcWidth; x++){
1968 asm volatile( EMMS" \n\t"
1976 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1977 * problem for anyone then tell me, and ill fix it)
1978 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1980 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1981 long width, long height,
1982 long lumStride, long chromStride, long srcStride)
1985 const long chromWidth= width>>1;
1986 for(y=0; y<height; y+=2)
1990 "xorl %%eax, %%eax \n\t"
1991 "pcmpeqw %%mm7, %%mm7 \n\t"
1992 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1995 PREFETCH" 64(%0, %%eax, 4) \n\t"
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1998 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1999 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2000 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2001 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2004 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2007 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2009 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2010 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2011 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2012 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2013 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2014 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2015 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2016 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2017 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2018 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2020 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2022 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2023 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2024 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2025 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2026 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2027 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2028 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2029 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2031 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2032 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2034 "addl $8, %%eax \n\t"
2035 "cmpl %4, %%eax \n\t"
2037 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2045 "xorl %%eax, %%eax \n\t"
2048 PREFETCH" 64(%0, %%eax, 4) \n\t"
2049 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2050 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2051 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2052 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2053 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2054 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2055 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2056 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2057 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2058 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2060 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2061 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2063 "addl $8, %%eax \n\t"
2064 "cmpl %4, %%eax \n\t"
2067 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2072 for(i=0; i<chromWidth; i++)
2074 udst[i] = src[4*i+0];
2075 ydst[2*i+0] = src[4*i+1];
2076 vdst[i] = src[4*i+2];
2077 ydst[2*i+1] = src[4*i+3];
2082 for(i=0; i<chromWidth; i++)
2084 ydst[2*i+0] = src[4*i+1];
2085 ydst[2*i+1] = src[4*i+3];
2088 udst += chromStride;
2089 vdst += chromStride;
2094 asm volatile( EMMS" \n\t"
2102 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2103 * problem for anyone then tell me, and ill fix it)
2104 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2106 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2107 long width, long height,
2108 long lumStride, long chromStride, long srcStride)
2111 const long chromWidth= width>>1;
2113 for(y=0; y<height-2; y+=2)
2119 "mov %2, %%"REG_a" \n\t"
2120 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2121 "movq "MANGLE(w1111)", %%mm5 \n\t"
2122 "pxor %%mm7, %%mm7 \n\t"
2123 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2126 PREFETCH" 64(%0, %%"REG_b") \n\t"
2127 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2128 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2129 "punpcklbw %%mm7, %%mm0 \n\t"
2130 "punpcklbw %%mm7, %%mm1 \n\t"
2131 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2132 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2133 "punpcklbw %%mm7, %%mm2 \n\t"
2134 "punpcklbw %%mm7, %%mm3 \n\t"
2135 "pmaddwd %%mm6, %%mm0 \n\t"
2136 "pmaddwd %%mm6, %%mm1 \n\t"
2137 "pmaddwd %%mm6, %%mm2 \n\t"
2138 "pmaddwd %%mm6, %%mm3 \n\t"
2139 #ifndef FAST_BGR2YV12
2140 "psrad $8, %%mm0 \n\t"
2141 "psrad $8, %%mm1 \n\t"
2142 "psrad $8, %%mm2 \n\t"
2143 "psrad $8, %%mm3 \n\t"
2145 "packssdw %%mm1, %%mm0 \n\t"
2146 "packssdw %%mm3, %%mm2 \n\t"
2147 "pmaddwd %%mm5, %%mm0 \n\t"
2148 "pmaddwd %%mm5, %%mm2 \n\t"
2149 "packssdw %%mm2, %%mm0 \n\t"
2150 "psraw $7, %%mm0 \n\t"
2152 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2153 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2154 "punpcklbw %%mm7, %%mm4 \n\t"
2155 "punpcklbw %%mm7, %%mm1 \n\t"
2156 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2157 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2158 "punpcklbw %%mm7, %%mm2 \n\t"
2159 "punpcklbw %%mm7, %%mm3 \n\t"
2160 "pmaddwd %%mm6, %%mm4 \n\t"
2161 "pmaddwd %%mm6, %%mm1 \n\t"
2162 "pmaddwd %%mm6, %%mm2 \n\t"
2163 "pmaddwd %%mm6, %%mm3 \n\t"
2164 #ifndef FAST_BGR2YV12
2165 "psrad $8, %%mm4 \n\t"
2166 "psrad $8, %%mm1 \n\t"
2167 "psrad $8, %%mm2 \n\t"
2168 "psrad $8, %%mm3 \n\t"
2170 "packssdw %%mm1, %%mm4 \n\t"
2171 "packssdw %%mm3, %%mm2 \n\t"
2172 "pmaddwd %%mm5, %%mm4 \n\t"
2173 "pmaddwd %%mm5, %%mm2 \n\t"
2174 "add $24, %%"REG_b" \n\t"
2175 "packssdw %%mm2, %%mm4 \n\t"
2176 "psraw $7, %%mm4 \n\t"
2178 "packuswb %%mm4, %%mm0 \n\t"
2179 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2181 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2182 "add $8, %%"REG_a" \n\t"
2184 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2185 : "%"REG_a, "%"REG_b
2192 "mov %4, %%"REG_a" \n\t"
2193 "movq "MANGLE(w1111)", %%mm5 \n\t"
2194 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2195 "pxor %%mm7, %%mm7 \n\t"
2196 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2197 "add %%"REG_b", %%"REG_b" \n\t"
2200 PREFETCH" 64(%0, %%"REG_b") \n\t"
2201 PREFETCH" 64(%1, %%"REG_b") \n\t"
2202 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2203 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2204 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2205 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2206 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2207 PAVGB" %%mm1, %%mm0 \n\t"
2208 PAVGB" %%mm3, %%mm2 \n\t"
2209 "movq %%mm0, %%mm1 \n\t"
2210 "movq %%mm2, %%mm3 \n\t"
2211 "psrlq $24, %%mm0 \n\t"
2212 "psrlq $24, %%mm2 \n\t"
2213 PAVGB" %%mm1, %%mm0 \n\t"
2214 PAVGB" %%mm3, %%mm2 \n\t"
2215 "punpcklbw %%mm7, %%mm0 \n\t"
2216 "punpcklbw %%mm7, %%mm2 \n\t"
2218 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2219 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2220 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2221 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2222 "punpcklbw %%mm7, %%mm0 \n\t"
2223 "punpcklbw %%mm7, %%mm1 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm3 \n\t"
2226 "paddw %%mm1, %%mm0 \n\t"
2227 "paddw %%mm3, %%mm2 \n\t"
2228 "paddw %%mm2, %%mm0 \n\t"
2229 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2230 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2231 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2232 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2233 "punpcklbw %%mm7, %%mm4 \n\t"
2234 "punpcklbw %%mm7, %%mm1 \n\t"
2235 "punpcklbw %%mm7, %%mm2 \n\t"
2236 "punpcklbw %%mm7, %%mm3 \n\t"
2237 "paddw %%mm1, %%mm4 \n\t"
2238 "paddw %%mm3, %%mm2 \n\t"
2239 "paddw %%mm4, %%mm2 \n\t"
2240 "psrlw $2, %%mm0 \n\t"
2241 "psrlw $2, %%mm2 \n\t"
2243 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2244 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2246 "pmaddwd %%mm0, %%mm1 \n\t"
2247 "pmaddwd %%mm2, %%mm3 \n\t"
2248 "pmaddwd %%mm6, %%mm0 \n\t"
2249 "pmaddwd %%mm6, %%mm2 \n\t"
2250 #ifndef FAST_BGR2YV12
2251 "psrad $8, %%mm0 \n\t"
2252 "psrad $8, %%mm1 \n\t"
2253 "psrad $8, %%mm2 \n\t"
2254 "psrad $8, %%mm3 \n\t"
2256 "packssdw %%mm2, %%mm0 \n\t"
2257 "packssdw %%mm3, %%mm1 \n\t"
2258 "pmaddwd %%mm5, %%mm0 \n\t"
2259 "pmaddwd %%mm5, %%mm1 \n\t"
2260 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2261 "psraw $7, %%mm0 \n\t"
2263 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2264 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2265 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2266 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2267 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2268 PAVGB" %%mm1, %%mm4 \n\t"
2269 PAVGB" %%mm3, %%mm2 \n\t"
2270 "movq %%mm4, %%mm1 \n\t"
2271 "movq %%mm2, %%mm3 \n\t"
2272 "psrlq $24, %%mm4 \n\t"
2273 "psrlq $24, %%mm2 \n\t"
2274 PAVGB" %%mm1, %%mm4 \n\t"
2275 PAVGB" %%mm3, %%mm2 \n\t"
2276 "punpcklbw %%mm7, %%mm4 \n\t"
2277 "punpcklbw %%mm7, %%mm2 \n\t"
2279 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2280 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2281 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2282 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2283 "punpcklbw %%mm7, %%mm4 \n\t"
2284 "punpcklbw %%mm7, %%mm1 \n\t"
2285 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm3 \n\t"
2287 "paddw %%mm1, %%mm4 \n\t"
2288 "paddw %%mm3, %%mm2 \n\t"
2289 "paddw %%mm2, %%mm4 \n\t"
2290 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2291 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2292 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2293 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2294 "punpcklbw %%mm7, %%mm5 \n\t"
2295 "punpcklbw %%mm7, %%mm1 \n\t"
2296 "punpcklbw %%mm7, %%mm2 \n\t"
2297 "punpcklbw %%mm7, %%mm3 \n\t"
2298 "paddw %%mm1, %%mm5 \n\t"
2299 "paddw %%mm3, %%mm2 \n\t"
2300 "paddw %%mm5, %%mm2 \n\t"
2301 "movq "MANGLE(w1111)", %%mm5 \n\t"
2302 "psrlw $2, %%mm4 \n\t"
2303 "psrlw $2, %%mm2 \n\t"
2305 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2306 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2308 "pmaddwd %%mm4, %%mm1 \n\t"
2309 "pmaddwd %%mm2, %%mm3 \n\t"
2310 "pmaddwd %%mm6, %%mm4 \n\t"
2311 "pmaddwd %%mm6, %%mm2 \n\t"
2312 #ifndef FAST_BGR2YV12
2313 "psrad $8, %%mm4 \n\t"
2314 "psrad $8, %%mm1 \n\t"
2315 "psrad $8, %%mm2 \n\t"
2316 "psrad $8, %%mm3 \n\t"
2318 "packssdw %%mm2, %%mm4 \n\t"
2319 "packssdw %%mm3, %%mm1 \n\t"
2320 "pmaddwd %%mm5, %%mm4 \n\t"
2321 "pmaddwd %%mm5, %%mm1 \n\t"
2322 "add $24, %%"REG_b" \n\t"
2323 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2324 "psraw $7, %%mm4 \n\t"
2326 "movq %%mm0, %%mm1 \n\t"
2327 "punpckldq %%mm4, %%mm0 \n\t"
2328 "punpckhdq %%mm4, %%mm1 \n\t"
2329 "packsswb %%mm1, %%mm0 \n\t"
2330 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2331 "movd %%mm0, (%2, %%"REG_a") \n\t"
2332 "punpckhdq %%mm0, %%mm0 \n\t"
2333 "movd %%mm0, (%3, %%"REG_a") \n\t"
2334 "add $4, %%"REG_a" \n\t"
2336 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2337 : "%"REG_a, "%"REG_b
2340 udst += chromStride;
2341 vdst += chromStride;
2345 asm volatile( EMMS" \n\t"
2351 for(; y<height; y+=2)
2354 for(i=0; i<chromWidth; i++)
2356 unsigned int b= src[6*i+0];
2357 unsigned int g= src[6*i+1];
2358 unsigned int r= src[6*i+2];
2360 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2361 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2362 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2372 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2378 for(i=0; i<chromWidth; i++)
2380 unsigned int b= src[6*i+0];
2381 unsigned int g= src[6*i+1];
2382 unsigned int r= src[6*i+2];
2384 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2392 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2395 udst += chromStride;
2396 vdst += chromStride;
2402 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2403 long width, long height, long src1Stride,
2404 long src2Stride, long dstStride){
2407 for(h=0; h < height; h++)
2414 "xor %%"REG_a", %%"REG_a" \n\t"
2416 PREFETCH" 64(%1, %%"REG_a") \n\t"
2417 PREFETCH" 64(%2, %%"REG_a") \n\t"
2418 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2419 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2420 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2421 "punpcklbw %%xmm2, %%xmm0 \n\t"
2422 "punpckhbw %%xmm2, %%xmm1 \n\t"
2423 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2424 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2425 "add $16, %%"REG_a" \n\t"
2426 "cmp %3, %%"REG_a" \n\t"
2428 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2429 : "memory", "%"REG_a""
2433 "xor %%"REG_a", %%"REG_a" \n\t"
2435 PREFETCH" 64(%1, %%"REG_a") \n\t"
2436 PREFETCH" 64(%2, %%"REG_a") \n\t"
2437 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2438 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2439 "movq %%mm0, %%mm1 \n\t"
2440 "movq %%mm2, %%mm3 \n\t"
2441 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2442 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2443 "punpcklbw %%mm4, %%mm0 \n\t"
2444 "punpckhbw %%mm4, %%mm1 \n\t"
2445 "punpcklbw %%mm5, %%mm2 \n\t"
2446 "punpckhbw %%mm5, %%mm3 \n\t"
2447 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2448 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2449 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2450 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2451 "add $16, %%"REG_a" \n\t"
2452 "cmp %3, %%"REG_a" \n\t"
2454 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2455 : "memory", "%"REG_a
2458 for(w= (width&(~15)); w < width; w++)
2460 dest[2*w+0] = src1[w];
2461 dest[2*w+1] = src2[w];
2464 for(w=0; w < width; w++)
2466 dest[2*w+0] = src1[w];
2467 dest[2*w+1] = src2[w];
2483 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2484 uint8_t *dst1, uint8_t *dst2,
2485 long width, long height,
2486 long srcStride1, long srcStride2,
2487 long dstStride1, long dstStride2)
2490 w=width/2; h=height/2;
2495 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2498 const uint8_t* s1=src1+srcStride1*(y>>1);
2499 uint8_t* d=dst1+dstStride1*y;
2506 "movq %1, %%mm0\n\t"
2507 "movq 8%1, %%mm2\n\t"
2508 "movq 16%1, %%mm4\n\t"
2509 "movq 24%1, %%mm6\n\t"
2510 "movq %%mm0, %%mm1\n\t"
2511 "movq %%mm2, %%mm3\n\t"
2512 "movq %%mm4, %%mm5\n\t"
2513 "movq %%mm6, %%mm7\n\t"
2514 "punpcklbw %%mm0, %%mm0\n\t"
2515 "punpckhbw %%mm1, %%mm1\n\t"
2516 "punpcklbw %%mm2, %%mm2\n\t"
2517 "punpckhbw %%mm3, %%mm3\n\t"
2518 "punpcklbw %%mm4, %%mm4\n\t"
2519 "punpckhbw %%mm5, %%mm5\n\t"
2520 "punpcklbw %%mm6, %%mm6\n\t"
2521 "punpckhbw %%mm7, %%mm7\n\t"
2522 MOVNTQ" %%mm0, %0\n\t"
2523 MOVNTQ" %%mm1, 8%0\n\t"
2524 MOVNTQ" %%mm2, 16%0\n\t"
2525 MOVNTQ" %%mm3, 24%0\n\t"
2526 MOVNTQ" %%mm4, 32%0\n\t"
2527 MOVNTQ" %%mm5, 40%0\n\t"
2528 MOVNTQ" %%mm6, 48%0\n\t"
2529 MOVNTQ" %%mm7, 56%0"
2535 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2538 const uint8_t* s2=src2+srcStride2*(y>>1);
2539 uint8_t* d=dst2+dstStride2*y;
2546 "movq %1, %%mm0\n\t"
2547 "movq 8%1, %%mm2\n\t"
2548 "movq 16%1, %%mm4\n\t"
2549 "movq 24%1, %%mm6\n\t"
2550 "movq %%mm0, %%mm1\n\t"
2551 "movq %%mm2, %%mm3\n\t"
2552 "movq %%mm4, %%mm5\n\t"
2553 "movq %%mm6, %%mm7\n\t"
2554 "punpcklbw %%mm0, %%mm0\n\t"
2555 "punpckhbw %%mm1, %%mm1\n\t"
2556 "punpcklbw %%mm2, %%mm2\n\t"
2557 "punpckhbw %%mm3, %%mm3\n\t"
2558 "punpcklbw %%mm4, %%mm4\n\t"
2559 "punpckhbw %%mm5, %%mm5\n\t"
2560 "punpcklbw %%mm6, %%mm6\n\t"
2561 "punpckhbw %%mm7, %%mm7\n\t"
2562 MOVNTQ" %%mm0, %0\n\t"
2563 MOVNTQ" %%mm1, 8%0\n\t"
2564 MOVNTQ" %%mm2, 16%0\n\t"
2565 MOVNTQ" %%mm3, 24%0\n\t"
2566 MOVNTQ" %%mm4, 32%0\n\t"
2567 MOVNTQ" %%mm5, 40%0\n\t"
2568 MOVNTQ" %%mm6, 48%0\n\t"
2569 MOVNTQ" %%mm7, 56%0"
2575 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2586 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2588 long width, long height,
2589 long srcStride1, long srcStride2,
2590 long srcStride3, long dstStride)
2593 w=width/2; h=height;
2595 const uint8_t* yp=src1+srcStride1*y;
2596 const uint8_t* up=src2+srcStride2*(y>>2);
2597 const uint8_t* vp=src3+srcStride3*(y>>2);
2598 uint8_t* d=dst+dstStride*y;
2604 PREFETCH" 32(%1, %0)\n\t"
2605 PREFETCH" 32(%2, %0)\n\t"
2606 PREFETCH" 32(%3, %0)\n\t"
2607 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2608 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2609 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2610 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2611 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2612 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2613 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2614 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2615 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2616 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2618 "movq %%mm1, %%mm6\n\t"
2619 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2620 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2621 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2622 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2623 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2625 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2626 "movq 8(%1, %0, 4), %%mm0\n\t"
2627 "movq %%mm0, %%mm3\n\t"
2628 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2629 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2630 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2631 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2633 "movq %%mm4, %%mm6\n\t"
2634 "movq 16(%1, %0, 4), %%mm0\n\t"
2635 "movq %%mm0, %%mm3\n\t"
2636 "punpcklbw %%mm5, %%mm4\n\t"
2637 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2638 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2639 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2640 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2642 "punpckhbw %%mm5, %%mm6\n\t"
2643 "movq 24(%1, %0, 4), %%mm0\n\t"
2644 "movq %%mm0, %%mm3\n\t"
2645 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2646 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2647 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2648 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2651 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2657 const long x2= x<<2;