3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
13 #include <inttypes.h> /* for __WORDSIZE */
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
65 const uint8_t *s = src;
68 const uint8_t *mm_end;
72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
80 "punpckldq 3%1, %%mm0\n\t"
82 "punpckldq 9%1, %%mm1\n\t"
83 "movd 12%1, %%mm2\n\t"
84 "punpckldq 15%1, %%mm2\n\t"
85 "movd 18%1, %%mm3\n\t"
86 "punpckldq 21%1, %%mm3\n\t"
87 "pand %%mm7, %%mm0\n\t"
88 "pand %%mm7, %%mm1\n\t"
89 "pand %%mm7, %%mm2\n\t"
90 "pand %%mm7, %%mm3\n\t"
91 MOVNTQ" %%mm0, %0\n\t"
92 MOVNTQ" %%mm1, 8%0\n\t"
93 MOVNTQ" %%mm2, 16%0\n\t"
101 __asm __volatile(SFENCE:::"memory");
102 __asm __volatile(EMMS:::"memory");
106 #ifdef WORDS_BIGENDIAN
107 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
122 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
125 const uint8_t *s = src;
128 const uint8_t *mm_end;
132 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
139 "movq 8%1, %%mm1\n\t"
140 "movq 16%1, %%mm4\n\t"
141 "movq 24%1, %%mm5\n\t"
142 "movq %%mm0, %%mm2\n\t"
143 "movq %%mm1, %%mm3\n\t"
144 "movq %%mm4, %%mm6\n\t"
145 "movq %%mm5, %%mm7\n\t"
146 "psrlq $8, %%mm2\n\t"
147 "psrlq $8, %%mm3\n\t"
148 "psrlq $8, %%mm6\n\t"
149 "psrlq $8, %%mm7\n\t"
158 "por %%mm2, %%mm0\n\t"
159 "por %%mm3, %%mm1\n\t"
160 "por %%mm6, %%mm4\n\t"
161 "por %%mm7, %%mm5\n\t"
163 "movq %%mm1, %%mm2\n\t"
164 "movq %%mm4, %%mm3\n\t"
165 "psllq $48, %%mm2\n\t"
166 "psllq $32, %%mm3\n\t"
169 "por %%mm2, %%mm0\n\t"
170 "psrlq $16, %%mm1\n\t"
171 "psrlq $32, %%mm4\n\t"
172 "psllq $16, %%mm5\n\t"
173 "por %%mm3, %%mm1\n\t"
175 "por %%mm5, %%mm4\n\t"
177 MOVNTQ" %%mm0, %0\n\t"
178 MOVNTQ" %%mm1, 8%0\n\t"
181 :"m"(*s),"m"(mask24l),
182 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
187 __asm __volatile(SFENCE:::"memory");
188 __asm __volatile(EMMS:::"memory");
192 #ifdef WORDS_BIGENDIAN
193 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
209 Original by Strepto/Astral
210 ported to gcc & bugfixed : A'rpi
211 MMX2, 3DNOW optimization by Nick Kurshev
212 32bit c version, and and&add trick by Michael Niedermayer
214 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
216 register const uint8_t* s=src;
217 register uint8_t* d=dst;
218 register const uint8_t *end;
219 const uint8_t *mm_end;
222 __asm __volatile(PREFETCH" %0"::"m"(*s));
223 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
230 "movq 8%1, %%mm2\n\t"
231 "movq %%mm0, %%mm1\n\t"
232 "movq %%mm2, %%mm3\n\t"
233 "pand %%mm4, %%mm0\n\t"
234 "pand %%mm4, %%mm2\n\t"
235 "paddw %%mm1, %%mm0\n\t"
236 "paddw %%mm3, %%mm2\n\t"
237 MOVNTQ" %%mm0, %0\n\t"
245 __asm __volatile(SFENCE:::"memory");
246 __asm __volatile(EMMS:::"memory");
251 register unsigned x= *((uint32_t *)s);
252 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
258 register unsigned short x= *((uint16_t *)s);
259 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
263 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
265 register const uint8_t* s=src;
266 register uint8_t* d=dst;
267 register const uint8_t *end;
268 const uint8_t *mm_end;
271 __asm __volatile(PREFETCH" %0"::"m"(*s));
272 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
273 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
280 "movq 8%1, %%mm2\n\t"
281 "movq %%mm0, %%mm1\n\t"
282 "movq %%mm2, %%mm3\n\t"
283 "psrlq $1, %%mm0\n\t"
284 "psrlq $1, %%mm2\n\t"
285 "pand %%mm7, %%mm0\n\t"
286 "pand %%mm7, %%mm2\n\t"
287 "pand %%mm6, %%mm1\n\t"
288 "pand %%mm6, %%mm3\n\t"
289 "por %%mm1, %%mm0\n\t"
290 "por %%mm3, %%mm2\n\t"
291 MOVNTQ" %%mm0, %0\n\t"
299 __asm __volatile(SFENCE:::"memory");
300 __asm __volatile(EMMS:::"memory");
305 register uint32_t x= *((uint32_t *)s);
306 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
312 register uint16_t x= *((uint16_t *)s);
313 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
319 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
321 const uint8_t *s = src;
324 const uint8_t *mm_end;
326 uint16_t *d = (uint16_t *)dst;
330 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
332 "movq %3, %%mm5 \n\t"
333 "movq %4, %%mm6 \n\t"
334 "movq %5, %%mm7 \n\t"
337 PREFETCH" 32(%1) \n\t"
338 "movd (%1), %%mm0 \n\t"
339 "movd 4(%1), %%mm3 \n\t"
340 "punpckldq 8(%1), %%mm0 \n\t"
341 "punpckldq 12(%1), %%mm3 \n\t"
342 "movq %%mm0, %%mm1 \n\t"
343 "movq %%mm3, %%mm4 \n\t"
344 "pand %%mm6, %%mm0 \n\t"
345 "pand %%mm6, %%mm3 \n\t"
346 "pmaddwd %%mm7, %%mm0 \n\t"
347 "pmaddwd %%mm7, %%mm3 \n\t"
348 "pand %%mm5, %%mm1 \n\t"
349 "pand %%mm5, %%mm4 \n\t"
350 "por %%mm1, %%mm0 \n\t"
351 "por %%mm4, %%mm3 \n\t"
352 "psrld $5, %%mm0 \n\t"
353 "pslld $11, %%mm3 \n\t"
354 "por %%mm3, %%mm0 \n\t"
355 MOVNTQ" %%mm0, (%0) \n\t"
361 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
364 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
368 ::"m"(red_16mask),"m"(green_16mask));
374 "movd 4%1, %%mm3\n\t"
375 "punpckldq 8%1, %%mm0\n\t"
376 "punpckldq 12%1, %%mm3\n\t"
377 "movq %%mm0, %%mm1\n\t"
378 "movq %%mm0, %%mm2\n\t"
379 "movq %%mm3, %%mm4\n\t"
380 "movq %%mm3, %%mm5\n\t"
381 "psrlq $3, %%mm0\n\t"
382 "psrlq $3, %%mm3\n\t"
385 "psrlq $5, %%mm1\n\t"
386 "psrlq $5, %%mm4\n\t"
387 "pand %%mm6, %%mm1\n\t"
388 "pand %%mm6, %%mm4\n\t"
389 "psrlq $8, %%mm2\n\t"
390 "psrlq $8, %%mm5\n\t"
391 "pand %%mm7, %%mm2\n\t"
392 "pand %%mm7, %%mm5\n\t"
393 "por %%mm1, %%mm0\n\t"
394 "por %%mm4, %%mm3\n\t"
395 "por %%mm2, %%mm0\n\t"
396 "por %%mm5, %%mm3\n\t"
397 "psllq $16, %%mm3\n\t"
398 "por %%mm3, %%mm0\n\t"
399 MOVNTQ" %%mm0, %0\n\t"
400 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
405 __asm __volatile(SFENCE:::"memory");
406 __asm __volatile(EMMS:::"memory");
410 register int rgb = *(uint32_t*)s; s += 4;
411 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
415 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
417 const uint8_t *s = src;
420 const uint8_t *mm_end;
422 uint16_t *d = (uint16_t *)dst;
425 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
429 ::"m"(red_16mask),"m"(green_16mask));
436 "movd 4%1, %%mm3\n\t"
437 "punpckldq 8%1, %%mm0\n\t"
438 "punpckldq 12%1, %%mm3\n\t"
439 "movq %%mm0, %%mm1\n\t"
440 "movq %%mm0, %%mm2\n\t"
441 "movq %%mm3, %%mm4\n\t"
442 "movq %%mm3, %%mm5\n\t"
443 "psllq $8, %%mm0\n\t"
444 "psllq $8, %%mm3\n\t"
445 "pand %%mm7, %%mm0\n\t"
446 "pand %%mm7, %%mm3\n\t"
447 "psrlq $5, %%mm1\n\t"
448 "psrlq $5, %%mm4\n\t"
449 "pand %%mm6, %%mm1\n\t"
450 "pand %%mm6, %%mm4\n\t"
451 "psrlq $19, %%mm2\n\t"
452 "psrlq $19, %%mm5\n\t"
455 "por %%mm1, %%mm0\n\t"
456 "por %%mm4, %%mm3\n\t"
457 "por %%mm2, %%mm0\n\t"
458 "por %%mm5, %%mm3\n\t"
459 "psllq $16, %%mm3\n\t"
460 "por %%mm3, %%mm0\n\t"
461 MOVNTQ" %%mm0, %0\n\t"
462 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
466 __asm __volatile(SFENCE:::"memory");
467 __asm __volatile(EMMS:::"memory");
471 const int src= *s; s += 4;
472 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
476 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
478 const uint8_t *s = src;
481 const uint8_t *mm_end;
483 uint16_t *d = (uint16_t *)dst;
487 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
489 "movq %3, %%mm5 \n\t"
490 "movq %4, %%mm6 \n\t"
491 "movq %5, %%mm7 \n\t"
494 PREFETCH" 32(%1) \n\t"
495 "movd (%1), %%mm0 \n\t"
496 "movd 4(%1), %%mm3 \n\t"
497 "punpckldq 8(%1), %%mm0 \n\t"
498 "punpckldq 12(%1), %%mm3 \n\t"
499 "movq %%mm0, %%mm1 \n\t"
500 "movq %%mm3, %%mm4 \n\t"
501 "pand %%mm6, %%mm0 \n\t"
502 "pand %%mm6, %%mm3 \n\t"
503 "pmaddwd %%mm7, %%mm0 \n\t"
504 "pmaddwd %%mm7, %%mm3 \n\t"
505 "pand %%mm5, %%mm1 \n\t"
506 "pand %%mm5, %%mm4 \n\t"
507 "por %%mm1, %%mm0 \n\t"
508 "por %%mm4, %%mm3 \n\t"
509 "psrld $6, %%mm0 \n\t"
510 "pslld $10, %%mm3 \n\t"
511 "por %%mm3, %%mm0 \n\t"
512 MOVNTQ" %%mm0, (%0) \n\t"
518 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
521 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
525 ::"m"(red_15mask),"m"(green_15mask));
531 "movd 4%1, %%mm3\n\t"
532 "punpckldq 8%1, %%mm0\n\t"
533 "punpckldq 12%1, %%mm3\n\t"
534 "movq %%mm0, %%mm1\n\t"
535 "movq %%mm0, %%mm2\n\t"
536 "movq %%mm3, %%mm4\n\t"
537 "movq %%mm3, %%mm5\n\t"
538 "psrlq $3, %%mm0\n\t"
539 "psrlq $3, %%mm3\n\t"
542 "psrlq $6, %%mm1\n\t"
543 "psrlq $6, %%mm4\n\t"
544 "pand %%mm6, %%mm1\n\t"
545 "pand %%mm6, %%mm4\n\t"
546 "psrlq $9, %%mm2\n\t"
547 "psrlq $9, %%mm5\n\t"
548 "pand %%mm7, %%mm2\n\t"
549 "pand %%mm7, %%mm5\n\t"
550 "por %%mm1, %%mm0\n\t"
551 "por %%mm4, %%mm3\n\t"
552 "por %%mm2, %%mm0\n\t"
553 "por %%mm5, %%mm3\n\t"
554 "psllq $16, %%mm3\n\t"
555 "por %%mm3, %%mm0\n\t"
556 MOVNTQ" %%mm0, %0\n\t"
557 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
562 __asm __volatile(SFENCE:::"memory");
563 __asm __volatile(EMMS:::"memory");
567 const int src= *s; s += 4;
568 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
572 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
574 const uint8_t *s = src;
577 const uint8_t *mm_end;
579 uint16_t *d = (uint16_t *)dst;
582 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
586 ::"m"(red_15mask),"m"(green_15mask));
593 "movd 4%1, %%mm3\n\t"
594 "punpckldq 8%1, %%mm0\n\t"
595 "punpckldq 12%1, %%mm3\n\t"
596 "movq %%mm0, %%mm1\n\t"
597 "movq %%mm0, %%mm2\n\t"
598 "movq %%mm3, %%mm4\n\t"
599 "movq %%mm3, %%mm5\n\t"
600 "psllq $7, %%mm0\n\t"
601 "psllq $7, %%mm3\n\t"
602 "pand %%mm7, %%mm0\n\t"
603 "pand %%mm7, %%mm3\n\t"
604 "psrlq $6, %%mm1\n\t"
605 "psrlq $6, %%mm4\n\t"
606 "pand %%mm6, %%mm1\n\t"
607 "pand %%mm6, %%mm4\n\t"
608 "psrlq $19, %%mm2\n\t"
609 "psrlq $19, %%mm5\n\t"
612 "por %%mm1, %%mm0\n\t"
613 "por %%mm4, %%mm3\n\t"
614 "por %%mm2, %%mm0\n\t"
615 "por %%mm5, %%mm3\n\t"
616 "psllq $16, %%mm3\n\t"
617 "por %%mm3, %%mm0\n\t"
618 MOVNTQ" %%mm0, %0\n\t"
619 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
623 __asm __volatile(SFENCE:::"memory");
624 __asm __volatile(EMMS:::"memory");
628 const int src= *s; s += 4;
629 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
635 const uint8_t *s = src;
638 const uint8_t *mm_end;
640 uint16_t *d = (uint16_t *)dst;
643 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
647 ::"m"(red_16mask),"m"(green_16mask));
654 "movd 3%1, %%mm3\n\t"
655 "punpckldq 6%1, %%mm0\n\t"
656 "punpckldq 9%1, %%mm3\n\t"
657 "movq %%mm0, %%mm1\n\t"
658 "movq %%mm0, %%mm2\n\t"
659 "movq %%mm3, %%mm4\n\t"
660 "movq %%mm3, %%mm5\n\t"
661 "psrlq $3, %%mm0\n\t"
662 "psrlq $3, %%mm3\n\t"
665 "psrlq $5, %%mm1\n\t"
666 "psrlq $5, %%mm4\n\t"
667 "pand %%mm6, %%mm1\n\t"
668 "pand %%mm6, %%mm4\n\t"
669 "psrlq $8, %%mm2\n\t"
670 "psrlq $8, %%mm5\n\t"
671 "pand %%mm7, %%mm2\n\t"
672 "pand %%mm7, %%mm5\n\t"
673 "por %%mm1, %%mm0\n\t"
674 "por %%mm4, %%mm3\n\t"
675 "por %%mm2, %%mm0\n\t"
676 "por %%mm5, %%mm3\n\t"
677 "psllq $16, %%mm3\n\t"
678 "por %%mm3, %%mm0\n\t"
679 MOVNTQ" %%mm0, %0\n\t"
680 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
684 __asm __volatile(SFENCE:::"memory");
685 __asm __volatile(EMMS:::"memory");
692 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
696 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
698 const uint8_t *s = src;
701 const uint8_t *mm_end;
703 uint16_t *d = (uint16_t *)dst;
706 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
710 ::"m"(red_16mask),"m"(green_16mask));
717 "movd 3%1, %%mm3\n\t"
718 "punpckldq 6%1, %%mm0\n\t"
719 "punpckldq 9%1, %%mm3\n\t"
720 "movq %%mm0, %%mm1\n\t"
721 "movq %%mm0, %%mm2\n\t"
722 "movq %%mm3, %%mm4\n\t"
723 "movq %%mm3, %%mm5\n\t"
724 "psllq $8, %%mm0\n\t"
725 "psllq $8, %%mm3\n\t"
726 "pand %%mm7, %%mm0\n\t"
727 "pand %%mm7, %%mm3\n\t"
728 "psrlq $5, %%mm1\n\t"
729 "psrlq $5, %%mm4\n\t"
730 "pand %%mm6, %%mm1\n\t"
731 "pand %%mm6, %%mm4\n\t"
732 "psrlq $19, %%mm2\n\t"
733 "psrlq $19, %%mm5\n\t"
736 "por %%mm1, %%mm0\n\t"
737 "por %%mm4, %%mm3\n\t"
738 "por %%mm2, %%mm0\n\t"
739 "por %%mm5, %%mm3\n\t"
740 "psllq $16, %%mm3\n\t"
741 "por %%mm3, %%mm0\n\t"
742 MOVNTQ" %%mm0, %0\n\t"
743 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
747 __asm __volatile(SFENCE:::"memory");
748 __asm __volatile(EMMS:::"memory");
755 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
759 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
761 const uint8_t *s = src;
764 const uint8_t *mm_end;
766 uint16_t *d = (uint16_t *)dst;
769 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
773 ::"m"(red_15mask),"m"(green_15mask));
780 "movd 3%1, %%mm3\n\t"
781 "punpckldq 6%1, %%mm0\n\t"
782 "punpckldq 9%1, %%mm3\n\t"
783 "movq %%mm0, %%mm1\n\t"
784 "movq %%mm0, %%mm2\n\t"
785 "movq %%mm3, %%mm4\n\t"
786 "movq %%mm3, %%mm5\n\t"
787 "psrlq $3, %%mm0\n\t"
788 "psrlq $3, %%mm3\n\t"
791 "psrlq $6, %%mm1\n\t"
792 "psrlq $6, %%mm4\n\t"
793 "pand %%mm6, %%mm1\n\t"
794 "pand %%mm6, %%mm4\n\t"
795 "psrlq $9, %%mm2\n\t"
796 "psrlq $9, %%mm5\n\t"
797 "pand %%mm7, %%mm2\n\t"
798 "pand %%mm7, %%mm5\n\t"
799 "por %%mm1, %%mm0\n\t"
800 "por %%mm4, %%mm3\n\t"
801 "por %%mm2, %%mm0\n\t"
802 "por %%mm5, %%mm3\n\t"
803 "psllq $16, %%mm3\n\t"
804 "por %%mm3, %%mm0\n\t"
805 MOVNTQ" %%mm0, %0\n\t"
806 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
810 __asm __volatile(SFENCE:::"memory");
811 __asm __volatile(EMMS:::"memory");
818 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
822 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
824 const uint8_t *s = src;
827 const uint8_t *mm_end;
829 uint16_t *d = (uint16_t *)dst;
832 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
836 ::"m"(red_15mask),"m"(green_15mask));
843 "movd 3%1, %%mm3\n\t"
844 "punpckldq 6%1, %%mm0\n\t"
845 "punpckldq 9%1, %%mm3\n\t"
846 "movq %%mm0, %%mm1\n\t"
847 "movq %%mm0, %%mm2\n\t"
848 "movq %%mm3, %%mm4\n\t"
849 "movq %%mm3, %%mm5\n\t"
850 "psllq $7, %%mm0\n\t"
851 "psllq $7, %%mm3\n\t"
852 "pand %%mm7, %%mm0\n\t"
853 "pand %%mm7, %%mm3\n\t"
854 "psrlq $6, %%mm1\n\t"
855 "psrlq $6, %%mm4\n\t"
856 "pand %%mm6, %%mm1\n\t"
857 "pand %%mm6, %%mm4\n\t"
858 "psrlq $19, %%mm2\n\t"
859 "psrlq $19, %%mm5\n\t"
862 "por %%mm1, %%mm0\n\t"
863 "por %%mm4, %%mm3\n\t"
864 "por %%mm2, %%mm0\n\t"
865 "por %%mm5, %%mm3\n\t"
866 "psllq $16, %%mm3\n\t"
867 "por %%mm3, %%mm0\n\t"
868 MOVNTQ" %%mm0, %0\n\t"
869 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
873 __asm __volatile(SFENCE:::"memory");
874 __asm __volatile(EMMS:::"memory");
881 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
886 I use here less accurate approximation by simply
887 left-shifting the input
888 value and filling the low order bits with
889 zeroes. This method improves png's
890 compression but this scheme cannot reproduce white exactly, since it does not
891 generate an all-ones maximum value; the net effect is to darken the
894 The better method should be "left bit replication":
904 | Leftmost Bits Repeated to Fill Open Bits
908 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
912 const uint16_t *mm_end;
914 uint8_t *d = (uint8_t *)dst;
915 const uint16_t *s = (uint16_t *)src;
916 end = s + src_size/2;
918 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
930 "psllq $3, %%mm0\n\t"
931 "psrlq $2, %%mm1\n\t"
932 "psrlq $7, %%mm2\n\t"
933 "movq %%mm0, %%mm3\n\t"
934 "movq %%mm1, %%mm4\n\t"
935 "movq %%mm2, %%mm5\n\t"
936 "punpcklwd %5, %%mm0\n\t"
937 "punpcklwd %5, %%mm1\n\t"
938 "punpcklwd %5, %%mm2\n\t"
939 "punpckhwd %5, %%mm3\n\t"
940 "punpckhwd %5, %%mm4\n\t"
941 "punpckhwd %5, %%mm5\n\t"
942 "psllq $8, %%mm1\n\t"
943 "psllq $16, %%mm2\n\t"
944 "por %%mm1, %%mm0\n\t"
945 "por %%mm2, %%mm0\n\t"
946 "psllq $8, %%mm4\n\t"
947 "psllq $16, %%mm5\n\t"
948 "por %%mm4, %%mm3\n\t"
949 "por %%mm5, %%mm3\n\t"
951 "movq %%mm0, %%mm6\n\t"
952 "movq %%mm3, %%mm7\n\t"
954 "movq 8%1, %%mm0\n\t"
955 "movq 8%1, %%mm1\n\t"
956 "movq 8%1, %%mm2\n\t"
960 "psllq $3, %%mm0\n\t"
961 "psrlq $2, %%mm1\n\t"
962 "psrlq $7, %%mm2\n\t"
963 "movq %%mm0, %%mm3\n\t"
964 "movq %%mm1, %%mm4\n\t"
965 "movq %%mm2, %%mm5\n\t"
966 "punpcklwd %5, %%mm0\n\t"
967 "punpcklwd %5, %%mm1\n\t"
968 "punpcklwd %5, %%mm2\n\t"
969 "punpckhwd %5, %%mm3\n\t"
970 "punpckhwd %5, %%mm4\n\t"
971 "punpckhwd %5, %%mm5\n\t"
972 "psllq $8, %%mm1\n\t"
973 "psllq $16, %%mm2\n\t"
974 "por %%mm1, %%mm0\n\t"
975 "por %%mm2, %%mm0\n\t"
976 "psllq $8, %%mm4\n\t"
977 "psllq $16, %%mm5\n\t"
978 "por %%mm4, %%mm3\n\t"
979 "por %%mm5, %%mm3\n\t"
982 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
984 /* Borrowed 32 to 24 */
986 "movq %%mm0, %%mm4\n\t"
987 "movq %%mm3, %%mm5\n\t"
988 "movq %%mm6, %%mm0\n\t"
989 "movq %%mm7, %%mm1\n\t"
991 "movq %%mm4, %%mm6\n\t"
992 "movq %%mm5, %%mm7\n\t"
993 "movq %%mm0, %%mm2\n\t"
994 "movq %%mm1, %%mm3\n\t"
996 "psrlq $8, %%mm2\n\t"
997 "psrlq $8, %%mm3\n\t"
998 "psrlq $8, %%mm6\n\t"
999 "psrlq $8, %%mm7\n\t"
1000 "pand %2, %%mm0\n\t"
1001 "pand %2, %%mm1\n\t"
1002 "pand %2, %%mm4\n\t"
1003 "pand %2, %%mm5\n\t"
1004 "pand %3, %%mm2\n\t"
1005 "pand %3, %%mm3\n\t"
1006 "pand %3, %%mm6\n\t"
1007 "pand %3, %%mm7\n\t"
1008 "por %%mm2, %%mm0\n\t"
1009 "por %%mm3, %%mm1\n\t"
1010 "por %%mm6, %%mm4\n\t"
1011 "por %%mm7, %%mm5\n\t"
1013 "movq %%mm1, %%mm2\n\t"
1014 "movq %%mm4, %%mm3\n\t"
1015 "psllq $48, %%mm2\n\t"
1016 "psllq $32, %%mm3\n\t"
1017 "pand %4, %%mm2\n\t"
1018 "pand %5, %%mm3\n\t"
1019 "por %%mm2, %%mm0\n\t"
1020 "psrlq $16, %%mm1\n\t"
1021 "psrlq $32, %%mm4\n\t"
1022 "psllq $16, %%mm5\n\t"
1023 "por %%mm3, %%mm1\n\t"
1024 "pand %6, %%mm5\n\t"
1025 "por %%mm5, %%mm4\n\t"
1027 MOVNTQ" %%mm0, %0\n\t"
1028 MOVNTQ" %%mm1, 8%0\n\t"
1029 MOVNTQ" %%mm4, 16%0"
1032 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1037 __asm __volatile(SFENCE:::"memory");
1038 __asm __volatile(EMMS:::"memory");
1042 register uint16_t bgr;
1044 *d++ = (bgr&0x1F)<<3;
1045 *d++ = (bgr&0x3E0)>>2;
1046 *d++ = (bgr&0x7C00)>>7;
1050 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1052 const uint16_t *end;
1054 const uint16_t *mm_end;
1056 uint8_t *d = (uint8_t *)dst;
1057 const uint16_t *s = (const uint16_t *)src;
1058 end = s + src_size/2;
1060 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1066 "movq %1, %%mm0\n\t"
1067 "movq %1, %%mm1\n\t"
1068 "movq %1, %%mm2\n\t"
1069 "pand %2, %%mm0\n\t"
1070 "pand %3, %%mm1\n\t"
1071 "pand %4, %%mm2\n\t"
1072 "psllq $3, %%mm0\n\t"
1073 "psrlq $3, %%mm1\n\t"
1074 "psrlq $8, %%mm2\n\t"
1075 "movq %%mm0, %%mm3\n\t"
1076 "movq %%mm1, %%mm4\n\t"
1077 "movq %%mm2, %%mm5\n\t"
1078 "punpcklwd %5, %%mm0\n\t"
1079 "punpcklwd %5, %%mm1\n\t"
1080 "punpcklwd %5, %%mm2\n\t"
1081 "punpckhwd %5, %%mm3\n\t"
1082 "punpckhwd %5, %%mm4\n\t"
1083 "punpckhwd %5, %%mm5\n\t"
1084 "psllq $8, %%mm1\n\t"
1085 "psllq $16, %%mm2\n\t"
1086 "por %%mm1, %%mm0\n\t"
1087 "por %%mm2, %%mm0\n\t"
1088 "psllq $8, %%mm4\n\t"
1089 "psllq $16, %%mm5\n\t"
1090 "por %%mm4, %%mm3\n\t"
1091 "por %%mm5, %%mm3\n\t"
1093 "movq %%mm0, %%mm6\n\t"
1094 "movq %%mm3, %%mm7\n\t"
1096 "movq 8%1, %%mm0\n\t"
1097 "movq 8%1, %%mm1\n\t"
1098 "movq 8%1, %%mm2\n\t"
1099 "pand %2, %%mm0\n\t"
1100 "pand %3, %%mm1\n\t"
1101 "pand %4, %%mm2\n\t"
1102 "psllq $3, %%mm0\n\t"
1103 "psrlq $3, %%mm1\n\t"
1104 "psrlq $8, %%mm2\n\t"
1105 "movq %%mm0, %%mm3\n\t"
1106 "movq %%mm1, %%mm4\n\t"
1107 "movq %%mm2, %%mm5\n\t"
1108 "punpcklwd %5, %%mm0\n\t"
1109 "punpcklwd %5, %%mm1\n\t"
1110 "punpcklwd %5, %%mm2\n\t"
1111 "punpckhwd %5, %%mm3\n\t"
1112 "punpckhwd %5, %%mm4\n\t"
1113 "punpckhwd %5, %%mm5\n\t"
1114 "psllq $8, %%mm1\n\t"
1115 "psllq $16, %%mm2\n\t"
1116 "por %%mm1, %%mm0\n\t"
1117 "por %%mm2, %%mm0\n\t"
1118 "psllq $8, %%mm4\n\t"
1119 "psllq $16, %%mm5\n\t"
1120 "por %%mm4, %%mm3\n\t"
1121 "por %%mm5, %%mm3\n\t"
1123 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1125 /* Borrowed 32 to 24 */
1127 "movq %%mm0, %%mm4\n\t"
1128 "movq %%mm3, %%mm5\n\t"
1129 "movq %%mm6, %%mm0\n\t"
1130 "movq %%mm7, %%mm1\n\t"
1132 "movq %%mm4, %%mm6\n\t"
1133 "movq %%mm5, %%mm7\n\t"
1134 "movq %%mm0, %%mm2\n\t"
1135 "movq %%mm1, %%mm3\n\t"
1137 "psrlq $8, %%mm2\n\t"
1138 "psrlq $8, %%mm3\n\t"
1139 "psrlq $8, %%mm6\n\t"
1140 "psrlq $8, %%mm7\n\t"
1141 "pand %2, %%mm0\n\t"
1142 "pand %2, %%mm1\n\t"
1143 "pand %2, %%mm4\n\t"
1144 "pand %2, %%mm5\n\t"
1145 "pand %3, %%mm2\n\t"
1146 "pand %3, %%mm3\n\t"
1147 "pand %3, %%mm6\n\t"
1148 "pand %3, %%mm7\n\t"
1149 "por %%mm2, %%mm0\n\t"
1150 "por %%mm3, %%mm1\n\t"
1151 "por %%mm6, %%mm4\n\t"
1152 "por %%mm7, %%mm5\n\t"
1154 "movq %%mm1, %%mm2\n\t"
1155 "movq %%mm4, %%mm3\n\t"
1156 "psllq $48, %%mm2\n\t"
1157 "psllq $32, %%mm3\n\t"
1158 "pand %4, %%mm2\n\t"
1159 "pand %5, %%mm3\n\t"
1160 "por %%mm2, %%mm0\n\t"
1161 "psrlq $16, %%mm1\n\t"
1162 "psrlq $32, %%mm4\n\t"
1163 "psllq $16, %%mm5\n\t"
1164 "por %%mm3, %%mm1\n\t"
1165 "pand %6, %%mm5\n\t"
1166 "por %%mm5, %%mm4\n\t"
1168 MOVNTQ" %%mm0, %0\n\t"
1169 MOVNTQ" %%mm1, 8%0\n\t"
1170 MOVNTQ" %%mm4, 16%0"
1173 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1178 __asm __volatile(SFENCE:::"memory");
1179 __asm __volatile(EMMS:::"memory");
1183 register uint16_t bgr;
1185 *d++ = (bgr&0x1F)<<3;
1186 *d++ = (bgr&0x7E0)>>3;
1187 *d++ = (bgr&0xF800)>>8;
1191 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1193 const uint16_t *end;
1195 const uint16_t *mm_end;
1197 uint8_t *d = (uint8_t *)dst;
1198 const uint16_t *s = (const uint16_t *)src;
1199 end = s + src_size/2;
1201 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1202 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1208 "movq %1, %%mm0\n\t"
1209 "movq %1, %%mm1\n\t"
1210 "movq %1, %%mm2\n\t"
1211 "pand %2, %%mm0\n\t"
1212 "pand %3, %%mm1\n\t"
1213 "pand %4, %%mm2\n\t"
1214 "psllq $3, %%mm0\n\t"
1215 "psrlq $2, %%mm1\n\t"
1216 "psrlq $7, %%mm2\n\t"
1217 "movq %%mm0, %%mm3\n\t"
1218 "movq %%mm1, %%mm4\n\t"
1219 "movq %%mm2, %%mm5\n\t"
1220 "punpcklwd %%mm7, %%mm0\n\t"
1221 "punpcklwd %%mm7, %%mm1\n\t"
1222 "punpcklwd %%mm7, %%mm2\n\t"
1223 "punpckhwd %%mm7, %%mm3\n\t"
1224 "punpckhwd %%mm7, %%mm4\n\t"
1225 "punpckhwd %%mm7, %%mm5\n\t"
1226 "psllq $8, %%mm1\n\t"
1227 "psllq $16, %%mm2\n\t"
1228 "por %%mm1, %%mm0\n\t"
1229 "por %%mm2, %%mm0\n\t"
1230 "psllq $8, %%mm4\n\t"
1231 "psllq $16, %%mm5\n\t"
1232 "por %%mm4, %%mm3\n\t"
1233 "por %%mm5, %%mm3\n\t"
1234 MOVNTQ" %%mm0, %0\n\t"
1235 MOVNTQ" %%mm3, 8%0\n\t"
1237 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1242 __asm __volatile(SFENCE:::"memory");
1243 __asm __volatile(EMMS:::"memory");
1247 #if 0 //slightly slower on athlon
1249 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1251 register uint16_t bgr;
1253 #ifdef WORDS_BIGENDIAN
1255 *d++ = (bgr&0x7C00)>>7;
1256 *d++ = (bgr&0x3E0)>>2;
1257 *d++ = (bgr&0x1F)<<3;
1259 *d++ = (bgr&0x1F)<<3;
1260 *d++ = (bgr&0x3E0)>>2;
1261 *d++ = (bgr&0x7C00)>>7;
1269 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1271 const uint16_t *end;
1273 const uint16_t *mm_end;
1275 uint8_t *d = (uint8_t *)dst;
1276 const uint16_t *s = (uint16_t *)src;
1277 end = s + src_size/2;
1279 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1280 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1286 "movq %1, %%mm0\n\t"
1287 "movq %1, %%mm1\n\t"
1288 "movq %1, %%mm2\n\t"
1289 "pand %2, %%mm0\n\t"
1290 "pand %3, %%mm1\n\t"
1291 "pand %4, %%mm2\n\t"
1292 "psllq $3, %%mm0\n\t"
1293 "psrlq $3, %%mm1\n\t"
1294 "psrlq $8, %%mm2\n\t"
1295 "movq %%mm0, %%mm3\n\t"
1296 "movq %%mm1, %%mm4\n\t"
1297 "movq %%mm2, %%mm5\n\t"
1298 "punpcklwd %%mm7, %%mm0\n\t"
1299 "punpcklwd %%mm7, %%mm1\n\t"
1300 "punpcklwd %%mm7, %%mm2\n\t"
1301 "punpckhwd %%mm7, %%mm3\n\t"
1302 "punpckhwd %%mm7, %%mm4\n\t"
1303 "punpckhwd %%mm7, %%mm5\n\t"
1304 "psllq $8, %%mm1\n\t"
1305 "psllq $16, %%mm2\n\t"
1306 "por %%mm1, %%mm0\n\t"
1307 "por %%mm2, %%mm0\n\t"
1308 "psllq $8, %%mm4\n\t"
1309 "psllq $16, %%mm5\n\t"
1310 "por %%mm4, %%mm3\n\t"
1311 "por %%mm5, %%mm3\n\t"
1312 MOVNTQ" %%mm0, %0\n\t"
1313 MOVNTQ" %%mm3, 8%0\n\t"
1315 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1320 __asm __volatile(SFENCE:::"memory");
1321 __asm __volatile(EMMS:::"memory");
1325 register uint16_t bgr;
1327 #ifdef WORDS_BIGENDIAN
1329 *d++ = (bgr&0xF800)>>8;
1330 *d++ = (bgr&0x7E0)>>3;
1331 *d++ = (bgr&0x1F)<<3;
1333 *d++ = (bgr&0x1F)<<3;
1334 *d++ = (bgr&0x7E0)>>3;
1335 *d++ = (bgr&0xF800)>>8;
1341 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1344 /* TODO: unroll this loop */
1346 "xor %%"REG_a", %%"REG_a" \n\t"
1349 PREFETCH" 32(%0, %%"REG_a") \n\t"
1350 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1351 "movq %%mm0, %%mm1 \n\t"
1352 "movq %%mm0, %%mm2 \n\t"
1353 "pslld $16, %%mm0 \n\t"
1354 "psrld $16, %%mm1 \n\t"
1355 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1356 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1357 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1358 "por %%mm0, %%mm2 \n\t"
1359 "por %%mm1, %%mm2 \n\t"
1360 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1361 "add $8, %%"REG_a" \n\t"
1362 "cmp %2, %%"REG_a" \n\t"
1364 :: "r" (src), "r"(dst), "r" (src_size-7)
1368 __asm __volatile(SFENCE:::"memory");
1369 __asm __volatile(EMMS:::"memory");
1372 unsigned num_pixels = src_size >> 2;
1373 for(i=0; i<num_pixels; i++)
1375 #ifdef WORDS_BIGENDIAN
1376 dst[4*i + 1] = src[4*i + 3];
1377 dst[4*i + 2] = src[4*i + 2];
1378 dst[4*i + 3] = src[4*i + 1];
1380 dst[4*i + 0] = src[4*i + 2];
1381 dst[4*i + 1] = src[4*i + 1];
1382 dst[4*i + 2] = src[4*i + 0];
1388 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1392 long mmx_size= 23 - src_size;
1394 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1395 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1396 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1399 PREFETCH" 32(%1, %%"REG_a") \n\t"
1400 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1401 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1402 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1403 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1404 "pand %%mm5, %%mm0 \n\t"
1405 "pand %%mm6, %%mm1 \n\t"
1406 "pand %%mm7, %%mm2 \n\t"
1407 "por %%mm0, %%mm1 \n\t"
1408 "por %%mm2, %%mm1 \n\t"
1409 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1410 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1411 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1412 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1413 "pand %%mm7, %%mm0 \n\t"
1414 "pand %%mm5, %%mm1 \n\t"
1415 "pand %%mm6, %%mm2 \n\t"
1416 "por %%mm0, %%mm1 \n\t"
1417 "por %%mm2, %%mm1 \n\t"
1418 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1419 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1420 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1421 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1422 "pand %%mm6, %%mm0 \n\t"
1423 "pand %%mm7, %%mm1 \n\t"
1424 "pand %%mm5, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1428 "add $24, %%"REG_a" \n\t"
1431 : "r" (src-mmx_size), "r"(dst-mmx_size)
1434 __asm __volatile(SFENCE:::"memory");
1435 __asm __volatile(EMMS:::"memory");
1437 if(mmx_size==23) return; //finihsed, was multiple of 8
1441 src_size= 23-mmx_size;
1445 for(i=0; i<src_size; i+=3)
1449 dst[i + 1] = src[i + 1];
1450 dst[i + 2] = src[i + 0];
1455 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456 long width, long height,
1457 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1460 const long chromWidth= width>>1;
1461 for(y=0; y<height; y++)
1464 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1466 "xor %%"REG_a", %%"REG_a" \n\t"
1469 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1470 PREFETCH" 32(%2, %%"REG_a") \n\t"
1471 PREFETCH" 32(%3, %%"REG_a") \n\t"
1472 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1473 "movq %%mm0, %%mm2 \n\t" // U(0)
1474 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1475 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1476 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1478 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1479 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1480 "movq %%mm3, %%mm4 \n\t" // Y(0)
1481 "movq %%mm5, %%mm6 \n\t" // Y(8)
1482 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1483 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1484 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1485 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1487 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1488 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1489 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1490 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1492 "add $8, %%"REG_a" \n\t"
1493 "cmp %4, %%"REG_a" \n\t"
1495 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1500 #if defined ARCH_ALPHA && defined HAVE_MVI
1501 #define pl2yuy2(n) \
1506 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1507 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1508 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1509 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1510 yuv1 = (u << 8) + (v << 24); \
1517 uint64_t *qdst = (uint64_t *) dst;
1518 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1519 const uint32_t *yc = (uint32_t *) ysrc;
1520 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1521 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1522 for(i = 0; i < chromWidth; i += 8){
1523 uint64_t y1, y2, yuv1, yuv2;
1526 asm("ldq $31,64(%0)" :: "r"(yc));
1527 asm("ldq $31,64(%0)" :: "r"(yc2));
1528 asm("ldq $31,64(%0)" :: "r"(uc));
1529 asm("ldq $31,64(%0)" :: "r"(vc));
1547 #elif __WORDSIZE >= 64
1549 uint64_t *ldst = (uint64_t *) dst;
1550 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1551 for(i = 0; i < chromWidth; i += 2){
1553 k = yc[0] + (uc[0] << 8) +
1554 (yc[1] << 16) + (vc[0] << 24);
1555 l = yc[2] + (uc[1] << 8) +
1556 (yc[3] << 16) + (vc[1] << 24);
1557 *ldst++ = k + (l << 32);
1564 int i, *idst = (int32_t *) dst;
1565 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1566 for(i = 0; i < chromWidth; i++){
1567 #ifdef WORDS_BIGENDIAN
1568 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1569 (yc[1] << 8) + (vc[0] << 0);
1571 *idst++ = yc[0] + (uc[0] << 8) +
1572 (yc[1] << 16) + (vc[0] << 24);
1580 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1582 usrc += chromStride;
1583 vsrc += chromStride;
1597 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1598 * problem for anyone then tell me, and ill fix it)
1600 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1601 long width, long height,
1602 long lumStride, long chromStride, long dstStride)
1604 //FIXME interpolate chroma
1605 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1608 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1609 long width, long height,
1610 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1613 const long chromWidth= width>>1;
1614 for(y=0; y<height; y++)
1617 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1619 "xor %%"REG_a", %%"REG_a" \n\t"
1622 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1623 PREFETCH" 32(%2, %%"REG_a") \n\t"
1624 PREFETCH" 32(%3, %%"REG_a") \n\t"
1625 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1626 "movq %%mm0, %%mm2 \n\t" // U(0)
1627 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1628 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1629 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1631 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1632 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1633 "movq %%mm0, %%mm4 \n\t" // Y(0)
1634 "movq %%mm2, %%mm6 \n\t" // Y(8)
1635 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1636 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1637 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1638 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1640 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1641 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1642 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1643 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1645 "add $8, %%"REG_a" \n\t"
1646 "cmp %4, %%"REG_a" \n\t"
1648 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1652 //FIXME adapt the alpha asm code from yv12->yuy2
1654 #if __WORDSIZE >= 64
1656 uint64_t *ldst = (uint64_t *) dst;
1657 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1658 for(i = 0; i < chromWidth; i += 2){
1660 k = uc[0] + (yc[0] << 8) +
1661 (vc[0] << 16) + (yc[1] << 24);
1662 l = uc[1] + (yc[2] << 8) +
1663 (vc[1] << 16) + (yc[3] << 24);
1664 *ldst++ = k + (l << 32);
1671 int i, *idst = (int32_t *) dst;
1672 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1673 for(i = 0; i < chromWidth; i++){
1674 #ifdef WORDS_BIGENDIAN
1675 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1676 (vc[0] << 8) + (yc[1] << 0);
1678 *idst++ = uc[0] + (yc[0] << 8) +
1679 (vc[0] << 16) + (yc[1] << 24);
1687 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1689 usrc += chromStride;
1690 vsrc += chromStride;
1704 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1705 * problem for anyone then tell me, and ill fix it)
1707 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1708 long width, long height,
1709 long lumStride, long chromStride, long dstStride)
1711 //FIXME interpolate chroma
1712 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1717 * width should be a multiple of 16
1719 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1720 long width, long height,
1721 long lumStride, long chromStride, long dstStride)
1723 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1728 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729 * problem for anyone then tell me, and ill fix it)
1731 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1732 long width, long height,
1733 long lumStride, long chromStride, long srcStride)
1736 const long chromWidth= width>>1;
1737 for(y=0; y<height; y+=2)
1741 "xor %%"REG_a", %%"REG_a" \n\t"
1742 "pcmpeqw %%mm7, %%mm7 \n\t"
1743 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1746 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1747 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1748 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1749 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1750 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1751 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1752 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1753 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1754 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1755 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1756 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1758 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1760 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1761 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1762 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1763 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1764 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1765 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1766 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1767 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1768 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1769 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1771 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1773 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1774 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1775 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1776 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1777 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1778 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1779 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1780 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1782 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1783 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1785 "add $8, %%"REG_a" \n\t"
1786 "cmp %4, %%"REG_a" \n\t"
1788 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1789 : "memory", "%"REG_a
1796 "xor %%"REG_a", %%"REG_a" \n\t"
1799 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1800 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1801 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1802 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1803 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1804 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1805 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1806 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1809 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1811 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1812 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1814 "add $8, %%"REG_a" \n\t"
1815 "cmp %4, %%"REG_a" \n\t"
1818 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1819 : "memory", "%"REG_a
1823 for(i=0; i<chromWidth; i++)
1825 ydst[2*i+0] = src[4*i+0];
1826 udst[i] = src[4*i+1];
1827 ydst[2*i+1] = src[4*i+2];
1828 vdst[i] = src[4*i+3];
1833 for(i=0; i<chromWidth; i++)
1835 ydst[2*i+0] = src[4*i+0];
1836 ydst[2*i+1] = src[4*i+2];
1839 udst += chromStride;
1840 vdst += chromStride;
1845 asm volatile( EMMS" \n\t"
1851 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1852 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853 long width, long height, long lumStride, long chromStride)
1856 memcpy(ydst, ysrc, width*height);
1858 /* XXX: implement upscaling for U,V */
1861 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1868 for(x=0; x<srcWidth-1; x++){
1869 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1870 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1872 dst[2*srcWidth-1]= src[srcWidth-1];
1876 for(y=1; y<srcHeight; y++){
1877 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878 const long mmxSize= srcWidth&~15;
1880 "mov %4, %%"REG_a" \n\t"
1882 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1883 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1884 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1885 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1886 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1887 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1888 PAVGB" %%mm0, %%mm5 \n\t"
1889 PAVGB" %%mm0, %%mm3 \n\t"
1890 PAVGB" %%mm0, %%mm5 \n\t"
1891 PAVGB" %%mm0, %%mm3 \n\t"
1892 PAVGB" %%mm1, %%mm4 \n\t"
1893 PAVGB" %%mm1, %%mm2 \n\t"
1894 PAVGB" %%mm1, %%mm4 \n\t"
1895 PAVGB" %%mm1, %%mm2 \n\t"
1896 "movq %%mm5, %%mm7 \n\t"
1897 "movq %%mm4, %%mm6 \n\t"
1898 "punpcklbw %%mm3, %%mm5 \n\t"
1899 "punpckhbw %%mm3, %%mm7 \n\t"
1900 "punpcklbw %%mm2, %%mm4 \n\t"
1901 "punpckhbw %%mm2, %%mm6 \n\t"
1903 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1904 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1905 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1906 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1908 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1909 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1910 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1911 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1913 "add $8, %%"REG_a" \n\t"
1915 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1916 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1922 const long mmxSize=1;
1924 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1925 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1927 for(x=mmxSize-1; x<srcWidth-1; x++){
1928 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1929 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1930 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1931 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1933 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1934 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1944 for(x=0; x<srcWidth-1; x++){
1945 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1946 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1948 dst[2*srcWidth-1]= src[srcWidth-1];
1950 for(x=0; x<srcWidth; x++){
1957 asm volatile( EMMS" \n\t"
1965 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1966 * problem for anyone then tell me, and ill fix it)
1967 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1969 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1970 long width, long height,
1971 long lumStride, long chromStride, long srcStride)
1974 const long chromWidth= width>>1;
1975 for(y=0; y<height; y+=2)
1979 "xorl %%eax, %%eax \n\t"
1980 "pcmpeqw %%mm7, %%mm7 \n\t"
1981 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1984 PREFETCH" 64(%0, %%eax, 4) \n\t"
1985 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1986 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1987 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1988 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1989 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1990 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1991 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1992 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1993 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1994 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1996 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1998 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1999 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2000 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2001 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2002 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2003 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2004 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2005 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2006 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2007 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2009 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2011 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2012 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2013 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2014 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2015 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2016 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2017 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2018 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2020 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2021 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2023 "addl $8, %%eax \n\t"
2024 "cmpl %4, %%eax \n\t"
2026 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2034 "xorl %%eax, %%eax \n\t"
2037 PREFETCH" 64(%0, %%eax, 4) \n\t"
2038 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2039 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2040 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2041 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2042 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2043 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2044 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2047 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2049 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2050 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2052 "addl $8, %%eax \n\t"
2053 "cmpl %4, %%eax \n\t"
2056 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2061 for(i=0; i<chromWidth; i++)
2063 udst[i] = src[4*i+0];
2064 ydst[2*i+0] = src[4*i+1];
2065 vdst[i] = src[4*i+2];
2066 ydst[2*i+1] = src[4*i+3];
2071 for(i=0; i<chromWidth; i++)
2073 ydst[2*i+0] = src[4*i+1];
2074 ydst[2*i+1] = src[4*i+3];
2077 udst += chromStride;
2078 vdst += chromStride;
2083 asm volatile( EMMS" \n\t"
2091 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2092 * problem for anyone then tell me, and ill fix it)
2093 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2095 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2096 long width, long height,
2097 long lumStride, long chromStride, long srcStride)
2100 const long chromWidth= width>>1;
2102 for(y=0; y<height-2; y+=2)
2108 "mov %2, %%"REG_a" \n\t"
2109 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2110 "movq "MANGLE(w1111)", %%mm5 \n\t"
2111 "pxor %%mm7, %%mm7 \n\t"
2112 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115 PREFETCH" 64(%0, %%"REG_b") \n\t"
2116 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2117 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2118 "punpcklbw %%mm7, %%mm0 \n\t"
2119 "punpcklbw %%mm7, %%mm1 \n\t"
2120 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2121 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "punpcklbw %%mm7, %%mm3 \n\t"
2124 "pmaddwd %%mm6, %%mm0 \n\t"
2125 "pmaddwd %%mm6, %%mm1 \n\t"
2126 "pmaddwd %%mm6, %%mm2 \n\t"
2127 "pmaddwd %%mm6, %%mm3 \n\t"
2128 #ifndef FAST_BGR2YV12
2129 "psrad $8, %%mm0 \n\t"
2130 "psrad $8, %%mm1 \n\t"
2131 "psrad $8, %%mm2 \n\t"
2132 "psrad $8, %%mm3 \n\t"
2134 "packssdw %%mm1, %%mm0 \n\t"
2135 "packssdw %%mm3, %%mm2 \n\t"
2136 "pmaddwd %%mm5, %%mm0 \n\t"
2137 "pmaddwd %%mm5, %%mm2 \n\t"
2138 "packssdw %%mm2, %%mm0 \n\t"
2139 "psraw $7, %%mm0 \n\t"
2141 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2142 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2143 "punpcklbw %%mm7, %%mm4 \n\t"
2144 "punpcklbw %%mm7, %%mm1 \n\t"
2145 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2146 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2147 "punpcklbw %%mm7, %%mm2 \n\t"
2148 "punpcklbw %%mm7, %%mm3 \n\t"
2149 "pmaddwd %%mm6, %%mm4 \n\t"
2150 "pmaddwd %%mm6, %%mm1 \n\t"
2151 "pmaddwd %%mm6, %%mm2 \n\t"
2152 "pmaddwd %%mm6, %%mm3 \n\t"
2153 #ifndef FAST_BGR2YV12
2154 "psrad $8, %%mm4 \n\t"
2155 "psrad $8, %%mm1 \n\t"
2156 "psrad $8, %%mm2 \n\t"
2157 "psrad $8, %%mm3 \n\t"
2159 "packssdw %%mm1, %%mm4 \n\t"
2160 "packssdw %%mm3, %%mm2 \n\t"
2161 "pmaddwd %%mm5, %%mm4 \n\t"
2162 "pmaddwd %%mm5, %%mm2 \n\t"
2163 "add $24, %%"REG_b" \n\t"
2164 "packssdw %%mm2, %%mm4 \n\t"
2165 "psraw $7, %%mm4 \n\t"
2167 "packuswb %%mm4, %%mm0 \n\t"
2168 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2170 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2171 "add $8, %%"REG_a" \n\t"
2173 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2174 : "%"REG_a, "%"REG_b
2181 "mov %4, %%"REG_a" \n\t"
2182 "movq "MANGLE(w1111)", %%mm5 \n\t"
2183 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2184 "pxor %%mm7, %%mm7 \n\t"
2185 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2186 "add %%"REG_b", %%"REG_b" \n\t"
2189 PREFETCH" 64(%0, %%"REG_b") \n\t"
2190 PREFETCH" 64(%1, %%"REG_b") \n\t"
2191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2192 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2193 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2194 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2195 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2196 PAVGB" %%mm1, %%mm0 \n\t"
2197 PAVGB" %%mm3, %%mm2 \n\t"
2198 "movq %%mm0, %%mm1 \n\t"
2199 "movq %%mm2, %%mm3 \n\t"
2200 "psrlq $24, %%mm0 \n\t"
2201 "psrlq $24, %%mm2 \n\t"
2202 PAVGB" %%mm1, %%mm0 \n\t"
2203 PAVGB" %%mm3, %%mm2 \n\t"
2204 "punpcklbw %%mm7, %%mm0 \n\t"
2205 "punpcklbw %%mm7, %%mm2 \n\t"
2207 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2208 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2209 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2210 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2211 "punpcklbw %%mm7, %%mm0 \n\t"
2212 "punpcklbw %%mm7, %%mm1 \n\t"
2213 "punpcklbw %%mm7, %%mm2 \n\t"
2214 "punpcklbw %%mm7, %%mm3 \n\t"
2215 "paddw %%mm1, %%mm0 \n\t"
2216 "paddw %%mm3, %%mm2 \n\t"
2217 "paddw %%mm2, %%mm0 \n\t"
2218 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2219 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2220 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2221 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2222 "punpcklbw %%mm7, %%mm4 \n\t"
2223 "punpcklbw %%mm7, %%mm1 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm3 \n\t"
2226 "paddw %%mm1, %%mm4 \n\t"
2227 "paddw %%mm3, %%mm2 \n\t"
2228 "paddw %%mm4, %%mm2 \n\t"
2229 "psrlw $2, %%mm0 \n\t"
2230 "psrlw $2, %%mm2 \n\t"
2232 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2233 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2235 "pmaddwd %%mm0, %%mm1 \n\t"
2236 "pmaddwd %%mm2, %%mm3 \n\t"
2237 "pmaddwd %%mm6, %%mm0 \n\t"
2238 "pmaddwd %%mm6, %%mm2 \n\t"
2239 #ifndef FAST_BGR2YV12
2240 "psrad $8, %%mm0 \n\t"
2241 "psrad $8, %%mm1 \n\t"
2242 "psrad $8, %%mm2 \n\t"
2243 "psrad $8, %%mm3 \n\t"
2245 "packssdw %%mm2, %%mm0 \n\t"
2246 "packssdw %%mm3, %%mm1 \n\t"
2247 "pmaddwd %%mm5, %%mm0 \n\t"
2248 "pmaddwd %%mm5, %%mm1 \n\t"
2249 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2250 "psraw $7, %%mm0 \n\t"
2252 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2253 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2254 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2255 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2256 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2257 PAVGB" %%mm1, %%mm4 \n\t"
2258 PAVGB" %%mm3, %%mm2 \n\t"
2259 "movq %%mm4, %%mm1 \n\t"
2260 "movq %%mm2, %%mm3 \n\t"
2261 "psrlq $24, %%mm4 \n\t"
2262 "psrlq $24, %%mm2 \n\t"
2263 PAVGB" %%mm1, %%mm4 \n\t"
2264 PAVGB" %%mm3, %%mm2 \n\t"
2265 "punpcklbw %%mm7, %%mm4 \n\t"
2266 "punpcklbw %%mm7, %%mm2 \n\t"
2268 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2269 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2270 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2271 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2272 "punpcklbw %%mm7, %%mm4 \n\t"
2273 "punpcklbw %%mm7, %%mm1 \n\t"
2274 "punpcklbw %%mm7, %%mm2 \n\t"
2275 "punpcklbw %%mm7, %%mm3 \n\t"
2276 "paddw %%mm1, %%mm4 \n\t"
2277 "paddw %%mm3, %%mm2 \n\t"
2278 "paddw %%mm2, %%mm4 \n\t"
2279 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2280 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2281 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2282 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2283 "punpcklbw %%mm7, %%mm5 \n\t"
2284 "punpcklbw %%mm7, %%mm1 \n\t"
2285 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm3 \n\t"
2287 "paddw %%mm1, %%mm5 \n\t"
2288 "paddw %%mm3, %%mm2 \n\t"
2289 "paddw %%mm5, %%mm2 \n\t"
2290 "movq "MANGLE(w1111)", %%mm5 \n\t"
2291 "psrlw $2, %%mm4 \n\t"
2292 "psrlw $2, %%mm2 \n\t"
2294 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2295 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2297 "pmaddwd %%mm4, %%mm1 \n\t"
2298 "pmaddwd %%mm2, %%mm3 \n\t"
2299 "pmaddwd %%mm6, %%mm4 \n\t"
2300 "pmaddwd %%mm6, %%mm2 \n\t"
2301 #ifndef FAST_BGR2YV12
2302 "psrad $8, %%mm4 \n\t"
2303 "psrad $8, %%mm1 \n\t"
2304 "psrad $8, %%mm2 \n\t"
2305 "psrad $8, %%mm3 \n\t"
2307 "packssdw %%mm2, %%mm4 \n\t"
2308 "packssdw %%mm3, %%mm1 \n\t"
2309 "pmaddwd %%mm5, %%mm4 \n\t"
2310 "pmaddwd %%mm5, %%mm1 \n\t"
2311 "add $24, %%"REG_b" \n\t"
2312 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2313 "psraw $7, %%mm4 \n\t"
2315 "movq %%mm0, %%mm1 \n\t"
2316 "punpckldq %%mm4, %%mm0 \n\t"
2317 "punpckhdq %%mm4, %%mm1 \n\t"
2318 "packsswb %%mm1, %%mm0 \n\t"
2319 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2320 "movd %%mm0, (%2, %%"REG_a") \n\t"
2321 "punpckhdq %%mm0, %%mm0 \n\t"
2322 "movd %%mm0, (%3, %%"REG_a") \n\t"
2323 "add $4, %%"REG_a" \n\t"
2325 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2326 : "%"REG_a, "%"REG_b
2329 udst += chromStride;
2330 vdst += chromStride;
2334 asm volatile( EMMS" \n\t"
2340 for(; y<height; y+=2)
2343 for(i=0; i<chromWidth; i++)
2345 unsigned int b= src[6*i+0];
2346 unsigned int g= src[6*i+1];
2347 unsigned int r= src[6*i+2];
2349 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2351 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2361 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2367 for(i=0; i<chromWidth; i++)
2369 unsigned int b= src[6*i+0];
2370 unsigned int g= src[6*i+1];
2371 unsigned int r= src[6*i+2];
2373 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2381 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2384 udst += chromStride;
2385 vdst += chromStride;
2391 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2392 long width, long height, long src1Stride,
2393 long src2Stride, long dstStride){
2396 for(h=0; h < height; h++)
2403 "xor %%"REG_a", %%"REG_a" \n\t"
2405 PREFETCH" 64(%1, %%"REG_a") \n\t"
2406 PREFETCH" 64(%2, %%"REG_a") \n\t"
2407 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2408 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2409 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2410 "punpcklbw %%xmm2, %%xmm0 \n\t"
2411 "punpckhbw %%xmm2, %%xmm1 \n\t"
2412 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2413 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2414 "add $16, %%"REG_a" \n\t"
2415 "cmp %3, %%"REG_a" \n\t"
2417 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2418 : "memory", "%"REG_a""
2422 "xor %%"REG_a", %%"REG_a" \n\t"
2424 PREFETCH" 64(%1, %%"REG_a") \n\t"
2425 PREFETCH" 64(%2, %%"REG_a") \n\t"
2426 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2427 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2428 "movq %%mm0, %%mm1 \n\t"
2429 "movq %%mm2, %%mm3 \n\t"
2430 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2431 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2432 "punpcklbw %%mm4, %%mm0 \n\t"
2433 "punpckhbw %%mm4, %%mm1 \n\t"
2434 "punpcklbw %%mm5, %%mm2 \n\t"
2435 "punpckhbw %%mm5, %%mm3 \n\t"
2436 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2437 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2438 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2439 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2440 "add $16, %%"REG_a" \n\t"
2441 "cmp %3, %%"REG_a" \n\t"
2443 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2444 : "memory", "%"REG_a
2447 for(w= (width&(~15)); w < width; w++)
2449 dest[2*w+0] = src1[w];
2450 dest[2*w+1] = src2[w];
2453 for(w=0; w < width; w++)
2455 dest[2*w+0] = src1[w];
2456 dest[2*w+1] = src2[w];
2472 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2473 uint8_t *dst1, uint8_t *dst2,
2474 long width, long height,
2475 long srcStride1, long srcStride2,
2476 long dstStride1, long dstStride2)
2479 w=width/2; h=height/2;
2484 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2487 const uint8_t* s1=src1+srcStride1*(y>>1);
2488 uint8_t* d=dst1+dstStride1*y;
2495 "movq %1, %%mm0\n\t"
2496 "movq 8%1, %%mm2\n\t"
2497 "movq 16%1, %%mm4\n\t"
2498 "movq 24%1, %%mm6\n\t"
2499 "movq %%mm0, %%mm1\n\t"
2500 "movq %%mm2, %%mm3\n\t"
2501 "movq %%mm4, %%mm5\n\t"
2502 "movq %%mm6, %%mm7\n\t"
2503 "punpcklbw %%mm0, %%mm0\n\t"
2504 "punpckhbw %%mm1, %%mm1\n\t"
2505 "punpcklbw %%mm2, %%mm2\n\t"
2506 "punpckhbw %%mm3, %%mm3\n\t"
2507 "punpcklbw %%mm4, %%mm4\n\t"
2508 "punpckhbw %%mm5, %%mm5\n\t"
2509 "punpcklbw %%mm6, %%mm6\n\t"
2510 "punpckhbw %%mm7, %%mm7\n\t"
2511 MOVNTQ" %%mm0, %0\n\t"
2512 MOVNTQ" %%mm1, 8%0\n\t"
2513 MOVNTQ" %%mm2, 16%0\n\t"
2514 MOVNTQ" %%mm3, 24%0\n\t"
2515 MOVNTQ" %%mm4, 32%0\n\t"
2516 MOVNTQ" %%mm5, 40%0\n\t"
2517 MOVNTQ" %%mm6, 48%0\n\t"
2518 MOVNTQ" %%mm7, 56%0"
2524 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2527 const uint8_t* s2=src2+srcStride2*(y>>1);
2528 uint8_t* d=dst2+dstStride2*y;
2535 "movq %1, %%mm0\n\t"
2536 "movq 8%1, %%mm2\n\t"
2537 "movq 16%1, %%mm4\n\t"
2538 "movq 24%1, %%mm6\n\t"
2539 "movq %%mm0, %%mm1\n\t"
2540 "movq %%mm2, %%mm3\n\t"
2541 "movq %%mm4, %%mm5\n\t"
2542 "movq %%mm6, %%mm7\n\t"
2543 "punpcklbw %%mm0, %%mm0\n\t"
2544 "punpckhbw %%mm1, %%mm1\n\t"
2545 "punpcklbw %%mm2, %%mm2\n\t"
2546 "punpckhbw %%mm3, %%mm3\n\t"
2547 "punpcklbw %%mm4, %%mm4\n\t"
2548 "punpckhbw %%mm5, %%mm5\n\t"
2549 "punpcklbw %%mm6, %%mm6\n\t"
2550 "punpckhbw %%mm7, %%mm7\n\t"
2551 MOVNTQ" %%mm0, %0\n\t"
2552 MOVNTQ" %%mm1, 8%0\n\t"
2553 MOVNTQ" %%mm2, 16%0\n\t"
2554 MOVNTQ" %%mm3, 24%0\n\t"
2555 MOVNTQ" %%mm4, 32%0\n\t"
2556 MOVNTQ" %%mm5, 40%0\n\t"
2557 MOVNTQ" %%mm6, 48%0\n\t"
2558 MOVNTQ" %%mm7, 56%0"
2564 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2575 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2577 long width, long height,
2578 long srcStride1, long srcStride2,
2579 long srcStride3, long dstStride)
2582 w=width/2; h=height;
2584 const uint8_t* yp=src1+srcStride1*y;
2585 const uint8_t* up=src2+srcStride2*(y>>2);
2586 const uint8_t* vp=src3+srcStride3*(y>>2);
2587 uint8_t* d=dst+dstStride*y;
2593 PREFETCH" 32(%1, %0)\n\t"
2594 PREFETCH" 32(%2, %0)\n\t"
2595 PREFETCH" 32(%3, %0)\n\t"
2596 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2597 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2598 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2599 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2600 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2601 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2602 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2603 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2604 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2605 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2607 "movq %%mm1, %%mm6\n\t"
2608 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2609 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2610 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2611 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2612 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2614 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2615 "movq 8(%1, %0, 4), %%mm0\n\t"
2616 "movq %%mm0, %%mm3\n\t"
2617 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2618 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2619 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2620 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2622 "movq %%mm4, %%mm6\n\t"
2623 "movq 16(%1, %0, 4), %%mm0\n\t"
2624 "movq %%mm0, %%mm3\n\t"
2625 "punpcklbw %%mm5, %%mm4\n\t"
2626 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2627 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2628 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2629 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2631 "punpckhbw %%mm5, %%mm6\n\t"
2632 "movq 24(%1, %0, 4), %%mm0\n\t"
2633 "movq %%mm0, %%mm3\n\t"
2634 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2635 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2636 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2637 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2640 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2646 const long x2= x<<2;