3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
13 #include <inttypes.h> /* for __WORDSIZE */
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
43 #define PREFETCH "/nop"
44 #define PREFETCHW "/nop"
48 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
55 #define MOVNTQ "movntq"
56 #define SFENCE "sfence"
62 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
65 const uint8_t *s = src;
68 const uint8_t *mm_end;
72 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
74 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
80 "punpckldq 3%1, %%mm0\n\t"
82 "punpckldq 9%1, %%mm1\n\t"
83 "movd 12%1, %%mm2\n\t"
84 "punpckldq 15%1, %%mm2\n\t"
85 "movd 18%1, %%mm3\n\t"
86 "punpckldq 21%1, %%mm3\n\t"
87 "pand %%mm7, %%mm0\n\t"
88 "pand %%mm7, %%mm1\n\t"
89 "pand %%mm7, %%mm2\n\t"
90 "pand %%mm7, %%mm3\n\t"
91 MOVNTQ" %%mm0, %0\n\t"
92 MOVNTQ" %%mm1, 8%0\n\t"
93 MOVNTQ" %%mm2, 16%0\n\t"
101 __asm __volatile(SFENCE:::"memory");
102 __asm __volatile(EMMS:::"memory");
106 #ifdef WORDS_BIGENDIAN
120 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
123 const uint8_t *s = src;
126 const uint8_t *mm_end;
130 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
137 "movq 8%1, %%mm1\n\t"
138 "movq 16%1, %%mm4\n\t"
139 "movq 24%1, %%mm5\n\t"
140 "movq %%mm0, %%mm2\n\t"
141 "movq %%mm1, %%mm3\n\t"
142 "movq %%mm4, %%mm6\n\t"
143 "movq %%mm5, %%mm7\n\t"
144 "psrlq $8, %%mm2\n\t"
145 "psrlq $8, %%mm3\n\t"
146 "psrlq $8, %%mm6\n\t"
147 "psrlq $8, %%mm7\n\t"
156 "por %%mm2, %%mm0\n\t"
157 "por %%mm3, %%mm1\n\t"
158 "por %%mm6, %%mm4\n\t"
159 "por %%mm7, %%mm5\n\t"
161 "movq %%mm1, %%mm2\n\t"
162 "movq %%mm4, %%mm3\n\t"
163 "psllq $48, %%mm2\n\t"
164 "psllq $32, %%mm3\n\t"
167 "por %%mm2, %%mm0\n\t"
168 "psrlq $16, %%mm1\n\t"
169 "psrlq $32, %%mm4\n\t"
170 "psllq $16, %%mm5\n\t"
171 "por %%mm3, %%mm1\n\t"
173 "por %%mm5, %%mm4\n\t"
175 MOVNTQ" %%mm0, %0\n\t"
176 MOVNTQ" %%mm1, 8%0\n\t"
179 :"m"(*s),"m"(mask24l),
180 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
185 __asm __volatile(SFENCE:::"memory");
186 __asm __volatile(EMMS:::"memory");
190 #ifdef WORDS_BIGENDIAN
205 Original by Strepto/Astral
206 ported to gcc & bugfixed : A'rpi
207 MMX2, 3DNOW optimization by Nick Kurshev
208 32bit c version, and and&add trick by Michael Niedermayer
210 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
212 register const uint8_t* s=src;
213 register uint8_t* d=dst;
214 register const uint8_t *end;
215 const uint8_t *mm_end;
218 __asm __volatile(PREFETCH" %0"::"m"(*s));
219 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
226 "movq 8%1, %%mm2\n\t"
227 "movq %%mm0, %%mm1\n\t"
228 "movq %%mm2, %%mm3\n\t"
229 "pand %%mm4, %%mm0\n\t"
230 "pand %%mm4, %%mm2\n\t"
231 "paddw %%mm1, %%mm0\n\t"
232 "paddw %%mm3, %%mm2\n\t"
233 MOVNTQ" %%mm0, %0\n\t"
241 __asm __volatile(SFENCE:::"memory");
242 __asm __volatile(EMMS:::"memory");
247 register unsigned x= *((uint32_t *)s);
248 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
254 register unsigned short x= *((uint16_t *)s);
255 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
259 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
261 register const uint8_t* s=src;
262 register uint8_t* d=dst;
263 register const uint8_t *end;
264 const uint8_t *mm_end;
267 __asm __volatile(PREFETCH" %0"::"m"(*s));
268 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
269 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
276 "movq 8%1, %%mm2\n\t"
277 "movq %%mm0, %%mm1\n\t"
278 "movq %%mm2, %%mm3\n\t"
279 "psrlq $1, %%mm0\n\t"
280 "psrlq $1, %%mm2\n\t"
281 "pand %%mm7, %%mm0\n\t"
282 "pand %%mm7, %%mm2\n\t"
283 "pand %%mm6, %%mm1\n\t"
284 "pand %%mm6, %%mm3\n\t"
285 "por %%mm1, %%mm0\n\t"
286 "por %%mm3, %%mm2\n\t"
287 MOVNTQ" %%mm0, %0\n\t"
295 __asm __volatile(SFENCE:::"memory");
296 __asm __volatile(EMMS:::"memory");
301 register uint32_t x= *((uint32_t *)s);
302 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
308 register uint16_t x= *((uint16_t *)s);
309 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
315 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
317 const uint8_t *s = src;
320 const uint8_t *mm_end;
322 uint16_t *d = (uint16_t *)dst;
326 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
328 "movq %3, %%mm5 \n\t"
329 "movq %4, %%mm6 \n\t"
330 "movq %5, %%mm7 \n\t"
333 PREFETCH" 32(%1) \n\t"
334 "movd (%1), %%mm0 \n\t"
335 "movd 4(%1), %%mm3 \n\t"
336 "punpckldq 8(%1), %%mm0 \n\t"
337 "punpckldq 12(%1), %%mm3 \n\t"
338 "movq %%mm0, %%mm1 \n\t"
339 "movq %%mm3, %%mm4 \n\t"
340 "pand %%mm6, %%mm0 \n\t"
341 "pand %%mm6, %%mm3 \n\t"
342 "pmaddwd %%mm7, %%mm0 \n\t"
343 "pmaddwd %%mm7, %%mm3 \n\t"
344 "pand %%mm5, %%mm1 \n\t"
345 "pand %%mm5, %%mm4 \n\t"
346 "por %%mm1, %%mm0 \n\t"
347 "por %%mm4, %%mm3 \n\t"
348 "psrld $5, %%mm0 \n\t"
349 "pslld $11, %%mm3 \n\t"
350 "por %%mm3, %%mm0 \n\t"
351 MOVNTQ" %%mm0, (%0) \n\t"
357 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
360 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
364 ::"m"(red_16mask),"m"(green_16mask));
370 "movd 4%1, %%mm3\n\t"
371 "punpckldq 8%1, %%mm0\n\t"
372 "punpckldq 12%1, %%mm3\n\t"
373 "movq %%mm0, %%mm1\n\t"
374 "movq %%mm0, %%mm2\n\t"
375 "movq %%mm3, %%mm4\n\t"
376 "movq %%mm3, %%mm5\n\t"
377 "psrlq $3, %%mm0\n\t"
378 "psrlq $3, %%mm3\n\t"
381 "psrlq $5, %%mm1\n\t"
382 "psrlq $5, %%mm4\n\t"
383 "pand %%mm6, %%mm1\n\t"
384 "pand %%mm6, %%mm4\n\t"
385 "psrlq $8, %%mm2\n\t"
386 "psrlq $8, %%mm5\n\t"
387 "pand %%mm7, %%mm2\n\t"
388 "pand %%mm7, %%mm5\n\t"
389 "por %%mm1, %%mm0\n\t"
390 "por %%mm4, %%mm3\n\t"
391 "por %%mm2, %%mm0\n\t"
392 "por %%mm5, %%mm3\n\t"
393 "psllq $16, %%mm3\n\t"
394 "por %%mm3, %%mm0\n\t"
395 MOVNTQ" %%mm0, %0\n\t"
396 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
401 __asm __volatile(SFENCE:::"memory");
402 __asm __volatile(EMMS:::"memory");
406 register int rgb = *(uint32_t*)s; s += 4;
407 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
411 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
413 const uint8_t *s = src;
416 const uint8_t *mm_end;
418 uint16_t *d = (uint16_t *)dst;
421 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
425 ::"m"(red_16mask),"m"(green_16mask));
432 "movd 4%1, %%mm3\n\t"
433 "punpckldq 8%1, %%mm0\n\t"
434 "punpckldq 12%1, %%mm3\n\t"
435 "movq %%mm0, %%mm1\n\t"
436 "movq %%mm0, %%mm2\n\t"
437 "movq %%mm3, %%mm4\n\t"
438 "movq %%mm3, %%mm5\n\t"
439 "psllq $8, %%mm0\n\t"
440 "psllq $8, %%mm3\n\t"
441 "pand %%mm7, %%mm0\n\t"
442 "pand %%mm7, %%mm3\n\t"
443 "psrlq $5, %%mm1\n\t"
444 "psrlq $5, %%mm4\n\t"
445 "pand %%mm6, %%mm1\n\t"
446 "pand %%mm6, %%mm4\n\t"
447 "psrlq $19, %%mm2\n\t"
448 "psrlq $19, %%mm5\n\t"
451 "por %%mm1, %%mm0\n\t"
452 "por %%mm4, %%mm3\n\t"
453 "por %%mm2, %%mm0\n\t"
454 "por %%mm5, %%mm3\n\t"
455 "psllq $16, %%mm3\n\t"
456 "por %%mm3, %%mm0\n\t"
457 MOVNTQ" %%mm0, %0\n\t"
458 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
462 __asm __volatile(SFENCE:::"memory");
463 __asm __volatile(EMMS:::"memory");
467 // FIXME on bigendian
468 const int src= *s; s += 4;
469 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
473 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
475 const uint8_t *s = src;
478 const uint8_t *mm_end;
480 uint16_t *d = (uint16_t *)dst;
484 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
486 "movq %3, %%mm5 \n\t"
487 "movq %4, %%mm6 \n\t"
488 "movq %5, %%mm7 \n\t"
491 PREFETCH" 32(%1) \n\t"
492 "movd (%1), %%mm0 \n\t"
493 "movd 4(%1), %%mm3 \n\t"
494 "punpckldq 8(%1), %%mm0 \n\t"
495 "punpckldq 12(%1), %%mm3 \n\t"
496 "movq %%mm0, %%mm1 \n\t"
497 "movq %%mm3, %%mm4 \n\t"
498 "pand %%mm6, %%mm0 \n\t"
499 "pand %%mm6, %%mm3 \n\t"
500 "pmaddwd %%mm7, %%mm0 \n\t"
501 "pmaddwd %%mm7, %%mm3 \n\t"
502 "pand %%mm5, %%mm1 \n\t"
503 "pand %%mm5, %%mm4 \n\t"
504 "por %%mm1, %%mm0 \n\t"
505 "por %%mm4, %%mm3 \n\t"
506 "psrld $6, %%mm0 \n\t"
507 "pslld $10, %%mm3 \n\t"
508 "por %%mm3, %%mm0 \n\t"
509 MOVNTQ" %%mm0, (%0) \n\t"
515 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
518 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
522 ::"m"(red_15mask),"m"(green_15mask));
528 "movd 4%1, %%mm3\n\t"
529 "punpckldq 8%1, %%mm0\n\t"
530 "punpckldq 12%1, %%mm3\n\t"
531 "movq %%mm0, %%mm1\n\t"
532 "movq %%mm0, %%mm2\n\t"
533 "movq %%mm3, %%mm4\n\t"
534 "movq %%mm3, %%mm5\n\t"
535 "psrlq $3, %%mm0\n\t"
536 "psrlq $3, %%mm3\n\t"
539 "psrlq $6, %%mm1\n\t"
540 "psrlq $6, %%mm4\n\t"
541 "pand %%mm6, %%mm1\n\t"
542 "pand %%mm6, %%mm4\n\t"
543 "psrlq $9, %%mm2\n\t"
544 "psrlq $9, %%mm5\n\t"
545 "pand %%mm7, %%mm2\n\t"
546 "pand %%mm7, %%mm5\n\t"
547 "por %%mm1, %%mm0\n\t"
548 "por %%mm4, %%mm3\n\t"
549 "por %%mm2, %%mm0\n\t"
550 "por %%mm5, %%mm3\n\t"
551 "psllq $16, %%mm3\n\t"
552 "por %%mm3, %%mm0\n\t"
553 MOVNTQ" %%mm0, %0\n\t"
554 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
559 __asm __volatile(SFENCE:::"memory");
560 __asm __volatile(EMMS:::"memory");
564 // FIXME on bigendian
565 const int src= *s; s += 4;
566 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
570 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
572 const uint8_t *s = src;
575 const uint8_t *mm_end;
577 uint16_t *d = (uint16_t *)dst;
580 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
584 ::"m"(red_15mask),"m"(green_15mask));
591 "movd 4%1, %%mm3\n\t"
592 "punpckldq 8%1, %%mm0\n\t"
593 "punpckldq 12%1, %%mm3\n\t"
594 "movq %%mm0, %%mm1\n\t"
595 "movq %%mm0, %%mm2\n\t"
596 "movq %%mm3, %%mm4\n\t"
597 "movq %%mm3, %%mm5\n\t"
598 "psllq $7, %%mm0\n\t"
599 "psllq $7, %%mm3\n\t"
600 "pand %%mm7, %%mm0\n\t"
601 "pand %%mm7, %%mm3\n\t"
602 "psrlq $6, %%mm1\n\t"
603 "psrlq $6, %%mm4\n\t"
604 "pand %%mm6, %%mm1\n\t"
605 "pand %%mm6, %%mm4\n\t"
606 "psrlq $19, %%mm2\n\t"
607 "psrlq $19, %%mm5\n\t"
610 "por %%mm1, %%mm0\n\t"
611 "por %%mm4, %%mm3\n\t"
612 "por %%mm2, %%mm0\n\t"
613 "por %%mm5, %%mm3\n\t"
614 "psllq $16, %%mm3\n\t"
615 "por %%mm3, %%mm0\n\t"
616 MOVNTQ" %%mm0, %0\n\t"
617 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
621 __asm __volatile(SFENCE:::"memory");
622 __asm __volatile(EMMS:::"memory");
626 // FIXME on bigendian
627 const int src= *s; s += 4;
628 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
632 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
634 const uint8_t *s = src;
637 const uint8_t *mm_end;
639 uint16_t *d = (uint16_t *)dst;
642 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
646 ::"m"(red_16mask),"m"(green_16mask));
653 "movd 3%1, %%mm3\n\t"
654 "punpckldq 6%1, %%mm0\n\t"
655 "punpckldq 9%1, %%mm3\n\t"
656 "movq %%mm0, %%mm1\n\t"
657 "movq %%mm0, %%mm2\n\t"
658 "movq %%mm3, %%mm4\n\t"
659 "movq %%mm3, %%mm5\n\t"
660 "psrlq $3, %%mm0\n\t"
661 "psrlq $3, %%mm3\n\t"
664 "psrlq $5, %%mm1\n\t"
665 "psrlq $5, %%mm4\n\t"
666 "pand %%mm6, %%mm1\n\t"
667 "pand %%mm6, %%mm4\n\t"
668 "psrlq $8, %%mm2\n\t"
669 "psrlq $8, %%mm5\n\t"
670 "pand %%mm7, %%mm2\n\t"
671 "pand %%mm7, %%mm5\n\t"
672 "por %%mm1, %%mm0\n\t"
673 "por %%mm4, %%mm3\n\t"
674 "por %%mm2, %%mm0\n\t"
675 "por %%mm5, %%mm3\n\t"
676 "psllq $16, %%mm3\n\t"
677 "por %%mm3, %%mm0\n\t"
678 MOVNTQ" %%mm0, %0\n\t"
679 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
683 __asm __volatile(SFENCE:::"memory");
684 __asm __volatile(EMMS:::"memory");
691 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
695 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
697 const uint8_t *s = src;
700 const uint8_t *mm_end;
702 uint16_t *d = (uint16_t *)dst;
705 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
709 ::"m"(red_16mask),"m"(green_16mask));
716 "movd 3%1, %%mm3\n\t"
717 "punpckldq 6%1, %%mm0\n\t"
718 "punpckldq 9%1, %%mm3\n\t"
719 "movq %%mm0, %%mm1\n\t"
720 "movq %%mm0, %%mm2\n\t"
721 "movq %%mm3, %%mm4\n\t"
722 "movq %%mm3, %%mm5\n\t"
723 "psllq $8, %%mm0\n\t"
724 "psllq $8, %%mm3\n\t"
725 "pand %%mm7, %%mm0\n\t"
726 "pand %%mm7, %%mm3\n\t"
727 "psrlq $5, %%mm1\n\t"
728 "psrlq $5, %%mm4\n\t"
729 "pand %%mm6, %%mm1\n\t"
730 "pand %%mm6, %%mm4\n\t"
731 "psrlq $19, %%mm2\n\t"
732 "psrlq $19, %%mm5\n\t"
735 "por %%mm1, %%mm0\n\t"
736 "por %%mm4, %%mm3\n\t"
737 "por %%mm2, %%mm0\n\t"
738 "por %%mm5, %%mm3\n\t"
739 "psllq $16, %%mm3\n\t"
740 "por %%mm3, %%mm0\n\t"
741 MOVNTQ" %%mm0, %0\n\t"
742 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
746 __asm __volatile(SFENCE:::"memory");
747 __asm __volatile(EMMS:::"memory");
754 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
758 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
760 const uint8_t *s = src;
763 const uint8_t *mm_end;
765 uint16_t *d = (uint16_t *)dst;
768 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
772 ::"m"(red_15mask),"m"(green_15mask));
779 "movd 3%1, %%mm3\n\t"
780 "punpckldq 6%1, %%mm0\n\t"
781 "punpckldq 9%1, %%mm3\n\t"
782 "movq %%mm0, %%mm1\n\t"
783 "movq %%mm0, %%mm2\n\t"
784 "movq %%mm3, %%mm4\n\t"
785 "movq %%mm3, %%mm5\n\t"
786 "psrlq $3, %%mm0\n\t"
787 "psrlq $3, %%mm3\n\t"
790 "psrlq $6, %%mm1\n\t"
791 "psrlq $6, %%mm4\n\t"
792 "pand %%mm6, %%mm1\n\t"
793 "pand %%mm6, %%mm4\n\t"
794 "psrlq $9, %%mm2\n\t"
795 "psrlq $9, %%mm5\n\t"
796 "pand %%mm7, %%mm2\n\t"
797 "pand %%mm7, %%mm5\n\t"
798 "por %%mm1, %%mm0\n\t"
799 "por %%mm4, %%mm3\n\t"
800 "por %%mm2, %%mm0\n\t"
801 "por %%mm5, %%mm3\n\t"
802 "psllq $16, %%mm3\n\t"
803 "por %%mm3, %%mm0\n\t"
804 MOVNTQ" %%mm0, %0\n\t"
805 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
809 __asm __volatile(SFENCE:::"memory");
810 __asm __volatile(EMMS:::"memory");
817 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
821 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
823 const uint8_t *s = src;
826 const uint8_t *mm_end;
828 uint16_t *d = (uint16_t *)dst;
831 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
835 ::"m"(red_15mask),"m"(green_15mask));
842 "movd 3%1, %%mm3\n\t"
843 "punpckldq 6%1, %%mm0\n\t"
844 "punpckldq 9%1, %%mm3\n\t"
845 "movq %%mm0, %%mm1\n\t"
846 "movq %%mm0, %%mm2\n\t"
847 "movq %%mm3, %%mm4\n\t"
848 "movq %%mm3, %%mm5\n\t"
849 "psllq $7, %%mm0\n\t"
850 "psllq $7, %%mm3\n\t"
851 "pand %%mm7, %%mm0\n\t"
852 "pand %%mm7, %%mm3\n\t"
853 "psrlq $6, %%mm1\n\t"
854 "psrlq $6, %%mm4\n\t"
855 "pand %%mm6, %%mm1\n\t"
856 "pand %%mm6, %%mm4\n\t"
857 "psrlq $19, %%mm2\n\t"
858 "psrlq $19, %%mm5\n\t"
861 "por %%mm1, %%mm0\n\t"
862 "por %%mm4, %%mm3\n\t"
863 "por %%mm2, %%mm0\n\t"
864 "por %%mm5, %%mm3\n\t"
865 "psllq $16, %%mm3\n\t"
866 "por %%mm3, %%mm0\n\t"
867 MOVNTQ" %%mm0, %0\n\t"
868 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
872 __asm __volatile(SFENCE:::"memory");
873 __asm __volatile(EMMS:::"memory");
880 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
885 I use here less accurate approximation by simply
886 left-shifting the input
887 value and filling the low order bits with
888 zeroes. This method improves png's
889 compression but this scheme cannot reproduce white exactly, since it does not
890 generate an all-ones maximum value; the net effect is to darken the
893 The better method should be "left bit replication":
903 | Leftmost Bits Repeated to Fill Open Bits
907 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
911 const uint16_t *mm_end;
913 uint8_t *d = (uint8_t *)dst;
914 const uint16_t *s = (uint16_t *)src;
915 end = s + src_size/2;
917 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
929 "psllq $3, %%mm0\n\t"
930 "psrlq $2, %%mm1\n\t"
931 "psrlq $7, %%mm2\n\t"
932 "movq %%mm0, %%mm3\n\t"
933 "movq %%mm1, %%mm4\n\t"
934 "movq %%mm2, %%mm5\n\t"
935 "punpcklwd %5, %%mm0\n\t"
936 "punpcklwd %5, %%mm1\n\t"
937 "punpcklwd %5, %%mm2\n\t"
938 "punpckhwd %5, %%mm3\n\t"
939 "punpckhwd %5, %%mm4\n\t"
940 "punpckhwd %5, %%mm5\n\t"
941 "psllq $8, %%mm1\n\t"
942 "psllq $16, %%mm2\n\t"
943 "por %%mm1, %%mm0\n\t"
944 "por %%mm2, %%mm0\n\t"
945 "psllq $8, %%mm4\n\t"
946 "psllq $16, %%mm5\n\t"
947 "por %%mm4, %%mm3\n\t"
948 "por %%mm5, %%mm3\n\t"
950 "movq %%mm0, %%mm6\n\t"
951 "movq %%mm3, %%mm7\n\t"
953 "movq 8%1, %%mm0\n\t"
954 "movq 8%1, %%mm1\n\t"
955 "movq 8%1, %%mm2\n\t"
959 "psllq $3, %%mm0\n\t"
960 "psrlq $2, %%mm1\n\t"
961 "psrlq $7, %%mm2\n\t"
962 "movq %%mm0, %%mm3\n\t"
963 "movq %%mm1, %%mm4\n\t"
964 "movq %%mm2, %%mm5\n\t"
965 "punpcklwd %5, %%mm0\n\t"
966 "punpcklwd %5, %%mm1\n\t"
967 "punpcklwd %5, %%mm2\n\t"
968 "punpckhwd %5, %%mm3\n\t"
969 "punpckhwd %5, %%mm4\n\t"
970 "punpckhwd %5, %%mm5\n\t"
971 "psllq $8, %%mm1\n\t"
972 "psllq $16, %%mm2\n\t"
973 "por %%mm1, %%mm0\n\t"
974 "por %%mm2, %%mm0\n\t"
975 "psllq $8, %%mm4\n\t"
976 "psllq $16, %%mm5\n\t"
977 "por %%mm4, %%mm3\n\t"
978 "por %%mm5, %%mm3\n\t"
981 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
983 /* Borrowed 32 to 24 */
985 "movq %%mm0, %%mm4\n\t"
986 "movq %%mm3, %%mm5\n\t"
987 "movq %%mm6, %%mm0\n\t"
988 "movq %%mm7, %%mm1\n\t"
990 "movq %%mm4, %%mm6\n\t"
991 "movq %%mm5, %%mm7\n\t"
992 "movq %%mm0, %%mm2\n\t"
993 "movq %%mm1, %%mm3\n\t"
995 "psrlq $8, %%mm2\n\t"
996 "psrlq $8, %%mm3\n\t"
997 "psrlq $8, %%mm6\n\t"
998 "psrlq $8, %%mm7\n\t"
1000 "pand %2, %%mm1\n\t"
1001 "pand %2, %%mm4\n\t"
1002 "pand %2, %%mm5\n\t"
1003 "pand %3, %%mm2\n\t"
1004 "pand %3, %%mm3\n\t"
1005 "pand %3, %%mm6\n\t"
1006 "pand %3, %%mm7\n\t"
1007 "por %%mm2, %%mm0\n\t"
1008 "por %%mm3, %%mm1\n\t"
1009 "por %%mm6, %%mm4\n\t"
1010 "por %%mm7, %%mm5\n\t"
1012 "movq %%mm1, %%mm2\n\t"
1013 "movq %%mm4, %%mm3\n\t"
1014 "psllq $48, %%mm2\n\t"
1015 "psllq $32, %%mm3\n\t"
1016 "pand %4, %%mm2\n\t"
1017 "pand %5, %%mm3\n\t"
1018 "por %%mm2, %%mm0\n\t"
1019 "psrlq $16, %%mm1\n\t"
1020 "psrlq $32, %%mm4\n\t"
1021 "psllq $16, %%mm5\n\t"
1022 "por %%mm3, %%mm1\n\t"
1023 "pand %6, %%mm5\n\t"
1024 "por %%mm5, %%mm4\n\t"
1026 MOVNTQ" %%mm0, %0\n\t"
1027 MOVNTQ" %%mm1, 8%0\n\t"
1028 MOVNTQ" %%mm4, 16%0"
1031 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1036 __asm __volatile(SFENCE:::"memory");
1037 __asm __volatile(EMMS:::"memory");
1041 register uint16_t bgr;
1043 *d++ = (bgr&0x1F)<<3;
1044 *d++ = (bgr&0x3E0)>>2;
1045 *d++ = (bgr&0x7C00)>>7;
1049 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1051 const uint16_t *end;
1053 const uint16_t *mm_end;
1055 uint8_t *d = (uint8_t *)dst;
1056 const uint16_t *s = (const uint16_t *)src;
1057 end = s + src_size/2;
1059 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1065 "movq %1, %%mm0\n\t"
1066 "movq %1, %%mm1\n\t"
1067 "movq %1, %%mm2\n\t"
1068 "pand %2, %%mm0\n\t"
1069 "pand %3, %%mm1\n\t"
1070 "pand %4, %%mm2\n\t"
1071 "psllq $3, %%mm0\n\t"
1072 "psrlq $3, %%mm1\n\t"
1073 "psrlq $8, %%mm2\n\t"
1074 "movq %%mm0, %%mm3\n\t"
1075 "movq %%mm1, %%mm4\n\t"
1076 "movq %%mm2, %%mm5\n\t"
1077 "punpcklwd %5, %%mm0\n\t"
1078 "punpcklwd %5, %%mm1\n\t"
1079 "punpcklwd %5, %%mm2\n\t"
1080 "punpckhwd %5, %%mm3\n\t"
1081 "punpckhwd %5, %%mm4\n\t"
1082 "punpckhwd %5, %%mm5\n\t"
1083 "psllq $8, %%mm1\n\t"
1084 "psllq $16, %%mm2\n\t"
1085 "por %%mm1, %%mm0\n\t"
1086 "por %%mm2, %%mm0\n\t"
1087 "psllq $8, %%mm4\n\t"
1088 "psllq $16, %%mm5\n\t"
1089 "por %%mm4, %%mm3\n\t"
1090 "por %%mm5, %%mm3\n\t"
1092 "movq %%mm0, %%mm6\n\t"
1093 "movq %%mm3, %%mm7\n\t"
1095 "movq 8%1, %%mm0\n\t"
1096 "movq 8%1, %%mm1\n\t"
1097 "movq 8%1, %%mm2\n\t"
1098 "pand %2, %%mm0\n\t"
1099 "pand %3, %%mm1\n\t"
1100 "pand %4, %%mm2\n\t"
1101 "psllq $3, %%mm0\n\t"
1102 "psrlq $3, %%mm1\n\t"
1103 "psrlq $8, %%mm2\n\t"
1104 "movq %%mm0, %%mm3\n\t"
1105 "movq %%mm1, %%mm4\n\t"
1106 "movq %%mm2, %%mm5\n\t"
1107 "punpcklwd %5, %%mm0\n\t"
1108 "punpcklwd %5, %%mm1\n\t"
1109 "punpcklwd %5, %%mm2\n\t"
1110 "punpckhwd %5, %%mm3\n\t"
1111 "punpckhwd %5, %%mm4\n\t"
1112 "punpckhwd %5, %%mm5\n\t"
1113 "psllq $8, %%mm1\n\t"
1114 "psllq $16, %%mm2\n\t"
1115 "por %%mm1, %%mm0\n\t"
1116 "por %%mm2, %%mm0\n\t"
1117 "psllq $8, %%mm4\n\t"
1118 "psllq $16, %%mm5\n\t"
1119 "por %%mm4, %%mm3\n\t"
1120 "por %%mm5, %%mm3\n\t"
1122 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1124 /* Borrowed 32 to 24 */
1126 "movq %%mm0, %%mm4\n\t"
1127 "movq %%mm3, %%mm5\n\t"
1128 "movq %%mm6, %%mm0\n\t"
1129 "movq %%mm7, %%mm1\n\t"
1131 "movq %%mm4, %%mm6\n\t"
1132 "movq %%mm5, %%mm7\n\t"
1133 "movq %%mm0, %%mm2\n\t"
1134 "movq %%mm1, %%mm3\n\t"
1136 "psrlq $8, %%mm2\n\t"
1137 "psrlq $8, %%mm3\n\t"
1138 "psrlq $8, %%mm6\n\t"
1139 "psrlq $8, %%mm7\n\t"
1140 "pand %2, %%mm0\n\t"
1141 "pand %2, %%mm1\n\t"
1142 "pand %2, %%mm4\n\t"
1143 "pand %2, %%mm5\n\t"
1144 "pand %3, %%mm2\n\t"
1145 "pand %3, %%mm3\n\t"
1146 "pand %3, %%mm6\n\t"
1147 "pand %3, %%mm7\n\t"
1148 "por %%mm2, %%mm0\n\t"
1149 "por %%mm3, %%mm1\n\t"
1150 "por %%mm6, %%mm4\n\t"
1151 "por %%mm7, %%mm5\n\t"
1153 "movq %%mm1, %%mm2\n\t"
1154 "movq %%mm4, %%mm3\n\t"
1155 "psllq $48, %%mm2\n\t"
1156 "psllq $32, %%mm3\n\t"
1157 "pand %4, %%mm2\n\t"
1158 "pand %5, %%mm3\n\t"
1159 "por %%mm2, %%mm0\n\t"
1160 "psrlq $16, %%mm1\n\t"
1161 "psrlq $32, %%mm4\n\t"
1162 "psllq $16, %%mm5\n\t"
1163 "por %%mm3, %%mm1\n\t"
1164 "pand %6, %%mm5\n\t"
1165 "por %%mm5, %%mm4\n\t"
1167 MOVNTQ" %%mm0, %0\n\t"
1168 MOVNTQ" %%mm1, 8%0\n\t"
1169 MOVNTQ" %%mm4, 16%0"
1172 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1177 __asm __volatile(SFENCE:::"memory");
1178 __asm __volatile(EMMS:::"memory");
1182 register uint16_t bgr;
1184 *d++ = (bgr&0x1F)<<3;
1185 *d++ = (bgr&0x7E0)>>3;
1186 *d++ = (bgr&0xF800)>>8;
1190 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1192 const uint16_t *end;
1194 const uint16_t *mm_end;
1196 uint8_t *d = (uint8_t *)dst;
1197 const uint16_t *s = (const uint16_t *)src;
1198 end = s + src_size/2;
1200 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1201 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1207 "movq %1, %%mm0\n\t"
1208 "movq %1, %%mm1\n\t"
1209 "movq %1, %%mm2\n\t"
1210 "pand %2, %%mm0\n\t"
1211 "pand %3, %%mm1\n\t"
1212 "pand %4, %%mm2\n\t"
1213 "psllq $3, %%mm0\n\t"
1214 "psrlq $2, %%mm1\n\t"
1215 "psrlq $7, %%mm2\n\t"
1216 "movq %%mm0, %%mm3\n\t"
1217 "movq %%mm1, %%mm4\n\t"
1218 "movq %%mm2, %%mm5\n\t"
1219 "punpcklwd %%mm7, %%mm0\n\t"
1220 "punpcklwd %%mm7, %%mm1\n\t"
1221 "punpcklwd %%mm7, %%mm2\n\t"
1222 "punpckhwd %%mm7, %%mm3\n\t"
1223 "punpckhwd %%mm7, %%mm4\n\t"
1224 "punpckhwd %%mm7, %%mm5\n\t"
1225 "psllq $8, %%mm1\n\t"
1226 "psllq $16, %%mm2\n\t"
1227 "por %%mm1, %%mm0\n\t"
1228 "por %%mm2, %%mm0\n\t"
1229 "psllq $8, %%mm4\n\t"
1230 "psllq $16, %%mm5\n\t"
1231 "por %%mm4, %%mm3\n\t"
1232 "por %%mm5, %%mm3\n\t"
1233 MOVNTQ" %%mm0, %0\n\t"
1234 MOVNTQ" %%mm3, 8%0\n\t"
1236 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1241 __asm __volatile(SFENCE:::"memory");
1242 __asm __volatile(EMMS:::"memory");
1246 #if 0 //slightly slower on athlon
1248 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1250 //FIXME this is very likely wrong for bigendian (and the following converters too)
1251 register uint16_t bgr;
1253 #ifdef WORDS_BIGENDIAN
1255 *d++ = (bgr&0x1F)<<3;
1256 *d++ = (bgr&0x3E0)>>2;
1257 *d++ = (bgr&0x7C00)>>7;
1259 *d++ = (bgr&0x1F)<<3;
1260 *d++ = (bgr&0x3E0)>>2;
1261 *d++ = (bgr&0x7C00)>>7;
1269 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1271 const uint16_t *end;
1273 const uint16_t *mm_end;
1275 uint8_t *d = (uint8_t *)dst;
1276 const uint16_t *s = (uint16_t *)src;
1277 end = s + src_size/2;
1279 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1280 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1286 "movq %1, %%mm0\n\t"
1287 "movq %1, %%mm1\n\t"
1288 "movq %1, %%mm2\n\t"
1289 "pand %2, %%mm0\n\t"
1290 "pand %3, %%mm1\n\t"
1291 "pand %4, %%mm2\n\t"
1292 "psllq $3, %%mm0\n\t"
1293 "psrlq $3, %%mm1\n\t"
1294 "psrlq $8, %%mm2\n\t"
1295 "movq %%mm0, %%mm3\n\t"
1296 "movq %%mm1, %%mm4\n\t"
1297 "movq %%mm2, %%mm5\n\t"
1298 "punpcklwd %%mm7, %%mm0\n\t"
1299 "punpcklwd %%mm7, %%mm1\n\t"
1300 "punpcklwd %%mm7, %%mm2\n\t"
1301 "punpckhwd %%mm7, %%mm3\n\t"
1302 "punpckhwd %%mm7, %%mm4\n\t"
1303 "punpckhwd %%mm7, %%mm5\n\t"
1304 "psllq $8, %%mm1\n\t"
1305 "psllq $16, %%mm2\n\t"
1306 "por %%mm1, %%mm0\n\t"
1307 "por %%mm2, %%mm0\n\t"
1308 "psllq $8, %%mm4\n\t"
1309 "psllq $16, %%mm5\n\t"
1310 "por %%mm4, %%mm3\n\t"
1311 "por %%mm5, %%mm3\n\t"
1312 MOVNTQ" %%mm0, %0\n\t"
1313 MOVNTQ" %%mm3, 8%0\n\t"
1315 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1320 __asm __volatile(SFENCE:::"memory");
1321 __asm __volatile(EMMS:::"memory");
1325 register uint16_t bgr;
1327 #ifdef WORDS_BIGENDIAN
1329 *d++ = (bgr&0x1F)<<3;
1330 *d++ = (bgr&0x7E0)>>3;
1331 *d++ = (bgr&0xF800)>>8;
1333 *d++ = (bgr&0x1F)<<3;
1334 *d++ = (bgr&0x7E0)>>3;
1335 *d++ = (bgr&0xF800)>>8;
1341 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1344 /* TODO: unroll this loop */
1346 "xor %%"REG_a", %%"REG_a" \n\t"
1349 PREFETCH" 32(%0, %%"REG_a") \n\t"
1350 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1351 "movq %%mm0, %%mm1 \n\t"
1352 "movq %%mm0, %%mm2 \n\t"
1353 "pslld $16, %%mm0 \n\t"
1354 "psrld $16, %%mm1 \n\t"
1355 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1356 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1357 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1358 "por %%mm0, %%mm2 \n\t"
1359 "por %%mm1, %%mm2 \n\t"
1360 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1361 "add $8, %%"REG_a" \n\t"
1362 "cmp %2, %%"REG_a" \n\t"
1364 :: "r" (src), "r"(dst), "r" ((long)src_size-7)
1368 __asm __volatile(SFENCE:::"memory");
1369 __asm __volatile(EMMS:::"memory");
1372 unsigned num_pixels = src_size >> 2;
1373 for(i=0; i<num_pixels; i++)
1375 #ifdef WORDS_BIGENDIAN
1376 dst[4*i + 1] = src[4*i + 3];
1377 dst[4*i + 2] = src[4*i + 2];
1378 dst[4*i + 3] = src[4*i + 1];
1380 dst[4*i + 0] = src[4*i + 2];
1381 dst[4*i + 1] = src[4*i + 1];
1382 dst[4*i + 2] = src[4*i + 0];
1388 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1392 long mmx_size= 23 - src_size;
1394 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1395 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1396 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1399 PREFETCH" 32(%1, %%"REG_a") \n\t"
1400 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1401 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1402 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1403 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1404 "pand %%mm5, %%mm0 \n\t"
1405 "pand %%mm6, %%mm1 \n\t"
1406 "pand %%mm7, %%mm2 \n\t"
1407 "por %%mm0, %%mm1 \n\t"
1408 "por %%mm2, %%mm1 \n\t"
1409 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1410 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1411 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1412 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1413 "pand %%mm7, %%mm0 \n\t"
1414 "pand %%mm5, %%mm1 \n\t"
1415 "pand %%mm6, %%mm2 \n\t"
1416 "por %%mm0, %%mm1 \n\t"
1417 "por %%mm2, %%mm1 \n\t"
1418 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1419 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1420 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1421 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1422 "pand %%mm6, %%mm0 \n\t"
1423 "pand %%mm7, %%mm1 \n\t"
1424 "pand %%mm5, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1428 "add $24, %%"REG_a" \n\t"
1431 : "r" (src-mmx_size), "r"(dst-mmx_size)
1434 __asm __volatile(SFENCE:::"memory");
1435 __asm __volatile(EMMS:::"memory");
1437 if(mmx_size==23) return; //finihsed, was multiple of 8
1441 src_size= 23-mmx_size;
1445 for(i=0; i<src_size; i+=3)
1449 dst[i + 1] = src[i + 1];
1450 dst[i + 2] = src[i + 0];
1455 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1456 unsigned int width, unsigned int height,
1457 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1460 const unsigned chromWidth= width>>1;
1461 for(y=0; y<height; y++)
1464 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1466 "xor %%"REG_a", %%"REG_a" \n\t"
1469 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1470 PREFETCH" 32(%2, %%"REG_a") \n\t"
1471 PREFETCH" 32(%3, %%"REG_a") \n\t"
1472 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1473 "movq %%mm0, %%mm2 \n\t" // U(0)
1474 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1475 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1476 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1478 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1479 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1480 "movq %%mm3, %%mm4 \n\t" // Y(0)
1481 "movq %%mm5, %%mm6 \n\t" // Y(8)
1482 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1483 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1484 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1485 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1487 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1488 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1489 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1490 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1492 "add $8, %%"REG_a" \n\t"
1493 "cmp %4, %%"REG_a" \n\t"
1495 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1500 #if defined ARCH_ALPHA && defined HAVE_MVI
1501 #define pl2yuy2(n) \
1506 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1507 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1508 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1509 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1510 yuv1 = (u << 8) + (v << 24); \
1517 uint64_t *qdst = (uint64_t *) dst;
1518 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1519 const uint32_t *yc = (uint32_t *) ysrc;
1520 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1521 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1522 for(i = 0; i < chromWidth; i += 8){
1523 uint64_t y1, y2, yuv1, yuv2;
1526 asm("ldq $31,64(%0)" :: "r"(yc));
1527 asm("ldq $31,64(%0)" :: "r"(yc2));
1528 asm("ldq $31,64(%0)" :: "r"(uc));
1529 asm("ldq $31,64(%0)" :: "r"(vc));
1547 #elif __WORDSIZE >= 64
1549 uint64_t *ldst = (uint64_t *) dst;
1550 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1551 for(i = 0; i < chromWidth; i += 2){
1553 k = yc[0] + (uc[0] << 8) +
1554 (yc[1] << 16) + (vc[0] << 24);
1555 l = yc[2] + (uc[1] << 8) +
1556 (yc[3] << 16) + (vc[1] << 24);
1557 *ldst++ = k + (l << 32);
1564 int i, *idst = (int32_t *) dst;
1565 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1566 for(i = 0; i < chromWidth; i++){
1567 #ifdef WORDS_BIGENDIAN
1568 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1569 (yc[1] << 8) + (vc[0] << 0);
1571 *idst++ = yc[0] + (uc[0] << 8) +
1572 (yc[1] << 16) + (vc[0] << 24);
1580 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1582 usrc += chromStride;
1583 vsrc += chromStride;
1597 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1598 * problem for anyone then tell me, and ill fix it)
1600 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1601 unsigned int width, unsigned int height,
1602 int lumStride, int chromStride, int dstStride)
1604 //FIXME interpolate chroma
1605 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1608 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1609 unsigned int width, unsigned int height,
1610 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1613 const unsigned chromWidth= width>>1;
1614 for(y=0; y<height; y++)
1617 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1619 "xor %%"REG_a", %%"REG_a" \n\t"
1622 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1623 PREFETCH" 32(%2, %%"REG_a") \n\t"
1624 PREFETCH" 32(%3, %%"REG_a") \n\t"
1625 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1626 "movq %%mm0, %%mm2 \n\t" // U(0)
1627 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1628 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1629 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1631 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1632 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1633 "movq %%mm0, %%mm4 \n\t" // Y(0)
1634 "movq %%mm2, %%mm6 \n\t" // Y(8)
1635 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1636 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1637 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1638 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1640 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1641 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1642 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1643 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1645 "add $8, %%"REG_a" \n\t"
1646 "cmp %4, %%"REG_a" \n\t"
1648 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" ((long)chromWidth)
1652 //FIXME adapt the alpha asm code from yv12->yuy2
1654 #if __WORDSIZE >= 64
1656 uint64_t *ldst = (uint64_t *) dst;
1657 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1658 for(i = 0; i < chromWidth; i += 2){
1660 k = uc[0] + (yc[0] << 8) +
1661 (vc[0] << 16) + (yc[1] << 24);
1662 l = uc[1] + (yc[2] << 8) +
1663 (vc[1] << 16) + (yc[3] << 24);
1664 *ldst++ = k + (l << 32);
1671 int i, *idst = (int32_t *) dst;
1672 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1673 for(i = 0; i < chromWidth; i++){
1674 #ifdef WORDS_BIGENDIAN
1675 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1676 (vc[0] << 8) + (yc[1] << 0);
1678 *idst++ = uc[0] + (yc[0] << 8) +
1679 (vc[0] << 16) + (yc[1] << 24);
1687 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1689 usrc += chromStride;
1690 vsrc += chromStride;
1704 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1705 * problem for anyone then tell me, and ill fix it)
1707 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1708 unsigned int width, unsigned int height,
1709 int lumStride, int chromStride, int dstStride)
1711 //FIXME interpolate chroma
1712 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1717 * width should be a multiple of 16
1719 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1720 unsigned int width, unsigned int height,
1721 int lumStride, int chromStride, int dstStride)
1723 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1728 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729 * problem for anyone then tell me, and ill fix it)
1731 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1732 unsigned int width, unsigned int height,
1733 int lumStride, int chromStride, int srcStride)
1736 const unsigned chromWidth= width>>1;
1737 for(y=0; y<height; y+=2)
1741 "xor %%"REG_a", %%"REG_a" \n\t"
1742 "pcmpeqw %%mm7, %%mm7 \n\t"
1743 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1746 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1747 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1748 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1749 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1750 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1751 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1752 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1753 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1754 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1755 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1756 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1758 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1760 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1761 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1762 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1763 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1764 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1765 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1766 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1767 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1768 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1769 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1771 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1773 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1774 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1775 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1776 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1777 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1778 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1779 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1780 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1782 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1783 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1785 "add $8, %%"REG_a" \n\t"
1786 "cmp %4, %%"REG_a" \n\t"
1788 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1789 : "memory", "%"REG_a
1796 "xor %%"REG_a", %%"REG_a" \n\t"
1799 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1800 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1801 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1802 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1803 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1804 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1805 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1806 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1807 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1808 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1809 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1811 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1812 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1814 "add $8, %%"REG_a" \n\t"
1815 "cmp %4, %%"REG_a" \n\t"
1818 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" ((long)chromWidth)
1819 : "memory", "%"REG_a
1823 for(i=0; i<chromWidth; i++)
1825 ydst[2*i+0] = src[4*i+0];
1826 udst[i] = src[4*i+1];
1827 ydst[2*i+1] = src[4*i+2];
1828 vdst[i] = src[4*i+3];
1833 for(i=0; i<chromWidth; i++)
1835 ydst[2*i+0] = src[4*i+0];
1836 ydst[2*i+1] = src[4*i+2];
1839 udst += chromStride;
1840 vdst += chromStride;
1845 asm volatile( EMMS" \n\t"
1851 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1852 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1853 unsigned int width, unsigned int height, int lumStride, int chromStride)
1856 memcpy(ydst, ysrc, width*height);
1858 /* XXX: implement upscaling for U,V */
1861 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1868 for(x=0; x<srcWidth-1; x++){
1869 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1870 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1872 dst[2*srcWidth-1]= src[srcWidth-1];
1876 for(y=1; y<srcHeight; y++){
1877 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1878 const long mmxSize= srcWidth&~15;
1880 "mov %4, %%"REG_a" \n\t"
1882 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1883 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1884 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1885 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1886 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1887 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1888 PAVGB" %%mm0, %%mm5 \n\t"
1889 PAVGB" %%mm0, %%mm3 \n\t"
1890 PAVGB" %%mm0, %%mm5 \n\t"
1891 PAVGB" %%mm0, %%mm3 \n\t"
1892 PAVGB" %%mm1, %%mm4 \n\t"
1893 PAVGB" %%mm1, %%mm2 \n\t"
1894 PAVGB" %%mm1, %%mm4 \n\t"
1895 PAVGB" %%mm1, %%mm2 \n\t"
1896 "movq %%mm5, %%mm7 \n\t"
1897 "movq %%mm4, %%mm6 \n\t"
1898 "punpcklbw %%mm3, %%mm5 \n\t"
1899 "punpckhbw %%mm3, %%mm7 \n\t"
1900 "punpcklbw %%mm2, %%mm4 \n\t"
1901 "punpckhbw %%mm2, %%mm6 \n\t"
1903 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1904 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1905 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1906 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1908 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1909 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1910 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1911 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1913 "add $8, %%"REG_a" \n\t"
1915 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1916 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1922 const int mmxSize=1;
1924 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1925 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1927 for(x=mmxSize-1; x<srcWidth-1; x++){
1928 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1929 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1930 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1931 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1933 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1934 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1944 for(x=0; x<srcWidth-1; x++){
1945 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1946 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1948 dst[2*srcWidth-1]= src[srcWidth-1];
1950 for(x=0; x<srcWidth; x++){
1957 asm volatile( EMMS" \n\t"
1965 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1966 * problem for anyone then tell me, and ill fix it)
1967 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1969 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1970 unsigned int width, unsigned int height,
1971 int lumStride, int chromStride, int srcStride)
1974 const unsigned chromWidth= width>>1;
1975 for(y=0; y<height; y+=2)
1979 "xorl %%eax, %%eax \n\t"
1980 "pcmpeqw %%mm7, %%mm7 \n\t"
1981 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1984 PREFETCH" 64(%0, %%eax, 4) \n\t"
1985 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1986 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1987 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1988 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1989 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1990 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1991 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1992 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1993 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1994 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1996 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1998 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1999 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2000 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2001 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2002 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2003 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2004 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2005 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2006 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2007 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2009 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2011 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2012 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2013 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2014 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2015 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2016 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2017 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2018 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2020 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2021 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2023 "addl $8, %%eax \n\t"
2024 "cmpl %4, %%eax \n\t"
2026 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2034 "xorl %%eax, %%eax \n\t"
2037 PREFETCH" 64(%0, %%eax, 4) \n\t"
2038 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2039 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2040 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2041 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2042 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2043 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2044 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2045 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2046 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2047 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2049 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2050 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2052 "addl $8, %%eax \n\t"
2053 "cmpl %4, %%eax \n\t"
2056 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2061 for(i=0; i<chromWidth; i++)
2063 udst[i] = src[4*i+0];
2064 ydst[2*i+0] = src[4*i+1];
2065 vdst[i] = src[4*i+2];
2066 ydst[2*i+1] = src[4*i+3];
2071 for(i=0; i<chromWidth; i++)
2073 ydst[2*i+0] = src[4*i+1];
2074 ydst[2*i+1] = src[4*i+3];
2077 udst += chromStride;
2078 vdst += chromStride;
2083 asm volatile( EMMS" \n\t"
2091 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2092 * problem for anyone then tell me, and ill fix it)
2093 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2095 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2096 unsigned int width, unsigned int height,
2097 int lumStride, int chromStride, int srcStride)
2100 const unsigned chromWidth= width>>1;
2102 for(y=0; y<height-2; y+=2)
2108 "mov %2, %%"REG_a" \n\t"
2109 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2110 "movq "MANGLE(w1111)", %%mm5 \n\t"
2111 "pxor %%mm7, %%mm7 \n\t"
2112 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2115 PREFETCH" 64(%0, %%"REG_b") \n\t"
2116 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2117 "movd 3(%0, %%"REG_b"), %%mm1 \n\t"
2118 "punpcklbw %%mm7, %%mm0 \n\t"
2119 "punpcklbw %%mm7, %%mm1 \n\t"
2120 "movd 6(%0, %%"REG_b"), %%mm2 \n\t"
2121 "movd 9(%0, %%"REG_b"), %%mm3 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2123 "punpcklbw %%mm7, %%mm3 \n\t"
2124 "pmaddwd %%mm6, %%mm0 \n\t"
2125 "pmaddwd %%mm6, %%mm1 \n\t"
2126 "pmaddwd %%mm6, %%mm2 \n\t"
2127 "pmaddwd %%mm6, %%mm3 \n\t"
2128 #ifndef FAST_BGR2YV12
2129 "psrad $8, %%mm0 \n\t"
2130 "psrad $8, %%mm1 \n\t"
2131 "psrad $8, %%mm2 \n\t"
2132 "psrad $8, %%mm3 \n\t"
2134 "packssdw %%mm1, %%mm0 \n\t"
2135 "packssdw %%mm3, %%mm2 \n\t"
2136 "pmaddwd %%mm5, %%mm0 \n\t"
2137 "pmaddwd %%mm5, %%mm2 \n\t"
2138 "packssdw %%mm2, %%mm0 \n\t"
2139 "psraw $7, %%mm0 \n\t"
2141 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2142 "movd 15(%0, %%"REG_b"), %%mm1 \n\t"
2143 "punpcklbw %%mm7, %%mm4 \n\t"
2144 "punpcklbw %%mm7, %%mm1 \n\t"
2145 "movd 18(%0, %%"REG_b"), %%mm2 \n\t"
2146 "movd 21(%0, %%"REG_b"), %%mm3 \n\t"
2147 "punpcklbw %%mm7, %%mm2 \n\t"
2148 "punpcklbw %%mm7, %%mm3 \n\t"
2149 "pmaddwd %%mm6, %%mm4 \n\t"
2150 "pmaddwd %%mm6, %%mm1 \n\t"
2151 "pmaddwd %%mm6, %%mm2 \n\t"
2152 "pmaddwd %%mm6, %%mm3 \n\t"
2153 #ifndef FAST_BGR2YV12
2154 "psrad $8, %%mm4 \n\t"
2155 "psrad $8, %%mm1 \n\t"
2156 "psrad $8, %%mm2 \n\t"
2157 "psrad $8, %%mm3 \n\t"
2159 "packssdw %%mm1, %%mm4 \n\t"
2160 "packssdw %%mm3, %%mm2 \n\t"
2161 "pmaddwd %%mm5, %%mm4 \n\t"
2162 "pmaddwd %%mm5, %%mm2 \n\t"
2163 "add $24, %%"REG_b" \n\t"
2164 "packssdw %%mm2, %%mm4 \n\t"
2165 "psraw $7, %%mm4 \n\t"
2167 "packuswb %%mm4, %%mm0 \n\t"
2168 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2170 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2171 "add $8, %%"REG_a" \n\t"
2173 : : "r" (src+width*3), "r" (ydst+width), "g" ((long)-width)
2174 : "%"REG_a, "%"REG_b
2181 "mov %4, %%"REG_a" \n\t"
2182 "movq "MANGLE(w1111)", %%mm5 \n\t"
2183 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2184 "pxor %%mm7, %%mm7 \n\t"
2185 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
2186 "add %%"REG_b", %%"REG_b" \n\t"
2189 PREFETCH" 64(%0, %%"REG_b") \n\t"
2190 PREFETCH" 64(%1, %%"REG_b") \n\t"
2191 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2192 "movq (%0, %%"REG_b"), %%mm0 \n\t"
2193 "movq (%1, %%"REG_b"), %%mm1 \n\t"
2194 "movq 6(%0, %%"REG_b"), %%mm2 \n\t"
2195 "movq 6(%1, %%"REG_b"), %%mm3 \n\t"
2196 PAVGB" %%mm1, %%mm0 \n\t"
2197 PAVGB" %%mm3, %%mm2 \n\t"
2198 "movq %%mm0, %%mm1 \n\t"
2199 "movq %%mm2, %%mm3 \n\t"
2200 "psrlq $24, %%mm0 \n\t"
2201 "psrlq $24, %%mm2 \n\t"
2202 PAVGB" %%mm1, %%mm0 \n\t"
2203 PAVGB" %%mm3, %%mm2 \n\t"
2204 "punpcklbw %%mm7, %%mm0 \n\t"
2205 "punpcklbw %%mm7, %%mm2 \n\t"
2207 "movd (%0, %%"REG_b"), %%mm0 \n\t"
2208 "movd (%1, %%"REG_b"), %%mm1 \n\t"
2209 "movd 3(%0, %%"REG_b"), %%mm2 \n\t"
2210 "movd 3(%1, %%"REG_b"), %%mm3 \n\t"
2211 "punpcklbw %%mm7, %%mm0 \n\t"
2212 "punpcklbw %%mm7, %%mm1 \n\t"
2213 "punpcklbw %%mm7, %%mm2 \n\t"
2214 "punpcklbw %%mm7, %%mm3 \n\t"
2215 "paddw %%mm1, %%mm0 \n\t"
2216 "paddw %%mm3, %%mm2 \n\t"
2217 "paddw %%mm2, %%mm0 \n\t"
2218 "movd 6(%0, %%"REG_b"), %%mm4 \n\t"
2219 "movd 6(%1, %%"REG_b"), %%mm1 \n\t"
2220 "movd 9(%0, %%"REG_b"), %%mm2 \n\t"
2221 "movd 9(%1, %%"REG_b"), %%mm3 \n\t"
2222 "punpcklbw %%mm7, %%mm4 \n\t"
2223 "punpcklbw %%mm7, %%mm1 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm3 \n\t"
2226 "paddw %%mm1, %%mm4 \n\t"
2227 "paddw %%mm3, %%mm2 \n\t"
2228 "paddw %%mm4, %%mm2 \n\t"
2229 "psrlw $2, %%mm0 \n\t"
2230 "psrlw $2, %%mm2 \n\t"
2232 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2233 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2235 "pmaddwd %%mm0, %%mm1 \n\t"
2236 "pmaddwd %%mm2, %%mm3 \n\t"
2237 "pmaddwd %%mm6, %%mm0 \n\t"
2238 "pmaddwd %%mm6, %%mm2 \n\t"
2239 #ifndef FAST_BGR2YV12
2240 "psrad $8, %%mm0 \n\t"
2241 "psrad $8, %%mm1 \n\t"
2242 "psrad $8, %%mm2 \n\t"
2243 "psrad $8, %%mm3 \n\t"
2245 "packssdw %%mm2, %%mm0 \n\t"
2246 "packssdw %%mm3, %%mm1 \n\t"
2247 "pmaddwd %%mm5, %%mm0 \n\t"
2248 "pmaddwd %%mm5, %%mm1 \n\t"
2249 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2250 "psraw $7, %%mm0 \n\t"
2252 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2253 "movq 12(%0, %%"REG_b"), %%mm4 \n\t"
2254 "movq 12(%1, %%"REG_b"), %%mm1 \n\t"
2255 "movq 18(%0, %%"REG_b"), %%mm2 \n\t"
2256 "movq 18(%1, %%"REG_b"), %%mm3 \n\t"
2257 PAVGB" %%mm1, %%mm4 \n\t"
2258 PAVGB" %%mm3, %%mm2 \n\t"
2259 "movq %%mm4, %%mm1 \n\t"
2260 "movq %%mm2, %%mm3 \n\t"
2261 "psrlq $24, %%mm4 \n\t"
2262 "psrlq $24, %%mm2 \n\t"
2263 PAVGB" %%mm1, %%mm4 \n\t"
2264 PAVGB" %%mm3, %%mm2 \n\t"
2265 "punpcklbw %%mm7, %%mm4 \n\t"
2266 "punpcklbw %%mm7, %%mm2 \n\t"
2268 "movd 12(%0, %%"REG_b"), %%mm4 \n\t"
2269 "movd 12(%1, %%"REG_b"), %%mm1 \n\t"
2270 "movd 15(%0, %%"REG_b"), %%mm2 \n\t"
2271 "movd 15(%1, %%"REG_b"), %%mm3 \n\t"
2272 "punpcklbw %%mm7, %%mm4 \n\t"
2273 "punpcklbw %%mm7, %%mm1 \n\t"
2274 "punpcklbw %%mm7, %%mm2 \n\t"
2275 "punpcklbw %%mm7, %%mm3 \n\t"
2276 "paddw %%mm1, %%mm4 \n\t"
2277 "paddw %%mm3, %%mm2 \n\t"
2278 "paddw %%mm2, %%mm4 \n\t"
2279 "movd 18(%0, %%"REG_b"), %%mm5 \n\t"
2280 "movd 18(%1, %%"REG_b"), %%mm1 \n\t"
2281 "movd 21(%0, %%"REG_b"), %%mm2 \n\t"
2282 "movd 21(%1, %%"REG_b"), %%mm3 \n\t"
2283 "punpcklbw %%mm7, %%mm5 \n\t"
2284 "punpcklbw %%mm7, %%mm1 \n\t"
2285 "punpcklbw %%mm7, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm3 \n\t"
2287 "paddw %%mm1, %%mm5 \n\t"
2288 "paddw %%mm3, %%mm2 \n\t"
2289 "paddw %%mm5, %%mm2 \n\t"
2290 "movq "MANGLE(w1111)", %%mm5 \n\t"
2291 "psrlw $2, %%mm4 \n\t"
2292 "psrlw $2, %%mm2 \n\t"
2294 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2295 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2297 "pmaddwd %%mm4, %%mm1 \n\t"
2298 "pmaddwd %%mm2, %%mm3 \n\t"
2299 "pmaddwd %%mm6, %%mm4 \n\t"
2300 "pmaddwd %%mm6, %%mm2 \n\t"
2301 #ifndef FAST_BGR2YV12
2302 "psrad $8, %%mm4 \n\t"
2303 "psrad $8, %%mm1 \n\t"
2304 "psrad $8, %%mm2 \n\t"
2305 "psrad $8, %%mm3 \n\t"
2307 "packssdw %%mm2, %%mm4 \n\t"
2308 "packssdw %%mm3, %%mm1 \n\t"
2309 "pmaddwd %%mm5, %%mm4 \n\t"
2310 "pmaddwd %%mm5, %%mm1 \n\t"
2311 "add $24, %%"REG_b" \n\t"
2312 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2313 "psraw $7, %%mm4 \n\t"
2315 "movq %%mm0, %%mm1 \n\t"
2316 "punpckldq %%mm4, %%mm0 \n\t"
2317 "punpckhdq %%mm4, %%mm1 \n\t"
2318 "packsswb %%mm1, %%mm0 \n\t"
2319 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2320 "movd %%mm0, (%2, %%"REG_a") \n\t"
2321 "punpckhdq %%mm0, %%mm0 \n\t"
2322 "movd %%mm0, (%3, %%"REG_a") \n\t"
2323 "add $4, %%"REG_a" \n\t"
2325 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" ((long)-chromWidth)
2326 : "%"REG_a, "%"REG_b
2329 udst += chromStride;
2330 vdst += chromStride;
2334 asm volatile( EMMS" \n\t"
2340 for(; y<height; y+=2)
2343 for(i=0; i<chromWidth; i++)
2345 unsigned int b= src[6*i+0];
2346 unsigned int g= src[6*i+1];
2347 unsigned int r= src[6*i+2];
2349 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2351 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2361 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2367 for(i=0; i<chromWidth; i++)
2369 unsigned int b= src[6*i+0];
2370 unsigned int g= src[6*i+1];
2371 unsigned int r= src[6*i+2];
2373 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2381 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2384 udst += chromStride;
2385 vdst += chromStride;
2391 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2392 unsigned width, unsigned height, int src1Stride,
2393 int src2Stride, int dstStride){
2396 for(h=0; h < height; h++)
2403 "xor %%"REG_a", %%"REG_a" \n\t"
2405 PREFETCH" 64(%1, %%"REG_a") \n\t"
2406 PREFETCH" 64(%2, %%"REG_a") \n\t"
2407 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2408 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2409 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2410 "punpcklbw %%xmm2, %%xmm0 \n\t"
2411 "punpckhbw %%xmm2, %%xmm1 \n\t"
2412 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2413 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2414 "add $16, %%"REG_a" \n\t"
2415 "cmp %3, %%"REG_a" \n\t"
2417 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2418 : "memory", "%"REG_a""
2422 "xor %%"REG_a", %%"REG_a" \n\t"
2424 PREFETCH" 64(%1, %%"REG_a") \n\t"
2425 PREFETCH" 64(%2, %%"REG_a") \n\t"
2426 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2427 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2428 "movq %%mm0, %%mm1 \n\t"
2429 "movq %%mm2, %%mm3 \n\t"
2430 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2431 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2432 "punpcklbw %%mm4, %%mm0 \n\t"
2433 "punpckhbw %%mm4, %%mm1 \n\t"
2434 "punpcklbw %%mm5, %%mm2 \n\t"
2435 "punpckhbw %%mm5, %%mm3 \n\t"
2436 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2437 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2438 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2439 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2440 "add $16, %%"REG_a" \n\t"
2441 "cmp %3, %%"REG_a" \n\t"
2443 ::"r"(dest), "r"(src1), "r"(src2), "r" ((long)width-15)
2444 : "memory", "%"REG_a
2447 for(w= (width&(~15)); w < width; w++)
2449 dest[2*w+0] = src1[w];
2450 dest[2*w+1] = src2[w];
2453 for(w=0; w < width; w++)
2455 dest[2*w+0] = src1[w];
2456 dest[2*w+1] = src2[w];
2472 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2473 uint8_t *dst1, uint8_t *dst2,
2474 unsigned width, unsigned height,
2475 int srcStride1, int srcStride2,
2476 int dstStride1, int dstStride2)
2480 w=width/2; h=height/2;
2485 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2488 const uint8_t* s1=src1+srcStride1*(y>>1);
2489 uint8_t* d=dst1+dstStride1*y;
2496 "movq %1, %%mm0\n\t"
2497 "movq 8%1, %%mm2\n\t"
2498 "movq 16%1, %%mm4\n\t"
2499 "movq 24%1, %%mm6\n\t"
2500 "movq %%mm0, %%mm1\n\t"
2501 "movq %%mm2, %%mm3\n\t"
2502 "movq %%mm4, %%mm5\n\t"
2503 "movq %%mm6, %%mm7\n\t"
2504 "punpcklbw %%mm0, %%mm0\n\t"
2505 "punpckhbw %%mm1, %%mm1\n\t"
2506 "punpcklbw %%mm2, %%mm2\n\t"
2507 "punpckhbw %%mm3, %%mm3\n\t"
2508 "punpcklbw %%mm4, %%mm4\n\t"
2509 "punpckhbw %%mm5, %%mm5\n\t"
2510 "punpcklbw %%mm6, %%mm6\n\t"
2511 "punpckhbw %%mm7, %%mm7\n\t"
2512 MOVNTQ" %%mm0, %0\n\t"
2513 MOVNTQ" %%mm1, 8%0\n\t"
2514 MOVNTQ" %%mm2, 16%0\n\t"
2515 MOVNTQ" %%mm3, 24%0\n\t"
2516 MOVNTQ" %%mm4, 32%0\n\t"
2517 MOVNTQ" %%mm5, 40%0\n\t"
2518 MOVNTQ" %%mm6, 48%0\n\t"
2519 MOVNTQ" %%mm7, 56%0"
2525 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2528 const uint8_t* s2=src2+srcStride2*(y>>1);
2529 uint8_t* d=dst2+dstStride2*y;
2536 "movq %1, %%mm0\n\t"
2537 "movq 8%1, %%mm2\n\t"
2538 "movq 16%1, %%mm4\n\t"
2539 "movq 24%1, %%mm6\n\t"
2540 "movq %%mm0, %%mm1\n\t"
2541 "movq %%mm2, %%mm3\n\t"
2542 "movq %%mm4, %%mm5\n\t"
2543 "movq %%mm6, %%mm7\n\t"
2544 "punpcklbw %%mm0, %%mm0\n\t"
2545 "punpckhbw %%mm1, %%mm1\n\t"
2546 "punpcklbw %%mm2, %%mm2\n\t"
2547 "punpckhbw %%mm3, %%mm3\n\t"
2548 "punpcklbw %%mm4, %%mm4\n\t"
2549 "punpckhbw %%mm5, %%mm5\n\t"
2550 "punpcklbw %%mm6, %%mm6\n\t"
2551 "punpckhbw %%mm7, %%mm7\n\t"
2552 MOVNTQ" %%mm0, %0\n\t"
2553 MOVNTQ" %%mm1, 8%0\n\t"
2554 MOVNTQ" %%mm2, 16%0\n\t"
2555 MOVNTQ" %%mm3, 24%0\n\t"
2556 MOVNTQ" %%mm4, 32%0\n\t"
2557 MOVNTQ" %%mm5, 40%0\n\t"
2558 MOVNTQ" %%mm6, 48%0\n\t"
2559 MOVNTQ" %%mm7, 56%0"
2565 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2576 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2578 unsigned width, unsigned height,
2579 int srcStride1, int srcStride2,
2580 int srcStride3, int dstStride)
2582 unsigned long y,x,w,h;
2583 w=width/2; h=height;
2585 const uint8_t* yp=src1+srcStride1*y;
2586 const uint8_t* up=src2+srcStride2*(y>>2);
2587 const uint8_t* vp=src3+srcStride3*(y>>2);
2588 uint8_t* d=dst+dstStride*y;
2594 PREFETCH" 32(%1, %0)\n\t"
2595 PREFETCH" 32(%2, %0)\n\t"
2596 PREFETCH" 32(%3, %0)\n\t"
2597 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2598 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2599 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2600 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2601 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2602 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2603 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2604 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2605 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2606 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2608 "movq %%mm1, %%mm6\n\t"
2609 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2610 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2611 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2612 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2613 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2615 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2616 "movq 8(%1, %0, 4), %%mm0\n\t"
2617 "movq %%mm0, %%mm3\n\t"
2618 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2619 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2620 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2621 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2623 "movq %%mm4, %%mm6\n\t"
2624 "movq 16(%1, %0, 4), %%mm0\n\t"
2625 "movq %%mm0, %%mm3\n\t"
2626 "punpcklbw %%mm5, %%mm4\n\t"
2627 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2628 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2629 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2630 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2632 "punpckhbw %%mm5, %%mm6\n\t"
2633 "movq 24(%1, %0, 4), %%mm0\n\t"
2634 "movq %%mm0, %%mm3\n\t"
2635 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2636 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2637 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2638 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2641 : "r"(yp), "r" (up), "r"(vp), "r"(d)