3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246 register const uint8_t* s=src;
247 register uint8_t* d=dst;
248 register const uint8_t *end;
249 const uint8_t *mm_end;
252 __asm __volatile(PREFETCH" %0"::"m"(*s));
253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
261 "movq 8%1, %%mm2\n\t"
262 "movq %%mm0, %%mm1\n\t"
263 "movq %%mm2, %%mm3\n\t"
264 "psrlq $1, %%mm0\n\t"
265 "psrlq $1, %%mm2\n\t"
266 "pand %%mm7, %%mm0\n\t"
267 "pand %%mm7, %%mm2\n\t"
268 "pand %%mm6, %%mm1\n\t"
269 "pand %%mm6, %%mm3\n\t"
270 "por %%mm1, %%mm0\n\t"
271 "por %%mm3, %%mm2\n\t"
272 MOVNTQ" %%mm0, %0\n\t"
280 __asm __volatile(SFENCE:::"memory");
281 __asm __volatile(EMMS:::"memory");
286 register uint32_t x= *((uint32_t *)s);
287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
293 register uint16_t x= *((uint16_t *)s);
294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302 const uint8_t *s = src;
305 const uint8_t *mm_end;
307 uint16_t *d = (uint16_t *)dst;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
313 "movq %3, %%mm5 \n\t"
314 "movq %4, %%mm6 \n\t"
315 "movq %5, %%mm7 \n\t"
318 PREFETCH" 32(%1) \n\t"
319 "movd (%1), %%mm0 \n\t"
320 "movd 4(%1), %%mm3 \n\t"
321 "punpckldq 8(%1), %%mm0 \n\t"
322 "punpckldq 12(%1), %%mm3 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "movq %%mm3, %%mm4 \n\t"
325 "pand %%mm6, %%mm0 \n\t"
326 "pand %%mm6, %%mm3 \n\t"
327 "pmaddwd %%mm7, %%mm0 \n\t"
328 "pmaddwd %%mm7, %%mm3 \n\t"
329 "pand %%mm5, %%mm1 \n\t"
330 "pand %%mm5, %%mm4 \n\t"
331 "por %%mm1, %%mm0 \n\t"
332 "por %%mm4, %%mm3 \n\t"
333 "psrld $5, %%mm0 \n\t"
334 "pslld $11, %%mm3 \n\t"
335 "por %%mm3, %%mm0 \n\t"
336 MOVNTQ" %%mm0, (%0) \n\t"
342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
349 ::"m"(red_16mask),"m"(green_16mask));
355 "movd 4%1, %%mm3\n\t"
356 "punpckldq 8%1, %%mm0\n\t"
357 "punpckldq 12%1, %%mm3\n\t"
358 "movq %%mm0, %%mm1\n\t"
359 "movq %%mm0, %%mm2\n\t"
360 "movq %%mm3, %%mm4\n\t"
361 "movq %%mm3, %%mm5\n\t"
362 "psrlq $3, %%mm0\n\t"
363 "psrlq $3, %%mm3\n\t"
366 "psrlq $5, %%mm1\n\t"
367 "psrlq $5, %%mm4\n\t"
368 "pand %%mm6, %%mm1\n\t"
369 "pand %%mm6, %%mm4\n\t"
370 "psrlq $8, %%mm2\n\t"
371 "psrlq $8, %%mm5\n\t"
372 "pand %%mm7, %%mm2\n\t"
373 "pand %%mm7, %%mm5\n\t"
374 "por %%mm1, %%mm0\n\t"
375 "por %%mm4, %%mm3\n\t"
376 "por %%mm2, %%mm0\n\t"
377 "por %%mm5, %%mm3\n\t"
378 "psllq $16, %%mm3\n\t"
379 "por %%mm3, %%mm0\n\t"
380 MOVNTQ" %%mm0, %0\n\t"
381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
391 const int src= *s; s += 4;
392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399 const uint8_t *s = src;
402 const uint8_t *mm_end;
404 uint16_t *d = (uint16_t *)dst;
407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
411 ::"m"(red_16mask),"m"(green_16mask));
418 "movd 4%1, %%mm3\n\t"
419 "punpckldq 8%1, %%mm0\n\t"
420 "punpckldq 12%1, %%mm3\n\t"
421 "movq %%mm0, %%mm1\n\t"
422 "movq %%mm0, %%mm2\n\t"
423 "movq %%mm3, %%mm4\n\t"
424 "movq %%mm3, %%mm5\n\t"
425 "psllq $8, %%mm0\n\t"
426 "psllq $8, %%mm3\n\t"
427 "pand %%mm7, %%mm0\n\t"
428 "pand %%mm7, %%mm3\n\t"
429 "psrlq $5, %%mm1\n\t"
430 "psrlq $5, %%mm4\n\t"
431 "pand %%mm6, %%mm1\n\t"
432 "pand %%mm6, %%mm4\n\t"
433 "psrlq $19, %%mm2\n\t"
434 "psrlq $19, %%mm5\n\t"
437 "por %%mm1, %%mm0\n\t"
438 "por %%mm4, %%mm3\n\t"
439 "por %%mm2, %%mm0\n\t"
440 "por %%mm5, %%mm3\n\t"
441 "psllq $16, %%mm3\n\t"
442 "por %%mm3, %%mm0\n\t"
443 MOVNTQ" %%mm0, %0\n\t"
444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
448 __asm __volatile(SFENCE:::"memory");
449 __asm __volatile(EMMS:::"memory");
453 const int src= *s; s += 4;
454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460 const uint8_t *s = src;
463 const uint8_t *mm_end;
465 uint16_t *d = (uint16_t *)dst;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
471 "movq %3, %%mm5 \n\t"
472 "movq %4, %%mm6 \n\t"
473 "movq %5, %%mm7 \n\t"
476 PREFETCH" 32(%1) \n\t"
477 "movd (%1), %%mm0 \n\t"
478 "movd 4(%1), %%mm3 \n\t"
479 "punpckldq 8(%1), %%mm0 \n\t"
480 "punpckldq 12(%1), %%mm3 \n\t"
481 "movq %%mm0, %%mm1 \n\t"
482 "movq %%mm3, %%mm4 \n\t"
483 "pand %%mm6, %%mm0 \n\t"
484 "pand %%mm6, %%mm3 \n\t"
485 "pmaddwd %%mm7, %%mm0 \n\t"
486 "pmaddwd %%mm7, %%mm3 \n\t"
487 "pand %%mm5, %%mm1 \n\t"
488 "pand %%mm5, %%mm4 \n\t"
489 "por %%mm1, %%mm0 \n\t"
490 "por %%mm4, %%mm3 \n\t"
491 "psrld $6, %%mm0 \n\t"
492 "pslld $10, %%mm3 \n\t"
493 "por %%mm3, %%mm0 \n\t"
494 MOVNTQ" %%mm0, (%0) \n\t"
500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
507 ::"m"(red_15mask),"m"(green_15mask));
513 "movd 4%1, %%mm3\n\t"
514 "punpckldq 8%1, %%mm0\n\t"
515 "punpckldq 12%1, %%mm3\n\t"
516 "movq %%mm0, %%mm1\n\t"
517 "movq %%mm0, %%mm2\n\t"
518 "movq %%mm3, %%mm4\n\t"
519 "movq %%mm3, %%mm5\n\t"
520 "psrlq $3, %%mm0\n\t"
521 "psrlq $3, %%mm3\n\t"
524 "psrlq $6, %%mm1\n\t"
525 "psrlq $6, %%mm4\n\t"
526 "pand %%mm6, %%mm1\n\t"
527 "pand %%mm6, %%mm4\n\t"
528 "psrlq $9, %%mm2\n\t"
529 "psrlq $9, %%mm5\n\t"
530 "pand %%mm7, %%mm2\n\t"
531 "pand %%mm7, %%mm5\n\t"
532 "por %%mm1, %%mm0\n\t"
533 "por %%mm4, %%mm3\n\t"
534 "por %%mm2, %%mm0\n\t"
535 "por %%mm5, %%mm3\n\t"
536 "psllq $16, %%mm3\n\t"
537 "por %%mm3, %%mm0\n\t"
538 MOVNTQ" %%mm0, %0\n\t"
539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
544 __asm __volatile(SFENCE:::"memory");
545 __asm __volatile(EMMS:::"memory");
549 const int src= *s; s += 4;
550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556 const uint8_t *s = src;
559 const uint8_t *mm_end;
561 uint16_t *d = (uint16_t *)dst;
564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
568 ::"m"(red_15mask),"m"(green_15mask));
575 "movd 4%1, %%mm3\n\t"
576 "punpckldq 8%1, %%mm0\n\t"
577 "punpckldq 12%1, %%mm3\n\t"
578 "movq %%mm0, %%mm1\n\t"
579 "movq %%mm0, %%mm2\n\t"
580 "movq %%mm3, %%mm4\n\t"
581 "movq %%mm3, %%mm5\n\t"
582 "psllq $7, %%mm0\n\t"
583 "psllq $7, %%mm3\n\t"
584 "pand %%mm7, %%mm0\n\t"
585 "pand %%mm7, %%mm3\n\t"
586 "psrlq $6, %%mm1\n\t"
587 "psrlq $6, %%mm4\n\t"
588 "pand %%mm6, %%mm1\n\t"
589 "pand %%mm6, %%mm4\n\t"
590 "psrlq $19, %%mm2\n\t"
591 "psrlq $19, %%mm5\n\t"
594 "por %%mm1, %%mm0\n\t"
595 "por %%mm4, %%mm3\n\t"
596 "por %%mm2, %%mm0\n\t"
597 "por %%mm5, %%mm3\n\t"
598 "psllq $16, %%mm3\n\t"
599 "por %%mm3, %%mm0\n\t"
600 MOVNTQ" %%mm0, %0\n\t"
601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
605 __asm __volatile(SFENCE:::"memory");
606 __asm __volatile(EMMS:::"memory");
610 const int src= *s; s += 4;
611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617 const uint8_t *s = src;
620 const uint8_t *mm_end;
622 uint16_t *d = (uint16_t *)dst;
625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
629 ::"m"(red_16mask),"m"(green_16mask));
636 "movd 3%1, %%mm3\n\t"
637 "punpckldq 6%1, %%mm0\n\t"
638 "punpckldq 9%1, %%mm3\n\t"
639 "movq %%mm0, %%mm1\n\t"
640 "movq %%mm0, %%mm2\n\t"
641 "movq %%mm3, %%mm4\n\t"
642 "movq %%mm3, %%mm5\n\t"
643 "psrlq $3, %%mm0\n\t"
644 "psrlq $3, %%mm3\n\t"
647 "psrlq $5, %%mm1\n\t"
648 "psrlq $5, %%mm4\n\t"
649 "pand %%mm6, %%mm1\n\t"
650 "pand %%mm6, %%mm4\n\t"
651 "psrlq $8, %%mm2\n\t"
652 "psrlq $8, %%mm5\n\t"
653 "pand %%mm7, %%mm2\n\t"
654 "pand %%mm7, %%mm5\n\t"
655 "por %%mm1, %%mm0\n\t"
656 "por %%mm4, %%mm3\n\t"
657 "por %%mm2, %%mm0\n\t"
658 "por %%mm5, %%mm3\n\t"
659 "psllq $16, %%mm3\n\t"
660 "por %%mm3, %%mm0\n\t"
661 MOVNTQ" %%mm0, %0\n\t"
662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
666 __asm __volatile(SFENCE:::"memory");
667 __asm __volatile(EMMS:::"memory");
674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680 const uint8_t *s = src;
683 const uint8_t *mm_end;
685 uint16_t *d = (uint16_t *)dst;
688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
692 ::"m"(red_16mask),"m"(green_16mask));
699 "movd 3%1, %%mm3\n\t"
700 "punpckldq 6%1, %%mm0\n\t"
701 "punpckldq 9%1, %%mm3\n\t"
702 "movq %%mm0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm3, %%mm4\n\t"
705 "movq %%mm3, %%mm5\n\t"
706 "psllq $8, %%mm0\n\t"
707 "psllq $8, %%mm3\n\t"
708 "pand %%mm7, %%mm0\n\t"
709 "pand %%mm7, %%mm3\n\t"
710 "psrlq $5, %%mm1\n\t"
711 "psrlq $5, %%mm4\n\t"
712 "pand %%mm6, %%mm1\n\t"
713 "pand %%mm6, %%mm4\n\t"
714 "psrlq $19, %%mm2\n\t"
715 "psrlq $19, %%mm5\n\t"
718 "por %%mm1, %%mm0\n\t"
719 "por %%mm4, %%mm3\n\t"
720 "por %%mm2, %%mm0\n\t"
721 "por %%mm5, %%mm3\n\t"
722 "psllq $16, %%mm3\n\t"
723 "por %%mm3, %%mm0\n\t"
724 MOVNTQ" %%mm0, %0\n\t"
725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
729 __asm __volatile(SFENCE:::"memory");
730 __asm __volatile(EMMS:::"memory");
737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743 const uint8_t *s = src;
746 const uint8_t *mm_end;
748 uint16_t *d = (uint16_t *)dst;
751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
755 ::"m"(red_15mask),"m"(green_15mask));
762 "movd 3%1, %%mm3\n\t"
763 "punpckldq 6%1, %%mm0\n\t"
764 "punpckldq 9%1, %%mm3\n\t"
765 "movq %%mm0, %%mm1\n\t"
766 "movq %%mm0, %%mm2\n\t"
767 "movq %%mm3, %%mm4\n\t"
768 "movq %%mm3, %%mm5\n\t"
769 "psrlq $3, %%mm0\n\t"
770 "psrlq $3, %%mm3\n\t"
773 "psrlq $6, %%mm1\n\t"
774 "psrlq $6, %%mm4\n\t"
775 "pand %%mm6, %%mm1\n\t"
776 "pand %%mm6, %%mm4\n\t"
777 "psrlq $9, %%mm2\n\t"
778 "psrlq $9, %%mm5\n\t"
779 "pand %%mm7, %%mm2\n\t"
780 "pand %%mm7, %%mm5\n\t"
781 "por %%mm1, %%mm0\n\t"
782 "por %%mm4, %%mm3\n\t"
783 "por %%mm2, %%mm0\n\t"
784 "por %%mm5, %%mm3\n\t"
785 "psllq $16, %%mm3\n\t"
786 "por %%mm3, %%mm0\n\t"
787 MOVNTQ" %%mm0, %0\n\t"
788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
792 __asm __volatile(SFENCE:::"memory");
793 __asm __volatile(EMMS:::"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806 const uint8_t *s = src;
809 const uint8_t *mm_end;
811 uint16_t *d = (uint16_t *)dst;
814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
818 ::"m"(red_15mask),"m"(green_15mask));
825 "movd 3%1, %%mm3\n\t"
826 "punpckldq 6%1, %%mm0\n\t"
827 "punpckldq 9%1, %%mm3\n\t"
828 "movq %%mm0, %%mm1\n\t"
829 "movq %%mm0, %%mm2\n\t"
830 "movq %%mm3, %%mm4\n\t"
831 "movq %%mm3, %%mm5\n\t"
832 "psllq $7, %%mm0\n\t"
833 "psllq $7, %%mm3\n\t"
834 "pand %%mm7, %%mm0\n\t"
835 "pand %%mm7, %%mm3\n\t"
836 "psrlq $6, %%mm1\n\t"
837 "psrlq $6, %%mm4\n\t"
838 "pand %%mm6, %%mm1\n\t"
839 "pand %%mm6, %%mm4\n\t"
840 "psrlq $19, %%mm2\n\t"
841 "psrlq $19, %%mm5\n\t"
844 "por %%mm1, %%mm0\n\t"
845 "por %%mm4, %%mm3\n\t"
846 "por %%mm2, %%mm0\n\t"
847 "por %%mm5, %%mm3\n\t"
848 "psllq $16, %%mm3\n\t"
849 "por %%mm3, %%mm0\n\t"
850 MOVNTQ" %%mm0, %0\n\t"
851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 __asm __volatile(SFENCE:::"memory");
856 __asm __volatile(EMMS:::"memory");
863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868 I use here less accurate approximation by simply
869 left-shifting the input
870 value and filling the low order bits with
871 zeroes. This method improves png's
872 compression but this scheme cannot reproduce white exactly, since it does not
873 generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | Leftmost Bits Repeated to Fill Open Bits
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
894 const uint16_t *mm_end;
896 uint8_t *d = (uint8_t *)dst;
897 const uint16_t *s = (uint16_t *)src;
898 end = s + src_size/2;
900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
912 "psllq $3, %%mm0\n\t"
913 "psrlq $2, %%mm1\n\t"
914 "psrlq $7, %%mm2\n\t"
915 "movq %%mm0, %%mm3\n\t"
916 "movq %%mm1, %%mm4\n\t"
917 "movq %%mm2, %%mm5\n\t"
918 "punpcklwd %5, %%mm0\n\t"
919 "punpcklwd %5, %%mm1\n\t"
920 "punpcklwd %5, %%mm2\n\t"
921 "punpckhwd %5, %%mm3\n\t"
922 "punpckhwd %5, %%mm4\n\t"
923 "punpckhwd %5, %%mm5\n\t"
924 "psllq $8, %%mm1\n\t"
925 "psllq $16, %%mm2\n\t"
926 "por %%mm1, %%mm0\n\t"
927 "por %%mm2, %%mm0\n\t"
928 "psllq $8, %%mm4\n\t"
929 "psllq $16, %%mm5\n\t"
930 "por %%mm4, %%mm3\n\t"
931 "por %%mm5, %%mm3\n\t"
933 "movq %%mm0, %%mm6\n\t"
934 "movq %%mm3, %%mm7\n\t"
936 "movq 8%1, %%mm0\n\t"
937 "movq 8%1, %%mm1\n\t"
938 "movq 8%1, %%mm2\n\t"
942 "psllq $3, %%mm0\n\t"
943 "psrlq $2, %%mm1\n\t"
944 "psrlq $7, %%mm2\n\t"
945 "movq %%mm0, %%mm3\n\t"
946 "movq %%mm1, %%mm4\n\t"
947 "movq %%mm2, %%mm5\n\t"
948 "punpcklwd %5, %%mm0\n\t"
949 "punpcklwd %5, %%mm1\n\t"
950 "punpcklwd %5, %%mm2\n\t"
951 "punpckhwd %5, %%mm3\n\t"
952 "punpckhwd %5, %%mm4\n\t"
953 "punpckhwd %5, %%mm5\n\t"
954 "psllq $8, %%mm1\n\t"
955 "psllq $16, %%mm2\n\t"
956 "por %%mm1, %%mm0\n\t"
957 "por %%mm2, %%mm0\n\t"
958 "psllq $8, %%mm4\n\t"
959 "psllq $16, %%mm5\n\t"
960 "por %%mm4, %%mm3\n\t"
961 "por %%mm5, %%mm3\n\t"
964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
966 /* Borrowed 32 to 24 */
968 "movq %%mm0, %%mm4\n\t"
969 "movq %%mm3, %%mm5\n\t"
970 "movq %%mm6, %%mm0\n\t"
971 "movq %%mm7, %%mm1\n\t"
973 "movq %%mm4, %%mm6\n\t"
974 "movq %%mm5, %%mm7\n\t"
975 "movq %%mm0, %%mm2\n\t"
976 "movq %%mm1, %%mm3\n\t"
978 "psrlq $8, %%mm2\n\t"
979 "psrlq $8, %%mm3\n\t"
980 "psrlq $8, %%mm6\n\t"
981 "psrlq $8, %%mm7\n\t"
990 "por %%mm2, %%mm0\n\t"
991 "por %%mm3, %%mm1\n\t"
992 "por %%mm6, %%mm4\n\t"
993 "por %%mm7, %%mm5\n\t"
995 "movq %%mm1, %%mm2\n\t"
996 "movq %%mm4, %%mm3\n\t"
997 "psllq $48, %%mm2\n\t"
998 "psllq $32, %%mm3\n\t"
1000 "pand %5, %%mm3\n\t"
1001 "por %%mm2, %%mm0\n\t"
1002 "psrlq $16, %%mm1\n\t"
1003 "psrlq $32, %%mm4\n\t"
1004 "psllq $16, %%mm5\n\t"
1005 "por %%mm3, %%mm1\n\t"
1006 "pand %6, %%mm5\n\t"
1007 "por %%mm5, %%mm4\n\t"
1009 MOVNTQ" %%mm0, %0\n\t"
1010 MOVNTQ" %%mm1, 8%0\n\t"
1011 MOVNTQ" %%mm4, 16%0"
1014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1019 __asm __volatile(SFENCE:::"memory");
1020 __asm __volatile(EMMS:::"memory");
1024 register uint16_t bgr;
1026 *d++ = (bgr&0x1F)<<3;
1027 *d++ = (bgr&0x3E0)>>2;
1028 *d++ = (bgr&0x7C00)>>7;
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034 const uint16_t *end;
1036 const uint16_t *mm_end;
1038 uint8_t *d = (uint8_t *)dst;
1039 const uint16_t *s = (const uint16_t *)src;
1040 end = s + src_size/2;
1042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1048 "movq %1, %%mm0\n\t"
1049 "movq %1, %%mm1\n\t"
1050 "movq %1, %%mm2\n\t"
1051 "pand %2, %%mm0\n\t"
1052 "pand %3, %%mm1\n\t"
1053 "pand %4, %%mm2\n\t"
1054 "psllq $3, %%mm0\n\t"
1055 "psrlq $3, %%mm1\n\t"
1056 "psrlq $8, %%mm2\n\t"
1057 "movq %%mm0, %%mm3\n\t"
1058 "movq %%mm1, %%mm4\n\t"
1059 "movq %%mm2, %%mm5\n\t"
1060 "punpcklwd %5, %%mm0\n\t"
1061 "punpcklwd %5, %%mm1\n\t"
1062 "punpcklwd %5, %%mm2\n\t"
1063 "punpckhwd %5, %%mm3\n\t"
1064 "punpckhwd %5, %%mm4\n\t"
1065 "punpckhwd %5, %%mm5\n\t"
1066 "psllq $8, %%mm1\n\t"
1067 "psllq $16, %%mm2\n\t"
1068 "por %%mm1, %%mm0\n\t"
1069 "por %%mm2, %%mm0\n\t"
1070 "psllq $8, %%mm4\n\t"
1071 "psllq $16, %%mm5\n\t"
1072 "por %%mm4, %%mm3\n\t"
1073 "por %%mm5, %%mm3\n\t"
1075 "movq %%mm0, %%mm6\n\t"
1076 "movq %%mm3, %%mm7\n\t"
1078 "movq 8%1, %%mm0\n\t"
1079 "movq 8%1, %%mm1\n\t"
1080 "movq 8%1, %%mm2\n\t"
1081 "pand %2, %%mm0\n\t"
1082 "pand %3, %%mm1\n\t"
1083 "pand %4, %%mm2\n\t"
1084 "psllq $3, %%mm0\n\t"
1085 "psrlq $3, %%mm1\n\t"
1086 "psrlq $8, %%mm2\n\t"
1087 "movq %%mm0, %%mm3\n\t"
1088 "movq %%mm1, %%mm4\n\t"
1089 "movq %%mm2, %%mm5\n\t"
1090 "punpcklwd %5, %%mm0\n\t"
1091 "punpcklwd %5, %%mm1\n\t"
1092 "punpcklwd %5, %%mm2\n\t"
1093 "punpckhwd %5, %%mm3\n\t"
1094 "punpckhwd %5, %%mm4\n\t"
1095 "punpckhwd %5, %%mm5\n\t"
1096 "psllq $8, %%mm1\n\t"
1097 "psllq $16, %%mm2\n\t"
1098 "por %%mm1, %%mm0\n\t"
1099 "por %%mm2, %%mm0\n\t"
1100 "psllq $8, %%mm4\n\t"
1101 "psllq $16, %%mm5\n\t"
1102 "por %%mm4, %%mm3\n\t"
1103 "por %%mm5, %%mm3\n\t"
1105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107 /* Borrowed 32 to 24 */
1109 "movq %%mm0, %%mm4\n\t"
1110 "movq %%mm3, %%mm5\n\t"
1111 "movq %%mm6, %%mm0\n\t"
1112 "movq %%mm7, %%mm1\n\t"
1114 "movq %%mm4, %%mm6\n\t"
1115 "movq %%mm5, %%mm7\n\t"
1116 "movq %%mm0, %%mm2\n\t"
1117 "movq %%mm1, %%mm3\n\t"
1119 "psrlq $8, %%mm2\n\t"
1120 "psrlq $8, %%mm3\n\t"
1121 "psrlq $8, %%mm6\n\t"
1122 "psrlq $8, %%mm7\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %2, %%mm1\n\t"
1125 "pand %2, %%mm4\n\t"
1126 "pand %2, %%mm5\n\t"
1127 "pand %3, %%mm2\n\t"
1128 "pand %3, %%mm3\n\t"
1129 "pand %3, %%mm6\n\t"
1130 "pand %3, %%mm7\n\t"
1131 "por %%mm2, %%mm0\n\t"
1132 "por %%mm3, %%mm1\n\t"
1133 "por %%mm6, %%mm4\n\t"
1134 "por %%mm7, %%mm5\n\t"
1136 "movq %%mm1, %%mm2\n\t"
1137 "movq %%mm4, %%mm3\n\t"
1138 "psllq $48, %%mm2\n\t"
1139 "psllq $32, %%mm3\n\t"
1140 "pand %4, %%mm2\n\t"
1141 "pand %5, %%mm3\n\t"
1142 "por %%mm2, %%mm0\n\t"
1143 "psrlq $16, %%mm1\n\t"
1144 "psrlq $32, %%mm4\n\t"
1145 "psllq $16, %%mm5\n\t"
1146 "por %%mm3, %%mm1\n\t"
1147 "pand %6, %%mm5\n\t"
1148 "por %%mm5, %%mm4\n\t"
1150 MOVNTQ" %%mm0, %0\n\t"
1151 MOVNTQ" %%mm1, 8%0\n\t"
1152 MOVNTQ" %%mm4, 16%0"
1155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1160 __asm __volatile(SFENCE:::"memory");
1161 __asm __volatile(EMMS:::"memory");
1165 register uint16_t bgr;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175 const uint16_t *end;
1177 const uint16_t *mm_end;
1179 uint8_t *d = (uint8_t *)dst;
1180 const uint16_t *s = (const uint16_t *)src;
1181 end = s + src_size/2;
1183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1190 "movq %1, %%mm0\n\t"
1191 "movq %1, %%mm1\n\t"
1192 "movq %1, %%mm2\n\t"
1193 "pand %2, %%mm0\n\t"
1194 "pand %3, %%mm1\n\t"
1195 "pand %4, %%mm2\n\t"
1196 "psllq $3, %%mm0\n\t"
1197 "psrlq $2, %%mm1\n\t"
1198 "psrlq $7, %%mm2\n\t"
1199 "movq %%mm0, %%mm3\n\t"
1200 "movq %%mm1, %%mm4\n\t"
1201 "movq %%mm2, %%mm5\n\t"
1202 "punpcklwd %%mm7, %%mm0\n\t"
1203 "punpcklwd %%mm7, %%mm1\n\t"
1204 "punpcklwd %%mm7, %%mm2\n\t"
1205 "punpckhwd %%mm7, %%mm3\n\t"
1206 "punpckhwd %%mm7, %%mm4\n\t"
1207 "punpckhwd %%mm7, %%mm5\n\t"
1208 "psllq $8, %%mm1\n\t"
1209 "psllq $16, %%mm2\n\t"
1210 "por %%mm1, %%mm0\n\t"
1211 "por %%mm2, %%mm0\n\t"
1212 "psllq $8, %%mm4\n\t"
1213 "psllq $16, %%mm5\n\t"
1214 "por %%mm4, %%mm3\n\t"
1215 "por %%mm5, %%mm3\n\t"
1216 MOVNTQ" %%mm0, %0\n\t"
1217 MOVNTQ" %%mm3, 8%0\n\t"
1219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1224 __asm __volatile(SFENCE:::"memory");
1225 __asm __volatile(EMMS:::"memory");
1229 #if 0 //slightly slower on athlon
1231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234 register uint16_t bgr;
1236 *d++ = (bgr&0x1F)<<3;
1237 *d++ = (bgr&0x3E0)>>2;
1238 *d++ = (bgr&0x7C00)>>7;
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246 const uint16_t *end;
1248 const uint16_t *mm_end;
1250 uint8_t *d = (uint8_t *)dst;
1251 const uint16_t *s = (uint16_t *)src;
1252 end = s + src_size/2;
1254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1261 "movq %1, %%mm0\n\t"
1262 "movq %1, %%mm1\n\t"
1263 "movq %1, %%mm2\n\t"
1264 "pand %2, %%mm0\n\t"
1265 "pand %3, %%mm1\n\t"
1266 "pand %4, %%mm2\n\t"
1267 "psllq $3, %%mm0\n\t"
1268 "psrlq $3, %%mm1\n\t"
1269 "psrlq $8, %%mm2\n\t"
1270 "movq %%mm0, %%mm3\n\t"
1271 "movq %%mm1, %%mm4\n\t"
1272 "movq %%mm2, %%mm5\n\t"
1273 "punpcklwd %%mm7, %%mm0\n\t"
1274 "punpcklwd %%mm7, %%mm1\n\t"
1275 "punpcklwd %%mm7, %%mm2\n\t"
1276 "punpckhwd %%mm7, %%mm3\n\t"
1277 "punpckhwd %%mm7, %%mm4\n\t"
1278 "punpckhwd %%mm7, %%mm5\n\t"
1279 "psllq $8, %%mm1\n\t"
1280 "psllq $16, %%mm2\n\t"
1281 "por %%mm1, %%mm0\n\t"
1282 "por %%mm2, %%mm0\n\t"
1283 "psllq $8, %%mm4\n\t"
1284 "psllq $16, %%mm5\n\t"
1285 "por %%mm4, %%mm3\n\t"
1286 "por %%mm5, %%mm3\n\t"
1287 MOVNTQ" %%mm0, %0\n\t"
1288 MOVNTQ" %%mm3, 8%0\n\t"
1290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1295 __asm __volatile(SFENCE:::"memory");
1296 __asm __volatile(EMMS:::"memory");
1300 register uint16_t bgr;
1302 *d++ = (bgr&0x1F)<<3;
1303 *d++ = (bgr&0x7E0)>>3;
1304 *d++ = (bgr&0xF800)>>8;
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1312 /* TODO: unroll this loop */
1314 "xorl %%eax, %%eax \n\t"
1317 PREFETCH" 32(%0, %%eax) \n\t"
1318 "movq (%0, %%eax), %%mm0 \n\t"
1319 "movq %%mm0, %%mm1 \n\t"
1320 "movq %%mm0, %%mm2 \n\t"
1321 "pslld $16, %%mm0 \n\t"
1322 "psrld $16, %%mm1 \n\t"
1323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1326 "por %%mm0, %%mm2 \n\t"
1327 "por %%mm1, %%mm2 \n\t"
1328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329 "addl $8, %%eax \n\t"
1330 "cmpl %2, %%eax \n\t"
1332 :: "r" (src), "r"(dst), "r" (src_size-7)
1336 __asm __volatile(SFENCE:::"memory");
1337 __asm __volatile(EMMS:::"memory");
1340 unsigned num_pixels = src_size >> 2;
1341 for(i=0; i<num_pixels; i++)
1343 #ifdef WORDS_BIGENDIAN
1344 dst[4*i + 1] = src[4*i + 3];
1345 dst[4*i + 2] = src[4*i + 2];
1346 dst[4*i + 3] = src[4*i + 1];
1348 dst[4*i + 0] = src[4*i + 2];
1349 dst[4*i + 1] = src[4*i + 1];
1350 dst[4*i + 2] = src[4*i + 0];
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1360 int mmx_size= 23 - src_size;
1362 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1363 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1364 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1367 PREFETCH" 32(%1, %%eax) \n\t"
1368 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1369 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1370 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1371 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1372 "pand %%mm5, %%mm0 \n\t"
1373 "pand %%mm6, %%mm1 \n\t"
1374 "pand %%mm7, %%mm2 \n\t"
1375 "por %%mm0, %%mm1 \n\t"
1376 "por %%mm2, %%mm1 \n\t"
1377 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1378 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1379 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1380 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1381 "pand %%mm7, %%mm0 \n\t"
1382 "pand %%mm5, %%mm1 \n\t"
1383 "pand %%mm6, %%mm2 \n\t"
1384 "por %%mm0, %%mm1 \n\t"
1385 "por %%mm2, %%mm1 \n\t"
1386 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1387 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1388 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1389 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1390 "pand %%mm6, %%mm0 \n\t"
1391 "pand %%mm7, %%mm1 \n\t"
1392 "pand %%mm5, %%mm2 \n\t"
1393 "por %%mm0, %%mm1 \n\t"
1394 "por %%mm2, %%mm1 \n\t"
1395 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1396 "addl $24, %%eax \n\t"
1399 : "r" (src-mmx_size), "r"(dst-mmx_size)
1402 __asm __volatile(SFENCE:::"memory");
1403 __asm __volatile(EMMS:::"memory");
1405 if(mmx_size==23) return; //finihsed, was multiple of 8
1409 src_size= 23-mmx_size;
1413 for(i=0; i<src_size; i+=3)
1417 dst[i + 1] = src[i + 1];
1418 dst[i + 2] = src[i + 0];
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424 unsigned int width, unsigned int height,
1425 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1428 const unsigned chromWidth= width>>1;
1429 for(y=0; y<height; y++)
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1434 "xorl %%eax, %%eax \n\t"
1437 PREFETCH" 32(%1, %%eax, 2) \n\t"
1438 PREFETCH" 32(%2, %%eax) \n\t"
1439 PREFETCH" 32(%3, %%eax) \n\t"
1440 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1441 "movq %%mm0, %%mm2 \n\t" // U(0)
1442 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1443 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1444 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1446 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1447 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1448 "movq %%mm3, %%mm4 \n\t" // Y(0)
1449 "movq %%mm5, %%mm6 \n\t" // Y(8)
1450 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1451 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1452 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1453 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1455 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1456 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1457 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1460 "addl $8, %%eax \n\t"
1461 "cmpl %4, %%eax \n\t"
1463 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n) \
1474 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1475 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1476 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1477 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1478 yuv1 = (u << 8) + (v << 24); \
1485 uint64_t *qdst = (uint64_t *) dst;
1486 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487 const uint32_t *yc = (uint32_t *) ysrc;
1488 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490 for(i = 0; i < chromWidth; i += 8){
1491 uint64_t y1, y2, yuv1, yuv2;
1494 asm("ldq $31,64(%0)" :: "r"(yc));
1495 asm("ldq $31,64(%0)" :: "r"(yc2));
1496 asm("ldq $31,64(%0)" :: "r"(uc));
1497 asm("ldq $31,64(%0)" :: "r"(vc));
1515 #elif __WORDSIZE >= 64
1517 uint64_t *ldst = (uint64_t *) dst;
1518 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519 for(i = 0; i < chromWidth; i += 2){
1521 k = yc[0] + (uc[0] << 8) +
1522 (yc[1] << 16) + (vc[0] << 24);
1523 l = yc[2] + (uc[1] << 8) +
1524 (yc[3] << 16) + (vc[1] << 24);
1525 *ldst++ = k + (l << 32);
1532 int i, *idst = (int32_t *) dst;
1533 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534 for(i = 0; i < chromWidth; i++){
1535 #ifdef WORDS_BIGENDIAN
1536 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1537 (yc[1] << 8) + (vc[0] << 0);
1539 *idst++ = yc[0] + (uc[0] << 8) +
1540 (yc[1] << 16) + (vc[0] << 24);
1548 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1550 usrc += chromStride;
1551 vsrc += chromStride;
1565 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1566 * problem for anyone then tell me, and ill fix it)
1568 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1569 unsigned int width, unsigned int height,
1570 int lumStride, int chromStride, int dstStride)
1572 //FIXME interpolate chroma
1573 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1576 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1577 unsigned int width, unsigned int height,
1578 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1581 const unsigned chromWidth= width>>1;
1582 for(y=0; y<height; y++)
1585 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1587 "xorl %%eax, %%eax \n\t"
1590 PREFETCH" 32(%1, %%eax, 2) \n\t"
1591 PREFETCH" 32(%2, %%eax) \n\t"
1592 PREFETCH" 32(%3, %%eax) \n\t"
1593 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1594 "movq %%mm0, %%mm2 \n\t" // U(0)
1595 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1596 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1597 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1599 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1600 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1601 "movq %%mm0, %%mm4 \n\t" // Y(0)
1602 "movq %%mm2, %%mm6 \n\t" // Y(8)
1603 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1604 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1605 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1606 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1608 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1609 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1610 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1611 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1613 "addl $8, %%eax \n\t"
1614 "cmpl %4, %%eax \n\t"
1616 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1620 //FIXME adapt the alpha asm code from yv12->yuy2
1622 #if __WORDSIZE >= 64
1624 uint64_t *ldst = (uint64_t *) dst;
1625 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1626 for(i = 0; i < chromWidth; i += 2){
1628 k = uc[0] + (yc[0] << 8) +
1629 (vc[0] << 16) + (yc[1] << 24);
1630 l = uc[1] + (yc[2] << 8) +
1631 (vc[1] << 16) + (yc[3] << 24);
1632 *ldst++ = k + (l << 32);
1639 int i, *idst = (int32_t *) dst;
1640 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1641 for(i = 0; i < chromWidth; i++){
1642 #ifdef WORDS_BIGENDIAN
1643 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1644 (vc[0] << 8) + (yc[1] << 0);
1646 *idst++ = uc[0] + (yc[0] << 8) +
1647 (vc[0] << 16) + (yc[1] << 24);
1655 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1657 usrc += chromStride;
1658 vsrc += chromStride;
1672 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1673 * problem for anyone then tell me, and ill fix it)
1675 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1676 unsigned int width, unsigned int height,
1677 int lumStride, int chromStride, int dstStride)
1679 //FIXME interpolate chroma
1680 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1685 * width should be a multiple of 16
1687 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1688 unsigned int width, unsigned int height,
1689 int lumStride, int chromStride, int dstStride)
1691 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1696 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1697 * problem for anyone then tell me, and ill fix it)
1699 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1700 unsigned int width, unsigned int height,
1701 int lumStride, int chromStride, int srcStride)
1704 const unsigned chromWidth= width>>1;
1705 for(y=0; y<height; y+=2)
1709 "xorl %%eax, %%eax \n\t"
1710 "pcmpeqw %%mm7, %%mm7 \n\t"
1711 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1714 PREFETCH" 64(%0, %%eax, 4) \n\t"
1715 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1716 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1717 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1718 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1719 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1720 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1721 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1722 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1723 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1724 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1726 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1728 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1729 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1730 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1731 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1732 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1733 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1734 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1735 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1736 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1737 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1739 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1741 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1742 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1743 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1744 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1745 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1746 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1747 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1748 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1750 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1751 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1753 "addl $8, %%eax \n\t"
1754 "cmpl %4, %%eax \n\t"
1756 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1764 "xorl %%eax, %%eax \n\t"
1767 PREFETCH" 64(%0, %%eax, 4) \n\t"
1768 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1769 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1770 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1771 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1772 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1773 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1774 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1775 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1776 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1777 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1779 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1780 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1782 "addl $8, %%eax \n\t"
1783 "cmpl %4, %%eax \n\t"
1786 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1791 for(i=0; i<chromWidth; i++)
1793 ydst[2*i+0] = src[4*i+0];
1794 udst[i] = src[4*i+1];
1795 ydst[2*i+1] = src[4*i+2];
1796 vdst[i] = src[4*i+3];
1801 for(i=0; i<chromWidth; i++)
1803 ydst[2*i+0] = src[4*i+0];
1804 ydst[2*i+1] = src[4*i+2];
1807 udst += chromStride;
1808 vdst += chromStride;
1813 asm volatile( EMMS" \n\t"
1819 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1820 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1821 unsigned int width, unsigned int height, int lumStride, int chromStride)
1824 memcpy(ydst, ysrc, width*height);
1826 /* XXX: implement upscaling for U,V */
1829 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1836 for(x=0; x<srcWidth-1; x++){
1837 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1838 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1840 dst[2*srcWidth-1]= src[srcWidth-1];
1844 for(y=1; y<srcHeight; y++){
1845 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1846 const int mmxSize= srcWidth&~15;
1848 "movl %4, %%eax \n\t"
1850 "movq (%0, %%eax), %%mm0 \n\t"
1851 "movq (%1, %%eax), %%mm1 \n\t"
1852 "movq 1(%0, %%eax), %%mm2 \n\t"
1853 "movq 1(%1, %%eax), %%mm3 \n\t"
1854 "movq -1(%0, %%eax), %%mm4 \n\t"
1855 "movq -1(%1, %%eax), %%mm5 \n\t"
1856 PAVGB" %%mm0, %%mm5 \n\t"
1857 PAVGB" %%mm0, %%mm3 \n\t"
1858 PAVGB" %%mm0, %%mm5 \n\t"
1859 PAVGB" %%mm0, %%mm3 \n\t"
1860 PAVGB" %%mm1, %%mm4 \n\t"
1861 PAVGB" %%mm1, %%mm2 \n\t"
1862 PAVGB" %%mm1, %%mm4 \n\t"
1863 PAVGB" %%mm1, %%mm2 \n\t"
1864 "movq %%mm5, %%mm7 \n\t"
1865 "movq %%mm4, %%mm6 \n\t"
1866 "punpcklbw %%mm3, %%mm5 \n\t"
1867 "punpckhbw %%mm3, %%mm7 \n\t"
1868 "punpcklbw %%mm2, %%mm4 \n\t"
1869 "punpckhbw %%mm2, %%mm6 \n\t"
1871 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1872 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1873 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1874 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1876 "movq %%mm5, (%2, %%eax, 2) \n\t"
1877 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1878 "movq %%mm4, (%3, %%eax, 2) \n\t"
1879 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1881 "addl $8, %%eax \n\t"
1883 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1884 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1890 const int mmxSize=1;
1892 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1893 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1895 for(x=mmxSize-1; x<srcWidth-1; x++){
1896 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1897 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1898 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1899 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1901 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1902 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1912 for(x=0; x<srcWidth-1; x++){
1913 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1914 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1916 dst[2*srcWidth-1]= src[srcWidth-1];
1918 for(x=0; x<srcWidth; x++){
1925 asm volatile( EMMS" \n\t"
1933 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1934 * problem for anyone then tell me, and ill fix it)
1935 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1937 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1938 unsigned int width, unsigned int height,
1939 int lumStride, int chromStride, int srcStride)
1942 const unsigned chromWidth= width>>1;
1943 for(y=0; y<height; y+=2)
1947 "xorl %%eax, %%eax \n\t"
1948 "pcmpeqw %%mm7, %%mm7 \n\t"
1949 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1952 PREFETCH" 64(%0, %%eax, 4) \n\t"
1953 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1954 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1955 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1956 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1957 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1958 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1959 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1960 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1961 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1962 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1964 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1966 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1967 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1968 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1969 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1970 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1971 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1972 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1973 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1974 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1975 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1977 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1979 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1980 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1981 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1982 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1983 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1984 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1985 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1986 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1988 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1989 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1991 "addl $8, %%eax \n\t"
1992 "cmpl %4, %%eax \n\t"
1994 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2002 "xorl %%eax, %%eax \n\t"
2005 PREFETCH" 64(%0, %%eax, 4) \n\t"
2006 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2007 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2008 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2009 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2010 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2011 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2012 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2013 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2014 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2015 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2017 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2018 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2020 "addl $8, %%eax \n\t"
2021 "cmpl %4, %%eax \n\t"
2024 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2029 for(i=0; i<chromWidth; i++)
2031 udst[i] = src[4*i+0];
2032 ydst[2*i+0] = src[4*i+1];
2033 vdst[i] = src[4*i+2];
2034 ydst[2*i+1] = src[4*i+3];
2039 for(i=0; i<chromWidth; i++)
2041 ydst[2*i+0] = src[4*i+1];
2042 ydst[2*i+1] = src[4*i+3];
2045 udst += chromStride;
2046 vdst += chromStride;
2051 asm volatile( EMMS" \n\t"
2059 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2060 * problem for anyone then tell me, and ill fix it)
2061 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2063 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2064 unsigned int width, unsigned int height,
2065 int lumStride, int chromStride, int srcStride)
2068 const unsigned chromWidth= width>>1;
2070 for(y=0; y<height-2; y+=2)
2076 "movl %2, %%eax \n\t"
2077 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2078 "movq "MANGLE(w1111)", %%mm5 \n\t"
2079 "pxor %%mm7, %%mm7 \n\t"
2080 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2083 PREFETCH" 64(%0, %%ebx) \n\t"
2084 "movd (%0, %%ebx), %%mm0 \n\t"
2085 "movd 3(%0, %%ebx), %%mm1 \n\t"
2086 "punpcklbw %%mm7, %%mm0 \n\t"
2087 "punpcklbw %%mm7, %%mm1 \n\t"
2088 "movd 6(%0, %%ebx), %%mm2 \n\t"
2089 "movd 9(%0, %%ebx), %%mm3 \n\t"
2090 "punpcklbw %%mm7, %%mm2 \n\t"
2091 "punpcklbw %%mm7, %%mm3 \n\t"
2092 "pmaddwd %%mm6, %%mm0 \n\t"
2093 "pmaddwd %%mm6, %%mm1 \n\t"
2094 "pmaddwd %%mm6, %%mm2 \n\t"
2095 "pmaddwd %%mm6, %%mm3 \n\t"
2096 #ifndef FAST_BGR2YV12
2097 "psrad $8, %%mm0 \n\t"
2098 "psrad $8, %%mm1 \n\t"
2099 "psrad $8, %%mm2 \n\t"
2100 "psrad $8, %%mm3 \n\t"
2102 "packssdw %%mm1, %%mm0 \n\t"
2103 "packssdw %%mm3, %%mm2 \n\t"
2104 "pmaddwd %%mm5, %%mm0 \n\t"
2105 "pmaddwd %%mm5, %%mm2 \n\t"
2106 "packssdw %%mm2, %%mm0 \n\t"
2107 "psraw $7, %%mm0 \n\t"
2109 "movd 12(%0, %%ebx), %%mm4 \n\t"
2110 "movd 15(%0, %%ebx), %%mm1 \n\t"
2111 "punpcklbw %%mm7, %%mm4 \n\t"
2112 "punpcklbw %%mm7, %%mm1 \n\t"
2113 "movd 18(%0, %%ebx), %%mm2 \n\t"
2114 "movd 21(%0, %%ebx), %%mm3 \n\t"
2115 "punpcklbw %%mm7, %%mm2 \n\t"
2116 "punpcklbw %%mm7, %%mm3 \n\t"
2117 "pmaddwd %%mm6, %%mm4 \n\t"
2118 "pmaddwd %%mm6, %%mm1 \n\t"
2119 "pmaddwd %%mm6, %%mm2 \n\t"
2120 "pmaddwd %%mm6, %%mm3 \n\t"
2121 #ifndef FAST_BGR2YV12
2122 "psrad $8, %%mm4 \n\t"
2123 "psrad $8, %%mm1 \n\t"
2124 "psrad $8, %%mm2 \n\t"
2125 "psrad $8, %%mm3 \n\t"
2127 "packssdw %%mm1, %%mm4 \n\t"
2128 "packssdw %%mm3, %%mm2 \n\t"
2129 "pmaddwd %%mm5, %%mm4 \n\t"
2130 "pmaddwd %%mm5, %%mm2 \n\t"
2131 "addl $24, %%ebx \n\t"
2132 "packssdw %%mm2, %%mm4 \n\t"
2133 "psraw $7, %%mm4 \n\t"
2135 "packuswb %%mm4, %%mm0 \n\t"
2136 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2138 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2139 "addl $8, %%eax \n\t"
2141 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2149 "movl %4, %%eax \n\t"
2150 "movq "MANGLE(w1111)", %%mm5 \n\t"
2151 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2152 "pxor %%mm7, %%mm7 \n\t"
2153 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2154 "addl %%ebx, %%ebx \n\t"
2157 PREFETCH" 64(%0, %%ebx) \n\t"
2158 PREFETCH" 64(%1, %%ebx) \n\t"
2159 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2160 "movq (%0, %%ebx), %%mm0 \n\t"
2161 "movq (%1, %%ebx), %%mm1 \n\t"
2162 "movq 6(%0, %%ebx), %%mm2 \n\t"
2163 "movq 6(%1, %%ebx), %%mm3 \n\t"
2164 PAVGB" %%mm1, %%mm0 \n\t"
2165 PAVGB" %%mm3, %%mm2 \n\t"
2166 "movq %%mm0, %%mm1 \n\t"
2167 "movq %%mm2, %%mm3 \n\t"
2168 "psrlq $24, %%mm0 \n\t"
2169 "psrlq $24, %%mm2 \n\t"
2170 PAVGB" %%mm1, %%mm0 \n\t"
2171 PAVGB" %%mm3, %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm0 \n\t"
2173 "punpcklbw %%mm7, %%mm2 \n\t"
2175 "movd (%0, %%ebx), %%mm0 \n\t"
2176 "movd (%1, %%ebx), %%mm1 \n\t"
2177 "movd 3(%0, %%ebx), %%mm2 \n\t"
2178 "movd 3(%1, %%ebx), %%mm3 \n\t"
2179 "punpcklbw %%mm7, %%mm0 \n\t"
2180 "punpcklbw %%mm7, %%mm1 \n\t"
2181 "punpcklbw %%mm7, %%mm2 \n\t"
2182 "punpcklbw %%mm7, %%mm3 \n\t"
2183 "paddw %%mm1, %%mm0 \n\t"
2184 "paddw %%mm3, %%mm2 \n\t"
2185 "paddw %%mm2, %%mm0 \n\t"
2186 "movd 6(%0, %%ebx), %%mm4 \n\t"
2187 "movd 6(%1, %%ebx), %%mm1 \n\t"
2188 "movd 9(%0, %%ebx), %%mm2 \n\t"
2189 "movd 9(%1, %%ebx), %%mm3 \n\t"
2190 "punpcklbw %%mm7, %%mm4 \n\t"
2191 "punpcklbw %%mm7, %%mm1 \n\t"
2192 "punpcklbw %%mm7, %%mm2 \n\t"
2193 "punpcklbw %%mm7, %%mm3 \n\t"
2194 "paddw %%mm1, %%mm4 \n\t"
2195 "paddw %%mm3, %%mm2 \n\t"
2196 "paddw %%mm4, %%mm2 \n\t"
2197 "psrlw $2, %%mm0 \n\t"
2198 "psrlw $2, %%mm2 \n\t"
2200 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2201 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2203 "pmaddwd %%mm0, %%mm1 \n\t"
2204 "pmaddwd %%mm2, %%mm3 \n\t"
2205 "pmaddwd %%mm6, %%mm0 \n\t"
2206 "pmaddwd %%mm6, %%mm2 \n\t"
2207 #ifndef FAST_BGR2YV12
2208 "psrad $8, %%mm0 \n\t"
2209 "psrad $8, %%mm1 \n\t"
2210 "psrad $8, %%mm2 \n\t"
2211 "psrad $8, %%mm3 \n\t"
2213 "packssdw %%mm2, %%mm0 \n\t"
2214 "packssdw %%mm3, %%mm1 \n\t"
2215 "pmaddwd %%mm5, %%mm0 \n\t"
2216 "pmaddwd %%mm5, %%mm1 \n\t"
2217 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2218 "psraw $7, %%mm0 \n\t"
2220 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2221 "movq 12(%0, %%ebx), %%mm4 \n\t"
2222 "movq 12(%1, %%ebx), %%mm1 \n\t"
2223 "movq 18(%0, %%ebx), %%mm2 \n\t"
2224 "movq 18(%1, %%ebx), %%mm3 \n\t"
2225 PAVGB" %%mm1, %%mm4 \n\t"
2226 PAVGB" %%mm3, %%mm2 \n\t"
2227 "movq %%mm4, %%mm1 \n\t"
2228 "movq %%mm2, %%mm3 \n\t"
2229 "psrlq $24, %%mm4 \n\t"
2230 "psrlq $24, %%mm2 \n\t"
2231 PAVGB" %%mm1, %%mm4 \n\t"
2232 PAVGB" %%mm3, %%mm2 \n\t"
2233 "punpcklbw %%mm7, %%mm4 \n\t"
2234 "punpcklbw %%mm7, %%mm2 \n\t"
2236 "movd 12(%0, %%ebx), %%mm4 \n\t"
2237 "movd 12(%1, %%ebx), %%mm1 \n\t"
2238 "movd 15(%0, %%ebx), %%mm2 \n\t"
2239 "movd 15(%1, %%ebx), %%mm3 \n\t"
2240 "punpcklbw %%mm7, %%mm4 \n\t"
2241 "punpcklbw %%mm7, %%mm1 \n\t"
2242 "punpcklbw %%mm7, %%mm2 \n\t"
2243 "punpcklbw %%mm7, %%mm3 \n\t"
2244 "paddw %%mm1, %%mm4 \n\t"
2245 "paddw %%mm3, %%mm2 \n\t"
2246 "paddw %%mm2, %%mm4 \n\t"
2247 "movd 18(%0, %%ebx), %%mm5 \n\t"
2248 "movd 18(%1, %%ebx), %%mm1 \n\t"
2249 "movd 21(%0, %%ebx), %%mm2 \n\t"
2250 "movd 21(%1, %%ebx), %%mm3 \n\t"
2251 "punpcklbw %%mm7, %%mm5 \n\t"
2252 "punpcklbw %%mm7, %%mm1 \n\t"
2253 "punpcklbw %%mm7, %%mm2 \n\t"
2254 "punpcklbw %%mm7, %%mm3 \n\t"
2255 "paddw %%mm1, %%mm5 \n\t"
2256 "paddw %%mm3, %%mm2 \n\t"
2257 "paddw %%mm5, %%mm2 \n\t"
2258 "movq "MANGLE(w1111)", %%mm5 \n\t"
2259 "psrlw $2, %%mm4 \n\t"
2260 "psrlw $2, %%mm2 \n\t"
2262 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2263 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2265 "pmaddwd %%mm4, %%mm1 \n\t"
2266 "pmaddwd %%mm2, %%mm3 \n\t"
2267 "pmaddwd %%mm6, %%mm4 \n\t"
2268 "pmaddwd %%mm6, %%mm2 \n\t"
2269 #ifndef FAST_BGR2YV12
2270 "psrad $8, %%mm4 \n\t"
2271 "psrad $8, %%mm1 \n\t"
2272 "psrad $8, %%mm2 \n\t"
2273 "psrad $8, %%mm3 \n\t"
2275 "packssdw %%mm2, %%mm4 \n\t"
2276 "packssdw %%mm3, %%mm1 \n\t"
2277 "pmaddwd %%mm5, %%mm4 \n\t"
2278 "pmaddwd %%mm5, %%mm1 \n\t"
2279 "addl $24, %%ebx \n\t"
2280 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2281 "psraw $7, %%mm4 \n\t"
2283 "movq %%mm0, %%mm1 \n\t"
2284 "punpckldq %%mm4, %%mm0 \n\t"
2285 "punpckhdq %%mm4, %%mm1 \n\t"
2286 "packsswb %%mm1, %%mm0 \n\t"
2287 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2289 "movd %%mm0, (%2, %%eax) \n\t"
2290 "punpckhdq %%mm0, %%mm0 \n\t"
2291 "movd %%mm0, (%3, %%eax) \n\t"
2292 "addl $4, %%eax \n\t"
2294 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2298 udst += chromStride;
2299 vdst += chromStride;
2303 asm volatile( EMMS" \n\t"
2309 for(; y<height; y+=2)
2312 for(i=0; i<chromWidth; i++)
2314 unsigned int b= src[6*i+0];
2315 unsigned int g= src[6*i+1];
2316 unsigned int r= src[6*i+2];
2318 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2319 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2320 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2330 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2336 for(i=0; i<chromWidth; i++)
2338 unsigned int b= src[6*i+0];
2339 unsigned int g= src[6*i+1];
2340 unsigned int r= src[6*i+2];
2342 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2350 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2353 udst += chromStride;
2354 vdst += chromStride;
2360 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2361 unsigned width, unsigned height, int src1Stride,
2362 int src2Stride, int dstStride){
2365 for(h=0; h < height; h++)
2372 "xorl %%eax, %%eax \n\t"
2374 PREFETCH" 64(%1, %%eax) \n\t"
2375 PREFETCH" 64(%2, %%eax) \n\t"
2376 "movdqa (%1, %%eax), %%xmm0 \n\t"
2377 "movdqa (%1, %%eax), %%xmm1 \n\t"
2378 "movdqa (%2, %%eax), %%xmm2 \n\t"
2379 "punpcklbw %%xmm2, %%xmm0 \n\t"
2380 "punpckhbw %%xmm2, %%xmm1 \n\t"
2381 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2382 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2383 "addl $16, %%eax \n\t"
2384 "cmpl %3, %%eax \n\t"
2386 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2391 "xorl %%eax, %%eax \n\t"
2393 PREFETCH" 64(%1, %%eax) \n\t"
2394 PREFETCH" 64(%2, %%eax) \n\t"
2395 "movq (%1, %%eax), %%mm0 \n\t"
2396 "movq 8(%1, %%eax), %%mm2 \n\t"
2397 "movq %%mm0, %%mm1 \n\t"
2398 "movq %%mm2, %%mm3 \n\t"
2399 "movq (%2, %%eax), %%mm4 \n\t"
2400 "movq 8(%2, %%eax), %%mm5 \n\t"
2401 "punpcklbw %%mm4, %%mm0 \n\t"
2402 "punpckhbw %%mm4, %%mm1 \n\t"
2403 "punpcklbw %%mm5, %%mm2 \n\t"
2404 "punpckhbw %%mm5, %%mm3 \n\t"
2405 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2406 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2407 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2408 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2409 "addl $16, %%eax \n\t"
2410 "cmpl %3, %%eax \n\t"
2412 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2416 for(w= (width&(~15)); w < width; w++)
2418 dest[2*w+0] = src1[w];
2419 dest[2*w+1] = src2[w];
2422 for(w=0; w < width; w++)
2424 dest[2*w+0] = src1[w];
2425 dest[2*w+1] = src2[w];
2441 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2442 uint8_t *dst1, uint8_t *dst2,
2443 unsigned width, unsigned height,
2444 int srcStride1, int srcStride2,
2445 int dstStride1, int dstStride2)
2449 w=width/2; h=height/2;
2454 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2457 const uint8_t* s1=src1+srcStride1*(y>>1);
2458 uint8_t* d=dst1+dstStride1*y;
2465 "movq %1, %%mm0\n\t"
2466 "movq 8%1, %%mm2\n\t"
2467 "movq 16%1, %%mm4\n\t"
2468 "movq 24%1, %%mm6\n\t"
2469 "movq %%mm0, %%mm1\n\t"
2470 "movq %%mm2, %%mm3\n\t"
2471 "movq %%mm4, %%mm5\n\t"
2472 "movq %%mm6, %%mm7\n\t"
2473 "punpcklbw %%mm0, %%mm0\n\t"
2474 "punpckhbw %%mm1, %%mm1\n\t"
2475 "punpcklbw %%mm2, %%mm2\n\t"
2476 "punpckhbw %%mm3, %%mm3\n\t"
2477 "punpcklbw %%mm4, %%mm4\n\t"
2478 "punpckhbw %%mm5, %%mm5\n\t"
2479 "punpcklbw %%mm6, %%mm6\n\t"
2480 "punpckhbw %%mm7, %%mm7\n\t"
2481 MOVNTQ" %%mm0, %0\n\t"
2482 MOVNTQ" %%mm1, 8%0\n\t"
2483 MOVNTQ" %%mm2, 16%0\n\t"
2484 MOVNTQ" %%mm3, 24%0\n\t"
2485 MOVNTQ" %%mm4, 32%0\n\t"
2486 MOVNTQ" %%mm5, 40%0\n\t"
2487 MOVNTQ" %%mm6, 48%0\n\t"
2488 MOVNTQ" %%mm7, 56%0"
2494 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2497 const uint8_t* s2=src2+srcStride2*(y>>1);
2498 uint8_t* d=dst2+dstStride2*y;
2505 "movq %1, %%mm0\n\t"
2506 "movq 8%1, %%mm2\n\t"
2507 "movq 16%1, %%mm4\n\t"
2508 "movq 24%1, %%mm6\n\t"
2509 "movq %%mm0, %%mm1\n\t"
2510 "movq %%mm2, %%mm3\n\t"
2511 "movq %%mm4, %%mm5\n\t"
2512 "movq %%mm6, %%mm7\n\t"
2513 "punpcklbw %%mm0, %%mm0\n\t"
2514 "punpckhbw %%mm1, %%mm1\n\t"
2515 "punpcklbw %%mm2, %%mm2\n\t"
2516 "punpckhbw %%mm3, %%mm3\n\t"
2517 "punpcklbw %%mm4, %%mm4\n\t"
2518 "punpckhbw %%mm5, %%mm5\n\t"
2519 "punpcklbw %%mm6, %%mm6\n\t"
2520 "punpckhbw %%mm7, %%mm7\n\t"
2521 MOVNTQ" %%mm0, %0\n\t"
2522 MOVNTQ" %%mm1, 8%0\n\t"
2523 MOVNTQ" %%mm2, 16%0\n\t"
2524 MOVNTQ" %%mm3, 24%0\n\t"
2525 MOVNTQ" %%mm4, 32%0\n\t"
2526 MOVNTQ" %%mm5, 40%0\n\t"
2527 MOVNTQ" %%mm6, 48%0\n\t"
2528 MOVNTQ" %%mm7, 56%0"
2534 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2545 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2547 unsigned width, unsigned height,
2548 int srcStride1, int srcStride2,
2549 int srcStride3, int dstStride)
2552 w=width/2; h=height;
2554 const uint8_t* yp=src1+srcStride1*y;
2555 const uint8_t* up=src2+srcStride2*(y>>2);
2556 const uint8_t* vp=src3+srcStride3*(y>>2);
2557 uint8_t* d=dst+dstStride*y;
2563 PREFETCH" 32(%1, %0)\n\t"
2564 PREFETCH" 32(%2, %0)\n\t"
2565 PREFETCH" 32(%3, %0)\n\t"
2566 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2567 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2568 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2569 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2570 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2571 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2572 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2573 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2574 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2575 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2577 "movq %%mm1, %%mm6\n\t"
2578 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2579 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2580 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2581 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2582 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2584 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2585 "movq 8(%1, %0, 4), %%mm0\n\t"
2586 "movq %%mm0, %%mm3\n\t"
2587 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2588 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2589 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2590 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2592 "movq %%mm4, %%mm6\n\t"
2593 "movq 16(%1, %0, 4), %%mm0\n\t"
2594 "movq %%mm0, %%mm3\n\t"
2595 "punpcklbw %%mm5, %%mm4\n\t"
2596 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2597 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2598 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2599 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2601 "punpckhbw %%mm5, %%mm6\n\t"
2602 "movq 24(%1, %0, 4), %%mm0\n\t"
2603 "movq %%mm0, %%mm3\n\t"
2604 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2605 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2606 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2607 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2610 : "r"(yp), "r" (up), "r"(vp), "r"(d)