3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
13 #include <inttypes.h> /* for __WORDSIZE */
16 // #warning You have misconfigured system and probably will lose performance!
17 #define __WORDSIZE MP_WORDSIZE
35 #define PREFETCH "prefetch"
36 #define PREFETCHW "prefetchw"
37 #define PAVGB "pavgusb"
38 #elif defined ( HAVE_MMX2 )
39 #define PREFETCH "prefetchnta"
40 #define PREFETCHW "prefetcht0"
47 #define PREFETCH "/nop"
48 #define PREFETCHW "/nop"
53 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
60 #define MOVNTQ "movntq"
61 #define SFENCE "sfence"
71 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
74 const uint8_t *s = src;
77 const uint8_t *mm_end;
81 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
83 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
89 "punpckldq 3%1, %%mm0\n\t"
91 "punpckldq 9%1, %%mm1\n\t"
92 "movd 12%1, %%mm2\n\t"
93 "punpckldq 15%1, %%mm2\n\t"
94 "movd 18%1, %%mm3\n\t"
95 "punpckldq 21%1, %%mm3\n\t"
96 "pand %%mm7, %%mm0\n\t"
97 "pand %%mm7, %%mm1\n\t"
98 "pand %%mm7, %%mm2\n\t"
99 "pand %%mm7, %%mm3\n\t"
100 MOVNTQ" %%mm0, %0\n\t"
101 MOVNTQ" %%mm1, 8%0\n\t"
102 MOVNTQ" %%mm2, 16%0\n\t"
110 __asm __volatile(SFENCE:::"memory");
111 __asm __volatile(EMMS:::"memory");
115 #ifdef WORDS_BIGENDIAN
116 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
131 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
134 const uint8_t *s = src;
137 const uint8_t *mm_end;
141 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
148 "movq 8%1, %%mm1\n\t"
149 "movq 16%1, %%mm4\n\t"
150 "movq 24%1, %%mm5\n\t"
151 "movq %%mm0, %%mm2\n\t"
152 "movq %%mm1, %%mm3\n\t"
153 "movq %%mm4, %%mm6\n\t"
154 "movq %%mm5, %%mm7\n\t"
155 "psrlq $8, %%mm2\n\t"
156 "psrlq $8, %%mm3\n\t"
157 "psrlq $8, %%mm6\n\t"
158 "psrlq $8, %%mm7\n\t"
167 "por %%mm2, %%mm0\n\t"
168 "por %%mm3, %%mm1\n\t"
169 "por %%mm6, %%mm4\n\t"
170 "por %%mm7, %%mm5\n\t"
172 "movq %%mm1, %%mm2\n\t"
173 "movq %%mm4, %%mm3\n\t"
174 "psllq $48, %%mm2\n\t"
175 "psllq $32, %%mm3\n\t"
178 "por %%mm2, %%mm0\n\t"
179 "psrlq $16, %%mm1\n\t"
180 "psrlq $32, %%mm4\n\t"
181 "psllq $16, %%mm5\n\t"
182 "por %%mm3, %%mm1\n\t"
184 "por %%mm5, %%mm4\n\t"
186 MOVNTQ" %%mm0, %0\n\t"
187 MOVNTQ" %%mm1, 8%0\n\t"
190 :"m"(*s),"m"(mask24l),
191 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
196 __asm __volatile(SFENCE:::"memory");
197 __asm __volatile(EMMS:::"memory");
201 #ifdef WORDS_BIGENDIAN
202 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
218 Original by Strepto/Astral
219 ported to gcc & bugfixed : A'rpi
220 MMX2, 3DNOW optimization by Nick Kurshev
221 32bit c version, and and&add trick by Michael Niedermayer
223 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
225 register const uint8_t* s=src;
226 register uint8_t* d=dst;
227 register const uint8_t *end;
228 const uint8_t *mm_end;
231 __asm __volatile(PREFETCH" %0"::"m"(*s));
232 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
239 "movq 8%1, %%mm2\n\t"
240 "movq %%mm0, %%mm1\n\t"
241 "movq %%mm2, %%mm3\n\t"
242 "pand %%mm4, %%mm0\n\t"
243 "pand %%mm4, %%mm2\n\t"
244 "paddw %%mm1, %%mm0\n\t"
245 "paddw %%mm3, %%mm2\n\t"
246 MOVNTQ" %%mm0, %0\n\t"
254 __asm __volatile(SFENCE:::"memory");
255 __asm __volatile(EMMS:::"memory");
260 register unsigned x= *((uint32_t *)s);
261 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
267 register unsigned short x= *((uint16_t *)s);
268 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
272 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
274 register const uint8_t* s=src;
275 register uint8_t* d=dst;
276 register const uint8_t *end;
277 const uint8_t *mm_end;
280 __asm __volatile(PREFETCH" %0"::"m"(*s));
281 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
282 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
289 "movq 8%1, %%mm2\n\t"
290 "movq %%mm0, %%mm1\n\t"
291 "movq %%mm2, %%mm3\n\t"
292 "psrlq $1, %%mm0\n\t"
293 "psrlq $1, %%mm2\n\t"
294 "pand %%mm7, %%mm0\n\t"
295 "pand %%mm7, %%mm2\n\t"
296 "pand %%mm6, %%mm1\n\t"
297 "pand %%mm6, %%mm3\n\t"
298 "por %%mm1, %%mm0\n\t"
299 "por %%mm3, %%mm2\n\t"
300 MOVNTQ" %%mm0, %0\n\t"
308 __asm __volatile(SFENCE:::"memory");
309 __asm __volatile(EMMS:::"memory");
314 register uint32_t x= *((uint32_t *)s);
315 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
321 register uint16_t x= *((uint16_t *)s);
322 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
328 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
330 const uint8_t *s = src;
333 const uint8_t *mm_end;
335 uint16_t *d = (uint16_t *)dst;
339 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
341 "movq %3, %%mm5 \n\t"
342 "movq %4, %%mm6 \n\t"
343 "movq %5, %%mm7 \n\t"
346 PREFETCH" 32(%1) \n\t"
347 "movd (%1), %%mm0 \n\t"
348 "movd 4(%1), %%mm3 \n\t"
349 "punpckldq 8(%1), %%mm0 \n\t"
350 "punpckldq 12(%1), %%mm3 \n\t"
351 "movq %%mm0, %%mm1 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "pand %%mm6, %%mm0 \n\t"
354 "pand %%mm6, %%mm3 \n\t"
355 "pmaddwd %%mm7, %%mm0 \n\t"
356 "pmaddwd %%mm7, %%mm3 \n\t"
357 "pand %%mm5, %%mm1 \n\t"
358 "pand %%mm5, %%mm4 \n\t"
359 "por %%mm1, %%mm0 \n\t"
360 "por %%mm4, %%mm3 \n\t"
361 "psrld $5, %%mm0 \n\t"
362 "pslld $11, %%mm3 \n\t"
363 "por %%mm3, %%mm0 \n\t"
364 MOVNTQ" %%mm0, (%0) \n\t"
370 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
373 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
377 ::"m"(red_16mask),"m"(green_16mask));
383 "movd 4%1, %%mm3\n\t"
384 "punpckldq 8%1, %%mm0\n\t"
385 "punpckldq 12%1, %%mm3\n\t"
386 "movq %%mm0, %%mm1\n\t"
387 "movq %%mm0, %%mm2\n\t"
388 "movq %%mm3, %%mm4\n\t"
389 "movq %%mm3, %%mm5\n\t"
390 "psrlq $3, %%mm0\n\t"
391 "psrlq $3, %%mm3\n\t"
394 "psrlq $5, %%mm1\n\t"
395 "psrlq $5, %%mm4\n\t"
396 "pand %%mm6, %%mm1\n\t"
397 "pand %%mm6, %%mm4\n\t"
398 "psrlq $8, %%mm2\n\t"
399 "psrlq $8, %%mm5\n\t"
400 "pand %%mm7, %%mm2\n\t"
401 "pand %%mm7, %%mm5\n\t"
402 "por %%mm1, %%mm0\n\t"
403 "por %%mm4, %%mm3\n\t"
404 "por %%mm2, %%mm0\n\t"
405 "por %%mm5, %%mm3\n\t"
406 "psllq $16, %%mm3\n\t"
407 "por %%mm3, %%mm0\n\t"
408 MOVNTQ" %%mm0, %0\n\t"
409 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
414 __asm __volatile(SFENCE:::"memory");
415 __asm __volatile(EMMS:::"memory");
419 register int rgb = *(uint32_t*)s; s += 4;
420 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
424 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
426 const uint8_t *s = src;
429 const uint8_t *mm_end;
431 uint16_t *d = (uint16_t *)dst;
434 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
438 ::"m"(red_16mask),"m"(green_16mask));
445 "movd 4%1, %%mm3\n\t"
446 "punpckldq 8%1, %%mm0\n\t"
447 "punpckldq 12%1, %%mm3\n\t"
448 "movq %%mm0, %%mm1\n\t"
449 "movq %%mm0, %%mm2\n\t"
450 "movq %%mm3, %%mm4\n\t"
451 "movq %%mm3, %%mm5\n\t"
452 "psllq $8, %%mm0\n\t"
453 "psllq $8, %%mm3\n\t"
454 "pand %%mm7, %%mm0\n\t"
455 "pand %%mm7, %%mm3\n\t"
456 "psrlq $5, %%mm1\n\t"
457 "psrlq $5, %%mm4\n\t"
458 "pand %%mm6, %%mm1\n\t"
459 "pand %%mm6, %%mm4\n\t"
460 "psrlq $19, %%mm2\n\t"
461 "psrlq $19, %%mm5\n\t"
464 "por %%mm1, %%mm0\n\t"
465 "por %%mm4, %%mm3\n\t"
466 "por %%mm2, %%mm0\n\t"
467 "por %%mm5, %%mm3\n\t"
468 "psllq $16, %%mm3\n\t"
469 "por %%mm3, %%mm0\n\t"
470 MOVNTQ" %%mm0, %0\n\t"
471 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
475 __asm __volatile(SFENCE:::"memory");
476 __asm __volatile(EMMS:::"memory");
480 register int rgb = *(uint32_t*)s; s += 4;
481 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
485 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
487 const uint8_t *s = src;
490 const uint8_t *mm_end;
492 uint16_t *d = (uint16_t *)dst;
496 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
498 "movq %3, %%mm5 \n\t"
499 "movq %4, %%mm6 \n\t"
500 "movq %5, %%mm7 \n\t"
503 PREFETCH" 32(%1) \n\t"
504 "movd (%1), %%mm0 \n\t"
505 "movd 4(%1), %%mm3 \n\t"
506 "punpckldq 8(%1), %%mm0 \n\t"
507 "punpckldq 12(%1), %%mm3 \n\t"
508 "movq %%mm0, %%mm1 \n\t"
509 "movq %%mm3, %%mm4 \n\t"
510 "pand %%mm6, %%mm0 \n\t"
511 "pand %%mm6, %%mm3 \n\t"
512 "pmaddwd %%mm7, %%mm0 \n\t"
513 "pmaddwd %%mm7, %%mm3 \n\t"
514 "pand %%mm5, %%mm1 \n\t"
515 "pand %%mm5, %%mm4 \n\t"
516 "por %%mm1, %%mm0 \n\t"
517 "por %%mm4, %%mm3 \n\t"
518 "psrld $6, %%mm0 \n\t"
519 "pslld $10, %%mm3 \n\t"
520 "por %%mm3, %%mm0 \n\t"
521 MOVNTQ" %%mm0, (%0) \n\t"
527 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
530 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
534 ::"m"(red_15mask),"m"(green_15mask));
540 "movd 4%1, %%mm3\n\t"
541 "punpckldq 8%1, %%mm0\n\t"
542 "punpckldq 12%1, %%mm3\n\t"
543 "movq %%mm0, %%mm1\n\t"
544 "movq %%mm0, %%mm2\n\t"
545 "movq %%mm3, %%mm4\n\t"
546 "movq %%mm3, %%mm5\n\t"
547 "psrlq $3, %%mm0\n\t"
548 "psrlq $3, %%mm3\n\t"
551 "psrlq $6, %%mm1\n\t"
552 "psrlq $6, %%mm4\n\t"
553 "pand %%mm6, %%mm1\n\t"
554 "pand %%mm6, %%mm4\n\t"
555 "psrlq $9, %%mm2\n\t"
556 "psrlq $9, %%mm5\n\t"
557 "pand %%mm7, %%mm2\n\t"
558 "pand %%mm7, %%mm5\n\t"
559 "por %%mm1, %%mm0\n\t"
560 "por %%mm4, %%mm3\n\t"
561 "por %%mm2, %%mm0\n\t"
562 "por %%mm5, %%mm3\n\t"
563 "psllq $16, %%mm3\n\t"
564 "por %%mm3, %%mm0\n\t"
565 MOVNTQ" %%mm0, %0\n\t"
566 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
571 __asm __volatile(SFENCE:::"memory");
572 __asm __volatile(EMMS:::"memory");
576 register int rgb = *(uint32_t*)s; s += 4;
577 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
581 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
583 const uint8_t *s = src;
586 const uint8_t *mm_end;
588 uint16_t *d = (uint16_t *)dst;
591 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
595 ::"m"(red_15mask),"m"(green_15mask));
602 "movd 4%1, %%mm3\n\t"
603 "punpckldq 8%1, %%mm0\n\t"
604 "punpckldq 12%1, %%mm3\n\t"
605 "movq %%mm0, %%mm1\n\t"
606 "movq %%mm0, %%mm2\n\t"
607 "movq %%mm3, %%mm4\n\t"
608 "movq %%mm3, %%mm5\n\t"
609 "psllq $7, %%mm0\n\t"
610 "psllq $7, %%mm3\n\t"
611 "pand %%mm7, %%mm0\n\t"
612 "pand %%mm7, %%mm3\n\t"
613 "psrlq $6, %%mm1\n\t"
614 "psrlq $6, %%mm4\n\t"
615 "pand %%mm6, %%mm1\n\t"
616 "pand %%mm6, %%mm4\n\t"
617 "psrlq $19, %%mm2\n\t"
618 "psrlq $19, %%mm5\n\t"
621 "por %%mm1, %%mm0\n\t"
622 "por %%mm4, %%mm3\n\t"
623 "por %%mm2, %%mm0\n\t"
624 "por %%mm5, %%mm3\n\t"
625 "psllq $16, %%mm3\n\t"
626 "por %%mm3, %%mm0\n\t"
627 MOVNTQ" %%mm0, %0\n\t"
628 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
632 __asm __volatile(SFENCE:::"memory");
633 __asm __volatile(EMMS:::"memory");
637 register int rgb = *(uint32_t*)s; s += 4;
638 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
642 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
644 const uint8_t *s = src;
647 const uint8_t *mm_end;
649 uint16_t *d = (uint16_t *)dst;
652 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
656 ::"m"(red_16mask),"m"(green_16mask));
663 "movd 3%1, %%mm3\n\t"
664 "punpckldq 6%1, %%mm0\n\t"
665 "punpckldq 9%1, %%mm3\n\t"
666 "movq %%mm0, %%mm1\n\t"
667 "movq %%mm0, %%mm2\n\t"
668 "movq %%mm3, %%mm4\n\t"
669 "movq %%mm3, %%mm5\n\t"
670 "psrlq $3, %%mm0\n\t"
671 "psrlq $3, %%mm3\n\t"
674 "psrlq $5, %%mm1\n\t"
675 "psrlq $5, %%mm4\n\t"
676 "pand %%mm6, %%mm1\n\t"
677 "pand %%mm6, %%mm4\n\t"
678 "psrlq $8, %%mm2\n\t"
679 "psrlq $8, %%mm5\n\t"
680 "pand %%mm7, %%mm2\n\t"
681 "pand %%mm7, %%mm5\n\t"
682 "por %%mm1, %%mm0\n\t"
683 "por %%mm4, %%mm3\n\t"
684 "por %%mm2, %%mm0\n\t"
685 "por %%mm5, %%mm3\n\t"
686 "psllq $16, %%mm3\n\t"
687 "por %%mm3, %%mm0\n\t"
688 MOVNTQ" %%mm0, %0\n\t"
689 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
693 __asm __volatile(SFENCE:::"memory");
694 __asm __volatile(EMMS:::"memory");
701 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
705 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
707 const uint8_t *s = src;
710 const uint8_t *mm_end;
712 uint16_t *d = (uint16_t *)dst;
715 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
719 ::"m"(red_16mask),"m"(green_16mask));
726 "movd 3%1, %%mm3\n\t"
727 "punpckldq 6%1, %%mm0\n\t"
728 "punpckldq 9%1, %%mm3\n\t"
729 "movq %%mm0, %%mm1\n\t"
730 "movq %%mm0, %%mm2\n\t"
731 "movq %%mm3, %%mm4\n\t"
732 "movq %%mm3, %%mm5\n\t"
733 "psllq $8, %%mm0\n\t"
734 "psllq $8, %%mm3\n\t"
735 "pand %%mm7, %%mm0\n\t"
736 "pand %%mm7, %%mm3\n\t"
737 "psrlq $5, %%mm1\n\t"
738 "psrlq $5, %%mm4\n\t"
739 "pand %%mm6, %%mm1\n\t"
740 "pand %%mm6, %%mm4\n\t"
741 "psrlq $19, %%mm2\n\t"
742 "psrlq $19, %%mm5\n\t"
745 "por %%mm1, %%mm0\n\t"
746 "por %%mm4, %%mm3\n\t"
747 "por %%mm2, %%mm0\n\t"
748 "por %%mm5, %%mm3\n\t"
749 "psllq $16, %%mm3\n\t"
750 "por %%mm3, %%mm0\n\t"
751 MOVNTQ" %%mm0, %0\n\t"
752 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
756 __asm __volatile(SFENCE:::"memory");
757 __asm __volatile(EMMS:::"memory");
764 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
768 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
770 const uint8_t *s = src;
773 const uint8_t *mm_end;
775 uint16_t *d = (uint16_t *)dst;
778 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
782 ::"m"(red_15mask),"m"(green_15mask));
789 "movd 3%1, %%mm3\n\t"
790 "punpckldq 6%1, %%mm0\n\t"
791 "punpckldq 9%1, %%mm3\n\t"
792 "movq %%mm0, %%mm1\n\t"
793 "movq %%mm0, %%mm2\n\t"
794 "movq %%mm3, %%mm4\n\t"
795 "movq %%mm3, %%mm5\n\t"
796 "psrlq $3, %%mm0\n\t"
797 "psrlq $3, %%mm3\n\t"
800 "psrlq $6, %%mm1\n\t"
801 "psrlq $6, %%mm4\n\t"
802 "pand %%mm6, %%mm1\n\t"
803 "pand %%mm6, %%mm4\n\t"
804 "psrlq $9, %%mm2\n\t"
805 "psrlq $9, %%mm5\n\t"
806 "pand %%mm7, %%mm2\n\t"
807 "pand %%mm7, %%mm5\n\t"
808 "por %%mm1, %%mm0\n\t"
809 "por %%mm4, %%mm3\n\t"
810 "por %%mm2, %%mm0\n\t"
811 "por %%mm5, %%mm3\n\t"
812 "psllq $16, %%mm3\n\t"
813 "por %%mm3, %%mm0\n\t"
814 MOVNTQ" %%mm0, %0\n\t"
815 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
819 __asm __volatile(SFENCE:::"memory");
820 __asm __volatile(EMMS:::"memory");
827 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
831 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
833 const uint8_t *s = src;
836 const uint8_t *mm_end;
838 uint16_t *d = (uint16_t *)dst;
841 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
845 ::"m"(red_15mask),"m"(green_15mask));
852 "movd 3%1, %%mm3\n\t"
853 "punpckldq 6%1, %%mm0\n\t"
854 "punpckldq 9%1, %%mm3\n\t"
855 "movq %%mm0, %%mm1\n\t"
856 "movq %%mm0, %%mm2\n\t"
857 "movq %%mm3, %%mm4\n\t"
858 "movq %%mm3, %%mm5\n\t"
859 "psllq $7, %%mm0\n\t"
860 "psllq $7, %%mm3\n\t"
861 "pand %%mm7, %%mm0\n\t"
862 "pand %%mm7, %%mm3\n\t"
863 "psrlq $6, %%mm1\n\t"
864 "psrlq $6, %%mm4\n\t"
865 "pand %%mm6, %%mm1\n\t"
866 "pand %%mm6, %%mm4\n\t"
867 "psrlq $19, %%mm2\n\t"
868 "psrlq $19, %%mm5\n\t"
871 "por %%mm1, %%mm0\n\t"
872 "por %%mm4, %%mm3\n\t"
873 "por %%mm2, %%mm0\n\t"
874 "por %%mm5, %%mm3\n\t"
875 "psllq $16, %%mm3\n\t"
876 "por %%mm3, %%mm0\n\t"
877 MOVNTQ" %%mm0, %0\n\t"
878 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
882 __asm __volatile(SFENCE:::"memory");
883 __asm __volatile(EMMS:::"memory");
890 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
895 I use here less accurate approximation by simply
896 left-shifting the input
897 value and filling the low order bits with
898 zeroes. This method improves png's
899 compression but this scheme cannot reproduce white exactly, since it does not
900 generate an all-ones maximum value; the net effect is to darken the
903 The better method should be "left bit replication":
913 | Leftmost Bits Repeated to Fill Open Bits
917 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
921 const uint16_t *mm_end;
923 uint8_t *d = (uint8_t *)dst;
924 const uint16_t *s = (uint16_t *)src;
925 end = s + src_size/2;
927 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
939 "psllq $3, %%mm0\n\t"
940 "psrlq $2, %%mm1\n\t"
941 "psrlq $7, %%mm2\n\t"
942 "movq %%mm0, %%mm3\n\t"
943 "movq %%mm1, %%mm4\n\t"
944 "movq %%mm2, %%mm5\n\t"
945 "punpcklwd %5, %%mm0\n\t"
946 "punpcklwd %5, %%mm1\n\t"
947 "punpcklwd %5, %%mm2\n\t"
948 "punpckhwd %5, %%mm3\n\t"
949 "punpckhwd %5, %%mm4\n\t"
950 "punpckhwd %5, %%mm5\n\t"
951 "psllq $8, %%mm1\n\t"
952 "psllq $16, %%mm2\n\t"
953 "por %%mm1, %%mm0\n\t"
954 "por %%mm2, %%mm0\n\t"
955 "psllq $8, %%mm4\n\t"
956 "psllq $16, %%mm5\n\t"
957 "por %%mm4, %%mm3\n\t"
958 "por %%mm5, %%mm3\n\t"
960 "movq %%mm0, %%mm6\n\t"
961 "movq %%mm3, %%mm7\n\t"
963 "movq 8%1, %%mm0\n\t"
964 "movq 8%1, %%mm1\n\t"
965 "movq 8%1, %%mm2\n\t"
969 "psllq $3, %%mm0\n\t"
970 "psrlq $2, %%mm1\n\t"
971 "psrlq $7, %%mm2\n\t"
972 "movq %%mm0, %%mm3\n\t"
973 "movq %%mm1, %%mm4\n\t"
974 "movq %%mm2, %%mm5\n\t"
975 "punpcklwd %5, %%mm0\n\t"
976 "punpcklwd %5, %%mm1\n\t"
977 "punpcklwd %5, %%mm2\n\t"
978 "punpckhwd %5, %%mm3\n\t"
979 "punpckhwd %5, %%mm4\n\t"
980 "punpckhwd %5, %%mm5\n\t"
981 "psllq $8, %%mm1\n\t"
982 "psllq $16, %%mm2\n\t"
983 "por %%mm1, %%mm0\n\t"
984 "por %%mm2, %%mm0\n\t"
985 "psllq $8, %%mm4\n\t"
986 "psllq $16, %%mm5\n\t"
987 "por %%mm4, %%mm3\n\t"
988 "por %%mm5, %%mm3\n\t"
991 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
993 /* Borrowed 32 to 24 */
995 "movq %%mm0, %%mm4\n\t"
996 "movq %%mm3, %%mm5\n\t"
997 "movq %%mm6, %%mm0\n\t"
998 "movq %%mm7, %%mm1\n\t"
1000 "movq %%mm4, %%mm6\n\t"
1001 "movq %%mm5, %%mm7\n\t"
1002 "movq %%mm0, %%mm2\n\t"
1003 "movq %%mm1, %%mm3\n\t"
1005 "psrlq $8, %%mm2\n\t"
1006 "psrlq $8, %%mm3\n\t"
1007 "psrlq $8, %%mm6\n\t"
1008 "psrlq $8, %%mm7\n\t"
1009 "pand %2, %%mm0\n\t"
1010 "pand %2, %%mm1\n\t"
1011 "pand %2, %%mm4\n\t"
1012 "pand %2, %%mm5\n\t"
1013 "pand %3, %%mm2\n\t"
1014 "pand %3, %%mm3\n\t"
1015 "pand %3, %%mm6\n\t"
1016 "pand %3, %%mm7\n\t"
1017 "por %%mm2, %%mm0\n\t"
1018 "por %%mm3, %%mm1\n\t"
1019 "por %%mm6, %%mm4\n\t"
1020 "por %%mm7, %%mm5\n\t"
1022 "movq %%mm1, %%mm2\n\t"
1023 "movq %%mm4, %%mm3\n\t"
1024 "psllq $48, %%mm2\n\t"
1025 "psllq $32, %%mm3\n\t"
1026 "pand %4, %%mm2\n\t"
1027 "pand %5, %%mm3\n\t"
1028 "por %%mm2, %%mm0\n\t"
1029 "psrlq $16, %%mm1\n\t"
1030 "psrlq $32, %%mm4\n\t"
1031 "psllq $16, %%mm5\n\t"
1032 "por %%mm3, %%mm1\n\t"
1033 "pand %6, %%mm5\n\t"
1034 "por %%mm5, %%mm4\n\t"
1036 MOVNTQ" %%mm0, %0\n\t"
1037 MOVNTQ" %%mm1, 8%0\n\t"
1038 MOVNTQ" %%mm4, 16%0"
1041 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1046 __asm __volatile(SFENCE:::"memory");
1047 __asm __volatile(EMMS:::"memory");
1051 register uint16_t bgr;
1053 *d++ = (bgr&0x1F)<<3;
1054 *d++ = (bgr&0x3E0)>>2;
1055 *d++ = (bgr&0x7C00)>>7;
1059 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1061 const uint16_t *end;
1063 const uint16_t *mm_end;
1065 uint8_t *d = (uint8_t *)dst;
1066 const uint16_t *s = (const uint16_t *)src;
1067 end = s + src_size/2;
1069 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1075 "movq %1, %%mm0\n\t"
1076 "movq %1, %%mm1\n\t"
1077 "movq %1, %%mm2\n\t"
1078 "pand %2, %%mm0\n\t"
1079 "pand %3, %%mm1\n\t"
1080 "pand %4, %%mm2\n\t"
1081 "psllq $3, %%mm0\n\t"
1082 "psrlq $3, %%mm1\n\t"
1083 "psrlq $8, %%mm2\n\t"
1084 "movq %%mm0, %%mm3\n\t"
1085 "movq %%mm1, %%mm4\n\t"
1086 "movq %%mm2, %%mm5\n\t"
1087 "punpcklwd %5, %%mm0\n\t"
1088 "punpcklwd %5, %%mm1\n\t"
1089 "punpcklwd %5, %%mm2\n\t"
1090 "punpckhwd %5, %%mm3\n\t"
1091 "punpckhwd %5, %%mm4\n\t"
1092 "punpckhwd %5, %%mm5\n\t"
1093 "psllq $8, %%mm1\n\t"
1094 "psllq $16, %%mm2\n\t"
1095 "por %%mm1, %%mm0\n\t"
1096 "por %%mm2, %%mm0\n\t"
1097 "psllq $8, %%mm4\n\t"
1098 "psllq $16, %%mm5\n\t"
1099 "por %%mm4, %%mm3\n\t"
1100 "por %%mm5, %%mm3\n\t"
1102 "movq %%mm0, %%mm6\n\t"
1103 "movq %%mm3, %%mm7\n\t"
1105 "movq 8%1, %%mm0\n\t"
1106 "movq 8%1, %%mm1\n\t"
1107 "movq 8%1, %%mm2\n\t"
1108 "pand %2, %%mm0\n\t"
1109 "pand %3, %%mm1\n\t"
1110 "pand %4, %%mm2\n\t"
1111 "psllq $3, %%mm0\n\t"
1112 "psrlq $3, %%mm1\n\t"
1113 "psrlq $8, %%mm2\n\t"
1114 "movq %%mm0, %%mm3\n\t"
1115 "movq %%mm1, %%mm4\n\t"
1116 "movq %%mm2, %%mm5\n\t"
1117 "punpcklwd %5, %%mm0\n\t"
1118 "punpcklwd %5, %%mm1\n\t"
1119 "punpcklwd %5, %%mm2\n\t"
1120 "punpckhwd %5, %%mm3\n\t"
1121 "punpckhwd %5, %%mm4\n\t"
1122 "punpckhwd %5, %%mm5\n\t"
1123 "psllq $8, %%mm1\n\t"
1124 "psllq $16, %%mm2\n\t"
1125 "por %%mm1, %%mm0\n\t"
1126 "por %%mm2, %%mm0\n\t"
1127 "psllq $8, %%mm4\n\t"
1128 "psllq $16, %%mm5\n\t"
1129 "por %%mm4, %%mm3\n\t"
1130 "por %%mm5, %%mm3\n\t"
1132 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1134 /* Borrowed 32 to 24 */
1136 "movq %%mm0, %%mm4\n\t"
1137 "movq %%mm3, %%mm5\n\t"
1138 "movq %%mm6, %%mm0\n\t"
1139 "movq %%mm7, %%mm1\n\t"
1141 "movq %%mm4, %%mm6\n\t"
1142 "movq %%mm5, %%mm7\n\t"
1143 "movq %%mm0, %%mm2\n\t"
1144 "movq %%mm1, %%mm3\n\t"
1146 "psrlq $8, %%mm2\n\t"
1147 "psrlq $8, %%mm3\n\t"
1148 "psrlq $8, %%mm6\n\t"
1149 "psrlq $8, %%mm7\n\t"
1150 "pand %2, %%mm0\n\t"
1151 "pand %2, %%mm1\n\t"
1152 "pand %2, %%mm4\n\t"
1153 "pand %2, %%mm5\n\t"
1154 "pand %3, %%mm2\n\t"
1155 "pand %3, %%mm3\n\t"
1156 "pand %3, %%mm6\n\t"
1157 "pand %3, %%mm7\n\t"
1158 "por %%mm2, %%mm0\n\t"
1159 "por %%mm3, %%mm1\n\t"
1160 "por %%mm6, %%mm4\n\t"
1161 "por %%mm7, %%mm5\n\t"
1163 "movq %%mm1, %%mm2\n\t"
1164 "movq %%mm4, %%mm3\n\t"
1165 "psllq $48, %%mm2\n\t"
1166 "psllq $32, %%mm3\n\t"
1167 "pand %4, %%mm2\n\t"
1168 "pand %5, %%mm3\n\t"
1169 "por %%mm2, %%mm0\n\t"
1170 "psrlq $16, %%mm1\n\t"
1171 "psrlq $32, %%mm4\n\t"
1172 "psllq $16, %%mm5\n\t"
1173 "por %%mm3, %%mm1\n\t"
1174 "pand %6, %%mm5\n\t"
1175 "por %%mm5, %%mm4\n\t"
1177 MOVNTQ" %%mm0, %0\n\t"
1178 MOVNTQ" %%mm1, 8%0\n\t"
1179 MOVNTQ" %%mm4, 16%0"
1182 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1187 __asm __volatile(SFENCE:::"memory");
1188 __asm __volatile(EMMS:::"memory");
1192 register uint16_t bgr;
1194 *d++ = (bgr&0x1F)<<3;
1195 *d++ = (bgr&0x7E0)>>3;
1196 *d++ = (bgr&0xF800)>>8;
1200 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1202 const uint16_t *end;
1204 const uint16_t *mm_end;
1206 uint8_t *d = (uint8_t *)dst;
1207 const uint16_t *s = (const uint16_t *)src;
1208 end = s + src_size/2;
1210 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1211 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1217 "movq %1, %%mm0\n\t"
1218 "movq %1, %%mm1\n\t"
1219 "movq %1, %%mm2\n\t"
1220 "pand %2, %%mm0\n\t"
1221 "pand %3, %%mm1\n\t"
1222 "pand %4, %%mm2\n\t"
1223 "psllq $3, %%mm0\n\t"
1224 "psrlq $2, %%mm1\n\t"
1225 "psrlq $7, %%mm2\n\t"
1226 "movq %%mm0, %%mm3\n\t"
1227 "movq %%mm1, %%mm4\n\t"
1228 "movq %%mm2, %%mm5\n\t"
1229 "punpcklwd %%mm7, %%mm0\n\t"
1230 "punpcklwd %%mm7, %%mm1\n\t"
1231 "punpcklwd %%mm7, %%mm2\n\t"
1232 "punpckhwd %%mm7, %%mm3\n\t"
1233 "punpckhwd %%mm7, %%mm4\n\t"
1234 "punpckhwd %%mm7, %%mm5\n\t"
1235 "psllq $8, %%mm1\n\t"
1236 "psllq $16, %%mm2\n\t"
1237 "por %%mm1, %%mm0\n\t"
1238 "por %%mm2, %%mm0\n\t"
1239 "psllq $8, %%mm4\n\t"
1240 "psllq $16, %%mm5\n\t"
1241 "por %%mm4, %%mm3\n\t"
1242 "por %%mm5, %%mm3\n\t"
1243 MOVNTQ" %%mm0, %0\n\t"
1244 MOVNTQ" %%mm3, 8%0\n\t"
1246 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1251 __asm __volatile(SFENCE:::"memory");
1252 __asm __volatile(EMMS:::"memory");
1256 #if 0 //slightly slower on athlon
1258 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1260 register uint16_t bgr;
1262 #ifdef WORDS_BIGENDIAN
1264 *d++ = (bgr&0x7C00)>>7;
1265 *d++ = (bgr&0x3E0)>>2;
1266 *d++ = (bgr&0x1F)<<3;
1268 *d++ = (bgr&0x1F)<<3;
1269 *d++ = (bgr&0x3E0)>>2;
1270 *d++ = (bgr&0x7C00)>>7;
1278 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1280 const uint16_t *end;
1282 const uint16_t *mm_end;
1284 uint8_t *d = (uint8_t *)dst;
1285 const uint16_t *s = (uint16_t *)src;
1286 end = s + src_size/2;
1288 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1289 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1295 "movq %1, %%mm0\n\t"
1296 "movq %1, %%mm1\n\t"
1297 "movq %1, %%mm2\n\t"
1298 "pand %2, %%mm0\n\t"
1299 "pand %3, %%mm1\n\t"
1300 "pand %4, %%mm2\n\t"
1301 "psllq $3, %%mm0\n\t"
1302 "psrlq $3, %%mm1\n\t"
1303 "psrlq $8, %%mm2\n\t"
1304 "movq %%mm0, %%mm3\n\t"
1305 "movq %%mm1, %%mm4\n\t"
1306 "movq %%mm2, %%mm5\n\t"
1307 "punpcklwd %%mm7, %%mm0\n\t"
1308 "punpcklwd %%mm7, %%mm1\n\t"
1309 "punpcklwd %%mm7, %%mm2\n\t"
1310 "punpckhwd %%mm7, %%mm3\n\t"
1311 "punpckhwd %%mm7, %%mm4\n\t"
1312 "punpckhwd %%mm7, %%mm5\n\t"
1313 "psllq $8, %%mm1\n\t"
1314 "psllq $16, %%mm2\n\t"
1315 "por %%mm1, %%mm0\n\t"
1316 "por %%mm2, %%mm0\n\t"
1317 "psllq $8, %%mm4\n\t"
1318 "psllq $16, %%mm5\n\t"
1319 "por %%mm4, %%mm3\n\t"
1320 "por %%mm5, %%mm3\n\t"
1321 MOVNTQ" %%mm0, %0\n\t"
1322 MOVNTQ" %%mm3, 8%0\n\t"
1324 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1329 __asm __volatile(SFENCE:::"memory");
1330 __asm __volatile(EMMS:::"memory");
1334 register uint16_t bgr;
1336 #ifdef WORDS_BIGENDIAN
1338 *d++ = (bgr&0xF800)>>8;
1339 *d++ = (bgr&0x7E0)>>3;
1340 *d++ = (bgr&0x1F)<<3;
1342 *d++ = (bgr&0x1F)<<3;
1343 *d++ = (bgr&0x7E0)>>3;
1344 *d++ = (bgr&0xF800)>>8;
1350 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1353 /* TODO: unroll this loop */
1355 "xor %%"REG_a", %%"REG_a" \n\t"
1358 PREFETCH" 32(%0, %%"REG_a") \n\t"
1359 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1360 "movq %%mm0, %%mm1 \n\t"
1361 "movq %%mm0, %%mm2 \n\t"
1362 "pslld $16, %%mm0 \n\t"
1363 "psrld $16, %%mm1 \n\t"
1364 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1365 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1366 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1367 "por %%mm0, %%mm2 \n\t"
1368 "por %%mm1, %%mm2 \n\t"
1369 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1370 "add $8, %%"REG_a" \n\t"
1371 "cmp %2, %%"REG_a" \n\t"
1373 :: "r" (src), "r"(dst), "r" (src_size-7)
1377 __asm __volatile(SFENCE:::"memory");
1378 __asm __volatile(EMMS:::"memory");
1381 unsigned num_pixels = src_size >> 2;
1382 for(i=0; i<num_pixels; i++)
1384 #ifdef WORDS_BIGENDIAN
1385 dst[4*i + 1] = src[4*i + 3];
1386 dst[4*i + 2] = src[4*i + 2];
1387 dst[4*i + 3] = src[4*i + 1];
1389 dst[4*i + 0] = src[4*i + 2];
1390 dst[4*i + 1] = src[4*i + 1];
1391 dst[4*i + 2] = src[4*i + 0];
1397 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1401 long mmx_size= 23 - src_size;
1403 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1404 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1405 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1408 PREFETCH" 32(%1, %%"REG_a") \n\t"
1409 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1410 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1411 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1412 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1413 "pand %%mm5, %%mm0 \n\t"
1414 "pand %%mm6, %%mm1 \n\t"
1415 "pand %%mm7, %%mm2 \n\t"
1416 "por %%mm0, %%mm1 \n\t"
1417 "por %%mm2, %%mm1 \n\t"
1418 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1419 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1420 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1421 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1422 "pand %%mm7, %%mm0 \n\t"
1423 "pand %%mm5, %%mm1 \n\t"
1424 "pand %%mm6, %%mm2 \n\t"
1425 "por %%mm0, %%mm1 \n\t"
1426 "por %%mm2, %%mm1 \n\t"
1427 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1428 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1429 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1430 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1431 "pand %%mm6, %%mm0 \n\t"
1432 "pand %%mm7, %%mm1 \n\t"
1433 "pand %%mm5, %%mm2 \n\t"
1434 "por %%mm0, %%mm1 \n\t"
1435 "por %%mm2, %%mm1 \n\t"
1436 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1437 "add $24, %%"REG_a" \n\t"
1440 : "r" (src-mmx_size), "r"(dst-mmx_size)
1443 __asm __volatile(SFENCE:::"memory");
1444 __asm __volatile(EMMS:::"memory");
1446 if(mmx_size==23) return; //finihsed, was multiple of 8
1450 src_size= 23-mmx_size;
1454 for(i=0; i<src_size; i+=3)
1458 dst[i + 1] = src[i + 1];
1459 dst[i + 2] = src[i + 0];
1464 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1465 long width, long height,
1466 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1469 const long chromWidth= width>>1;
1470 for(y=0; y<height; y++)
1473 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1475 "xor %%"REG_a", %%"REG_a" \n\t"
1478 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1479 PREFETCH" 32(%2, %%"REG_a") \n\t"
1480 PREFETCH" 32(%3, %%"REG_a") \n\t"
1481 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1482 "movq %%mm0, %%mm2 \n\t" // U(0)
1483 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1484 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1485 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1487 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1488 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1489 "movq %%mm3, %%mm4 \n\t" // Y(0)
1490 "movq %%mm5, %%mm6 \n\t" // Y(8)
1491 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1492 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1493 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1494 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1496 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1497 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1498 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1499 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1501 "add $8, %%"REG_a" \n\t"
1502 "cmp %4, %%"REG_a" \n\t"
1504 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1509 #if defined ARCH_ALPHA && defined HAVE_MVI
1510 #define pl2yuy2(n) \
1515 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1516 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1517 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1518 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1519 yuv1 = (u << 8) + (v << 24); \
1526 uint64_t *qdst = (uint64_t *) dst;
1527 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1528 const uint32_t *yc = (uint32_t *) ysrc;
1529 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1530 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1531 for(i = 0; i < chromWidth; i += 8){
1532 uint64_t y1, y2, yuv1, yuv2;
1535 asm("ldq $31,64(%0)" :: "r"(yc));
1536 asm("ldq $31,64(%0)" :: "r"(yc2));
1537 asm("ldq $31,64(%0)" :: "r"(uc));
1538 asm("ldq $31,64(%0)" :: "r"(vc));
1556 #elif __WORDSIZE >= 64
1558 uint64_t *ldst = (uint64_t *) dst;
1559 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1560 for(i = 0; i < chromWidth; i += 2){
1562 k = yc[0] + (uc[0] << 8) +
1563 (yc[1] << 16) + (vc[0] << 24);
1564 l = yc[2] + (uc[1] << 8) +
1565 (yc[3] << 16) + (vc[1] << 24);
1566 *ldst++ = k + (l << 32);
1573 int i, *idst = (int32_t *) dst;
1574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 for(i = 0; i < chromWidth; i++){
1576 #ifdef WORDS_BIGENDIAN
1577 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1578 (yc[1] << 8) + (vc[0] << 0);
1580 *idst++ = yc[0] + (uc[0] << 8) +
1581 (yc[1] << 16) + (vc[0] << 24);
1589 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1591 usrc += chromStride;
1592 vsrc += chromStride;
1606 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1607 * problem for anyone then tell me, and ill fix it)
1609 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1610 long width, long height,
1611 long lumStride, long chromStride, long dstStride)
1613 //FIXME interpolate chroma
1614 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1617 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1618 long width, long height,
1619 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1622 const long chromWidth= width>>1;
1623 for(y=0; y<height; y++)
1626 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1628 "xor %%"REG_a", %%"REG_a" \n\t"
1631 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1632 PREFETCH" 32(%2, %%"REG_a") \n\t"
1633 PREFETCH" 32(%3, %%"REG_a") \n\t"
1634 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1635 "movq %%mm0, %%mm2 \n\t" // U(0)
1636 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1637 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1638 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1640 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1641 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1642 "movq %%mm0, %%mm4 \n\t" // Y(0)
1643 "movq %%mm2, %%mm6 \n\t" // Y(8)
1644 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1645 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1646 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1647 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1649 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1650 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1651 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1652 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1654 "add $8, %%"REG_a" \n\t"
1655 "cmp %4, %%"REG_a" \n\t"
1657 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1661 //FIXME adapt the alpha asm code from yv12->yuy2
1663 #if __WORDSIZE >= 64
1665 uint64_t *ldst = (uint64_t *) dst;
1666 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1667 for(i = 0; i < chromWidth; i += 2){
1669 k = uc[0] + (yc[0] << 8) +
1670 (vc[0] << 16) + (yc[1] << 24);
1671 l = uc[1] + (yc[2] << 8) +
1672 (vc[1] << 16) + (yc[3] << 24);
1673 *ldst++ = k + (l << 32);
1680 int i, *idst = (int32_t *) dst;
1681 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1682 for(i = 0; i < chromWidth; i++){
1683 #ifdef WORDS_BIGENDIAN
1684 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1685 (vc[0] << 8) + (yc[1] << 0);
1687 *idst++ = uc[0] + (yc[0] << 8) +
1688 (vc[0] << 16) + (yc[1] << 24);
1696 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1698 usrc += chromStride;
1699 vsrc += chromStride;
1713 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1714 * problem for anyone then tell me, and ill fix it)
1716 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1717 long width, long height,
1718 long lumStride, long chromStride, long dstStride)
1720 //FIXME interpolate chroma
1721 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1726 * width should be a multiple of 16
1728 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1729 long width, long height,
1730 long lumStride, long chromStride, long dstStride)
1732 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1737 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1738 * problem for anyone then tell me, and ill fix it)
1740 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1741 long width, long height,
1742 long lumStride, long chromStride, long srcStride)
1745 const long chromWidth= width>>1;
1746 for(y=0; y<height; y+=2)
1750 "xor %%"REG_a", %%"REG_a" \n\t"
1751 "pcmpeqw %%mm7, %%mm7 \n\t"
1752 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1755 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1756 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1757 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1758 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1759 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1760 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1761 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1762 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1763 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1764 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1765 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1767 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1769 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1770 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1771 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1772 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1773 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1774 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1775 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1776 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1777 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1778 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1780 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1782 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1783 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1784 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1785 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1786 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1787 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1788 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1789 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1791 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1792 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1794 "add $8, %%"REG_a" \n\t"
1795 "cmp %4, %%"REG_a" \n\t"
1797 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1798 : "memory", "%"REG_a
1805 "xor %%"REG_a", %%"REG_a" \n\t"
1808 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1809 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1810 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1811 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1812 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1813 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1814 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1815 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1816 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1817 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1818 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1820 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1821 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1823 "add $8, %%"REG_a" \n\t"
1824 "cmp %4, %%"REG_a" \n\t"
1827 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1828 : "memory", "%"REG_a
1832 for(i=0; i<chromWidth; i++)
1834 ydst[2*i+0] = src[4*i+0];
1835 udst[i] = src[4*i+1];
1836 ydst[2*i+1] = src[4*i+2];
1837 vdst[i] = src[4*i+3];
1842 for(i=0; i<chromWidth; i++)
1844 ydst[2*i+0] = src[4*i+0];
1845 ydst[2*i+1] = src[4*i+2];
1848 udst += chromStride;
1849 vdst += chromStride;
1854 asm volatile( EMMS" \n\t"
1860 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1861 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1862 long width, long height, long lumStride, long chromStride)
1865 memcpy(ydst, ysrc, width*height);
1867 /* XXX: implement upscaling for U,V */
1870 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1877 for(x=0; x<srcWidth-1; x++){
1878 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1879 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1881 dst[2*srcWidth-1]= src[srcWidth-1];
1885 for(y=1; y<srcHeight; y++){
1886 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1887 const long mmxSize= srcWidth&~15;
1889 "mov %4, %%"REG_a" \n\t"
1891 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1892 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1893 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1894 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1895 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1896 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1897 PAVGB" %%mm0, %%mm5 \n\t"
1898 PAVGB" %%mm0, %%mm3 \n\t"
1899 PAVGB" %%mm0, %%mm5 \n\t"
1900 PAVGB" %%mm0, %%mm3 \n\t"
1901 PAVGB" %%mm1, %%mm4 \n\t"
1902 PAVGB" %%mm1, %%mm2 \n\t"
1903 PAVGB" %%mm1, %%mm4 \n\t"
1904 PAVGB" %%mm1, %%mm2 \n\t"
1905 "movq %%mm5, %%mm7 \n\t"
1906 "movq %%mm4, %%mm6 \n\t"
1907 "punpcklbw %%mm3, %%mm5 \n\t"
1908 "punpckhbw %%mm3, %%mm7 \n\t"
1909 "punpcklbw %%mm2, %%mm4 \n\t"
1910 "punpckhbw %%mm2, %%mm6 \n\t"
1912 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1913 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1914 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1915 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1917 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1918 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1919 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1920 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1922 "add $8, %%"REG_a" \n\t"
1924 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1925 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1931 const long mmxSize=1;
1933 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1934 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1936 for(x=mmxSize-1; x<srcWidth-1; x++){
1937 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1938 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1939 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1940 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1942 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1943 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1953 for(x=0; x<srcWidth-1; x++){
1954 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1955 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1957 dst[2*srcWidth-1]= src[srcWidth-1];
1959 for(x=0; x<srcWidth; x++){
1966 asm volatile( EMMS" \n\t"
1974 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1975 * problem for anyone then tell me, and ill fix it)
1976 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1978 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1979 long width, long height,
1980 long lumStride, long chromStride, long srcStride)
1983 const long chromWidth= width>>1;
1984 for(y=0; y<height; y+=2)
1988 "xorl %%eax, %%eax \n\t"
1989 "pcmpeqw %%mm7, %%mm7 \n\t"
1990 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1993 PREFETCH" 64(%0, %%eax, 4) \n\t"
1994 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1995 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1996 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1997 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1998 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1999 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2000 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2001 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2002 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2003 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2005 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2007 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2008 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2009 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2010 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2011 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2012 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2013 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2014 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2015 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2016 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2018 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2020 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2021 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2022 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2023 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2024 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2025 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2026 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2027 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2029 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2030 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2032 "addl $8, %%eax \n\t"
2033 "cmpl %4, %%eax \n\t"
2035 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2043 "xorl %%eax, %%eax \n\t"
2046 PREFETCH" 64(%0, %%eax, 4) \n\t"
2047 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2048 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2049 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2050 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2051 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2052 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2053 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2054 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2055 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2056 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2058 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2059 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2061 "addl $8, %%eax \n\t"
2062 "cmpl %4, %%eax \n\t"
2065 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2070 for(i=0; i<chromWidth; i++)
2072 udst[i] = src[4*i+0];
2073 ydst[2*i+0] = src[4*i+1];
2074 vdst[i] = src[4*i+2];
2075 ydst[2*i+1] = src[4*i+3];
2080 for(i=0; i<chromWidth; i++)
2082 ydst[2*i+0] = src[4*i+1];
2083 ydst[2*i+1] = src[4*i+3];
2086 udst += chromStride;
2087 vdst += chromStride;
2092 asm volatile( EMMS" \n\t"
2100 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2101 * problem for anyone then tell me, and ill fix it)
2102 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2104 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2105 long width, long height,
2106 long lumStride, long chromStride, long srcStride)
2109 const long chromWidth= width>>1;
2111 for(y=0; y<height-2; y+=2)
2117 "mov %2, %%"REG_a" \n\t"
2118 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2119 "movq "MANGLE(w1111)", %%mm5 \n\t"
2120 "pxor %%mm7, %%mm7 \n\t"
2121 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2124 PREFETCH" 64(%0, %%"REG_d") \n\t"
2125 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2126 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2127 "punpcklbw %%mm7, %%mm0 \n\t"
2128 "punpcklbw %%mm7, %%mm1 \n\t"
2129 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2130 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2131 "punpcklbw %%mm7, %%mm2 \n\t"
2132 "punpcklbw %%mm7, %%mm3 \n\t"
2133 "pmaddwd %%mm6, %%mm0 \n\t"
2134 "pmaddwd %%mm6, %%mm1 \n\t"
2135 "pmaddwd %%mm6, %%mm2 \n\t"
2136 "pmaddwd %%mm6, %%mm3 \n\t"
2137 #ifndef FAST_BGR2YV12
2138 "psrad $8, %%mm0 \n\t"
2139 "psrad $8, %%mm1 \n\t"
2140 "psrad $8, %%mm2 \n\t"
2141 "psrad $8, %%mm3 \n\t"
2143 "packssdw %%mm1, %%mm0 \n\t"
2144 "packssdw %%mm3, %%mm2 \n\t"
2145 "pmaddwd %%mm5, %%mm0 \n\t"
2146 "pmaddwd %%mm5, %%mm2 \n\t"
2147 "packssdw %%mm2, %%mm0 \n\t"
2148 "psraw $7, %%mm0 \n\t"
2150 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2151 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2152 "punpcklbw %%mm7, %%mm4 \n\t"
2153 "punpcklbw %%mm7, %%mm1 \n\t"
2154 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2155 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2156 "punpcklbw %%mm7, %%mm2 \n\t"
2157 "punpcklbw %%mm7, %%mm3 \n\t"
2158 "pmaddwd %%mm6, %%mm4 \n\t"
2159 "pmaddwd %%mm6, %%mm1 \n\t"
2160 "pmaddwd %%mm6, %%mm2 \n\t"
2161 "pmaddwd %%mm6, %%mm3 \n\t"
2162 #ifndef FAST_BGR2YV12
2163 "psrad $8, %%mm4 \n\t"
2164 "psrad $8, %%mm1 \n\t"
2165 "psrad $8, %%mm2 \n\t"
2166 "psrad $8, %%mm3 \n\t"
2168 "packssdw %%mm1, %%mm4 \n\t"
2169 "packssdw %%mm3, %%mm2 \n\t"
2170 "pmaddwd %%mm5, %%mm4 \n\t"
2171 "pmaddwd %%mm5, %%mm2 \n\t"
2172 "add $24, %%"REG_d" \n\t"
2173 "packssdw %%mm2, %%mm4 \n\t"
2174 "psraw $7, %%mm4 \n\t"
2176 "packuswb %%mm4, %%mm0 \n\t"
2177 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2179 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2180 "add $8, %%"REG_a" \n\t"
2182 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2183 : "%"REG_a, "%"REG_d
2190 "mov %4, %%"REG_a" \n\t"
2191 "movq "MANGLE(w1111)", %%mm5 \n\t"
2192 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2193 "pxor %%mm7, %%mm7 \n\t"
2194 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2195 "add %%"REG_d", %%"REG_d" \n\t"
2198 PREFETCH" 64(%0, %%"REG_d") \n\t"
2199 PREFETCH" 64(%1, %%"REG_d") \n\t"
2200 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2201 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2202 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2203 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2204 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2205 PAVGB" %%mm1, %%mm0 \n\t"
2206 PAVGB" %%mm3, %%mm2 \n\t"
2207 "movq %%mm0, %%mm1 \n\t"
2208 "movq %%mm2, %%mm3 \n\t"
2209 "psrlq $24, %%mm0 \n\t"
2210 "psrlq $24, %%mm2 \n\t"
2211 PAVGB" %%mm1, %%mm0 \n\t"
2212 PAVGB" %%mm3, %%mm2 \n\t"
2213 "punpcklbw %%mm7, %%mm0 \n\t"
2214 "punpcklbw %%mm7, %%mm2 \n\t"
2216 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2217 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2218 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2219 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2220 "punpcklbw %%mm7, %%mm0 \n\t"
2221 "punpcklbw %%mm7, %%mm1 \n\t"
2222 "punpcklbw %%mm7, %%mm2 \n\t"
2223 "punpcklbw %%mm7, %%mm3 \n\t"
2224 "paddw %%mm1, %%mm0 \n\t"
2225 "paddw %%mm3, %%mm2 \n\t"
2226 "paddw %%mm2, %%mm0 \n\t"
2227 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2228 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2229 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2230 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2231 "punpcklbw %%mm7, %%mm4 \n\t"
2232 "punpcklbw %%mm7, %%mm1 \n\t"
2233 "punpcklbw %%mm7, %%mm2 \n\t"
2234 "punpcklbw %%mm7, %%mm3 \n\t"
2235 "paddw %%mm1, %%mm4 \n\t"
2236 "paddw %%mm3, %%mm2 \n\t"
2237 "paddw %%mm4, %%mm2 \n\t"
2238 "psrlw $2, %%mm0 \n\t"
2239 "psrlw $2, %%mm2 \n\t"
2241 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2242 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2244 "pmaddwd %%mm0, %%mm1 \n\t"
2245 "pmaddwd %%mm2, %%mm3 \n\t"
2246 "pmaddwd %%mm6, %%mm0 \n\t"
2247 "pmaddwd %%mm6, %%mm2 \n\t"
2248 #ifndef FAST_BGR2YV12
2249 "psrad $8, %%mm0 \n\t"
2250 "psrad $8, %%mm1 \n\t"
2251 "psrad $8, %%mm2 \n\t"
2252 "psrad $8, %%mm3 \n\t"
2254 "packssdw %%mm2, %%mm0 \n\t"
2255 "packssdw %%mm3, %%mm1 \n\t"
2256 "pmaddwd %%mm5, %%mm0 \n\t"
2257 "pmaddwd %%mm5, %%mm1 \n\t"
2258 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2259 "psraw $7, %%mm0 \n\t"
2261 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2262 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2263 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2264 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2265 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2266 PAVGB" %%mm1, %%mm4 \n\t"
2267 PAVGB" %%mm3, %%mm2 \n\t"
2268 "movq %%mm4, %%mm1 \n\t"
2269 "movq %%mm2, %%mm3 \n\t"
2270 "psrlq $24, %%mm4 \n\t"
2271 "psrlq $24, %%mm2 \n\t"
2272 PAVGB" %%mm1, %%mm4 \n\t"
2273 PAVGB" %%mm3, %%mm2 \n\t"
2274 "punpcklbw %%mm7, %%mm4 \n\t"
2275 "punpcklbw %%mm7, %%mm2 \n\t"
2277 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2278 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2279 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2280 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2281 "punpcklbw %%mm7, %%mm4 \n\t"
2282 "punpcklbw %%mm7, %%mm1 \n\t"
2283 "punpcklbw %%mm7, %%mm2 \n\t"
2284 "punpcklbw %%mm7, %%mm3 \n\t"
2285 "paddw %%mm1, %%mm4 \n\t"
2286 "paddw %%mm3, %%mm2 \n\t"
2287 "paddw %%mm2, %%mm4 \n\t"
2288 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2289 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2290 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2291 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2292 "punpcklbw %%mm7, %%mm5 \n\t"
2293 "punpcklbw %%mm7, %%mm1 \n\t"
2294 "punpcklbw %%mm7, %%mm2 \n\t"
2295 "punpcklbw %%mm7, %%mm3 \n\t"
2296 "paddw %%mm1, %%mm5 \n\t"
2297 "paddw %%mm3, %%mm2 \n\t"
2298 "paddw %%mm5, %%mm2 \n\t"
2299 "movq "MANGLE(w1111)", %%mm5 \n\t"
2300 "psrlw $2, %%mm4 \n\t"
2301 "psrlw $2, %%mm2 \n\t"
2303 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2304 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2306 "pmaddwd %%mm4, %%mm1 \n\t"
2307 "pmaddwd %%mm2, %%mm3 \n\t"
2308 "pmaddwd %%mm6, %%mm4 \n\t"
2309 "pmaddwd %%mm6, %%mm2 \n\t"
2310 #ifndef FAST_BGR2YV12
2311 "psrad $8, %%mm4 \n\t"
2312 "psrad $8, %%mm1 \n\t"
2313 "psrad $8, %%mm2 \n\t"
2314 "psrad $8, %%mm3 \n\t"
2316 "packssdw %%mm2, %%mm4 \n\t"
2317 "packssdw %%mm3, %%mm1 \n\t"
2318 "pmaddwd %%mm5, %%mm4 \n\t"
2319 "pmaddwd %%mm5, %%mm1 \n\t"
2320 "add $24, %%"REG_d" \n\t"
2321 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2322 "psraw $7, %%mm4 \n\t"
2324 "movq %%mm0, %%mm1 \n\t"
2325 "punpckldq %%mm4, %%mm0 \n\t"
2326 "punpckhdq %%mm4, %%mm1 \n\t"
2327 "packsswb %%mm1, %%mm0 \n\t"
2328 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2329 "movd %%mm0, (%2, %%"REG_a") \n\t"
2330 "punpckhdq %%mm0, %%mm0 \n\t"
2331 "movd %%mm0, (%3, %%"REG_a") \n\t"
2332 "add $4, %%"REG_a" \n\t"
2334 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2335 : "%"REG_a, "%"REG_d
2338 udst += chromStride;
2339 vdst += chromStride;
2343 asm volatile( EMMS" \n\t"
2349 for(; y<height; y+=2)
2352 for(i=0; i<chromWidth; i++)
2354 unsigned int b= src[6*i+0];
2355 unsigned int g= src[6*i+1];
2356 unsigned int r= src[6*i+2];
2358 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2359 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2360 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2370 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2376 for(i=0; i<chromWidth; i++)
2378 unsigned int b= src[6*i+0];
2379 unsigned int g= src[6*i+1];
2380 unsigned int r= src[6*i+2];
2382 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2390 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2393 udst += chromStride;
2394 vdst += chromStride;
2400 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2401 long width, long height, long src1Stride,
2402 long src2Stride, long dstStride){
2405 for(h=0; h < height; h++)
2412 "xor %%"REG_a", %%"REG_a" \n\t"
2414 PREFETCH" 64(%1, %%"REG_a") \n\t"
2415 PREFETCH" 64(%2, %%"REG_a") \n\t"
2416 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2417 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2418 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2419 "punpcklbw %%xmm2, %%xmm0 \n\t"
2420 "punpckhbw %%xmm2, %%xmm1 \n\t"
2421 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2422 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2423 "add $16, %%"REG_a" \n\t"
2424 "cmp %3, %%"REG_a" \n\t"
2426 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2427 : "memory", "%"REG_a""
2431 "xor %%"REG_a", %%"REG_a" \n\t"
2433 PREFETCH" 64(%1, %%"REG_a") \n\t"
2434 PREFETCH" 64(%2, %%"REG_a") \n\t"
2435 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2436 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2437 "movq %%mm0, %%mm1 \n\t"
2438 "movq %%mm2, %%mm3 \n\t"
2439 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2440 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2441 "punpcklbw %%mm4, %%mm0 \n\t"
2442 "punpckhbw %%mm4, %%mm1 \n\t"
2443 "punpcklbw %%mm5, %%mm2 \n\t"
2444 "punpckhbw %%mm5, %%mm3 \n\t"
2445 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2446 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2447 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2448 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2449 "add $16, %%"REG_a" \n\t"
2450 "cmp %3, %%"REG_a" \n\t"
2452 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2453 : "memory", "%"REG_a
2456 for(w= (width&(~15)); w < width; w++)
2458 dest[2*w+0] = src1[w];
2459 dest[2*w+1] = src2[w];
2462 for(w=0; w < width; w++)
2464 dest[2*w+0] = src1[w];
2465 dest[2*w+1] = src2[w];
2481 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2482 uint8_t *dst1, uint8_t *dst2,
2483 long width, long height,
2484 long srcStride1, long srcStride2,
2485 long dstStride1, long dstStride2)
2488 w=width/2; h=height/2;
2493 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2496 const uint8_t* s1=src1+srcStride1*(y>>1);
2497 uint8_t* d=dst1+dstStride1*y;
2504 "movq %1, %%mm0\n\t"
2505 "movq 8%1, %%mm2\n\t"
2506 "movq 16%1, %%mm4\n\t"
2507 "movq 24%1, %%mm6\n\t"
2508 "movq %%mm0, %%mm1\n\t"
2509 "movq %%mm2, %%mm3\n\t"
2510 "movq %%mm4, %%mm5\n\t"
2511 "movq %%mm6, %%mm7\n\t"
2512 "punpcklbw %%mm0, %%mm0\n\t"
2513 "punpckhbw %%mm1, %%mm1\n\t"
2514 "punpcklbw %%mm2, %%mm2\n\t"
2515 "punpckhbw %%mm3, %%mm3\n\t"
2516 "punpcklbw %%mm4, %%mm4\n\t"
2517 "punpckhbw %%mm5, %%mm5\n\t"
2518 "punpcklbw %%mm6, %%mm6\n\t"
2519 "punpckhbw %%mm7, %%mm7\n\t"
2520 MOVNTQ" %%mm0, %0\n\t"
2521 MOVNTQ" %%mm1, 8%0\n\t"
2522 MOVNTQ" %%mm2, 16%0\n\t"
2523 MOVNTQ" %%mm3, 24%0\n\t"
2524 MOVNTQ" %%mm4, 32%0\n\t"
2525 MOVNTQ" %%mm5, 40%0\n\t"
2526 MOVNTQ" %%mm6, 48%0\n\t"
2527 MOVNTQ" %%mm7, 56%0"
2533 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2536 const uint8_t* s2=src2+srcStride2*(y>>1);
2537 uint8_t* d=dst2+dstStride2*y;
2544 "movq %1, %%mm0\n\t"
2545 "movq 8%1, %%mm2\n\t"
2546 "movq 16%1, %%mm4\n\t"
2547 "movq 24%1, %%mm6\n\t"
2548 "movq %%mm0, %%mm1\n\t"
2549 "movq %%mm2, %%mm3\n\t"
2550 "movq %%mm4, %%mm5\n\t"
2551 "movq %%mm6, %%mm7\n\t"
2552 "punpcklbw %%mm0, %%mm0\n\t"
2553 "punpckhbw %%mm1, %%mm1\n\t"
2554 "punpcklbw %%mm2, %%mm2\n\t"
2555 "punpckhbw %%mm3, %%mm3\n\t"
2556 "punpcklbw %%mm4, %%mm4\n\t"
2557 "punpckhbw %%mm5, %%mm5\n\t"
2558 "punpcklbw %%mm6, %%mm6\n\t"
2559 "punpckhbw %%mm7, %%mm7\n\t"
2560 MOVNTQ" %%mm0, %0\n\t"
2561 MOVNTQ" %%mm1, 8%0\n\t"
2562 MOVNTQ" %%mm2, 16%0\n\t"
2563 MOVNTQ" %%mm3, 24%0\n\t"
2564 MOVNTQ" %%mm4, 32%0\n\t"
2565 MOVNTQ" %%mm5, 40%0\n\t"
2566 MOVNTQ" %%mm6, 48%0\n\t"
2567 MOVNTQ" %%mm7, 56%0"
2573 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2584 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2586 long width, long height,
2587 long srcStride1, long srcStride2,
2588 long srcStride3, long dstStride)
2591 w=width/2; h=height;
2593 const uint8_t* yp=src1+srcStride1*y;
2594 const uint8_t* up=src2+srcStride2*(y>>2);
2595 const uint8_t* vp=src3+srcStride3*(y>>2);
2596 uint8_t* d=dst+dstStride*y;
2602 PREFETCH" 32(%1, %0)\n\t"
2603 PREFETCH" 32(%2, %0)\n\t"
2604 PREFETCH" 32(%3, %0)\n\t"
2605 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2606 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2607 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2608 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2609 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2610 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2611 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2612 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2613 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2614 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2616 "movq %%mm1, %%mm6\n\t"
2617 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2618 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2619 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2620 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2621 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2623 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2624 "movq 8(%1, %0, 4), %%mm0\n\t"
2625 "movq %%mm0, %%mm3\n\t"
2626 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2627 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2628 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2629 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2631 "movq %%mm4, %%mm6\n\t"
2632 "movq 16(%1, %0, 4), %%mm0\n\t"
2633 "movq %%mm0, %%mm3\n\t"
2634 "punpcklbw %%mm5, %%mm4\n\t"
2635 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2636 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2637 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2638 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2640 "punpckhbw %%mm5, %%mm6\n\t"
2641 "movq 24(%1, %0, 4), %%mm0\n\t"
2642 "movq %%mm0, %%mm3\n\t"
2643 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2644 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2645 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2646 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2649 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2655 const long x2= x<<2;