3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
11 * This file is part of FFmpeg.
13 * FFmpeg is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
29 #include <inttypes.h> /* for __WORDSIZE */
32 // #warning You have misconfigured system and probably will lose performance!
33 #define __WORDSIZE MP_WORDSIZE
51 #define PREFETCH "prefetch"
52 #define PREFETCHW "prefetchw"
53 #define PAVGB "pavgusb"
54 #elif defined ( HAVE_MMX2 )
55 #define PREFETCH "prefetchnta"
56 #define PREFETCHW "prefetcht0"
63 #define PREFETCH " # nop"
64 #define PREFETCHW " # nop"
69 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
76 #define MOVNTQ "movntq"
77 #define SFENCE "sfence"
80 #define SFENCE " # nop"
83 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
86 const uint8_t *s = src;
89 const uint8_t *mm_end;
93 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
95 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
101 "punpckldq 3%1, %%mm0\n\t"
102 "movd 6%1, %%mm1\n\t"
103 "punpckldq 9%1, %%mm1\n\t"
104 "movd 12%1, %%mm2\n\t"
105 "punpckldq 15%1, %%mm2\n\t"
106 "movd 18%1, %%mm3\n\t"
107 "punpckldq 21%1, %%mm3\n\t"
108 "pand %%mm7, %%mm0\n\t"
109 "pand %%mm7, %%mm1\n\t"
110 "pand %%mm7, %%mm2\n\t"
111 "pand %%mm7, %%mm3\n\t"
112 MOVNTQ" %%mm0, %0\n\t"
113 MOVNTQ" %%mm1, 8%0\n\t"
114 MOVNTQ" %%mm2, 16%0\n\t"
122 __asm __volatile(SFENCE:::"memory");
123 __asm __volatile(EMMS:::"memory");
127 #ifdef WORDS_BIGENDIAN
128 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
143 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
146 const uint8_t *s = src;
149 const uint8_t *mm_end;
153 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
160 "movq 8%1, %%mm1\n\t"
161 "movq 16%1, %%mm4\n\t"
162 "movq 24%1, %%mm5\n\t"
163 "movq %%mm0, %%mm2\n\t"
164 "movq %%mm1, %%mm3\n\t"
165 "movq %%mm4, %%mm6\n\t"
166 "movq %%mm5, %%mm7\n\t"
167 "psrlq $8, %%mm2\n\t"
168 "psrlq $8, %%mm3\n\t"
169 "psrlq $8, %%mm6\n\t"
170 "psrlq $8, %%mm7\n\t"
179 "por %%mm2, %%mm0\n\t"
180 "por %%mm3, %%mm1\n\t"
181 "por %%mm6, %%mm4\n\t"
182 "por %%mm7, %%mm5\n\t"
184 "movq %%mm1, %%mm2\n\t"
185 "movq %%mm4, %%mm3\n\t"
186 "psllq $48, %%mm2\n\t"
187 "psllq $32, %%mm3\n\t"
190 "por %%mm2, %%mm0\n\t"
191 "psrlq $16, %%mm1\n\t"
192 "psrlq $32, %%mm4\n\t"
193 "psllq $16, %%mm5\n\t"
194 "por %%mm3, %%mm1\n\t"
196 "por %%mm5, %%mm4\n\t"
198 MOVNTQ" %%mm0, %0\n\t"
199 MOVNTQ" %%mm1, 8%0\n\t"
202 :"m"(*s),"m"(mask24l),
203 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
208 __asm __volatile(SFENCE:::"memory");
209 __asm __volatile(EMMS:::"memory");
213 #ifdef WORDS_BIGENDIAN
214 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
230 Original by Strepto/Astral
231 ported to gcc & bugfixed : A'rpi
232 MMX2, 3DNOW optimization by Nick Kurshev
233 32bit c version, and and&add trick by Michael Niedermayer
235 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
237 register const uint8_t* s=src;
238 register uint8_t* d=dst;
239 register const uint8_t *end;
240 const uint8_t *mm_end;
243 __asm __volatile(PREFETCH" %0"::"m"(*s));
244 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
251 "movq 8%1, %%mm2\n\t"
252 "movq %%mm0, %%mm1\n\t"
253 "movq %%mm2, %%mm3\n\t"
254 "pand %%mm4, %%mm0\n\t"
255 "pand %%mm4, %%mm2\n\t"
256 "paddw %%mm1, %%mm0\n\t"
257 "paddw %%mm3, %%mm2\n\t"
258 MOVNTQ" %%mm0, %0\n\t"
266 __asm __volatile(SFENCE:::"memory");
267 __asm __volatile(EMMS:::"memory");
272 register unsigned x= *((uint32_t *)s);
273 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
279 register unsigned short x= *((uint16_t *)s);
280 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
284 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
286 register const uint8_t* s=src;
287 register uint8_t* d=dst;
288 register const uint8_t *end;
289 const uint8_t *mm_end;
292 __asm __volatile(PREFETCH" %0"::"m"(*s));
293 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
294 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
301 "movq 8%1, %%mm2\n\t"
302 "movq %%mm0, %%mm1\n\t"
303 "movq %%mm2, %%mm3\n\t"
304 "psrlq $1, %%mm0\n\t"
305 "psrlq $1, %%mm2\n\t"
306 "pand %%mm7, %%mm0\n\t"
307 "pand %%mm7, %%mm2\n\t"
308 "pand %%mm6, %%mm1\n\t"
309 "pand %%mm6, %%mm3\n\t"
310 "por %%mm1, %%mm0\n\t"
311 "por %%mm3, %%mm2\n\t"
312 MOVNTQ" %%mm0, %0\n\t"
320 __asm __volatile(SFENCE:::"memory");
321 __asm __volatile(EMMS:::"memory");
326 register uint32_t x= *((uint32_t *)s);
327 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
333 register uint16_t x= *((uint16_t *)s);
334 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
340 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
342 const uint8_t *s = src;
345 const uint8_t *mm_end;
347 uint16_t *d = (uint16_t *)dst;
351 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
353 "movq %3, %%mm5 \n\t"
354 "movq %4, %%mm6 \n\t"
355 "movq %5, %%mm7 \n\t"
358 PREFETCH" 32(%1) \n\t"
359 "movd (%1), %%mm0 \n\t"
360 "movd 4(%1), %%mm3 \n\t"
361 "punpckldq 8(%1), %%mm0 \n\t"
362 "punpckldq 12(%1), %%mm3 \n\t"
363 "movq %%mm0, %%mm1 \n\t"
364 "movq %%mm3, %%mm4 \n\t"
365 "pand %%mm6, %%mm0 \n\t"
366 "pand %%mm6, %%mm3 \n\t"
367 "pmaddwd %%mm7, %%mm0 \n\t"
368 "pmaddwd %%mm7, %%mm3 \n\t"
369 "pand %%mm5, %%mm1 \n\t"
370 "pand %%mm5, %%mm4 \n\t"
371 "por %%mm1, %%mm0 \n\t"
372 "por %%mm4, %%mm3 \n\t"
373 "psrld $5, %%mm0 \n\t"
374 "pslld $11, %%mm3 \n\t"
375 "por %%mm3, %%mm0 \n\t"
376 MOVNTQ" %%mm0, (%0) \n\t"
382 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
385 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
389 ::"m"(red_16mask),"m"(green_16mask));
395 "movd 4%1, %%mm3\n\t"
396 "punpckldq 8%1, %%mm0\n\t"
397 "punpckldq 12%1, %%mm3\n\t"
398 "movq %%mm0, %%mm1\n\t"
399 "movq %%mm0, %%mm2\n\t"
400 "movq %%mm3, %%mm4\n\t"
401 "movq %%mm3, %%mm5\n\t"
402 "psrlq $3, %%mm0\n\t"
403 "psrlq $3, %%mm3\n\t"
406 "psrlq $5, %%mm1\n\t"
407 "psrlq $5, %%mm4\n\t"
408 "pand %%mm6, %%mm1\n\t"
409 "pand %%mm6, %%mm4\n\t"
410 "psrlq $8, %%mm2\n\t"
411 "psrlq $8, %%mm5\n\t"
412 "pand %%mm7, %%mm2\n\t"
413 "pand %%mm7, %%mm5\n\t"
414 "por %%mm1, %%mm0\n\t"
415 "por %%mm4, %%mm3\n\t"
416 "por %%mm2, %%mm0\n\t"
417 "por %%mm5, %%mm3\n\t"
418 "psllq $16, %%mm3\n\t"
419 "por %%mm3, %%mm0\n\t"
420 MOVNTQ" %%mm0, %0\n\t"
421 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
426 __asm __volatile(SFENCE:::"memory");
427 __asm __volatile(EMMS:::"memory");
431 register int rgb = *(uint32_t*)s; s += 4;
432 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
436 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
438 const uint8_t *s = src;
441 const uint8_t *mm_end;
443 uint16_t *d = (uint16_t *)dst;
446 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
450 ::"m"(red_16mask),"m"(green_16mask));
457 "movd 4%1, %%mm3\n\t"
458 "punpckldq 8%1, %%mm0\n\t"
459 "punpckldq 12%1, %%mm3\n\t"
460 "movq %%mm0, %%mm1\n\t"
461 "movq %%mm0, %%mm2\n\t"
462 "movq %%mm3, %%mm4\n\t"
463 "movq %%mm3, %%mm5\n\t"
464 "psllq $8, %%mm0\n\t"
465 "psllq $8, %%mm3\n\t"
466 "pand %%mm7, %%mm0\n\t"
467 "pand %%mm7, %%mm3\n\t"
468 "psrlq $5, %%mm1\n\t"
469 "psrlq $5, %%mm4\n\t"
470 "pand %%mm6, %%mm1\n\t"
471 "pand %%mm6, %%mm4\n\t"
472 "psrlq $19, %%mm2\n\t"
473 "psrlq $19, %%mm5\n\t"
476 "por %%mm1, %%mm0\n\t"
477 "por %%mm4, %%mm3\n\t"
478 "por %%mm2, %%mm0\n\t"
479 "por %%mm5, %%mm3\n\t"
480 "psllq $16, %%mm3\n\t"
481 "por %%mm3, %%mm0\n\t"
482 MOVNTQ" %%mm0, %0\n\t"
483 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
487 __asm __volatile(SFENCE:::"memory");
488 __asm __volatile(EMMS:::"memory");
492 register int rgb = *(uint32_t*)s; s += 4;
493 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
497 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
499 const uint8_t *s = src;
502 const uint8_t *mm_end;
504 uint16_t *d = (uint16_t *)dst;
508 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
510 "movq %3, %%mm5 \n\t"
511 "movq %4, %%mm6 \n\t"
512 "movq %5, %%mm7 \n\t"
515 PREFETCH" 32(%1) \n\t"
516 "movd (%1), %%mm0 \n\t"
517 "movd 4(%1), %%mm3 \n\t"
518 "punpckldq 8(%1), %%mm0 \n\t"
519 "punpckldq 12(%1), %%mm3 \n\t"
520 "movq %%mm0, %%mm1 \n\t"
521 "movq %%mm3, %%mm4 \n\t"
522 "pand %%mm6, %%mm0 \n\t"
523 "pand %%mm6, %%mm3 \n\t"
524 "pmaddwd %%mm7, %%mm0 \n\t"
525 "pmaddwd %%mm7, %%mm3 \n\t"
526 "pand %%mm5, %%mm1 \n\t"
527 "pand %%mm5, %%mm4 \n\t"
528 "por %%mm1, %%mm0 \n\t"
529 "por %%mm4, %%mm3 \n\t"
530 "psrld $6, %%mm0 \n\t"
531 "pslld $10, %%mm3 \n\t"
532 "por %%mm3, %%mm0 \n\t"
533 MOVNTQ" %%mm0, (%0) \n\t"
539 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
542 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
546 ::"m"(red_15mask),"m"(green_15mask));
552 "movd 4%1, %%mm3\n\t"
553 "punpckldq 8%1, %%mm0\n\t"
554 "punpckldq 12%1, %%mm3\n\t"
555 "movq %%mm0, %%mm1\n\t"
556 "movq %%mm0, %%mm2\n\t"
557 "movq %%mm3, %%mm4\n\t"
558 "movq %%mm3, %%mm5\n\t"
559 "psrlq $3, %%mm0\n\t"
560 "psrlq $3, %%mm3\n\t"
563 "psrlq $6, %%mm1\n\t"
564 "psrlq $6, %%mm4\n\t"
565 "pand %%mm6, %%mm1\n\t"
566 "pand %%mm6, %%mm4\n\t"
567 "psrlq $9, %%mm2\n\t"
568 "psrlq $9, %%mm5\n\t"
569 "pand %%mm7, %%mm2\n\t"
570 "pand %%mm7, %%mm5\n\t"
571 "por %%mm1, %%mm0\n\t"
572 "por %%mm4, %%mm3\n\t"
573 "por %%mm2, %%mm0\n\t"
574 "por %%mm5, %%mm3\n\t"
575 "psllq $16, %%mm3\n\t"
576 "por %%mm3, %%mm0\n\t"
577 MOVNTQ" %%mm0, %0\n\t"
578 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
583 __asm __volatile(SFENCE:::"memory");
584 __asm __volatile(EMMS:::"memory");
588 register int rgb = *(uint32_t*)s; s += 4;
589 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
593 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
595 const uint8_t *s = src;
598 const uint8_t *mm_end;
600 uint16_t *d = (uint16_t *)dst;
603 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
607 ::"m"(red_15mask),"m"(green_15mask));
614 "movd 4%1, %%mm3\n\t"
615 "punpckldq 8%1, %%mm0\n\t"
616 "punpckldq 12%1, %%mm3\n\t"
617 "movq %%mm0, %%mm1\n\t"
618 "movq %%mm0, %%mm2\n\t"
619 "movq %%mm3, %%mm4\n\t"
620 "movq %%mm3, %%mm5\n\t"
621 "psllq $7, %%mm0\n\t"
622 "psllq $7, %%mm3\n\t"
623 "pand %%mm7, %%mm0\n\t"
624 "pand %%mm7, %%mm3\n\t"
625 "psrlq $6, %%mm1\n\t"
626 "psrlq $6, %%mm4\n\t"
627 "pand %%mm6, %%mm1\n\t"
628 "pand %%mm6, %%mm4\n\t"
629 "psrlq $19, %%mm2\n\t"
630 "psrlq $19, %%mm5\n\t"
633 "por %%mm1, %%mm0\n\t"
634 "por %%mm4, %%mm3\n\t"
635 "por %%mm2, %%mm0\n\t"
636 "por %%mm5, %%mm3\n\t"
637 "psllq $16, %%mm3\n\t"
638 "por %%mm3, %%mm0\n\t"
639 MOVNTQ" %%mm0, %0\n\t"
640 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
644 __asm __volatile(SFENCE:::"memory");
645 __asm __volatile(EMMS:::"memory");
649 register int rgb = *(uint32_t*)s; s += 4;
650 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
654 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
656 const uint8_t *s = src;
659 const uint8_t *mm_end;
661 uint16_t *d = (uint16_t *)dst;
664 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
668 ::"m"(red_16mask),"m"(green_16mask));
675 "movd 3%1, %%mm3\n\t"
676 "punpckldq 6%1, %%mm0\n\t"
677 "punpckldq 9%1, %%mm3\n\t"
678 "movq %%mm0, %%mm1\n\t"
679 "movq %%mm0, %%mm2\n\t"
680 "movq %%mm3, %%mm4\n\t"
681 "movq %%mm3, %%mm5\n\t"
682 "psrlq $3, %%mm0\n\t"
683 "psrlq $3, %%mm3\n\t"
686 "psrlq $5, %%mm1\n\t"
687 "psrlq $5, %%mm4\n\t"
688 "pand %%mm6, %%mm1\n\t"
689 "pand %%mm6, %%mm4\n\t"
690 "psrlq $8, %%mm2\n\t"
691 "psrlq $8, %%mm5\n\t"
692 "pand %%mm7, %%mm2\n\t"
693 "pand %%mm7, %%mm5\n\t"
694 "por %%mm1, %%mm0\n\t"
695 "por %%mm4, %%mm3\n\t"
696 "por %%mm2, %%mm0\n\t"
697 "por %%mm5, %%mm3\n\t"
698 "psllq $16, %%mm3\n\t"
699 "por %%mm3, %%mm0\n\t"
700 MOVNTQ" %%mm0, %0\n\t"
701 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
705 __asm __volatile(SFENCE:::"memory");
706 __asm __volatile(EMMS:::"memory");
713 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
717 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
719 const uint8_t *s = src;
722 const uint8_t *mm_end;
724 uint16_t *d = (uint16_t *)dst;
727 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
731 ::"m"(red_16mask),"m"(green_16mask));
738 "movd 3%1, %%mm3\n\t"
739 "punpckldq 6%1, %%mm0\n\t"
740 "punpckldq 9%1, %%mm3\n\t"
741 "movq %%mm0, %%mm1\n\t"
742 "movq %%mm0, %%mm2\n\t"
743 "movq %%mm3, %%mm4\n\t"
744 "movq %%mm3, %%mm5\n\t"
745 "psllq $8, %%mm0\n\t"
746 "psllq $8, %%mm3\n\t"
747 "pand %%mm7, %%mm0\n\t"
748 "pand %%mm7, %%mm3\n\t"
749 "psrlq $5, %%mm1\n\t"
750 "psrlq $5, %%mm4\n\t"
751 "pand %%mm6, %%mm1\n\t"
752 "pand %%mm6, %%mm4\n\t"
753 "psrlq $19, %%mm2\n\t"
754 "psrlq $19, %%mm5\n\t"
757 "por %%mm1, %%mm0\n\t"
758 "por %%mm4, %%mm3\n\t"
759 "por %%mm2, %%mm0\n\t"
760 "por %%mm5, %%mm3\n\t"
761 "psllq $16, %%mm3\n\t"
762 "por %%mm3, %%mm0\n\t"
763 MOVNTQ" %%mm0, %0\n\t"
764 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
768 __asm __volatile(SFENCE:::"memory");
769 __asm __volatile(EMMS:::"memory");
776 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
780 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
782 const uint8_t *s = src;
785 const uint8_t *mm_end;
787 uint16_t *d = (uint16_t *)dst;
790 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
794 ::"m"(red_15mask),"m"(green_15mask));
801 "movd 3%1, %%mm3\n\t"
802 "punpckldq 6%1, %%mm0\n\t"
803 "punpckldq 9%1, %%mm3\n\t"
804 "movq %%mm0, %%mm1\n\t"
805 "movq %%mm0, %%mm2\n\t"
806 "movq %%mm3, %%mm4\n\t"
807 "movq %%mm3, %%mm5\n\t"
808 "psrlq $3, %%mm0\n\t"
809 "psrlq $3, %%mm3\n\t"
812 "psrlq $6, %%mm1\n\t"
813 "psrlq $6, %%mm4\n\t"
814 "pand %%mm6, %%mm1\n\t"
815 "pand %%mm6, %%mm4\n\t"
816 "psrlq $9, %%mm2\n\t"
817 "psrlq $9, %%mm5\n\t"
818 "pand %%mm7, %%mm2\n\t"
819 "pand %%mm7, %%mm5\n\t"
820 "por %%mm1, %%mm0\n\t"
821 "por %%mm4, %%mm3\n\t"
822 "por %%mm2, %%mm0\n\t"
823 "por %%mm5, %%mm3\n\t"
824 "psllq $16, %%mm3\n\t"
825 "por %%mm3, %%mm0\n\t"
826 MOVNTQ" %%mm0, %0\n\t"
827 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
831 __asm __volatile(SFENCE:::"memory");
832 __asm __volatile(EMMS:::"memory");
839 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
843 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
845 const uint8_t *s = src;
848 const uint8_t *mm_end;
850 uint16_t *d = (uint16_t *)dst;
853 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
857 ::"m"(red_15mask),"m"(green_15mask));
864 "movd 3%1, %%mm3\n\t"
865 "punpckldq 6%1, %%mm0\n\t"
866 "punpckldq 9%1, %%mm3\n\t"
867 "movq %%mm0, %%mm1\n\t"
868 "movq %%mm0, %%mm2\n\t"
869 "movq %%mm3, %%mm4\n\t"
870 "movq %%mm3, %%mm5\n\t"
871 "psllq $7, %%mm0\n\t"
872 "psllq $7, %%mm3\n\t"
873 "pand %%mm7, %%mm0\n\t"
874 "pand %%mm7, %%mm3\n\t"
875 "psrlq $6, %%mm1\n\t"
876 "psrlq $6, %%mm4\n\t"
877 "pand %%mm6, %%mm1\n\t"
878 "pand %%mm6, %%mm4\n\t"
879 "psrlq $19, %%mm2\n\t"
880 "psrlq $19, %%mm5\n\t"
883 "por %%mm1, %%mm0\n\t"
884 "por %%mm4, %%mm3\n\t"
885 "por %%mm2, %%mm0\n\t"
886 "por %%mm5, %%mm3\n\t"
887 "psllq $16, %%mm3\n\t"
888 "por %%mm3, %%mm0\n\t"
889 MOVNTQ" %%mm0, %0\n\t"
890 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
894 __asm __volatile(SFENCE:::"memory");
895 __asm __volatile(EMMS:::"memory");
902 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
907 I use here less accurate approximation by simply
908 left-shifting the input
909 value and filling the low order bits with
910 zeroes. This method improves png's
911 compression but this scheme cannot reproduce white exactly, since it does not
912 generate an all-ones maximum value; the net effect is to darken the
915 The better method should be "left bit replication":
925 | Leftmost Bits Repeated to Fill Open Bits
929 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
933 const uint16_t *mm_end;
935 uint8_t *d = (uint8_t *)dst;
936 const uint16_t *s = (uint16_t *)src;
937 end = s + src_size/2;
939 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
951 "psllq $3, %%mm0\n\t"
952 "psrlq $2, %%mm1\n\t"
953 "psrlq $7, %%mm2\n\t"
954 "movq %%mm0, %%mm3\n\t"
955 "movq %%mm1, %%mm4\n\t"
956 "movq %%mm2, %%mm5\n\t"
957 "punpcklwd %5, %%mm0\n\t"
958 "punpcklwd %5, %%mm1\n\t"
959 "punpcklwd %5, %%mm2\n\t"
960 "punpckhwd %5, %%mm3\n\t"
961 "punpckhwd %5, %%mm4\n\t"
962 "punpckhwd %5, %%mm5\n\t"
963 "psllq $8, %%mm1\n\t"
964 "psllq $16, %%mm2\n\t"
965 "por %%mm1, %%mm0\n\t"
966 "por %%mm2, %%mm0\n\t"
967 "psllq $8, %%mm4\n\t"
968 "psllq $16, %%mm5\n\t"
969 "por %%mm4, %%mm3\n\t"
970 "por %%mm5, %%mm3\n\t"
972 "movq %%mm0, %%mm6\n\t"
973 "movq %%mm3, %%mm7\n\t"
975 "movq 8%1, %%mm0\n\t"
976 "movq 8%1, %%mm1\n\t"
977 "movq 8%1, %%mm2\n\t"
981 "psllq $3, %%mm0\n\t"
982 "psrlq $2, %%mm1\n\t"
983 "psrlq $7, %%mm2\n\t"
984 "movq %%mm0, %%mm3\n\t"
985 "movq %%mm1, %%mm4\n\t"
986 "movq %%mm2, %%mm5\n\t"
987 "punpcklwd %5, %%mm0\n\t"
988 "punpcklwd %5, %%mm1\n\t"
989 "punpcklwd %5, %%mm2\n\t"
990 "punpckhwd %5, %%mm3\n\t"
991 "punpckhwd %5, %%mm4\n\t"
992 "punpckhwd %5, %%mm5\n\t"
993 "psllq $8, %%mm1\n\t"
994 "psllq $16, %%mm2\n\t"
995 "por %%mm1, %%mm0\n\t"
996 "por %%mm2, %%mm0\n\t"
997 "psllq $8, %%mm4\n\t"
998 "psllq $16, %%mm5\n\t"
999 "por %%mm4, %%mm3\n\t"
1000 "por %%mm5, %%mm3\n\t"
1003 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1005 /* Borrowed 32 to 24 */
1007 "movq %%mm0, %%mm4\n\t"
1008 "movq %%mm3, %%mm5\n\t"
1009 "movq %%mm6, %%mm0\n\t"
1010 "movq %%mm7, %%mm1\n\t"
1012 "movq %%mm4, %%mm6\n\t"
1013 "movq %%mm5, %%mm7\n\t"
1014 "movq %%mm0, %%mm2\n\t"
1015 "movq %%mm1, %%mm3\n\t"
1017 "psrlq $8, %%mm2\n\t"
1018 "psrlq $8, %%mm3\n\t"
1019 "psrlq $8, %%mm6\n\t"
1020 "psrlq $8, %%mm7\n\t"
1021 "pand %2, %%mm0\n\t"
1022 "pand %2, %%mm1\n\t"
1023 "pand %2, %%mm4\n\t"
1024 "pand %2, %%mm5\n\t"
1025 "pand %3, %%mm2\n\t"
1026 "pand %3, %%mm3\n\t"
1027 "pand %3, %%mm6\n\t"
1028 "pand %3, %%mm7\n\t"
1029 "por %%mm2, %%mm0\n\t"
1030 "por %%mm3, %%mm1\n\t"
1031 "por %%mm6, %%mm4\n\t"
1032 "por %%mm7, %%mm5\n\t"
1034 "movq %%mm1, %%mm2\n\t"
1035 "movq %%mm4, %%mm3\n\t"
1036 "psllq $48, %%mm2\n\t"
1037 "psllq $32, %%mm3\n\t"
1038 "pand %4, %%mm2\n\t"
1039 "pand %5, %%mm3\n\t"
1040 "por %%mm2, %%mm0\n\t"
1041 "psrlq $16, %%mm1\n\t"
1042 "psrlq $32, %%mm4\n\t"
1043 "psllq $16, %%mm5\n\t"
1044 "por %%mm3, %%mm1\n\t"
1045 "pand %6, %%mm5\n\t"
1046 "por %%mm5, %%mm4\n\t"
1048 MOVNTQ" %%mm0, %0\n\t"
1049 MOVNTQ" %%mm1, 8%0\n\t"
1050 MOVNTQ" %%mm4, 16%0"
1053 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1058 __asm __volatile(SFENCE:::"memory");
1059 __asm __volatile(EMMS:::"memory");
1063 register uint16_t bgr;
1065 *d++ = (bgr&0x1F)<<3;
1066 *d++ = (bgr&0x3E0)>>2;
1067 *d++ = (bgr&0x7C00)>>7;
1071 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1073 const uint16_t *end;
1075 const uint16_t *mm_end;
1077 uint8_t *d = (uint8_t *)dst;
1078 const uint16_t *s = (const uint16_t *)src;
1079 end = s + src_size/2;
1081 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1087 "movq %1, %%mm0\n\t"
1088 "movq %1, %%mm1\n\t"
1089 "movq %1, %%mm2\n\t"
1090 "pand %2, %%mm0\n\t"
1091 "pand %3, %%mm1\n\t"
1092 "pand %4, %%mm2\n\t"
1093 "psllq $3, %%mm0\n\t"
1094 "psrlq $3, %%mm1\n\t"
1095 "psrlq $8, %%mm2\n\t"
1096 "movq %%mm0, %%mm3\n\t"
1097 "movq %%mm1, %%mm4\n\t"
1098 "movq %%mm2, %%mm5\n\t"
1099 "punpcklwd %5, %%mm0\n\t"
1100 "punpcklwd %5, %%mm1\n\t"
1101 "punpcklwd %5, %%mm2\n\t"
1102 "punpckhwd %5, %%mm3\n\t"
1103 "punpckhwd %5, %%mm4\n\t"
1104 "punpckhwd %5, %%mm5\n\t"
1105 "psllq $8, %%mm1\n\t"
1106 "psllq $16, %%mm2\n\t"
1107 "por %%mm1, %%mm0\n\t"
1108 "por %%mm2, %%mm0\n\t"
1109 "psllq $8, %%mm4\n\t"
1110 "psllq $16, %%mm5\n\t"
1111 "por %%mm4, %%mm3\n\t"
1112 "por %%mm5, %%mm3\n\t"
1114 "movq %%mm0, %%mm6\n\t"
1115 "movq %%mm3, %%mm7\n\t"
1117 "movq 8%1, %%mm0\n\t"
1118 "movq 8%1, %%mm1\n\t"
1119 "movq 8%1, %%mm2\n\t"
1120 "pand %2, %%mm0\n\t"
1121 "pand %3, %%mm1\n\t"
1122 "pand %4, %%mm2\n\t"
1123 "psllq $3, %%mm0\n\t"
1124 "psrlq $3, %%mm1\n\t"
1125 "psrlq $8, %%mm2\n\t"
1126 "movq %%mm0, %%mm3\n\t"
1127 "movq %%mm1, %%mm4\n\t"
1128 "movq %%mm2, %%mm5\n\t"
1129 "punpcklwd %5, %%mm0\n\t"
1130 "punpcklwd %5, %%mm1\n\t"
1131 "punpcklwd %5, %%mm2\n\t"
1132 "punpckhwd %5, %%mm3\n\t"
1133 "punpckhwd %5, %%mm4\n\t"
1134 "punpckhwd %5, %%mm5\n\t"
1135 "psllq $8, %%mm1\n\t"
1136 "psllq $16, %%mm2\n\t"
1137 "por %%mm1, %%mm0\n\t"
1138 "por %%mm2, %%mm0\n\t"
1139 "psllq $8, %%mm4\n\t"
1140 "psllq $16, %%mm5\n\t"
1141 "por %%mm4, %%mm3\n\t"
1142 "por %%mm5, %%mm3\n\t"
1144 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1146 /* Borrowed 32 to 24 */
1148 "movq %%mm0, %%mm4\n\t"
1149 "movq %%mm3, %%mm5\n\t"
1150 "movq %%mm6, %%mm0\n\t"
1151 "movq %%mm7, %%mm1\n\t"
1153 "movq %%mm4, %%mm6\n\t"
1154 "movq %%mm5, %%mm7\n\t"
1155 "movq %%mm0, %%mm2\n\t"
1156 "movq %%mm1, %%mm3\n\t"
1158 "psrlq $8, %%mm2\n\t"
1159 "psrlq $8, %%mm3\n\t"
1160 "psrlq $8, %%mm6\n\t"
1161 "psrlq $8, %%mm7\n\t"
1162 "pand %2, %%mm0\n\t"
1163 "pand %2, %%mm1\n\t"
1164 "pand %2, %%mm4\n\t"
1165 "pand %2, %%mm5\n\t"
1166 "pand %3, %%mm2\n\t"
1167 "pand %3, %%mm3\n\t"
1168 "pand %3, %%mm6\n\t"
1169 "pand %3, %%mm7\n\t"
1170 "por %%mm2, %%mm0\n\t"
1171 "por %%mm3, %%mm1\n\t"
1172 "por %%mm6, %%mm4\n\t"
1173 "por %%mm7, %%mm5\n\t"
1175 "movq %%mm1, %%mm2\n\t"
1176 "movq %%mm4, %%mm3\n\t"
1177 "psllq $48, %%mm2\n\t"
1178 "psllq $32, %%mm3\n\t"
1179 "pand %4, %%mm2\n\t"
1180 "pand %5, %%mm3\n\t"
1181 "por %%mm2, %%mm0\n\t"
1182 "psrlq $16, %%mm1\n\t"
1183 "psrlq $32, %%mm4\n\t"
1184 "psllq $16, %%mm5\n\t"
1185 "por %%mm3, %%mm1\n\t"
1186 "pand %6, %%mm5\n\t"
1187 "por %%mm5, %%mm4\n\t"
1189 MOVNTQ" %%mm0, %0\n\t"
1190 MOVNTQ" %%mm1, 8%0\n\t"
1191 MOVNTQ" %%mm4, 16%0"
1194 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1199 __asm __volatile(SFENCE:::"memory");
1200 __asm __volatile(EMMS:::"memory");
1204 register uint16_t bgr;
1206 *d++ = (bgr&0x1F)<<3;
1207 *d++ = (bgr&0x7E0)>>3;
1208 *d++ = (bgr&0xF800)>>8;
1212 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1214 const uint16_t *end;
1216 const uint16_t *mm_end;
1218 uint8_t *d = (uint8_t *)dst;
1219 const uint16_t *s = (const uint16_t *)src;
1220 end = s + src_size/2;
1222 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1223 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1229 "movq %1, %%mm0\n\t"
1230 "movq %1, %%mm1\n\t"
1231 "movq %1, %%mm2\n\t"
1232 "pand %2, %%mm0\n\t"
1233 "pand %3, %%mm1\n\t"
1234 "pand %4, %%mm2\n\t"
1235 "psllq $3, %%mm0\n\t"
1236 "psrlq $2, %%mm1\n\t"
1237 "psrlq $7, %%mm2\n\t"
1238 "movq %%mm0, %%mm3\n\t"
1239 "movq %%mm1, %%mm4\n\t"
1240 "movq %%mm2, %%mm5\n\t"
1241 "punpcklwd %%mm7, %%mm0\n\t"
1242 "punpcklwd %%mm7, %%mm1\n\t"
1243 "punpcklwd %%mm7, %%mm2\n\t"
1244 "punpckhwd %%mm7, %%mm3\n\t"
1245 "punpckhwd %%mm7, %%mm4\n\t"
1246 "punpckhwd %%mm7, %%mm5\n\t"
1247 "psllq $8, %%mm1\n\t"
1248 "psllq $16, %%mm2\n\t"
1249 "por %%mm1, %%mm0\n\t"
1250 "por %%mm2, %%mm0\n\t"
1251 "psllq $8, %%mm4\n\t"
1252 "psllq $16, %%mm5\n\t"
1253 "por %%mm4, %%mm3\n\t"
1254 "por %%mm5, %%mm3\n\t"
1255 MOVNTQ" %%mm0, %0\n\t"
1256 MOVNTQ" %%mm3, 8%0\n\t"
1258 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1263 __asm __volatile(SFENCE:::"memory");
1264 __asm __volatile(EMMS:::"memory");
1268 #if 0 //slightly slower on athlon
1270 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1272 register uint16_t bgr;
1274 #ifdef WORDS_BIGENDIAN
1276 *d++ = (bgr&0x7C00)>>7;
1277 *d++ = (bgr&0x3E0)>>2;
1278 *d++ = (bgr&0x1F)<<3;
1280 *d++ = (bgr&0x1F)<<3;
1281 *d++ = (bgr&0x3E0)>>2;
1282 *d++ = (bgr&0x7C00)>>7;
1290 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1292 const uint16_t *end;
1294 const uint16_t *mm_end;
1296 uint8_t *d = (uint8_t *)dst;
1297 const uint16_t *s = (uint16_t *)src;
1298 end = s + src_size/2;
1300 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1301 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1307 "movq %1, %%mm0\n\t"
1308 "movq %1, %%mm1\n\t"
1309 "movq %1, %%mm2\n\t"
1310 "pand %2, %%mm0\n\t"
1311 "pand %3, %%mm1\n\t"
1312 "pand %4, %%mm2\n\t"
1313 "psllq $3, %%mm0\n\t"
1314 "psrlq $3, %%mm1\n\t"
1315 "psrlq $8, %%mm2\n\t"
1316 "movq %%mm0, %%mm3\n\t"
1317 "movq %%mm1, %%mm4\n\t"
1318 "movq %%mm2, %%mm5\n\t"
1319 "punpcklwd %%mm7, %%mm0\n\t"
1320 "punpcklwd %%mm7, %%mm1\n\t"
1321 "punpcklwd %%mm7, %%mm2\n\t"
1322 "punpckhwd %%mm7, %%mm3\n\t"
1323 "punpckhwd %%mm7, %%mm4\n\t"
1324 "punpckhwd %%mm7, %%mm5\n\t"
1325 "psllq $8, %%mm1\n\t"
1326 "psllq $16, %%mm2\n\t"
1327 "por %%mm1, %%mm0\n\t"
1328 "por %%mm2, %%mm0\n\t"
1329 "psllq $8, %%mm4\n\t"
1330 "psllq $16, %%mm5\n\t"
1331 "por %%mm4, %%mm3\n\t"
1332 "por %%mm5, %%mm3\n\t"
1333 MOVNTQ" %%mm0, %0\n\t"
1334 MOVNTQ" %%mm3, 8%0\n\t"
1336 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1341 __asm __volatile(SFENCE:::"memory");
1342 __asm __volatile(EMMS:::"memory");
1346 register uint16_t bgr;
1348 #ifdef WORDS_BIGENDIAN
1350 *d++ = (bgr&0xF800)>>8;
1351 *d++ = (bgr&0x7E0)>>3;
1352 *d++ = (bgr&0x1F)<<3;
1354 *d++ = (bgr&0x1F)<<3;
1355 *d++ = (bgr&0x7E0)>>3;
1356 *d++ = (bgr&0xF800)>>8;
1362 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1365 /* TODO: unroll this loop */
1367 "xor %%"REG_a", %%"REG_a" \n\t"
1370 PREFETCH" 32(%0, %%"REG_a") \n\t"
1371 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1372 "movq %%mm0, %%mm1 \n\t"
1373 "movq %%mm0, %%mm2 \n\t"
1374 "pslld $16, %%mm0 \n\t"
1375 "psrld $16, %%mm1 \n\t"
1376 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1377 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1378 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1379 "por %%mm0, %%mm2 \n\t"
1380 "por %%mm1, %%mm2 \n\t"
1381 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1382 "add $8, %%"REG_a" \n\t"
1383 "cmp %2, %%"REG_a" \n\t"
1385 :: "r" (src), "r"(dst), "r" (src_size-7)
1389 __asm __volatile(SFENCE:::"memory");
1390 __asm __volatile(EMMS:::"memory");
1393 unsigned num_pixels = src_size >> 2;
1394 for(i=0; i<num_pixels; i++)
1396 #ifdef WORDS_BIGENDIAN
1397 dst[4*i + 1] = src[4*i + 3];
1398 dst[4*i + 2] = src[4*i + 2];
1399 dst[4*i + 3] = src[4*i + 1];
1401 dst[4*i + 0] = src[4*i + 2];
1402 dst[4*i + 1] = src[4*i + 1];
1403 dst[4*i + 2] = src[4*i + 0];
1409 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1413 long mmx_size= 23 - src_size;
1415 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1416 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1417 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1420 PREFETCH" 32(%1, %%"REG_a") \n\t"
1421 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1422 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1423 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1424 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1425 "pand %%mm5, %%mm0 \n\t"
1426 "pand %%mm6, %%mm1 \n\t"
1427 "pand %%mm7, %%mm2 \n\t"
1428 "por %%mm0, %%mm1 \n\t"
1429 "por %%mm2, %%mm1 \n\t"
1430 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1431 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1432 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1433 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1434 "pand %%mm7, %%mm0 \n\t"
1435 "pand %%mm5, %%mm1 \n\t"
1436 "pand %%mm6, %%mm2 \n\t"
1437 "por %%mm0, %%mm1 \n\t"
1438 "por %%mm2, %%mm1 \n\t"
1439 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1440 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1441 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1442 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1443 "pand %%mm6, %%mm0 \n\t"
1444 "pand %%mm7, %%mm1 \n\t"
1445 "pand %%mm5, %%mm2 \n\t"
1446 "por %%mm0, %%mm1 \n\t"
1447 "por %%mm2, %%mm1 \n\t"
1448 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1449 "add $24, %%"REG_a" \n\t"
1452 : "r" (src-mmx_size), "r"(dst-mmx_size)
1455 __asm __volatile(SFENCE:::"memory");
1456 __asm __volatile(EMMS:::"memory");
1458 if(mmx_size==23) return; //finihsed, was multiple of 8
1462 src_size= 23-mmx_size;
1466 for(i=0; i<src_size; i+=3)
1470 dst[i + 1] = src[i + 1];
1471 dst[i + 2] = src[i + 0];
1476 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1477 long width, long height,
1478 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1481 const long chromWidth= width>>1;
1482 for(y=0; y<height; y++)
1485 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1487 "xor %%"REG_a", %%"REG_a" \n\t"
1490 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1491 PREFETCH" 32(%2, %%"REG_a") \n\t"
1492 PREFETCH" 32(%3, %%"REG_a") \n\t"
1493 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1494 "movq %%mm0, %%mm2 \n\t" // U(0)
1495 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1496 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1497 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1499 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1500 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1501 "movq %%mm3, %%mm4 \n\t" // Y(0)
1502 "movq %%mm5, %%mm6 \n\t" // Y(8)
1503 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1504 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1505 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1506 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1508 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1509 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1510 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1511 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1513 "add $8, %%"REG_a" \n\t"
1514 "cmp %4, %%"REG_a" \n\t"
1516 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1521 #if defined ARCH_ALPHA && defined HAVE_MVI
1522 #define pl2yuy2(n) \
1527 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1528 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1529 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1530 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1531 yuv1 = (u << 8) + (v << 24); \
1538 uint64_t *qdst = (uint64_t *) dst;
1539 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1540 const uint32_t *yc = (uint32_t *) ysrc;
1541 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1542 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1543 for(i = 0; i < chromWidth; i += 8){
1544 uint64_t y1, y2, yuv1, yuv2;
1547 asm("ldq $31,64(%0)" :: "r"(yc));
1548 asm("ldq $31,64(%0)" :: "r"(yc2));
1549 asm("ldq $31,64(%0)" :: "r"(uc));
1550 asm("ldq $31,64(%0)" :: "r"(vc));
1568 #elif __WORDSIZE >= 64
1570 uint64_t *ldst = (uint64_t *) dst;
1571 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1572 for(i = 0; i < chromWidth; i += 2){
1574 k = yc[0] + (uc[0] << 8) +
1575 (yc[1] << 16) + (vc[0] << 24);
1576 l = yc[2] + (uc[1] << 8) +
1577 (yc[3] << 16) + (vc[1] << 24);
1578 *ldst++ = k + (l << 32);
1585 int i, *idst = (int32_t *) dst;
1586 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1587 for(i = 0; i < chromWidth; i++){
1588 #ifdef WORDS_BIGENDIAN
1589 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1590 (yc[1] << 8) + (vc[0] << 0);
1592 *idst++ = yc[0] + (uc[0] << 8) +
1593 (yc[1] << 16) + (vc[0] << 24);
1601 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1603 usrc += chromStride;
1604 vsrc += chromStride;
1618 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1619 * problem for anyone then tell me, and ill fix it)
1621 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1622 long width, long height,
1623 long lumStride, long chromStride, long dstStride)
1625 //FIXME interpolate chroma
1626 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1629 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1630 long width, long height,
1631 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1634 const long chromWidth= width>>1;
1635 for(y=0; y<height; y++)
1638 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1640 "xor %%"REG_a", %%"REG_a" \n\t"
1643 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1644 PREFETCH" 32(%2, %%"REG_a") \n\t"
1645 PREFETCH" 32(%3, %%"REG_a") \n\t"
1646 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1647 "movq %%mm0, %%mm2 \n\t" // U(0)
1648 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1649 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1650 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1652 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1653 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1654 "movq %%mm0, %%mm4 \n\t" // Y(0)
1655 "movq %%mm2, %%mm6 \n\t" // Y(8)
1656 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1657 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1658 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1659 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1661 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1662 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1663 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1664 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1666 "add $8, %%"REG_a" \n\t"
1667 "cmp %4, %%"REG_a" \n\t"
1669 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1673 //FIXME adapt the alpha asm code from yv12->yuy2
1675 #if __WORDSIZE >= 64
1677 uint64_t *ldst = (uint64_t *) dst;
1678 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1679 for(i = 0; i < chromWidth; i += 2){
1681 k = uc[0] + (yc[0] << 8) +
1682 (vc[0] << 16) + (yc[1] << 24);
1683 l = uc[1] + (yc[2] << 8) +
1684 (vc[1] << 16) + (yc[3] << 24);
1685 *ldst++ = k + (l << 32);
1692 int i, *idst = (int32_t *) dst;
1693 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1694 for(i = 0; i < chromWidth; i++){
1695 #ifdef WORDS_BIGENDIAN
1696 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1697 (vc[0] << 8) + (yc[1] << 0);
1699 *idst++ = uc[0] + (yc[0] << 8) +
1700 (vc[0] << 16) + (yc[1] << 24);
1708 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1710 usrc += chromStride;
1711 vsrc += chromStride;
1725 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1726 * problem for anyone then tell me, and ill fix it)
1728 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1729 long width, long height,
1730 long lumStride, long chromStride, long dstStride)
1732 //FIXME interpolate chroma
1733 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1738 * width should be a multiple of 16
1740 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1741 long width, long height,
1742 long lumStride, long chromStride, long dstStride)
1744 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1749 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1750 * problem for anyone then tell me, and ill fix it)
1752 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1753 long width, long height,
1754 long lumStride, long chromStride, long srcStride)
1757 const long chromWidth= width>>1;
1758 for(y=0; y<height; y+=2)
1762 "xor %%"REG_a", %%"REG_a" \n\t"
1763 "pcmpeqw %%mm7, %%mm7 \n\t"
1764 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1767 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1768 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1769 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1770 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1771 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1772 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1773 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1774 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1775 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1776 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1777 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1779 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1781 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1782 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1783 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1784 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1785 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1786 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1787 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1788 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1789 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1790 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1792 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1794 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1795 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1796 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1797 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1798 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1799 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1800 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1801 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1803 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1804 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1806 "add $8, %%"REG_a" \n\t"
1807 "cmp %4, %%"REG_a" \n\t"
1809 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1810 : "memory", "%"REG_a
1817 "xor %%"REG_a", %%"REG_a" \n\t"
1820 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1821 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1822 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1823 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1824 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1825 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1826 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1827 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1828 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1829 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1830 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1832 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1833 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1835 "add $8, %%"REG_a" \n\t"
1836 "cmp %4, %%"REG_a" \n\t"
1839 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1840 : "memory", "%"REG_a
1844 for(i=0; i<chromWidth; i++)
1846 ydst[2*i+0] = src[4*i+0];
1847 udst[i] = src[4*i+1];
1848 ydst[2*i+1] = src[4*i+2];
1849 vdst[i] = src[4*i+3];
1854 for(i=0; i<chromWidth; i++)
1856 ydst[2*i+0] = src[4*i+0];
1857 ydst[2*i+1] = src[4*i+2];
1860 udst += chromStride;
1861 vdst += chromStride;
1866 asm volatile( EMMS" \n\t"
1872 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1873 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1874 long width, long height, long lumStride, long chromStride)
1877 memcpy(ydst, ysrc, width*height);
1879 /* XXX: implement upscaling for U,V */
1882 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1889 for(x=0; x<srcWidth-1; x++){
1890 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1891 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1893 dst[2*srcWidth-1]= src[srcWidth-1];
1897 for(y=1; y<srcHeight; y++){
1898 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1899 const long mmxSize= srcWidth&~15;
1901 "mov %4, %%"REG_a" \n\t"
1903 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1904 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1905 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1906 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1907 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1908 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1909 PAVGB" %%mm0, %%mm5 \n\t"
1910 PAVGB" %%mm0, %%mm3 \n\t"
1911 PAVGB" %%mm0, %%mm5 \n\t"
1912 PAVGB" %%mm0, %%mm3 \n\t"
1913 PAVGB" %%mm1, %%mm4 \n\t"
1914 PAVGB" %%mm1, %%mm2 \n\t"
1915 PAVGB" %%mm1, %%mm4 \n\t"
1916 PAVGB" %%mm1, %%mm2 \n\t"
1917 "movq %%mm5, %%mm7 \n\t"
1918 "movq %%mm4, %%mm6 \n\t"
1919 "punpcklbw %%mm3, %%mm5 \n\t"
1920 "punpckhbw %%mm3, %%mm7 \n\t"
1921 "punpcklbw %%mm2, %%mm4 \n\t"
1922 "punpckhbw %%mm2, %%mm6 \n\t"
1924 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1925 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1926 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1927 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1929 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1930 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1931 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1932 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1934 "add $8, %%"REG_a" \n\t"
1936 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1937 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1943 const long mmxSize=1;
1945 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1946 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1948 for(x=mmxSize-1; x<srcWidth-1; x++){
1949 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1950 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1951 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1952 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1954 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1955 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1965 for(x=0; x<srcWidth-1; x++){
1966 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1967 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1969 dst[2*srcWidth-1]= src[srcWidth-1];
1971 for(x=0; x<srcWidth; x++){
1978 asm volatile( EMMS" \n\t"
1986 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1987 * problem for anyone then tell me, and ill fix it)
1988 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1990 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1991 long width, long height,
1992 long lumStride, long chromStride, long srcStride)
1995 const long chromWidth= width>>1;
1996 for(y=0; y<height; y+=2)
2000 "xorl %%eax, %%eax \n\t"
2001 "pcmpeqw %%mm7, %%mm7 \n\t"
2002 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2005 PREFETCH" 64(%0, %%eax, 4) \n\t"
2006 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2007 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2008 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2009 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2010 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2011 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2012 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2013 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2014 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2015 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2017 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2019 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2020 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2021 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2022 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2023 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2024 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2025 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2026 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2027 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2028 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2030 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2032 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2033 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2034 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2035 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2036 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2037 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2038 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2039 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2041 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2042 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2044 "addl $8, %%eax \n\t"
2045 "cmpl %4, %%eax \n\t"
2047 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2055 "xorl %%eax, %%eax \n\t"
2058 PREFETCH" 64(%0, %%eax, 4) \n\t"
2059 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2060 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2061 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2062 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2063 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2064 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2065 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2066 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2067 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2068 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2070 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2071 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2073 "addl $8, %%eax \n\t"
2074 "cmpl %4, %%eax \n\t"
2077 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2082 for(i=0; i<chromWidth; i++)
2084 udst[i] = src[4*i+0];
2085 ydst[2*i+0] = src[4*i+1];
2086 vdst[i] = src[4*i+2];
2087 ydst[2*i+1] = src[4*i+3];
2092 for(i=0; i<chromWidth; i++)
2094 ydst[2*i+0] = src[4*i+1];
2095 ydst[2*i+1] = src[4*i+3];
2098 udst += chromStride;
2099 vdst += chromStride;
2104 asm volatile( EMMS" \n\t"
2112 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2113 * problem for anyone then tell me, and ill fix it)
2114 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2116 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2117 long width, long height,
2118 long lumStride, long chromStride, long srcStride)
2121 const long chromWidth= width>>1;
2123 for(y=0; y<height-2; y+=2)
2129 "mov %2, %%"REG_a" \n\t"
2130 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2131 "movq "MANGLE(w1111)", %%mm5 \n\t"
2132 "pxor %%mm7, %%mm7 \n\t"
2133 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2136 PREFETCH" 64(%0, %%"REG_d") \n\t"
2137 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2138 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2139 "punpcklbw %%mm7, %%mm0 \n\t"
2140 "punpcklbw %%mm7, %%mm1 \n\t"
2141 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2142 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2143 "punpcklbw %%mm7, %%mm2 \n\t"
2144 "punpcklbw %%mm7, %%mm3 \n\t"
2145 "pmaddwd %%mm6, %%mm0 \n\t"
2146 "pmaddwd %%mm6, %%mm1 \n\t"
2147 "pmaddwd %%mm6, %%mm2 \n\t"
2148 "pmaddwd %%mm6, %%mm3 \n\t"
2149 #ifndef FAST_BGR2YV12
2150 "psrad $8, %%mm0 \n\t"
2151 "psrad $8, %%mm1 \n\t"
2152 "psrad $8, %%mm2 \n\t"
2153 "psrad $8, %%mm3 \n\t"
2155 "packssdw %%mm1, %%mm0 \n\t"
2156 "packssdw %%mm3, %%mm2 \n\t"
2157 "pmaddwd %%mm5, %%mm0 \n\t"
2158 "pmaddwd %%mm5, %%mm2 \n\t"
2159 "packssdw %%mm2, %%mm0 \n\t"
2160 "psraw $7, %%mm0 \n\t"
2162 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2163 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2164 "punpcklbw %%mm7, %%mm4 \n\t"
2165 "punpcklbw %%mm7, %%mm1 \n\t"
2166 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2167 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2168 "punpcklbw %%mm7, %%mm2 \n\t"
2169 "punpcklbw %%mm7, %%mm3 \n\t"
2170 "pmaddwd %%mm6, %%mm4 \n\t"
2171 "pmaddwd %%mm6, %%mm1 \n\t"
2172 "pmaddwd %%mm6, %%mm2 \n\t"
2173 "pmaddwd %%mm6, %%mm3 \n\t"
2174 #ifndef FAST_BGR2YV12
2175 "psrad $8, %%mm4 \n\t"
2176 "psrad $8, %%mm1 \n\t"
2177 "psrad $8, %%mm2 \n\t"
2178 "psrad $8, %%mm3 \n\t"
2180 "packssdw %%mm1, %%mm4 \n\t"
2181 "packssdw %%mm3, %%mm2 \n\t"
2182 "pmaddwd %%mm5, %%mm4 \n\t"
2183 "pmaddwd %%mm5, %%mm2 \n\t"
2184 "add $24, %%"REG_d" \n\t"
2185 "packssdw %%mm2, %%mm4 \n\t"
2186 "psraw $7, %%mm4 \n\t"
2188 "packuswb %%mm4, %%mm0 \n\t"
2189 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2191 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2192 "add $8, %%"REG_a" \n\t"
2194 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2195 : "%"REG_a, "%"REG_d
2202 "mov %4, %%"REG_a" \n\t"
2203 "movq "MANGLE(w1111)", %%mm5 \n\t"
2204 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2205 "pxor %%mm7, %%mm7 \n\t"
2206 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2207 "add %%"REG_d", %%"REG_d" \n\t"
2210 PREFETCH" 64(%0, %%"REG_d") \n\t"
2211 PREFETCH" 64(%1, %%"REG_d") \n\t"
2212 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2213 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2214 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2215 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2216 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2217 PAVGB" %%mm1, %%mm0 \n\t"
2218 PAVGB" %%mm3, %%mm2 \n\t"
2219 "movq %%mm0, %%mm1 \n\t"
2220 "movq %%mm2, %%mm3 \n\t"
2221 "psrlq $24, %%mm0 \n\t"
2222 "psrlq $24, %%mm2 \n\t"
2223 PAVGB" %%mm1, %%mm0 \n\t"
2224 PAVGB" %%mm3, %%mm2 \n\t"
2225 "punpcklbw %%mm7, %%mm0 \n\t"
2226 "punpcklbw %%mm7, %%mm2 \n\t"
2228 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2229 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2230 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2231 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2232 "punpcklbw %%mm7, %%mm0 \n\t"
2233 "punpcklbw %%mm7, %%mm1 \n\t"
2234 "punpcklbw %%mm7, %%mm2 \n\t"
2235 "punpcklbw %%mm7, %%mm3 \n\t"
2236 "paddw %%mm1, %%mm0 \n\t"
2237 "paddw %%mm3, %%mm2 \n\t"
2238 "paddw %%mm2, %%mm0 \n\t"
2239 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2240 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2241 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2242 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2243 "punpcklbw %%mm7, %%mm4 \n\t"
2244 "punpcklbw %%mm7, %%mm1 \n\t"
2245 "punpcklbw %%mm7, %%mm2 \n\t"
2246 "punpcklbw %%mm7, %%mm3 \n\t"
2247 "paddw %%mm1, %%mm4 \n\t"
2248 "paddw %%mm3, %%mm2 \n\t"
2249 "paddw %%mm4, %%mm2 \n\t"
2250 "psrlw $2, %%mm0 \n\t"
2251 "psrlw $2, %%mm2 \n\t"
2253 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2254 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2256 "pmaddwd %%mm0, %%mm1 \n\t"
2257 "pmaddwd %%mm2, %%mm3 \n\t"
2258 "pmaddwd %%mm6, %%mm0 \n\t"
2259 "pmaddwd %%mm6, %%mm2 \n\t"
2260 #ifndef FAST_BGR2YV12
2261 "psrad $8, %%mm0 \n\t"
2262 "psrad $8, %%mm1 \n\t"
2263 "psrad $8, %%mm2 \n\t"
2264 "psrad $8, %%mm3 \n\t"
2266 "packssdw %%mm2, %%mm0 \n\t"
2267 "packssdw %%mm3, %%mm1 \n\t"
2268 "pmaddwd %%mm5, %%mm0 \n\t"
2269 "pmaddwd %%mm5, %%mm1 \n\t"
2270 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2271 "psraw $7, %%mm0 \n\t"
2273 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2274 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2275 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2276 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2277 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2278 PAVGB" %%mm1, %%mm4 \n\t"
2279 PAVGB" %%mm3, %%mm2 \n\t"
2280 "movq %%mm4, %%mm1 \n\t"
2281 "movq %%mm2, %%mm3 \n\t"
2282 "psrlq $24, %%mm4 \n\t"
2283 "psrlq $24, %%mm2 \n\t"
2284 PAVGB" %%mm1, %%mm4 \n\t"
2285 PAVGB" %%mm3, %%mm2 \n\t"
2286 "punpcklbw %%mm7, %%mm4 \n\t"
2287 "punpcklbw %%mm7, %%mm2 \n\t"
2289 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2290 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2291 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2292 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2293 "punpcklbw %%mm7, %%mm4 \n\t"
2294 "punpcklbw %%mm7, %%mm1 \n\t"
2295 "punpcklbw %%mm7, %%mm2 \n\t"
2296 "punpcklbw %%mm7, %%mm3 \n\t"
2297 "paddw %%mm1, %%mm4 \n\t"
2298 "paddw %%mm3, %%mm2 \n\t"
2299 "paddw %%mm2, %%mm4 \n\t"
2300 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2301 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2302 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2303 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2304 "punpcklbw %%mm7, %%mm5 \n\t"
2305 "punpcklbw %%mm7, %%mm1 \n\t"
2306 "punpcklbw %%mm7, %%mm2 \n\t"
2307 "punpcklbw %%mm7, %%mm3 \n\t"
2308 "paddw %%mm1, %%mm5 \n\t"
2309 "paddw %%mm3, %%mm2 \n\t"
2310 "paddw %%mm5, %%mm2 \n\t"
2311 "movq "MANGLE(w1111)", %%mm5 \n\t"
2312 "psrlw $2, %%mm4 \n\t"
2313 "psrlw $2, %%mm2 \n\t"
2315 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2316 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2318 "pmaddwd %%mm4, %%mm1 \n\t"
2319 "pmaddwd %%mm2, %%mm3 \n\t"
2320 "pmaddwd %%mm6, %%mm4 \n\t"
2321 "pmaddwd %%mm6, %%mm2 \n\t"
2322 #ifndef FAST_BGR2YV12
2323 "psrad $8, %%mm4 \n\t"
2324 "psrad $8, %%mm1 \n\t"
2325 "psrad $8, %%mm2 \n\t"
2326 "psrad $8, %%mm3 \n\t"
2328 "packssdw %%mm2, %%mm4 \n\t"
2329 "packssdw %%mm3, %%mm1 \n\t"
2330 "pmaddwd %%mm5, %%mm4 \n\t"
2331 "pmaddwd %%mm5, %%mm1 \n\t"
2332 "add $24, %%"REG_d" \n\t"
2333 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2334 "psraw $7, %%mm4 \n\t"
2336 "movq %%mm0, %%mm1 \n\t"
2337 "punpckldq %%mm4, %%mm0 \n\t"
2338 "punpckhdq %%mm4, %%mm1 \n\t"
2339 "packsswb %%mm1, %%mm0 \n\t"
2340 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2341 "movd %%mm0, (%2, %%"REG_a") \n\t"
2342 "punpckhdq %%mm0, %%mm0 \n\t"
2343 "movd %%mm0, (%3, %%"REG_a") \n\t"
2344 "add $4, %%"REG_a" \n\t"
2346 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2347 : "%"REG_a, "%"REG_d
2350 udst += chromStride;
2351 vdst += chromStride;
2355 asm volatile( EMMS" \n\t"
2361 for(; y<height; y+=2)
2364 for(i=0; i<chromWidth; i++)
2366 unsigned int b= src[6*i+0];
2367 unsigned int g= src[6*i+1];
2368 unsigned int r= src[6*i+2];
2370 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2371 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2372 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2382 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2388 for(i=0; i<chromWidth; i++)
2390 unsigned int b= src[6*i+0];
2391 unsigned int g= src[6*i+1];
2392 unsigned int r= src[6*i+2];
2394 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2402 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2405 udst += chromStride;
2406 vdst += chromStride;
2412 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2413 long width, long height, long src1Stride,
2414 long src2Stride, long dstStride){
2417 for(h=0; h < height; h++)
2424 "xor %%"REG_a", %%"REG_a" \n\t"
2426 PREFETCH" 64(%1, %%"REG_a") \n\t"
2427 PREFETCH" 64(%2, %%"REG_a") \n\t"
2428 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2429 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2430 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2431 "punpcklbw %%xmm2, %%xmm0 \n\t"
2432 "punpckhbw %%xmm2, %%xmm1 \n\t"
2433 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2434 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2435 "add $16, %%"REG_a" \n\t"
2436 "cmp %3, %%"REG_a" \n\t"
2438 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2439 : "memory", "%"REG_a""
2443 "xor %%"REG_a", %%"REG_a" \n\t"
2445 PREFETCH" 64(%1, %%"REG_a") \n\t"
2446 PREFETCH" 64(%2, %%"REG_a") \n\t"
2447 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2448 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2449 "movq %%mm0, %%mm1 \n\t"
2450 "movq %%mm2, %%mm3 \n\t"
2451 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2452 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2453 "punpcklbw %%mm4, %%mm0 \n\t"
2454 "punpckhbw %%mm4, %%mm1 \n\t"
2455 "punpcklbw %%mm5, %%mm2 \n\t"
2456 "punpckhbw %%mm5, %%mm3 \n\t"
2457 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2458 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2459 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2460 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2461 "add $16, %%"REG_a" \n\t"
2462 "cmp %3, %%"REG_a" \n\t"
2464 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465 : "memory", "%"REG_a
2468 for(w= (width&(~15)); w < width; w++)
2470 dest[2*w+0] = src1[w];
2471 dest[2*w+1] = src2[w];
2474 for(w=0; w < width; w++)
2476 dest[2*w+0] = src1[w];
2477 dest[2*w+1] = src2[w];
2493 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2494 uint8_t *dst1, uint8_t *dst2,
2495 long width, long height,
2496 long srcStride1, long srcStride2,
2497 long dstStride1, long dstStride2)
2500 w=width/2; h=height/2;
2505 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2508 const uint8_t* s1=src1+srcStride1*(y>>1);
2509 uint8_t* d=dst1+dstStride1*y;
2516 "movq %1, %%mm0\n\t"
2517 "movq 8%1, %%mm2\n\t"
2518 "movq 16%1, %%mm4\n\t"
2519 "movq 24%1, %%mm6\n\t"
2520 "movq %%mm0, %%mm1\n\t"
2521 "movq %%mm2, %%mm3\n\t"
2522 "movq %%mm4, %%mm5\n\t"
2523 "movq %%mm6, %%mm7\n\t"
2524 "punpcklbw %%mm0, %%mm0\n\t"
2525 "punpckhbw %%mm1, %%mm1\n\t"
2526 "punpcklbw %%mm2, %%mm2\n\t"
2527 "punpckhbw %%mm3, %%mm3\n\t"
2528 "punpcklbw %%mm4, %%mm4\n\t"
2529 "punpckhbw %%mm5, %%mm5\n\t"
2530 "punpcklbw %%mm6, %%mm6\n\t"
2531 "punpckhbw %%mm7, %%mm7\n\t"
2532 MOVNTQ" %%mm0, %0\n\t"
2533 MOVNTQ" %%mm1, 8%0\n\t"
2534 MOVNTQ" %%mm2, 16%0\n\t"
2535 MOVNTQ" %%mm3, 24%0\n\t"
2536 MOVNTQ" %%mm4, 32%0\n\t"
2537 MOVNTQ" %%mm5, 40%0\n\t"
2538 MOVNTQ" %%mm6, 48%0\n\t"
2539 MOVNTQ" %%mm7, 56%0"
2545 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2548 const uint8_t* s2=src2+srcStride2*(y>>1);
2549 uint8_t* d=dst2+dstStride2*y;
2556 "movq %1, %%mm0\n\t"
2557 "movq 8%1, %%mm2\n\t"
2558 "movq 16%1, %%mm4\n\t"
2559 "movq 24%1, %%mm6\n\t"
2560 "movq %%mm0, %%mm1\n\t"
2561 "movq %%mm2, %%mm3\n\t"
2562 "movq %%mm4, %%mm5\n\t"
2563 "movq %%mm6, %%mm7\n\t"
2564 "punpcklbw %%mm0, %%mm0\n\t"
2565 "punpckhbw %%mm1, %%mm1\n\t"
2566 "punpcklbw %%mm2, %%mm2\n\t"
2567 "punpckhbw %%mm3, %%mm3\n\t"
2568 "punpcklbw %%mm4, %%mm4\n\t"
2569 "punpckhbw %%mm5, %%mm5\n\t"
2570 "punpcklbw %%mm6, %%mm6\n\t"
2571 "punpckhbw %%mm7, %%mm7\n\t"
2572 MOVNTQ" %%mm0, %0\n\t"
2573 MOVNTQ" %%mm1, 8%0\n\t"
2574 MOVNTQ" %%mm2, 16%0\n\t"
2575 MOVNTQ" %%mm3, 24%0\n\t"
2576 MOVNTQ" %%mm4, 32%0\n\t"
2577 MOVNTQ" %%mm5, 40%0\n\t"
2578 MOVNTQ" %%mm6, 48%0\n\t"
2579 MOVNTQ" %%mm7, 56%0"
2585 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2596 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2598 long width, long height,
2599 long srcStride1, long srcStride2,
2600 long srcStride3, long dstStride)
2603 w=width/2; h=height;
2605 const uint8_t* yp=src1+srcStride1*y;
2606 const uint8_t* up=src2+srcStride2*(y>>2);
2607 const uint8_t* vp=src3+srcStride3*(y>>2);
2608 uint8_t* d=dst+dstStride*y;
2614 PREFETCH" 32(%1, %0)\n\t"
2615 PREFETCH" 32(%2, %0)\n\t"
2616 PREFETCH" 32(%3, %0)\n\t"
2617 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2618 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2619 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2620 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2621 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2622 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2623 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2624 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2625 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2626 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2628 "movq %%mm1, %%mm6\n\t"
2629 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2630 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2631 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2632 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2633 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2635 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2636 "movq 8(%1, %0, 4), %%mm0\n\t"
2637 "movq %%mm0, %%mm3\n\t"
2638 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2639 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2640 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2641 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2643 "movq %%mm4, %%mm6\n\t"
2644 "movq 16(%1, %0, 4), %%mm0\n\t"
2645 "movq %%mm0, %%mm3\n\t"
2646 "punpcklbw %%mm5, %%mm4\n\t"
2647 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2648 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2649 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2650 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2652 "punpckhbw %%mm5, %%mm6\n\t"
2653 "movq 24(%1, %0, 4), %%mm0\n\t"
2654 "movq %%mm0, %%mm3\n\t"
2655 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2656 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2657 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2658 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2661 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2667 const long x2= x<<2;