3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
11 * This file is part of FFmpeg.
13 * FFmpeg is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 * the C code (not assembly, mmx, ...) of this file can be used
28 * under the LGPL license too
32 #include <inttypes.h> /* for __WORDSIZE */
35 // #warning You have misconfigured system and probably will lose performance!
36 #define __WORDSIZE MP_WORDSIZE
54 #define PREFETCH "prefetch"
55 #define PREFETCHW "prefetchw"
56 #define PAVGB "pavgusb"
57 #elif defined ( HAVE_MMX2 )
58 #define PREFETCH "prefetchnta"
59 #define PREFETCHW "prefetcht0"
66 #define PREFETCH " # nop"
67 #define PREFETCHW " # nop"
72 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
79 #define MOVNTQ "movntq"
80 #define SFENCE "sfence"
83 #define SFENCE " # nop"
86 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
89 const uint8_t *s = src;
92 const uint8_t *mm_end;
96 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
98 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
104 "punpckldq 3%1, %%mm0\n\t"
105 "movd 6%1, %%mm1\n\t"
106 "punpckldq 9%1, %%mm1\n\t"
107 "movd 12%1, %%mm2\n\t"
108 "punpckldq 15%1, %%mm2\n\t"
109 "movd 18%1, %%mm3\n\t"
110 "punpckldq 21%1, %%mm3\n\t"
111 "pand %%mm7, %%mm0\n\t"
112 "pand %%mm7, %%mm1\n\t"
113 "pand %%mm7, %%mm2\n\t"
114 "pand %%mm7, %%mm3\n\t"
115 MOVNTQ" %%mm0, %0\n\t"
116 MOVNTQ" %%mm1, 8%0\n\t"
117 MOVNTQ" %%mm2, 16%0\n\t"
125 __asm __volatile(SFENCE:::"memory");
126 __asm __volatile(EMMS:::"memory");
130 #ifdef WORDS_BIGENDIAN
131 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
146 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
149 const uint8_t *s = src;
152 const uint8_t *mm_end;
156 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
163 "movq 8%1, %%mm1\n\t"
164 "movq 16%1, %%mm4\n\t"
165 "movq 24%1, %%mm5\n\t"
166 "movq %%mm0, %%mm2\n\t"
167 "movq %%mm1, %%mm3\n\t"
168 "movq %%mm4, %%mm6\n\t"
169 "movq %%mm5, %%mm7\n\t"
170 "psrlq $8, %%mm2\n\t"
171 "psrlq $8, %%mm3\n\t"
172 "psrlq $8, %%mm6\n\t"
173 "psrlq $8, %%mm7\n\t"
182 "por %%mm2, %%mm0\n\t"
183 "por %%mm3, %%mm1\n\t"
184 "por %%mm6, %%mm4\n\t"
185 "por %%mm7, %%mm5\n\t"
187 "movq %%mm1, %%mm2\n\t"
188 "movq %%mm4, %%mm3\n\t"
189 "psllq $48, %%mm2\n\t"
190 "psllq $32, %%mm3\n\t"
193 "por %%mm2, %%mm0\n\t"
194 "psrlq $16, %%mm1\n\t"
195 "psrlq $32, %%mm4\n\t"
196 "psllq $16, %%mm5\n\t"
197 "por %%mm3, %%mm1\n\t"
199 "por %%mm5, %%mm4\n\t"
201 MOVNTQ" %%mm0, %0\n\t"
202 MOVNTQ" %%mm1, 8%0\n\t"
205 :"m"(*s),"m"(mask24l),
206 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
211 __asm __volatile(SFENCE:::"memory");
212 __asm __volatile(EMMS:::"memory");
216 #ifdef WORDS_BIGENDIAN
217 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
233 Original by Strepto/Astral
234 ported to gcc & bugfixed : A'rpi
235 MMX2, 3DNOW optimization by Nick Kurshev
236 32bit c version, and and&add trick by Michael Niedermayer
238 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
240 register const uint8_t* s=src;
241 register uint8_t* d=dst;
242 register const uint8_t *end;
243 const uint8_t *mm_end;
246 __asm __volatile(PREFETCH" %0"::"m"(*s));
247 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
254 "movq 8%1, %%mm2\n\t"
255 "movq %%mm0, %%mm1\n\t"
256 "movq %%mm2, %%mm3\n\t"
257 "pand %%mm4, %%mm0\n\t"
258 "pand %%mm4, %%mm2\n\t"
259 "paddw %%mm1, %%mm0\n\t"
260 "paddw %%mm3, %%mm2\n\t"
261 MOVNTQ" %%mm0, %0\n\t"
269 __asm __volatile(SFENCE:::"memory");
270 __asm __volatile(EMMS:::"memory");
275 register unsigned x= *((uint32_t *)s);
276 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
282 register unsigned short x= *((uint16_t *)s);
283 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
287 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
289 register const uint8_t* s=src;
290 register uint8_t* d=dst;
291 register const uint8_t *end;
292 const uint8_t *mm_end;
295 __asm __volatile(PREFETCH" %0"::"m"(*s));
296 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
297 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
304 "movq 8%1, %%mm2\n\t"
305 "movq %%mm0, %%mm1\n\t"
306 "movq %%mm2, %%mm3\n\t"
307 "psrlq $1, %%mm0\n\t"
308 "psrlq $1, %%mm2\n\t"
309 "pand %%mm7, %%mm0\n\t"
310 "pand %%mm7, %%mm2\n\t"
311 "pand %%mm6, %%mm1\n\t"
312 "pand %%mm6, %%mm3\n\t"
313 "por %%mm1, %%mm0\n\t"
314 "por %%mm3, %%mm2\n\t"
315 MOVNTQ" %%mm0, %0\n\t"
323 __asm __volatile(SFENCE:::"memory");
324 __asm __volatile(EMMS:::"memory");
329 register uint32_t x= *((uint32_t *)s);
330 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
336 register uint16_t x= *((uint16_t *)s);
337 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
343 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
345 const uint8_t *s = src;
348 const uint8_t *mm_end;
350 uint16_t *d = (uint16_t *)dst;
354 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
356 "movq %3, %%mm5 \n\t"
357 "movq %4, %%mm6 \n\t"
358 "movq %5, %%mm7 \n\t"
361 PREFETCH" 32(%1) \n\t"
362 "movd (%1), %%mm0 \n\t"
363 "movd 4(%1), %%mm3 \n\t"
364 "punpckldq 8(%1), %%mm0 \n\t"
365 "punpckldq 12(%1), %%mm3 \n\t"
366 "movq %%mm0, %%mm1 \n\t"
367 "movq %%mm3, %%mm4 \n\t"
368 "pand %%mm6, %%mm0 \n\t"
369 "pand %%mm6, %%mm3 \n\t"
370 "pmaddwd %%mm7, %%mm0 \n\t"
371 "pmaddwd %%mm7, %%mm3 \n\t"
372 "pand %%mm5, %%mm1 \n\t"
373 "pand %%mm5, %%mm4 \n\t"
374 "por %%mm1, %%mm0 \n\t"
375 "por %%mm4, %%mm3 \n\t"
376 "psrld $5, %%mm0 \n\t"
377 "pslld $11, %%mm3 \n\t"
378 "por %%mm3, %%mm0 \n\t"
379 MOVNTQ" %%mm0, (%0) \n\t"
385 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
388 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
392 ::"m"(red_16mask),"m"(green_16mask));
398 "movd 4%1, %%mm3\n\t"
399 "punpckldq 8%1, %%mm0\n\t"
400 "punpckldq 12%1, %%mm3\n\t"
401 "movq %%mm0, %%mm1\n\t"
402 "movq %%mm0, %%mm2\n\t"
403 "movq %%mm3, %%mm4\n\t"
404 "movq %%mm3, %%mm5\n\t"
405 "psrlq $3, %%mm0\n\t"
406 "psrlq $3, %%mm3\n\t"
409 "psrlq $5, %%mm1\n\t"
410 "psrlq $5, %%mm4\n\t"
411 "pand %%mm6, %%mm1\n\t"
412 "pand %%mm6, %%mm4\n\t"
413 "psrlq $8, %%mm2\n\t"
414 "psrlq $8, %%mm5\n\t"
415 "pand %%mm7, %%mm2\n\t"
416 "pand %%mm7, %%mm5\n\t"
417 "por %%mm1, %%mm0\n\t"
418 "por %%mm4, %%mm3\n\t"
419 "por %%mm2, %%mm0\n\t"
420 "por %%mm5, %%mm3\n\t"
421 "psllq $16, %%mm3\n\t"
422 "por %%mm3, %%mm0\n\t"
423 MOVNTQ" %%mm0, %0\n\t"
424 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
429 __asm __volatile(SFENCE:::"memory");
430 __asm __volatile(EMMS:::"memory");
434 register int rgb = *(uint32_t*)s; s += 4;
435 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
439 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
441 const uint8_t *s = src;
444 const uint8_t *mm_end;
446 uint16_t *d = (uint16_t *)dst;
449 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
453 ::"m"(red_16mask),"m"(green_16mask));
460 "movd 4%1, %%mm3\n\t"
461 "punpckldq 8%1, %%mm0\n\t"
462 "punpckldq 12%1, %%mm3\n\t"
463 "movq %%mm0, %%mm1\n\t"
464 "movq %%mm0, %%mm2\n\t"
465 "movq %%mm3, %%mm4\n\t"
466 "movq %%mm3, %%mm5\n\t"
467 "psllq $8, %%mm0\n\t"
468 "psllq $8, %%mm3\n\t"
469 "pand %%mm7, %%mm0\n\t"
470 "pand %%mm7, %%mm3\n\t"
471 "psrlq $5, %%mm1\n\t"
472 "psrlq $5, %%mm4\n\t"
473 "pand %%mm6, %%mm1\n\t"
474 "pand %%mm6, %%mm4\n\t"
475 "psrlq $19, %%mm2\n\t"
476 "psrlq $19, %%mm5\n\t"
479 "por %%mm1, %%mm0\n\t"
480 "por %%mm4, %%mm3\n\t"
481 "por %%mm2, %%mm0\n\t"
482 "por %%mm5, %%mm3\n\t"
483 "psllq $16, %%mm3\n\t"
484 "por %%mm3, %%mm0\n\t"
485 MOVNTQ" %%mm0, %0\n\t"
486 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
490 __asm __volatile(SFENCE:::"memory");
491 __asm __volatile(EMMS:::"memory");
495 register int rgb = *(uint32_t*)s; s += 4;
496 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
500 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
502 const uint8_t *s = src;
505 const uint8_t *mm_end;
507 uint16_t *d = (uint16_t *)dst;
511 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
513 "movq %3, %%mm5 \n\t"
514 "movq %4, %%mm6 \n\t"
515 "movq %5, %%mm7 \n\t"
518 PREFETCH" 32(%1) \n\t"
519 "movd (%1), %%mm0 \n\t"
520 "movd 4(%1), %%mm3 \n\t"
521 "punpckldq 8(%1), %%mm0 \n\t"
522 "punpckldq 12(%1), %%mm3 \n\t"
523 "movq %%mm0, %%mm1 \n\t"
524 "movq %%mm3, %%mm4 \n\t"
525 "pand %%mm6, %%mm0 \n\t"
526 "pand %%mm6, %%mm3 \n\t"
527 "pmaddwd %%mm7, %%mm0 \n\t"
528 "pmaddwd %%mm7, %%mm3 \n\t"
529 "pand %%mm5, %%mm1 \n\t"
530 "pand %%mm5, %%mm4 \n\t"
531 "por %%mm1, %%mm0 \n\t"
532 "por %%mm4, %%mm3 \n\t"
533 "psrld $6, %%mm0 \n\t"
534 "pslld $10, %%mm3 \n\t"
535 "por %%mm3, %%mm0 \n\t"
536 MOVNTQ" %%mm0, (%0) \n\t"
542 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
545 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
549 ::"m"(red_15mask),"m"(green_15mask));
555 "movd 4%1, %%mm3\n\t"
556 "punpckldq 8%1, %%mm0\n\t"
557 "punpckldq 12%1, %%mm3\n\t"
558 "movq %%mm0, %%mm1\n\t"
559 "movq %%mm0, %%mm2\n\t"
560 "movq %%mm3, %%mm4\n\t"
561 "movq %%mm3, %%mm5\n\t"
562 "psrlq $3, %%mm0\n\t"
563 "psrlq $3, %%mm3\n\t"
566 "psrlq $6, %%mm1\n\t"
567 "psrlq $6, %%mm4\n\t"
568 "pand %%mm6, %%mm1\n\t"
569 "pand %%mm6, %%mm4\n\t"
570 "psrlq $9, %%mm2\n\t"
571 "psrlq $9, %%mm5\n\t"
572 "pand %%mm7, %%mm2\n\t"
573 "pand %%mm7, %%mm5\n\t"
574 "por %%mm1, %%mm0\n\t"
575 "por %%mm4, %%mm3\n\t"
576 "por %%mm2, %%mm0\n\t"
577 "por %%mm5, %%mm3\n\t"
578 "psllq $16, %%mm3\n\t"
579 "por %%mm3, %%mm0\n\t"
580 MOVNTQ" %%mm0, %0\n\t"
581 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
586 __asm __volatile(SFENCE:::"memory");
587 __asm __volatile(EMMS:::"memory");
591 register int rgb = *(uint32_t*)s; s += 4;
592 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
596 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
598 const uint8_t *s = src;
601 const uint8_t *mm_end;
603 uint16_t *d = (uint16_t *)dst;
606 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
610 ::"m"(red_15mask),"m"(green_15mask));
617 "movd 4%1, %%mm3\n\t"
618 "punpckldq 8%1, %%mm0\n\t"
619 "punpckldq 12%1, %%mm3\n\t"
620 "movq %%mm0, %%mm1\n\t"
621 "movq %%mm0, %%mm2\n\t"
622 "movq %%mm3, %%mm4\n\t"
623 "movq %%mm3, %%mm5\n\t"
624 "psllq $7, %%mm0\n\t"
625 "psllq $7, %%mm3\n\t"
626 "pand %%mm7, %%mm0\n\t"
627 "pand %%mm7, %%mm3\n\t"
628 "psrlq $6, %%mm1\n\t"
629 "psrlq $6, %%mm4\n\t"
630 "pand %%mm6, %%mm1\n\t"
631 "pand %%mm6, %%mm4\n\t"
632 "psrlq $19, %%mm2\n\t"
633 "psrlq $19, %%mm5\n\t"
636 "por %%mm1, %%mm0\n\t"
637 "por %%mm4, %%mm3\n\t"
638 "por %%mm2, %%mm0\n\t"
639 "por %%mm5, %%mm3\n\t"
640 "psllq $16, %%mm3\n\t"
641 "por %%mm3, %%mm0\n\t"
642 MOVNTQ" %%mm0, %0\n\t"
643 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
647 __asm __volatile(SFENCE:::"memory");
648 __asm __volatile(EMMS:::"memory");
652 register int rgb = *(uint32_t*)s; s += 4;
653 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
657 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
659 const uint8_t *s = src;
662 const uint8_t *mm_end;
664 uint16_t *d = (uint16_t *)dst;
667 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
671 ::"m"(red_16mask),"m"(green_16mask));
678 "movd 3%1, %%mm3\n\t"
679 "punpckldq 6%1, %%mm0\n\t"
680 "punpckldq 9%1, %%mm3\n\t"
681 "movq %%mm0, %%mm1\n\t"
682 "movq %%mm0, %%mm2\n\t"
683 "movq %%mm3, %%mm4\n\t"
684 "movq %%mm3, %%mm5\n\t"
685 "psrlq $3, %%mm0\n\t"
686 "psrlq $3, %%mm3\n\t"
689 "psrlq $5, %%mm1\n\t"
690 "psrlq $5, %%mm4\n\t"
691 "pand %%mm6, %%mm1\n\t"
692 "pand %%mm6, %%mm4\n\t"
693 "psrlq $8, %%mm2\n\t"
694 "psrlq $8, %%mm5\n\t"
695 "pand %%mm7, %%mm2\n\t"
696 "pand %%mm7, %%mm5\n\t"
697 "por %%mm1, %%mm0\n\t"
698 "por %%mm4, %%mm3\n\t"
699 "por %%mm2, %%mm0\n\t"
700 "por %%mm5, %%mm3\n\t"
701 "psllq $16, %%mm3\n\t"
702 "por %%mm3, %%mm0\n\t"
703 MOVNTQ" %%mm0, %0\n\t"
704 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
708 __asm __volatile(SFENCE:::"memory");
709 __asm __volatile(EMMS:::"memory");
716 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
720 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
722 const uint8_t *s = src;
725 const uint8_t *mm_end;
727 uint16_t *d = (uint16_t *)dst;
730 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
734 ::"m"(red_16mask),"m"(green_16mask));
741 "movd 3%1, %%mm3\n\t"
742 "punpckldq 6%1, %%mm0\n\t"
743 "punpckldq 9%1, %%mm3\n\t"
744 "movq %%mm0, %%mm1\n\t"
745 "movq %%mm0, %%mm2\n\t"
746 "movq %%mm3, %%mm4\n\t"
747 "movq %%mm3, %%mm5\n\t"
748 "psllq $8, %%mm0\n\t"
749 "psllq $8, %%mm3\n\t"
750 "pand %%mm7, %%mm0\n\t"
751 "pand %%mm7, %%mm3\n\t"
752 "psrlq $5, %%mm1\n\t"
753 "psrlq $5, %%mm4\n\t"
754 "pand %%mm6, %%mm1\n\t"
755 "pand %%mm6, %%mm4\n\t"
756 "psrlq $19, %%mm2\n\t"
757 "psrlq $19, %%mm5\n\t"
760 "por %%mm1, %%mm0\n\t"
761 "por %%mm4, %%mm3\n\t"
762 "por %%mm2, %%mm0\n\t"
763 "por %%mm5, %%mm3\n\t"
764 "psllq $16, %%mm3\n\t"
765 "por %%mm3, %%mm0\n\t"
766 MOVNTQ" %%mm0, %0\n\t"
767 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
771 __asm __volatile(SFENCE:::"memory");
772 __asm __volatile(EMMS:::"memory");
779 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
783 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
785 const uint8_t *s = src;
788 const uint8_t *mm_end;
790 uint16_t *d = (uint16_t *)dst;
793 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
797 ::"m"(red_15mask),"m"(green_15mask));
804 "movd 3%1, %%mm3\n\t"
805 "punpckldq 6%1, %%mm0\n\t"
806 "punpckldq 9%1, %%mm3\n\t"
807 "movq %%mm0, %%mm1\n\t"
808 "movq %%mm0, %%mm2\n\t"
809 "movq %%mm3, %%mm4\n\t"
810 "movq %%mm3, %%mm5\n\t"
811 "psrlq $3, %%mm0\n\t"
812 "psrlq $3, %%mm3\n\t"
815 "psrlq $6, %%mm1\n\t"
816 "psrlq $6, %%mm4\n\t"
817 "pand %%mm6, %%mm1\n\t"
818 "pand %%mm6, %%mm4\n\t"
819 "psrlq $9, %%mm2\n\t"
820 "psrlq $9, %%mm5\n\t"
821 "pand %%mm7, %%mm2\n\t"
822 "pand %%mm7, %%mm5\n\t"
823 "por %%mm1, %%mm0\n\t"
824 "por %%mm4, %%mm3\n\t"
825 "por %%mm2, %%mm0\n\t"
826 "por %%mm5, %%mm3\n\t"
827 "psllq $16, %%mm3\n\t"
828 "por %%mm3, %%mm0\n\t"
829 MOVNTQ" %%mm0, %0\n\t"
830 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
834 __asm __volatile(SFENCE:::"memory");
835 __asm __volatile(EMMS:::"memory");
842 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
846 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
848 const uint8_t *s = src;
851 const uint8_t *mm_end;
853 uint16_t *d = (uint16_t *)dst;
856 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
860 ::"m"(red_15mask),"m"(green_15mask));
867 "movd 3%1, %%mm3\n\t"
868 "punpckldq 6%1, %%mm0\n\t"
869 "punpckldq 9%1, %%mm3\n\t"
870 "movq %%mm0, %%mm1\n\t"
871 "movq %%mm0, %%mm2\n\t"
872 "movq %%mm3, %%mm4\n\t"
873 "movq %%mm3, %%mm5\n\t"
874 "psllq $7, %%mm0\n\t"
875 "psllq $7, %%mm3\n\t"
876 "pand %%mm7, %%mm0\n\t"
877 "pand %%mm7, %%mm3\n\t"
878 "psrlq $6, %%mm1\n\t"
879 "psrlq $6, %%mm4\n\t"
880 "pand %%mm6, %%mm1\n\t"
881 "pand %%mm6, %%mm4\n\t"
882 "psrlq $19, %%mm2\n\t"
883 "psrlq $19, %%mm5\n\t"
886 "por %%mm1, %%mm0\n\t"
887 "por %%mm4, %%mm3\n\t"
888 "por %%mm2, %%mm0\n\t"
889 "por %%mm5, %%mm3\n\t"
890 "psllq $16, %%mm3\n\t"
891 "por %%mm3, %%mm0\n\t"
892 MOVNTQ" %%mm0, %0\n\t"
893 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
897 __asm __volatile(SFENCE:::"memory");
898 __asm __volatile(EMMS:::"memory");
905 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
910 I use here less accurate approximation by simply
911 left-shifting the input
912 value and filling the low order bits with
913 zeroes. This method improves png's
914 compression but this scheme cannot reproduce white exactly, since it does not
915 generate an all-ones maximum value; the net effect is to darken the
918 The better method should be "left bit replication":
928 | Leftmost Bits Repeated to Fill Open Bits
932 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
936 const uint16_t *mm_end;
938 uint8_t *d = (uint8_t *)dst;
939 const uint16_t *s = (uint16_t *)src;
940 end = s + src_size/2;
942 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
954 "psllq $3, %%mm0\n\t"
955 "psrlq $2, %%mm1\n\t"
956 "psrlq $7, %%mm2\n\t"
957 "movq %%mm0, %%mm3\n\t"
958 "movq %%mm1, %%mm4\n\t"
959 "movq %%mm2, %%mm5\n\t"
960 "punpcklwd %5, %%mm0\n\t"
961 "punpcklwd %5, %%mm1\n\t"
962 "punpcklwd %5, %%mm2\n\t"
963 "punpckhwd %5, %%mm3\n\t"
964 "punpckhwd %5, %%mm4\n\t"
965 "punpckhwd %5, %%mm5\n\t"
966 "psllq $8, %%mm1\n\t"
967 "psllq $16, %%mm2\n\t"
968 "por %%mm1, %%mm0\n\t"
969 "por %%mm2, %%mm0\n\t"
970 "psllq $8, %%mm4\n\t"
971 "psllq $16, %%mm5\n\t"
972 "por %%mm4, %%mm3\n\t"
973 "por %%mm5, %%mm3\n\t"
975 "movq %%mm0, %%mm6\n\t"
976 "movq %%mm3, %%mm7\n\t"
978 "movq 8%1, %%mm0\n\t"
979 "movq 8%1, %%mm1\n\t"
980 "movq 8%1, %%mm2\n\t"
984 "psllq $3, %%mm0\n\t"
985 "psrlq $2, %%mm1\n\t"
986 "psrlq $7, %%mm2\n\t"
987 "movq %%mm0, %%mm3\n\t"
988 "movq %%mm1, %%mm4\n\t"
989 "movq %%mm2, %%mm5\n\t"
990 "punpcklwd %5, %%mm0\n\t"
991 "punpcklwd %5, %%mm1\n\t"
992 "punpcklwd %5, %%mm2\n\t"
993 "punpckhwd %5, %%mm3\n\t"
994 "punpckhwd %5, %%mm4\n\t"
995 "punpckhwd %5, %%mm5\n\t"
996 "psllq $8, %%mm1\n\t"
997 "psllq $16, %%mm2\n\t"
998 "por %%mm1, %%mm0\n\t"
999 "por %%mm2, %%mm0\n\t"
1000 "psllq $8, %%mm4\n\t"
1001 "psllq $16, %%mm5\n\t"
1002 "por %%mm4, %%mm3\n\t"
1003 "por %%mm5, %%mm3\n\t"
1006 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1008 /* Borrowed 32 to 24 */
1010 "movq %%mm0, %%mm4\n\t"
1011 "movq %%mm3, %%mm5\n\t"
1012 "movq %%mm6, %%mm0\n\t"
1013 "movq %%mm7, %%mm1\n\t"
1015 "movq %%mm4, %%mm6\n\t"
1016 "movq %%mm5, %%mm7\n\t"
1017 "movq %%mm0, %%mm2\n\t"
1018 "movq %%mm1, %%mm3\n\t"
1020 "psrlq $8, %%mm2\n\t"
1021 "psrlq $8, %%mm3\n\t"
1022 "psrlq $8, %%mm6\n\t"
1023 "psrlq $8, %%mm7\n\t"
1024 "pand %2, %%mm0\n\t"
1025 "pand %2, %%mm1\n\t"
1026 "pand %2, %%mm4\n\t"
1027 "pand %2, %%mm5\n\t"
1028 "pand %3, %%mm2\n\t"
1029 "pand %3, %%mm3\n\t"
1030 "pand %3, %%mm6\n\t"
1031 "pand %3, %%mm7\n\t"
1032 "por %%mm2, %%mm0\n\t"
1033 "por %%mm3, %%mm1\n\t"
1034 "por %%mm6, %%mm4\n\t"
1035 "por %%mm7, %%mm5\n\t"
1037 "movq %%mm1, %%mm2\n\t"
1038 "movq %%mm4, %%mm3\n\t"
1039 "psllq $48, %%mm2\n\t"
1040 "psllq $32, %%mm3\n\t"
1041 "pand %4, %%mm2\n\t"
1042 "pand %5, %%mm3\n\t"
1043 "por %%mm2, %%mm0\n\t"
1044 "psrlq $16, %%mm1\n\t"
1045 "psrlq $32, %%mm4\n\t"
1046 "psllq $16, %%mm5\n\t"
1047 "por %%mm3, %%mm1\n\t"
1048 "pand %6, %%mm5\n\t"
1049 "por %%mm5, %%mm4\n\t"
1051 MOVNTQ" %%mm0, %0\n\t"
1052 MOVNTQ" %%mm1, 8%0\n\t"
1053 MOVNTQ" %%mm4, 16%0"
1056 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1061 __asm __volatile(SFENCE:::"memory");
1062 __asm __volatile(EMMS:::"memory");
1066 register uint16_t bgr;
1068 *d++ = (bgr&0x1F)<<3;
1069 *d++ = (bgr&0x3E0)>>2;
1070 *d++ = (bgr&0x7C00)>>7;
1074 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1076 const uint16_t *end;
1078 const uint16_t *mm_end;
1080 uint8_t *d = (uint8_t *)dst;
1081 const uint16_t *s = (const uint16_t *)src;
1082 end = s + src_size/2;
1084 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1090 "movq %1, %%mm0\n\t"
1091 "movq %1, %%mm1\n\t"
1092 "movq %1, %%mm2\n\t"
1093 "pand %2, %%mm0\n\t"
1094 "pand %3, %%mm1\n\t"
1095 "pand %4, %%mm2\n\t"
1096 "psllq $3, %%mm0\n\t"
1097 "psrlq $3, %%mm1\n\t"
1098 "psrlq $8, %%mm2\n\t"
1099 "movq %%mm0, %%mm3\n\t"
1100 "movq %%mm1, %%mm4\n\t"
1101 "movq %%mm2, %%mm5\n\t"
1102 "punpcklwd %5, %%mm0\n\t"
1103 "punpcklwd %5, %%mm1\n\t"
1104 "punpcklwd %5, %%mm2\n\t"
1105 "punpckhwd %5, %%mm3\n\t"
1106 "punpckhwd %5, %%mm4\n\t"
1107 "punpckhwd %5, %%mm5\n\t"
1108 "psllq $8, %%mm1\n\t"
1109 "psllq $16, %%mm2\n\t"
1110 "por %%mm1, %%mm0\n\t"
1111 "por %%mm2, %%mm0\n\t"
1112 "psllq $8, %%mm4\n\t"
1113 "psllq $16, %%mm5\n\t"
1114 "por %%mm4, %%mm3\n\t"
1115 "por %%mm5, %%mm3\n\t"
1117 "movq %%mm0, %%mm6\n\t"
1118 "movq %%mm3, %%mm7\n\t"
1120 "movq 8%1, %%mm0\n\t"
1121 "movq 8%1, %%mm1\n\t"
1122 "movq 8%1, %%mm2\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %3, %%mm1\n\t"
1125 "pand %4, %%mm2\n\t"
1126 "psllq $3, %%mm0\n\t"
1127 "psrlq $3, %%mm1\n\t"
1128 "psrlq $8, %%mm2\n\t"
1129 "movq %%mm0, %%mm3\n\t"
1130 "movq %%mm1, %%mm4\n\t"
1131 "movq %%mm2, %%mm5\n\t"
1132 "punpcklwd %5, %%mm0\n\t"
1133 "punpcklwd %5, %%mm1\n\t"
1134 "punpcklwd %5, %%mm2\n\t"
1135 "punpckhwd %5, %%mm3\n\t"
1136 "punpckhwd %5, %%mm4\n\t"
1137 "punpckhwd %5, %%mm5\n\t"
1138 "psllq $8, %%mm1\n\t"
1139 "psllq $16, %%mm2\n\t"
1140 "por %%mm1, %%mm0\n\t"
1141 "por %%mm2, %%mm0\n\t"
1142 "psllq $8, %%mm4\n\t"
1143 "psllq $16, %%mm5\n\t"
1144 "por %%mm4, %%mm3\n\t"
1145 "por %%mm5, %%mm3\n\t"
1147 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1149 /* Borrowed 32 to 24 */
1151 "movq %%mm0, %%mm4\n\t"
1152 "movq %%mm3, %%mm5\n\t"
1153 "movq %%mm6, %%mm0\n\t"
1154 "movq %%mm7, %%mm1\n\t"
1156 "movq %%mm4, %%mm6\n\t"
1157 "movq %%mm5, %%mm7\n\t"
1158 "movq %%mm0, %%mm2\n\t"
1159 "movq %%mm1, %%mm3\n\t"
1161 "psrlq $8, %%mm2\n\t"
1162 "psrlq $8, %%mm3\n\t"
1163 "psrlq $8, %%mm6\n\t"
1164 "psrlq $8, %%mm7\n\t"
1165 "pand %2, %%mm0\n\t"
1166 "pand %2, %%mm1\n\t"
1167 "pand %2, %%mm4\n\t"
1168 "pand %2, %%mm5\n\t"
1169 "pand %3, %%mm2\n\t"
1170 "pand %3, %%mm3\n\t"
1171 "pand %3, %%mm6\n\t"
1172 "pand %3, %%mm7\n\t"
1173 "por %%mm2, %%mm0\n\t"
1174 "por %%mm3, %%mm1\n\t"
1175 "por %%mm6, %%mm4\n\t"
1176 "por %%mm7, %%mm5\n\t"
1178 "movq %%mm1, %%mm2\n\t"
1179 "movq %%mm4, %%mm3\n\t"
1180 "psllq $48, %%mm2\n\t"
1181 "psllq $32, %%mm3\n\t"
1182 "pand %4, %%mm2\n\t"
1183 "pand %5, %%mm3\n\t"
1184 "por %%mm2, %%mm0\n\t"
1185 "psrlq $16, %%mm1\n\t"
1186 "psrlq $32, %%mm4\n\t"
1187 "psllq $16, %%mm5\n\t"
1188 "por %%mm3, %%mm1\n\t"
1189 "pand %6, %%mm5\n\t"
1190 "por %%mm5, %%mm4\n\t"
1192 MOVNTQ" %%mm0, %0\n\t"
1193 MOVNTQ" %%mm1, 8%0\n\t"
1194 MOVNTQ" %%mm4, 16%0"
1197 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1202 __asm __volatile(SFENCE:::"memory");
1203 __asm __volatile(EMMS:::"memory");
1207 register uint16_t bgr;
1209 *d++ = (bgr&0x1F)<<3;
1210 *d++ = (bgr&0x7E0)>>3;
1211 *d++ = (bgr&0xF800)>>8;
1215 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1217 const uint16_t *end;
1219 const uint16_t *mm_end;
1221 uint8_t *d = (uint8_t *)dst;
1222 const uint16_t *s = (const uint16_t *)src;
1223 end = s + src_size/2;
1225 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1226 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1232 "movq %1, %%mm0\n\t"
1233 "movq %1, %%mm1\n\t"
1234 "movq %1, %%mm2\n\t"
1235 "pand %2, %%mm0\n\t"
1236 "pand %3, %%mm1\n\t"
1237 "pand %4, %%mm2\n\t"
1238 "psllq $3, %%mm0\n\t"
1239 "psrlq $2, %%mm1\n\t"
1240 "psrlq $7, %%mm2\n\t"
1241 "movq %%mm0, %%mm3\n\t"
1242 "movq %%mm1, %%mm4\n\t"
1243 "movq %%mm2, %%mm5\n\t"
1244 "punpcklwd %%mm7, %%mm0\n\t"
1245 "punpcklwd %%mm7, %%mm1\n\t"
1246 "punpcklwd %%mm7, %%mm2\n\t"
1247 "punpckhwd %%mm7, %%mm3\n\t"
1248 "punpckhwd %%mm7, %%mm4\n\t"
1249 "punpckhwd %%mm7, %%mm5\n\t"
1250 "psllq $8, %%mm1\n\t"
1251 "psllq $16, %%mm2\n\t"
1252 "por %%mm1, %%mm0\n\t"
1253 "por %%mm2, %%mm0\n\t"
1254 "psllq $8, %%mm4\n\t"
1255 "psllq $16, %%mm5\n\t"
1256 "por %%mm4, %%mm3\n\t"
1257 "por %%mm5, %%mm3\n\t"
1258 MOVNTQ" %%mm0, %0\n\t"
1259 MOVNTQ" %%mm3, 8%0\n\t"
1261 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1266 __asm __volatile(SFENCE:::"memory");
1267 __asm __volatile(EMMS:::"memory");
1271 #if 0 //slightly slower on athlon
1273 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1275 register uint16_t bgr;
1277 #ifdef WORDS_BIGENDIAN
1279 *d++ = (bgr&0x7C00)>>7;
1280 *d++ = (bgr&0x3E0)>>2;
1281 *d++ = (bgr&0x1F)<<3;
1283 *d++ = (bgr&0x1F)<<3;
1284 *d++ = (bgr&0x3E0)>>2;
1285 *d++ = (bgr&0x7C00)>>7;
1293 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1295 const uint16_t *end;
1297 const uint16_t *mm_end;
1299 uint8_t *d = (uint8_t *)dst;
1300 const uint16_t *s = (uint16_t *)src;
1301 end = s + src_size/2;
1303 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1304 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1310 "movq %1, %%mm0\n\t"
1311 "movq %1, %%mm1\n\t"
1312 "movq %1, %%mm2\n\t"
1313 "pand %2, %%mm0\n\t"
1314 "pand %3, %%mm1\n\t"
1315 "pand %4, %%mm2\n\t"
1316 "psllq $3, %%mm0\n\t"
1317 "psrlq $3, %%mm1\n\t"
1318 "psrlq $8, %%mm2\n\t"
1319 "movq %%mm0, %%mm3\n\t"
1320 "movq %%mm1, %%mm4\n\t"
1321 "movq %%mm2, %%mm5\n\t"
1322 "punpcklwd %%mm7, %%mm0\n\t"
1323 "punpcklwd %%mm7, %%mm1\n\t"
1324 "punpcklwd %%mm7, %%mm2\n\t"
1325 "punpckhwd %%mm7, %%mm3\n\t"
1326 "punpckhwd %%mm7, %%mm4\n\t"
1327 "punpckhwd %%mm7, %%mm5\n\t"
1328 "psllq $8, %%mm1\n\t"
1329 "psllq $16, %%mm2\n\t"
1330 "por %%mm1, %%mm0\n\t"
1331 "por %%mm2, %%mm0\n\t"
1332 "psllq $8, %%mm4\n\t"
1333 "psllq $16, %%mm5\n\t"
1334 "por %%mm4, %%mm3\n\t"
1335 "por %%mm5, %%mm3\n\t"
1336 MOVNTQ" %%mm0, %0\n\t"
1337 MOVNTQ" %%mm3, 8%0\n\t"
1339 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1344 __asm __volatile(SFENCE:::"memory");
1345 __asm __volatile(EMMS:::"memory");
1349 register uint16_t bgr;
1351 #ifdef WORDS_BIGENDIAN
1353 *d++ = (bgr&0xF800)>>8;
1354 *d++ = (bgr&0x7E0)>>3;
1355 *d++ = (bgr&0x1F)<<3;
1357 *d++ = (bgr&0x1F)<<3;
1358 *d++ = (bgr&0x7E0)>>3;
1359 *d++ = (bgr&0xF800)>>8;
1365 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1368 /* TODO: unroll this loop */
1370 "xor %%"REG_a", %%"REG_a" \n\t"
1373 PREFETCH" 32(%0, %%"REG_a") \n\t"
1374 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1375 "movq %%mm0, %%mm1 \n\t"
1376 "movq %%mm0, %%mm2 \n\t"
1377 "pslld $16, %%mm0 \n\t"
1378 "psrld $16, %%mm1 \n\t"
1379 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1380 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1381 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1382 "por %%mm0, %%mm2 \n\t"
1383 "por %%mm1, %%mm2 \n\t"
1384 MOVNTQ" %%mm2, (%1, %%"REG_a") \n\t"
1385 "add $8, %%"REG_a" \n\t"
1386 "cmp %2, %%"REG_a" \n\t"
1388 :: "r" (src), "r"(dst), "r" (src_size-7)
1392 __asm __volatile(SFENCE:::"memory");
1393 __asm __volatile(EMMS:::"memory");
1396 unsigned num_pixels = src_size >> 2;
1397 for(i=0; i<num_pixels; i++)
1399 #ifdef WORDS_BIGENDIAN
1400 dst[4*i + 1] = src[4*i + 3];
1401 dst[4*i + 2] = src[4*i + 2];
1402 dst[4*i + 3] = src[4*i + 1];
1404 dst[4*i + 0] = src[4*i + 2];
1405 dst[4*i + 1] = src[4*i + 1];
1406 dst[4*i + 2] = src[4*i + 0];
1412 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1416 long mmx_size= 23 - src_size;
1418 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1419 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1420 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1423 PREFETCH" 32(%1, %%"REG_a") \n\t"
1424 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1425 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1426 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1427 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1428 "pand %%mm5, %%mm0 \n\t"
1429 "pand %%mm6, %%mm1 \n\t"
1430 "pand %%mm7, %%mm2 \n\t"
1431 "por %%mm0, %%mm1 \n\t"
1432 "por %%mm2, %%mm1 \n\t"
1433 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1434 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1435 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1436 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1437 "pand %%mm7, %%mm0 \n\t"
1438 "pand %%mm5, %%mm1 \n\t"
1439 "pand %%mm6, %%mm2 \n\t"
1440 "por %%mm0, %%mm1 \n\t"
1441 "por %%mm2, %%mm1 \n\t"
1442 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1443 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1444 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1445 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1446 "pand %%mm6, %%mm0 \n\t"
1447 "pand %%mm7, %%mm1 \n\t"
1448 "pand %%mm5, %%mm2 \n\t"
1449 "por %%mm0, %%mm1 \n\t"
1450 "por %%mm2, %%mm1 \n\t"
1451 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1452 "add $24, %%"REG_a" \n\t"
1455 : "r" (src-mmx_size), "r"(dst-mmx_size)
1458 __asm __volatile(SFENCE:::"memory");
1459 __asm __volatile(EMMS:::"memory");
1461 if(mmx_size==23) return; //finihsed, was multiple of 8
1465 src_size= 23-mmx_size;
1469 for(i=0; i<src_size; i+=3)
1473 dst[i + 1] = src[i + 1];
1474 dst[i + 2] = src[i + 0];
1479 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1480 long width, long height,
1481 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1484 const long chromWidth= width>>1;
1485 for(y=0; y<height; y++)
1488 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1490 "xor %%"REG_a", %%"REG_a" \n\t"
1493 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1494 PREFETCH" 32(%2, %%"REG_a") \n\t"
1495 PREFETCH" 32(%3, %%"REG_a") \n\t"
1496 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1497 "movq %%mm0, %%mm2 \n\t" // U(0)
1498 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1499 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1500 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1502 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1503 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1504 "movq %%mm3, %%mm4 \n\t" // Y(0)
1505 "movq %%mm5, %%mm6 \n\t" // Y(8)
1506 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1507 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1508 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1509 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1511 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1512 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1513 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1514 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1516 "add $8, %%"REG_a" \n\t"
1517 "cmp %4, %%"REG_a" \n\t"
1519 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1524 #if defined ARCH_ALPHA && defined HAVE_MVI
1525 #define pl2yuy2(n) \
1530 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1531 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1532 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1533 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1534 yuv1 = (u << 8) + (v << 24); \
1541 uint64_t *qdst = (uint64_t *) dst;
1542 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1543 const uint32_t *yc = (uint32_t *) ysrc;
1544 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1545 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1546 for(i = 0; i < chromWidth; i += 8){
1547 uint64_t y1, y2, yuv1, yuv2;
1550 asm("ldq $31,64(%0)" :: "r"(yc));
1551 asm("ldq $31,64(%0)" :: "r"(yc2));
1552 asm("ldq $31,64(%0)" :: "r"(uc));
1553 asm("ldq $31,64(%0)" :: "r"(vc));
1571 #elif __WORDSIZE >= 64
1573 uint64_t *ldst = (uint64_t *) dst;
1574 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1575 for(i = 0; i < chromWidth; i += 2){
1577 k = yc[0] + (uc[0] << 8) +
1578 (yc[1] << 16) + (vc[0] << 24);
1579 l = yc[2] + (uc[1] << 8) +
1580 (yc[3] << 16) + (vc[1] << 24);
1581 *ldst++ = k + (l << 32);
1588 int i, *idst = (int32_t *) dst;
1589 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1590 for(i = 0; i < chromWidth; i++){
1591 #ifdef WORDS_BIGENDIAN
1592 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1593 (yc[1] << 8) + (vc[0] << 0);
1595 *idst++ = yc[0] + (uc[0] << 8) +
1596 (yc[1] << 16) + (vc[0] << 24);
1604 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1606 usrc += chromStride;
1607 vsrc += chromStride;
1621 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1622 * problem for anyone then tell me, and ill fix it)
1624 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1625 long width, long height,
1626 long lumStride, long chromStride, long dstStride)
1628 //FIXME interpolate chroma
1629 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1632 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1633 long width, long height,
1634 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1637 const long chromWidth= width>>1;
1638 for(y=0; y<height; y++)
1641 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1643 "xor %%"REG_a", %%"REG_a" \n\t"
1646 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1647 PREFETCH" 32(%2, %%"REG_a") \n\t"
1648 PREFETCH" 32(%3, %%"REG_a") \n\t"
1649 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1650 "movq %%mm0, %%mm2 \n\t" // U(0)
1651 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1652 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1653 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1655 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1656 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1657 "movq %%mm0, %%mm4 \n\t" // Y(0)
1658 "movq %%mm2, %%mm6 \n\t" // Y(8)
1659 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1660 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1661 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1662 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1664 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1665 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1666 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1667 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1669 "add $8, %%"REG_a" \n\t"
1670 "cmp %4, %%"REG_a" \n\t"
1672 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1676 //FIXME adapt the alpha asm code from yv12->yuy2
1678 #if __WORDSIZE >= 64
1680 uint64_t *ldst = (uint64_t *) dst;
1681 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1682 for(i = 0; i < chromWidth; i += 2){
1684 k = uc[0] + (yc[0] << 8) +
1685 (vc[0] << 16) + (yc[1] << 24);
1686 l = uc[1] + (yc[2] << 8) +
1687 (vc[1] << 16) + (yc[3] << 24);
1688 *ldst++ = k + (l << 32);
1695 int i, *idst = (int32_t *) dst;
1696 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1697 for(i = 0; i < chromWidth; i++){
1698 #ifdef WORDS_BIGENDIAN
1699 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1700 (vc[0] << 8) + (yc[1] << 0);
1702 *idst++ = uc[0] + (yc[0] << 8) +
1703 (vc[0] << 16) + (yc[1] << 24);
1711 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1713 usrc += chromStride;
1714 vsrc += chromStride;
1728 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1729 * problem for anyone then tell me, and ill fix it)
1731 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1732 long width, long height,
1733 long lumStride, long chromStride, long dstStride)
1735 //FIXME interpolate chroma
1736 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1741 * width should be a multiple of 16
1743 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1744 long width, long height,
1745 long lumStride, long chromStride, long dstStride)
1747 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1752 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1753 * problem for anyone then tell me, and ill fix it)
1755 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1756 long width, long height,
1757 long lumStride, long chromStride, long srcStride)
1760 const long chromWidth= width>>1;
1761 for(y=0; y<height; y+=2)
1765 "xor %%"REG_a", %%"REG_a" \n\t"
1766 "pcmpeqw %%mm7, %%mm7 \n\t"
1767 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1770 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1771 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1772 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1773 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1774 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1775 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1776 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1777 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1778 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1779 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1780 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1782 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1784 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1785 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1786 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1787 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1788 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1789 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1790 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1791 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1792 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1793 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1795 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1797 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1798 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1799 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1800 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1801 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1802 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1803 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1804 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1806 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1807 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1809 "add $8, %%"REG_a" \n\t"
1810 "cmp %4, %%"REG_a" \n\t"
1812 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1813 : "memory", "%"REG_a
1820 "xor %%"REG_a", %%"REG_a" \n\t"
1823 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1824 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1825 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1826 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1827 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1828 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1829 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1830 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1831 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1832 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1833 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1835 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1836 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1838 "add $8, %%"REG_a" \n\t"
1839 "cmp %4, %%"REG_a" \n\t"
1842 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1843 : "memory", "%"REG_a
1847 for(i=0; i<chromWidth; i++)
1849 ydst[2*i+0] = src[4*i+0];
1850 udst[i] = src[4*i+1];
1851 ydst[2*i+1] = src[4*i+2];
1852 vdst[i] = src[4*i+3];
1857 for(i=0; i<chromWidth; i++)
1859 ydst[2*i+0] = src[4*i+0];
1860 ydst[2*i+1] = src[4*i+2];
1863 udst += chromStride;
1864 vdst += chromStride;
1869 asm volatile( EMMS" \n\t"
1875 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1876 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1877 long width, long height, long lumStride, long chromStride)
1880 memcpy(ydst, ysrc, width*height);
1882 /* XXX: implement upscaling for U,V */
1885 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1892 for(x=0; x<srcWidth-1; x++){
1893 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1894 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1896 dst[2*srcWidth-1]= src[srcWidth-1];
1900 for(y=1; y<srcHeight; y++){
1901 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1902 const long mmxSize= srcWidth&~15;
1904 "mov %4, %%"REG_a" \n\t"
1906 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1907 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1908 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1909 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1910 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1911 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1912 PAVGB" %%mm0, %%mm5 \n\t"
1913 PAVGB" %%mm0, %%mm3 \n\t"
1914 PAVGB" %%mm0, %%mm5 \n\t"
1915 PAVGB" %%mm0, %%mm3 \n\t"
1916 PAVGB" %%mm1, %%mm4 \n\t"
1917 PAVGB" %%mm1, %%mm2 \n\t"
1918 PAVGB" %%mm1, %%mm4 \n\t"
1919 PAVGB" %%mm1, %%mm2 \n\t"
1920 "movq %%mm5, %%mm7 \n\t"
1921 "movq %%mm4, %%mm6 \n\t"
1922 "punpcklbw %%mm3, %%mm5 \n\t"
1923 "punpckhbw %%mm3, %%mm7 \n\t"
1924 "punpcklbw %%mm2, %%mm4 \n\t"
1925 "punpckhbw %%mm2, %%mm6 \n\t"
1927 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1928 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1929 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1930 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1932 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1933 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1934 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1935 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1937 "add $8, %%"REG_a" \n\t"
1939 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1940 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1946 const long mmxSize=1;
1948 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1949 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1951 for(x=mmxSize-1; x<srcWidth-1; x++){
1952 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1953 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1954 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1955 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1957 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1958 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1968 for(x=0; x<srcWidth-1; x++){
1969 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1970 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1972 dst[2*srcWidth-1]= src[srcWidth-1];
1974 for(x=0; x<srcWidth; x++){
1981 asm volatile( EMMS" \n\t"
1989 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1990 * problem for anyone then tell me, and ill fix it)
1991 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1993 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1994 long width, long height,
1995 long lumStride, long chromStride, long srcStride)
1998 const long chromWidth= width>>1;
1999 for(y=0; y<height; y+=2)
2003 "xorl %%eax, %%eax \n\t"
2004 "pcmpeqw %%mm7, %%mm7 \n\t"
2005 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2008 PREFETCH" 64(%0, %%eax, 4) \n\t"
2009 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2010 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2011 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2012 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2013 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2014 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2015 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2016 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2017 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2018 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2020 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2022 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2023 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2024 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2025 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2026 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2027 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2028 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2029 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2030 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2031 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2033 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2035 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2036 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2037 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2038 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2039 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2040 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2041 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2042 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2044 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2045 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2047 "addl $8, %%eax \n\t"
2048 "cmpl %4, %%eax \n\t"
2050 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2058 "xorl %%eax, %%eax \n\t"
2061 PREFETCH" 64(%0, %%eax, 4) \n\t"
2062 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2063 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2064 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2065 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2066 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2067 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2068 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2069 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2070 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2071 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2073 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2074 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2076 "addl $8, %%eax \n\t"
2077 "cmpl %4, %%eax \n\t"
2080 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2085 for(i=0; i<chromWidth; i++)
2087 udst[i] = src[4*i+0];
2088 ydst[2*i+0] = src[4*i+1];
2089 vdst[i] = src[4*i+2];
2090 ydst[2*i+1] = src[4*i+3];
2095 for(i=0; i<chromWidth; i++)
2097 ydst[2*i+0] = src[4*i+1];
2098 ydst[2*i+1] = src[4*i+3];
2101 udst += chromStride;
2102 vdst += chromStride;
2107 asm volatile( EMMS" \n\t"
2115 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2116 * problem for anyone then tell me, and ill fix it)
2117 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2119 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2120 long width, long height,
2121 long lumStride, long chromStride, long srcStride)
2124 const long chromWidth= width>>1;
2126 for(y=0; y<height-2; y+=2)
2132 "mov %2, %%"REG_a" \n\t"
2133 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2134 "movq "MANGLE(w1111)", %%mm5 \n\t"
2135 "pxor %%mm7, %%mm7 \n\t"
2136 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2139 PREFETCH" 64(%0, %%"REG_d") \n\t"
2140 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2141 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2142 "punpcklbw %%mm7, %%mm0 \n\t"
2143 "punpcklbw %%mm7, %%mm1 \n\t"
2144 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2145 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2146 "punpcklbw %%mm7, %%mm2 \n\t"
2147 "punpcklbw %%mm7, %%mm3 \n\t"
2148 "pmaddwd %%mm6, %%mm0 \n\t"
2149 "pmaddwd %%mm6, %%mm1 \n\t"
2150 "pmaddwd %%mm6, %%mm2 \n\t"
2151 "pmaddwd %%mm6, %%mm3 \n\t"
2152 #ifndef FAST_BGR2YV12
2153 "psrad $8, %%mm0 \n\t"
2154 "psrad $8, %%mm1 \n\t"
2155 "psrad $8, %%mm2 \n\t"
2156 "psrad $8, %%mm3 \n\t"
2158 "packssdw %%mm1, %%mm0 \n\t"
2159 "packssdw %%mm3, %%mm2 \n\t"
2160 "pmaddwd %%mm5, %%mm0 \n\t"
2161 "pmaddwd %%mm5, %%mm2 \n\t"
2162 "packssdw %%mm2, %%mm0 \n\t"
2163 "psraw $7, %%mm0 \n\t"
2165 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2166 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2167 "punpcklbw %%mm7, %%mm4 \n\t"
2168 "punpcklbw %%mm7, %%mm1 \n\t"
2169 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2170 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2171 "punpcklbw %%mm7, %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm3 \n\t"
2173 "pmaddwd %%mm6, %%mm4 \n\t"
2174 "pmaddwd %%mm6, %%mm1 \n\t"
2175 "pmaddwd %%mm6, %%mm2 \n\t"
2176 "pmaddwd %%mm6, %%mm3 \n\t"
2177 #ifndef FAST_BGR2YV12
2178 "psrad $8, %%mm4 \n\t"
2179 "psrad $8, %%mm1 \n\t"
2180 "psrad $8, %%mm2 \n\t"
2181 "psrad $8, %%mm3 \n\t"
2183 "packssdw %%mm1, %%mm4 \n\t"
2184 "packssdw %%mm3, %%mm2 \n\t"
2185 "pmaddwd %%mm5, %%mm4 \n\t"
2186 "pmaddwd %%mm5, %%mm2 \n\t"
2187 "add $24, %%"REG_d" \n\t"
2188 "packssdw %%mm2, %%mm4 \n\t"
2189 "psraw $7, %%mm4 \n\t"
2191 "packuswb %%mm4, %%mm0 \n\t"
2192 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2194 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2195 "add $8, %%"REG_a" \n\t"
2197 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2198 : "%"REG_a, "%"REG_d
2205 "mov %4, %%"REG_a" \n\t"
2206 "movq "MANGLE(w1111)", %%mm5 \n\t"
2207 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2208 "pxor %%mm7, %%mm7 \n\t"
2209 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2210 "add %%"REG_d", %%"REG_d" \n\t"
2213 PREFETCH" 64(%0, %%"REG_d") \n\t"
2214 PREFETCH" 64(%1, %%"REG_d") \n\t"
2215 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2216 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2217 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2218 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2219 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2220 PAVGB" %%mm1, %%mm0 \n\t"
2221 PAVGB" %%mm3, %%mm2 \n\t"
2222 "movq %%mm0, %%mm1 \n\t"
2223 "movq %%mm2, %%mm3 \n\t"
2224 "psrlq $24, %%mm0 \n\t"
2225 "psrlq $24, %%mm2 \n\t"
2226 PAVGB" %%mm1, %%mm0 \n\t"
2227 PAVGB" %%mm3, %%mm2 \n\t"
2228 "punpcklbw %%mm7, %%mm0 \n\t"
2229 "punpcklbw %%mm7, %%mm2 \n\t"
2231 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2232 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2233 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2234 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2235 "punpcklbw %%mm7, %%mm0 \n\t"
2236 "punpcklbw %%mm7, %%mm1 \n\t"
2237 "punpcklbw %%mm7, %%mm2 \n\t"
2238 "punpcklbw %%mm7, %%mm3 \n\t"
2239 "paddw %%mm1, %%mm0 \n\t"
2240 "paddw %%mm3, %%mm2 \n\t"
2241 "paddw %%mm2, %%mm0 \n\t"
2242 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2243 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2244 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2245 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2246 "punpcklbw %%mm7, %%mm4 \n\t"
2247 "punpcklbw %%mm7, %%mm1 \n\t"
2248 "punpcklbw %%mm7, %%mm2 \n\t"
2249 "punpcklbw %%mm7, %%mm3 \n\t"
2250 "paddw %%mm1, %%mm4 \n\t"
2251 "paddw %%mm3, %%mm2 \n\t"
2252 "paddw %%mm4, %%mm2 \n\t"
2253 "psrlw $2, %%mm0 \n\t"
2254 "psrlw $2, %%mm2 \n\t"
2256 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2257 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2259 "pmaddwd %%mm0, %%mm1 \n\t"
2260 "pmaddwd %%mm2, %%mm3 \n\t"
2261 "pmaddwd %%mm6, %%mm0 \n\t"
2262 "pmaddwd %%mm6, %%mm2 \n\t"
2263 #ifndef FAST_BGR2YV12
2264 "psrad $8, %%mm0 \n\t"
2265 "psrad $8, %%mm1 \n\t"
2266 "psrad $8, %%mm2 \n\t"
2267 "psrad $8, %%mm3 \n\t"
2269 "packssdw %%mm2, %%mm0 \n\t"
2270 "packssdw %%mm3, %%mm1 \n\t"
2271 "pmaddwd %%mm5, %%mm0 \n\t"
2272 "pmaddwd %%mm5, %%mm1 \n\t"
2273 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2274 "psraw $7, %%mm0 \n\t"
2276 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2277 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2278 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2279 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2280 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2281 PAVGB" %%mm1, %%mm4 \n\t"
2282 PAVGB" %%mm3, %%mm2 \n\t"
2283 "movq %%mm4, %%mm1 \n\t"
2284 "movq %%mm2, %%mm3 \n\t"
2285 "psrlq $24, %%mm4 \n\t"
2286 "psrlq $24, %%mm2 \n\t"
2287 PAVGB" %%mm1, %%mm4 \n\t"
2288 PAVGB" %%mm3, %%mm2 \n\t"
2289 "punpcklbw %%mm7, %%mm4 \n\t"
2290 "punpcklbw %%mm7, %%mm2 \n\t"
2292 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2293 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2294 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2295 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2296 "punpcklbw %%mm7, %%mm4 \n\t"
2297 "punpcklbw %%mm7, %%mm1 \n\t"
2298 "punpcklbw %%mm7, %%mm2 \n\t"
2299 "punpcklbw %%mm7, %%mm3 \n\t"
2300 "paddw %%mm1, %%mm4 \n\t"
2301 "paddw %%mm3, %%mm2 \n\t"
2302 "paddw %%mm2, %%mm4 \n\t"
2303 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2304 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2305 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2306 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2307 "punpcklbw %%mm7, %%mm5 \n\t"
2308 "punpcklbw %%mm7, %%mm1 \n\t"
2309 "punpcklbw %%mm7, %%mm2 \n\t"
2310 "punpcklbw %%mm7, %%mm3 \n\t"
2311 "paddw %%mm1, %%mm5 \n\t"
2312 "paddw %%mm3, %%mm2 \n\t"
2313 "paddw %%mm5, %%mm2 \n\t"
2314 "movq "MANGLE(w1111)", %%mm5 \n\t"
2315 "psrlw $2, %%mm4 \n\t"
2316 "psrlw $2, %%mm2 \n\t"
2318 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2319 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2321 "pmaddwd %%mm4, %%mm1 \n\t"
2322 "pmaddwd %%mm2, %%mm3 \n\t"
2323 "pmaddwd %%mm6, %%mm4 \n\t"
2324 "pmaddwd %%mm6, %%mm2 \n\t"
2325 #ifndef FAST_BGR2YV12
2326 "psrad $8, %%mm4 \n\t"
2327 "psrad $8, %%mm1 \n\t"
2328 "psrad $8, %%mm2 \n\t"
2329 "psrad $8, %%mm3 \n\t"
2331 "packssdw %%mm2, %%mm4 \n\t"
2332 "packssdw %%mm3, %%mm1 \n\t"
2333 "pmaddwd %%mm5, %%mm4 \n\t"
2334 "pmaddwd %%mm5, %%mm1 \n\t"
2335 "add $24, %%"REG_d" \n\t"
2336 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2337 "psraw $7, %%mm4 \n\t"
2339 "movq %%mm0, %%mm1 \n\t"
2340 "punpckldq %%mm4, %%mm0 \n\t"
2341 "punpckhdq %%mm4, %%mm1 \n\t"
2342 "packsswb %%mm1, %%mm0 \n\t"
2343 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2344 "movd %%mm0, (%2, %%"REG_a") \n\t"
2345 "punpckhdq %%mm0, %%mm0 \n\t"
2346 "movd %%mm0, (%3, %%"REG_a") \n\t"
2347 "add $4, %%"REG_a" \n\t"
2349 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2350 : "%"REG_a, "%"REG_d
2353 udst += chromStride;
2354 vdst += chromStride;
2358 asm volatile( EMMS" \n\t"
2364 for(; y<height; y+=2)
2367 for(i=0; i<chromWidth; i++)
2369 unsigned int b= src[6*i+0];
2370 unsigned int g= src[6*i+1];
2371 unsigned int r= src[6*i+2];
2373 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2374 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2375 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2385 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2391 for(i=0; i<chromWidth; i++)
2393 unsigned int b= src[6*i+0];
2394 unsigned int g= src[6*i+1];
2395 unsigned int r= src[6*i+2];
2397 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2405 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2408 udst += chromStride;
2409 vdst += chromStride;
2415 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2416 long width, long height, long src1Stride,
2417 long src2Stride, long dstStride){
2420 for(h=0; h < height; h++)
2427 "xor %%"REG_a", %%"REG_a" \n\t"
2429 PREFETCH" 64(%1, %%"REG_a") \n\t"
2430 PREFETCH" 64(%2, %%"REG_a") \n\t"
2431 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2432 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2433 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2434 "punpcklbw %%xmm2, %%xmm0 \n\t"
2435 "punpckhbw %%xmm2, %%xmm1 \n\t"
2436 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2437 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2438 "add $16, %%"REG_a" \n\t"
2439 "cmp %3, %%"REG_a" \n\t"
2441 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2442 : "memory", "%"REG_a""
2446 "xor %%"REG_a", %%"REG_a" \n\t"
2448 PREFETCH" 64(%1, %%"REG_a") \n\t"
2449 PREFETCH" 64(%2, %%"REG_a") \n\t"
2450 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2451 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2452 "movq %%mm0, %%mm1 \n\t"
2453 "movq %%mm2, %%mm3 \n\t"
2454 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2455 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2456 "punpcklbw %%mm4, %%mm0 \n\t"
2457 "punpckhbw %%mm4, %%mm1 \n\t"
2458 "punpcklbw %%mm5, %%mm2 \n\t"
2459 "punpckhbw %%mm5, %%mm3 \n\t"
2460 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2461 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2462 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2463 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2464 "add $16, %%"REG_a" \n\t"
2465 "cmp %3, %%"REG_a" \n\t"
2467 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2468 : "memory", "%"REG_a
2471 for(w= (width&(~15)); w < width; w++)
2473 dest[2*w+0] = src1[w];
2474 dest[2*w+1] = src2[w];
2477 for(w=0; w < width; w++)
2479 dest[2*w+0] = src1[w];
2480 dest[2*w+1] = src2[w];
2496 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2497 uint8_t *dst1, uint8_t *dst2,
2498 long width, long height,
2499 long srcStride1, long srcStride2,
2500 long dstStride1, long dstStride2)
2503 w=width/2; h=height/2;
2508 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2511 const uint8_t* s1=src1+srcStride1*(y>>1);
2512 uint8_t* d=dst1+dstStride1*y;
2519 "movq %1, %%mm0\n\t"
2520 "movq 8%1, %%mm2\n\t"
2521 "movq 16%1, %%mm4\n\t"
2522 "movq 24%1, %%mm6\n\t"
2523 "movq %%mm0, %%mm1\n\t"
2524 "movq %%mm2, %%mm3\n\t"
2525 "movq %%mm4, %%mm5\n\t"
2526 "movq %%mm6, %%mm7\n\t"
2527 "punpcklbw %%mm0, %%mm0\n\t"
2528 "punpckhbw %%mm1, %%mm1\n\t"
2529 "punpcklbw %%mm2, %%mm2\n\t"
2530 "punpckhbw %%mm3, %%mm3\n\t"
2531 "punpcklbw %%mm4, %%mm4\n\t"
2532 "punpckhbw %%mm5, %%mm5\n\t"
2533 "punpcklbw %%mm6, %%mm6\n\t"
2534 "punpckhbw %%mm7, %%mm7\n\t"
2535 MOVNTQ" %%mm0, %0\n\t"
2536 MOVNTQ" %%mm1, 8%0\n\t"
2537 MOVNTQ" %%mm2, 16%0\n\t"
2538 MOVNTQ" %%mm3, 24%0\n\t"
2539 MOVNTQ" %%mm4, 32%0\n\t"
2540 MOVNTQ" %%mm5, 40%0\n\t"
2541 MOVNTQ" %%mm6, 48%0\n\t"
2542 MOVNTQ" %%mm7, 56%0"
2548 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2551 const uint8_t* s2=src2+srcStride2*(y>>1);
2552 uint8_t* d=dst2+dstStride2*y;
2559 "movq %1, %%mm0\n\t"
2560 "movq 8%1, %%mm2\n\t"
2561 "movq 16%1, %%mm4\n\t"
2562 "movq 24%1, %%mm6\n\t"
2563 "movq %%mm0, %%mm1\n\t"
2564 "movq %%mm2, %%mm3\n\t"
2565 "movq %%mm4, %%mm5\n\t"
2566 "movq %%mm6, %%mm7\n\t"
2567 "punpcklbw %%mm0, %%mm0\n\t"
2568 "punpckhbw %%mm1, %%mm1\n\t"
2569 "punpcklbw %%mm2, %%mm2\n\t"
2570 "punpckhbw %%mm3, %%mm3\n\t"
2571 "punpcklbw %%mm4, %%mm4\n\t"
2572 "punpckhbw %%mm5, %%mm5\n\t"
2573 "punpcklbw %%mm6, %%mm6\n\t"
2574 "punpckhbw %%mm7, %%mm7\n\t"
2575 MOVNTQ" %%mm0, %0\n\t"
2576 MOVNTQ" %%mm1, 8%0\n\t"
2577 MOVNTQ" %%mm2, 16%0\n\t"
2578 MOVNTQ" %%mm3, 24%0\n\t"
2579 MOVNTQ" %%mm4, 32%0\n\t"
2580 MOVNTQ" %%mm5, 40%0\n\t"
2581 MOVNTQ" %%mm6, 48%0\n\t"
2582 MOVNTQ" %%mm7, 56%0"
2588 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2599 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2601 long width, long height,
2602 long srcStride1, long srcStride2,
2603 long srcStride3, long dstStride)
2606 w=width/2; h=height;
2608 const uint8_t* yp=src1+srcStride1*y;
2609 const uint8_t* up=src2+srcStride2*(y>>2);
2610 const uint8_t* vp=src3+srcStride3*(y>>2);
2611 uint8_t* d=dst+dstStride*y;
2617 PREFETCH" 32(%1, %0)\n\t"
2618 PREFETCH" 32(%2, %0)\n\t"
2619 PREFETCH" 32(%3, %0)\n\t"
2620 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2621 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2622 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2623 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2624 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2625 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2626 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2627 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2628 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2629 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2631 "movq %%mm1, %%mm6\n\t"
2632 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2633 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2634 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2635 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2636 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2638 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2639 "movq 8(%1, %0, 4), %%mm0\n\t"
2640 "movq %%mm0, %%mm3\n\t"
2641 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2642 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2643 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2644 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2646 "movq %%mm4, %%mm6\n\t"
2647 "movq 16(%1, %0, 4), %%mm0\n\t"
2648 "movq %%mm0, %%mm3\n\t"
2649 "punpcklbw %%mm5, %%mm4\n\t"
2650 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2651 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2652 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2653 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2655 "punpckhbw %%mm5, %%mm6\n\t"
2656 "movq 24(%1, %0, 4), %%mm0\n\t"
2657 "movq %%mm0, %%mm3\n\t"
2658 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2659 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2660 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2661 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2664 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2670 const long x2= x<<2;