3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
9 * lot of big-endian byteorder fixes by Alex Beregszaszi
11 * This file is part of FFmpeg.
13 * FFmpeg is free software; you can redistribute it and/or modify
14 * it under the terms of the GNU General Public License as published by
15 * the Free Software Foundation; either version 2 of the License, or
16 * (at your option) any later version.
18 * FFmpeg is distributed in the hope that it will be useful,
19 * but WITHOUT ANY WARRANTY; without even the implied warranty of
20 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
21 * GNU General Public License for more details.
23 * You should have received a copy of the GNU General Public License
24 * along with FFmpeg; if not, write to the Free Software
25 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
27 * the C code (not assembly, mmx, ...) of this file can be used
28 * under the LGPL license too
32 #include <inttypes.h> /* for __WORDSIZE */
35 // #warning You have misconfigured system and probably will lose performance!
36 #define __WORDSIZE MP_WORDSIZE
54 #define PREFETCH "prefetch"
55 #define PREFETCHW "prefetchw"
56 #define PAVGB "pavgusb"
57 #elif defined ( HAVE_MMX2 )
58 #define PREFETCH "prefetchnta"
59 #define PREFETCHW "prefetcht0"
66 #define PREFETCH " # nop"
67 #define PREFETCHW " # nop"
72 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
79 #define MOVNTQ "movntq"
80 #define SFENCE "sfence"
83 #define SFENCE " # nop"
86 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
89 const uint8_t *s = src;
92 const uint8_t *mm_end;
96 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
98 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
104 "punpckldq 3%1, %%mm0\n\t"
105 "movd 6%1, %%mm1\n\t"
106 "punpckldq 9%1, %%mm1\n\t"
107 "movd 12%1, %%mm2\n\t"
108 "punpckldq 15%1, %%mm2\n\t"
109 "movd 18%1, %%mm3\n\t"
110 "punpckldq 21%1, %%mm3\n\t"
111 "pand %%mm7, %%mm0\n\t"
112 "pand %%mm7, %%mm1\n\t"
113 "pand %%mm7, %%mm2\n\t"
114 "pand %%mm7, %%mm3\n\t"
115 MOVNTQ" %%mm0, %0\n\t"
116 MOVNTQ" %%mm1, 8%0\n\t"
117 MOVNTQ" %%mm2, 16%0\n\t"
125 __asm __volatile(SFENCE:::"memory");
126 __asm __volatile(EMMS:::"memory");
130 #ifdef WORDS_BIGENDIAN
131 /* RGB24 (= R,G,B) -> RGB32 (= A,B,G,R) */
146 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,long src_size)
149 const uint8_t *s = src;
152 const uint8_t *mm_end;
156 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
163 "movq 8%1, %%mm1\n\t"
164 "movq 16%1, %%mm4\n\t"
165 "movq 24%1, %%mm5\n\t"
166 "movq %%mm0, %%mm2\n\t"
167 "movq %%mm1, %%mm3\n\t"
168 "movq %%mm4, %%mm6\n\t"
169 "movq %%mm5, %%mm7\n\t"
170 "psrlq $8, %%mm2\n\t"
171 "psrlq $8, %%mm3\n\t"
172 "psrlq $8, %%mm6\n\t"
173 "psrlq $8, %%mm7\n\t"
182 "por %%mm2, %%mm0\n\t"
183 "por %%mm3, %%mm1\n\t"
184 "por %%mm6, %%mm4\n\t"
185 "por %%mm7, %%mm5\n\t"
187 "movq %%mm1, %%mm2\n\t"
188 "movq %%mm4, %%mm3\n\t"
189 "psllq $48, %%mm2\n\t"
190 "psllq $32, %%mm3\n\t"
193 "por %%mm2, %%mm0\n\t"
194 "psrlq $16, %%mm1\n\t"
195 "psrlq $32, %%mm4\n\t"
196 "psllq $16, %%mm5\n\t"
197 "por %%mm3, %%mm1\n\t"
199 "por %%mm5, %%mm4\n\t"
201 MOVNTQ" %%mm0, %0\n\t"
202 MOVNTQ" %%mm1, 8%0\n\t"
205 :"m"(*s),"m"(mask24l),
206 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
211 __asm __volatile(SFENCE:::"memory");
212 __asm __volatile(EMMS:::"memory");
216 #ifdef WORDS_BIGENDIAN
217 /* RGB32 (= A,B,G,R) -> RGB24 (= R,G,B) */
233 Original by Strepto/Astral
234 ported to gcc & bugfixed : A'rpi
235 MMX2, 3DNOW optimization by Nick Kurshev
236 32bit c version, and and&add trick by Michael Niedermayer
238 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,long src_size)
240 register const uint8_t* s=src;
241 register uint8_t* d=dst;
242 register const uint8_t *end;
243 const uint8_t *mm_end;
246 __asm __volatile(PREFETCH" %0"::"m"(*s));
247 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
254 "movq 8%1, %%mm2\n\t"
255 "movq %%mm0, %%mm1\n\t"
256 "movq %%mm2, %%mm3\n\t"
257 "pand %%mm4, %%mm0\n\t"
258 "pand %%mm4, %%mm2\n\t"
259 "paddw %%mm1, %%mm0\n\t"
260 "paddw %%mm3, %%mm2\n\t"
261 MOVNTQ" %%mm0, %0\n\t"
269 __asm __volatile(SFENCE:::"memory");
270 __asm __volatile(EMMS:::"memory");
275 register unsigned x= *((uint32_t *)s);
276 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
282 register unsigned short x= *((uint16_t *)s);
283 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
287 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,long src_size)
289 register const uint8_t* s=src;
290 register uint8_t* d=dst;
291 register const uint8_t *end;
292 const uint8_t *mm_end;
295 __asm __volatile(PREFETCH" %0"::"m"(*s));
296 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
297 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
304 "movq 8%1, %%mm2\n\t"
305 "movq %%mm0, %%mm1\n\t"
306 "movq %%mm2, %%mm3\n\t"
307 "psrlq $1, %%mm0\n\t"
308 "psrlq $1, %%mm2\n\t"
309 "pand %%mm7, %%mm0\n\t"
310 "pand %%mm7, %%mm2\n\t"
311 "pand %%mm6, %%mm1\n\t"
312 "pand %%mm6, %%mm3\n\t"
313 "por %%mm1, %%mm0\n\t"
314 "por %%mm3, %%mm2\n\t"
315 MOVNTQ" %%mm0, %0\n\t"
323 __asm __volatile(SFENCE:::"memory");
324 __asm __volatile(EMMS:::"memory");
329 register uint32_t x= *((uint32_t *)s);
330 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
336 register uint16_t x= *((uint16_t *)s);
337 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
343 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, long src_size)
345 const uint8_t *s = src;
348 const uint8_t *mm_end;
350 uint16_t *d = (uint16_t *)dst;
354 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
356 "movq %3, %%mm5 \n\t"
357 "movq %4, %%mm6 \n\t"
358 "movq %5, %%mm7 \n\t"
362 PREFETCH" 32(%1) \n\t"
363 "movd (%1), %%mm0 \n\t"
364 "movd 4(%1), %%mm3 \n\t"
365 "punpckldq 8(%1), %%mm0 \n\t"
366 "punpckldq 12(%1), %%mm3 \n\t"
367 "movq %%mm0, %%mm1 \n\t"
368 "movq %%mm3, %%mm4 \n\t"
369 "pand %%mm6, %%mm0 \n\t"
370 "pand %%mm6, %%mm3 \n\t"
371 "pmaddwd %%mm7, %%mm0 \n\t"
372 "pmaddwd %%mm7, %%mm3 \n\t"
373 "pand %%mm5, %%mm1 \n\t"
374 "pand %%mm5, %%mm4 \n\t"
375 "por %%mm1, %%mm0 \n\t"
376 "por %%mm4, %%mm3 \n\t"
377 "psrld $5, %%mm0 \n\t"
378 "pslld $11, %%mm3 \n\t"
379 "por %%mm3, %%mm0 \n\t"
380 MOVNTQ" %%mm0, (%0) \n\t"
387 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
390 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
394 ::"m"(red_16mask),"m"(green_16mask));
400 "movd 4%1, %%mm3\n\t"
401 "punpckldq 8%1, %%mm0\n\t"
402 "punpckldq 12%1, %%mm3\n\t"
403 "movq %%mm0, %%mm1\n\t"
404 "movq %%mm0, %%mm2\n\t"
405 "movq %%mm3, %%mm4\n\t"
406 "movq %%mm3, %%mm5\n\t"
407 "psrlq $3, %%mm0\n\t"
408 "psrlq $3, %%mm3\n\t"
411 "psrlq $5, %%mm1\n\t"
412 "psrlq $5, %%mm4\n\t"
413 "pand %%mm6, %%mm1\n\t"
414 "pand %%mm6, %%mm4\n\t"
415 "psrlq $8, %%mm2\n\t"
416 "psrlq $8, %%mm5\n\t"
417 "pand %%mm7, %%mm2\n\t"
418 "pand %%mm7, %%mm5\n\t"
419 "por %%mm1, %%mm0\n\t"
420 "por %%mm4, %%mm3\n\t"
421 "por %%mm2, %%mm0\n\t"
422 "por %%mm5, %%mm3\n\t"
423 "psllq $16, %%mm3\n\t"
424 "por %%mm3, %%mm0\n\t"
425 MOVNTQ" %%mm0, %0\n\t"
426 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
431 __asm __volatile(SFENCE:::"memory");
432 __asm __volatile(EMMS:::"memory");
436 register int rgb = *(uint32_t*)s; s += 4;
437 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
441 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
443 const uint8_t *s = src;
446 const uint8_t *mm_end;
448 uint16_t *d = (uint16_t *)dst;
451 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
455 ::"m"(red_16mask),"m"(green_16mask));
462 "movd 4%1, %%mm3\n\t"
463 "punpckldq 8%1, %%mm0\n\t"
464 "punpckldq 12%1, %%mm3\n\t"
465 "movq %%mm0, %%mm1\n\t"
466 "movq %%mm0, %%mm2\n\t"
467 "movq %%mm3, %%mm4\n\t"
468 "movq %%mm3, %%mm5\n\t"
469 "psllq $8, %%mm0\n\t"
470 "psllq $8, %%mm3\n\t"
471 "pand %%mm7, %%mm0\n\t"
472 "pand %%mm7, %%mm3\n\t"
473 "psrlq $5, %%mm1\n\t"
474 "psrlq $5, %%mm4\n\t"
475 "pand %%mm6, %%mm1\n\t"
476 "pand %%mm6, %%mm4\n\t"
477 "psrlq $19, %%mm2\n\t"
478 "psrlq $19, %%mm5\n\t"
481 "por %%mm1, %%mm0\n\t"
482 "por %%mm4, %%mm3\n\t"
483 "por %%mm2, %%mm0\n\t"
484 "por %%mm5, %%mm3\n\t"
485 "psllq $16, %%mm3\n\t"
486 "por %%mm3, %%mm0\n\t"
487 MOVNTQ" %%mm0, %0\n\t"
488 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
492 __asm __volatile(SFENCE:::"memory");
493 __asm __volatile(EMMS:::"memory");
497 register int rgb = *(uint32_t*)s; s += 4;
498 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
502 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, long src_size)
504 const uint8_t *s = src;
507 const uint8_t *mm_end;
509 uint16_t *d = (uint16_t *)dst;
513 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
515 "movq %3, %%mm5 \n\t"
516 "movq %4, %%mm6 \n\t"
517 "movq %5, %%mm7 \n\t"
521 PREFETCH" 32(%1) \n\t"
522 "movd (%1), %%mm0 \n\t"
523 "movd 4(%1), %%mm3 \n\t"
524 "punpckldq 8(%1), %%mm0 \n\t"
525 "punpckldq 12(%1), %%mm3 \n\t"
526 "movq %%mm0, %%mm1 \n\t"
527 "movq %%mm3, %%mm4 \n\t"
528 "pand %%mm6, %%mm0 \n\t"
529 "pand %%mm6, %%mm3 \n\t"
530 "pmaddwd %%mm7, %%mm0 \n\t"
531 "pmaddwd %%mm7, %%mm3 \n\t"
532 "pand %%mm5, %%mm1 \n\t"
533 "pand %%mm5, %%mm4 \n\t"
534 "por %%mm1, %%mm0 \n\t"
535 "por %%mm4, %%mm3 \n\t"
536 "psrld $6, %%mm0 \n\t"
537 "pslld $10, %%mm3 \n\t"
538 "por %%mm3, %%mm0 \n\t"
539 MOVNTQ" %%mm0, (%0) \n\t"
546 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
549 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
553 ::"m"(red_15mask),"m"(green_15mask));
559 "movd 4%1, %%mm3\n\t"
560 "punpckldq 8%1, %%mm0\n\t"
561 "punpckldq 12%1, %%mm3\n\t"
562 "movq %%mm0, %%mm1\n\t"
563 "movq %%mm0, %%mm2\n\t"
564 "movq %%mm3, %%mm4\n\t"
565 "movq %%mm3, %%mm5\n\t"
566 "psrlq $3, %%mm0\n\t"
567 "psrlq $3, %%mm3\n\t"
570 "psrlq $6, %%mm1\n\t"
571 "psrlq $6, %%mm4\n\t"
572 "pand %%mm6, %%mm1\n\t"
573 "pand %%mm6, %%mm4\n\t"
574 "psrlq $9, %%mm2\n\t"
575 "psrlq $9, %%mm5\n\t"
576 "pand %%mm7, %%mm2\n\t"
577 "pand %%mm7, %%mm5\n\t"
578 "por %%mm1, %%mm0\n\t"
579 "por %%mm4, %%mm3\n\t"
580 "por %%mm2, %%mm0\n\t"
581 "por %%mm5, %%mm3\n\t"
582 "psllq $16, %%mm3\n\t"
583 "por %%mm3, %%mm0\n\t"
584 MOVNTQ" %%mm0, %0\n\t"
585 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
590 __asm __volatile(SFENCE:::"memory");
591 __asm __volatile(EMMS:::"memory");
595 register int rgb = *(uint32_t*)s; s += 4;
596 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
600 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
602 const uint8_t *s = src;
605 const uint8_t *mm_end;
607 uint16_t *d = (uint16_t *)dst;
610 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
614 ::"m"(red_15mask),"m"(green_15mask));
621 "movd 4%1, %%mm3\n\t"
622 "punpckldq 8%1, %%mm0\n\t"
623 "punpckldq 12%1, %%mm3\n\t"
624 "movq %%mm0, %%mm1\n\t"
625 "movq %%mm0, %%mm2\n\t"
626 "movq %%mm3, %%mm4\n\t"
627 "movq %%mm3, %%mm5\n\t"
628 "psllq $7, %%mm0\n\t"
629 "psllq $7, %%mm3\n\t"
630 "pand %%mm7, %%mm0\n\t"
631 "pand %%mm7, %%mm3\n\t"
632 "psrlq $6, %%mm1\n\t"
633 "psrlq $6, %%mm4\n\t"
634 "pand %%mm6, %%mm1\n\t"
635 "pand %%mm6, %%mm4\n\t"
636 "psrlq $19, %%mm2\n\t"
637 "psrlq $19, %%mm5\n\t"
640 "por %%mm1, %%mm0\n\t"
641 "por %%mm4, %%mm3\n\t"
642 "por %%mm2, %%mm0\n\t"
643 "por %%mm5, %%mm3\n\t"
644 "psllq $16, %%mm3\n\t"
645 "por %%mm3, %%mm0\n\t"
646 MOVNTQ" %%mm0, %0\n\t"
647 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
651 __asm __volatile(SFENCE:::"memory");
652 __asm __volatile(EMMS:::"memory");
656 register int rgb = *(uint32_t*)s; s += 4;
657 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
661 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, long src_size)
663 const uint8_t *s = src;
666 const uint8_t *mm_end;
668 uint16_t *d = (uint16_t *)dst;
671 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
675 ::"m"(red_16mask),"m"(green_16mask));
682 "movd 3%1, %%mm3\n\t"
683 "punpckldq 6%1, %%mm0\n\t"
684 "punpckldq 9%1, %%mm3\n\t"
685 "movq %%mm0, %%mm1\n\t"
686 "movq %%mm0, %%mm2\n\t"
687 "movq %%mm3, %%mm4\n\t"
688 "movq %%mm3, %%mm5\n\t"
689 "psrlq $3, %%mm0\n\t"
690 "psrlq $3, %%mm3\n\t"
693 "psrlq $5, %%mm1\n\t"
694 "psrlq $5, %%mm4\n\t"
695 "pand %%mm6, %%mm1\n\t"
696 "pand %%mm6, %%mm4\n\t"
697 "psrlq $8, %%mm2\n\t"
698 "psrlq $8, %%mm5\n\t"
699 "pand %%mm7, %%mm2\n\t"
700 "pand %%mm7, %%mm5\n\t"
701 "por %%mm1, %%mm0\n\t"
702 "por %%mm4, %%mm3\n\t"
703 "por %%mm2, %%mm0\n\t"
704 "por %%mm5, %%mm3\n\t"
705 "psllq $16, %%mm3\n\t"
706 "por %%mm3, %%mm0\n\t"
707 MOVNTQ" %%mm0, %0\n\t"
708 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
712 __asm __volatile(SFENCE:::"memory");
713 __asm __volatile(EMMS:::"memory");
720 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
724 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, long src_size)
726 const uint8_t *s = src;
729 const uint8_t *mm_end;
731 uint16_t *d = (uint16_t *)dst;
734 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
738 ::"m"(red_16mask),"m"(green_16mask));
745 "movd 3%1, %%mm3\n\t"
746 "punpckldq 6%1, %%mm0\n\t"
747 "punpckldq 9%1, %%mm3\n\t"
748 "movq %%mm0, %%mm1\n\t"
749 "movq %%mm0, %%mm2\n\t"
750 "movq %%mm3, %%mm4\n\t"
751 "movq %%mm3, %%mm5\n\t"
752 "psllq $8, %%mm0\n\t"
753 "psllq $8, %%mm3\n\t"
754 "pand %%mm7, %%mm0\n\t"
755 "pand %%mm7, %%mm3\n\t"
756 "psrlq $5, %%mm1\n\t"
757 "psrlq $5, %%mm4\n\t"
758 "pand %%mm6, %%mm1\n\t"
759 "pand %%mm6, %%mm4\n\t"
760 "psrlq $19, %%mm2\n\t"
761 "psrlq $19, %%mm5\n\t"
764 "por %%mm1, %%mm0\n\t"
765 "por %%mm4, %%mm3\n\t"
766 "por %%mm2, %%mm0\n\t"
767 "por %%mm5, %%mm3\n\t"
768 "psllq $16, %%mm3\n\t"
769 "por %%mm3, %%mm0\n\t"
770 MOVNTQ" %%mm0, %0\n\t"
771 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
775 __asm __volatile(SFENCE:::"memory");
776 __asm __volatile(EMMS:::"memory");
783 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
787 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, long src_size)
789 const uint8_t *s = src;
792 const uint8_t *mm_end;
794 uint16_t *d = (uint16_t *)dst;
797 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
801 ::"m"(red_15mask),"m"(green_15mask));
808 "movd 3%1, %%mm3\n\t"
809 "punpckldq 6%1, %%mm0\n\t"
810 "punpckldq 9%1, %%mm3\n\t"
811 "movq %%mm0, %%mm1\n\t"
812 "movq %%mm0, %%mm2\n\t"
813 "movq %%mm3, %%mm4\n\t"
814 "movq %%mm3, %%mm5\n\t"
815 "psrlq $3, %%mm0\n\t"
816 "psrlq $3, %%mm3\n\t"
819 "psrlq $6, %%mm1\n\t"
820 "psrlq $6, %%mm4\n\t"
821 "pand %%mm6, %%mm1\n\t"
822 "pand %%mm6, %%mm4\n\t"
823 "psrlq $9, %%mm2\n\t"
824 "psrlq $9, %%mm5\n\t"
825 "pand %%mm7, %%mm2\n\t"
826 "pand %%mm7, %%mm5\n\t"
827 "por %%mm1, %%mm0\n\t"
828 "por %%mm4, %%mm3\n\t"
829 "por %%mm2, %%mm0\n\t"
830 "por %%mm5, %%mm3\n\t"
831 "psllq $16, %%mm3\n\t"
832 "por %%mm3, %%mm0\n\t"
833 MOVNTQ" %%mm0, %0\n\t"
834 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
838 __asm __volatile(SFENCE:::"memory");
839 __asm __volatile(EMMS:::"memory");
846 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
850 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, long src_size)
852 const uint8_t *s = src;
855 const uint8_t *mm_end;
857 uint16_t *d = (uint16_t *)dst;
860 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
864 ::"m"(red_15mask),"m"(green_15mask));
871 "movd 3%1, %%mm3\n\t"
872 "punpckldq 6%1, %%mm0\n\t"
873 "punpckldq 9%1, %%mm3\n\t"
874 "movq %%mm0, %%mm1\n\t"
875 "movq %%mm0, %%mm2\n\t"
876 "movq %%mm3, %%mm4\n\t"
877 "movq %%mm3, %%mm5\n\t"
878 "psllq $7, %%mm0\n\t"
879 "psllq $7, %%mm3\n\t"
880 "pand %%mm7, %%mm0\n\t"
881 "pand %%mm7, %%mm3\n\t"
882 "psrlq $6, %%mm1\n\t"
883 "psrlq $6, %%mm4\n\t"
884 "pand %%mm6, %%mm1\n\t"
885 "pand %%mm6, %%mm4\n\t"
886 "psrlq $19, %%mm2\n\t"
887 "psrlq $19, %%mm5\n\t"
890 "por %%mm1, %%mm0\n\t"
891 "por %%mm4, %%mm3\n\t"
892 "por %%mm2, %%mm0\n\t"
893 "por %%mm5, %%mm3\n\t"
894 "psllq $16, %%mm3\n\t"
895 "por %%mm3, %%mm0\n\t"
896 MOVNTQ" %%mm0, %0\n\t"
897 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
901 __asm __volatile(SFENCE:::"memory");
902 __asm __volatile(EMMS:::"memory");
909 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
914 I use here less accurate approximation by simply
915 left-shifting the input
916 value and filling the low order bits with
917 zeroes. This method improves png's
918 compression but this scheme cannot reproduce white exactly, since it does not
919 generate an all-ones maximum value; the net effect is to darken the
922 The better method should be "left bit replication":
932 | Leftmost Bits Repeated to Fill Open Bits
936 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, long src_size)
940 const uint16_t *mm_end;
942 uint8_t *d = (uint8_t *)dst;
943 const uint16_t *s = (uint16_t *)src;
944 end = s + src_size/2;
946 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
958 "psllq $3, %%mm0\n\t"
959 "psrlq $2, %%mm1\n\t"
960 "psrlq $7, %%mm2\n\t"
961 "movq %%mm0, %%mm3\n\t"
962 "movq %%mm1, %%mm4\n\t"
963 "movq %%mm2, %%mm5\n\t"
964 "punpcklwd %5, %%mm0\n\t"
965 "punpcklwd %5, %%mm1\n\t"
966 "punpcklwd %5, %%mm2\n\t"
967 "punpckhwd %5, %%mm3\n\t"
968 "punpckhwd %5, %%mm4\n\t"
969 "punpckhwd %5, %%mm5\n\t"
970 "psllq $8, %%mm1\n\t"
971 "psllq $16, %%mm2\n\t"
972 "por %%mm1, %%mm0\n\t"
973 "por %%mm2, %%mm0\n\t"
974 "psllq $8, %%mm4\n\t"
975 "psllq $16, %%mm5\n\t"
976 "por %%mm4, %%mm3\n\t"
977 "por %%mm5, %%mm3\n\t"
979 "movq %%mm0, %%mm6\n\t"
980 "movq %%mm3, %%mm7\n\t"
982 "movq 8%1, %%mm0\n\t"
983 "movq 8%1, %%mm1\n\t"
984 "movq 8%1, %%mm2\n\t"
988 "psllq $3, %%mm0\n\t"
989 "psrlq $2, %%mm1\n\t"
990 "psrlq $7, %%mm2\n\t"
991 "movq %%mm0, %%mm3\n\t"
992 "movq %%mm1, %%mm4\n\t"
993 "movq %%mm2, %%mm5\n\t"
994 "punpcklwd %5, %%mm0\n\t"
995 "punpcklwd %5, %%mm1\n\t"
996 "punpcklwd %5, %%mm2\n\t"
997 "punpckhwd %5, %%mm3\n\t"
998 "punpckhwd %5, %%mm4\n\t"
999 "punpckhwd %5, %%mm5\n\t"
1000 "psllq $8, %%mm1\n\t"
1001 "psllq $16, %%mm2\n\t"
1002 "por %%mm1, %%mm0\n\t"
1003 "por %%mm2, %%mm0\n\t"
1004 "psllq $8, %%mm4\n\t"
1005 "psllq $16, %%mm5\n\t"
1006 "por %%mm4, %%mm3\n\t"
1007 "por %%mm5, %%mm3\n\t"
1010 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
1012 /* Borrowed 32 to 24 */
1014 "movq %%mm0, %%mm4\n\t"
1015 "movq %%mm3, %%mm5\n\t"
1016 "movq %%mm6, %%mm0\n\t"
1017 "movq %%mm7, %%mm1\n\t"
1019 "movq %%mm4, %%mm6\n\t"
1020 "movq %%mm5, %%mm7\n\t"
1021 "movq %%mm0, %%mm2\n\t"
1022 "movq %%mm1, %%mm3\n\t"
1024 "psrlq $8, %%mm2\n\t"
1025 "psrlq $8, %%mm3\n\t"
1026 "psrlq $8, %%mm6\n\t"
1027 "psrlq $8, %%mm7\n\t"
1028 "pand %2, %%mm0\n\t"
1029 "pand %2, %%mm1\n\t"
1030 "pand %2, %%mm4\n\t"
1031 "pand %2, %%mm5\n\t"
1032 "pand %3, %%mm2\n\t"
1033 "pand %3, %%mm3\n\t"
1034 "pand %3, %%mm6\n\t"
1035 "pand %3, %%mm7\n\t"
1036 "por %%mm2, %%mm0\n\t"
1037 "por %%mm3, %%mm1\n\t"
1038 "por %%mm6, %%mm4\n\t"
1039 "por %%mm7, %%mm5\n\t"
1041 "movq %%mm1, %%mm2\n\t"
1042 "movq %%mm4, %%mm3\n\t"
1043 "psllq $48, %%mm2\n\t"
1044 "psllq $32, %%mm3\n\t"
1045 "pand %4, %%mm2\n\t"
1046 "pand %5, %%mm3\n\t"
1047 "por %%mm2, %%mm0\n\t"
1048 "psrlq $16, %%mm1\n\t"
1049 "psrlq $32, %%mm4\n\t"
1050 "psllq $16, %%mm5\n\t"
1051 "por %%mm3, %%mm1\n\t"
1052 "pand %6, %%mm5\n\t"
1053 "por %%mm5, %%mm4\n\t"
1055 MOVNTQ" %%mm0, %0\n\t"
1056 MOVNTQ" %%mm1, 8%0\n\t"
1057 MOVNTQ" %%mm4, 16%0"
1060 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1065 __asm __volatile(SFENCE:::"memory");
1066 __asm __volatile(EMMS:::"memory");
1070 register uint16_t bgr;
1072 *d++ = (bgr&0x1F)<<3;
1073 *d++ = (bgr&0x3E0)>>2;
1074 *d++ = (bgr&0x7C00)>>7;
1078 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, long src_size)
1080 const uint16_t *end;
1082 const uint16_t *mm_end;
1084 uint8_t *d = (uint8_t *)dst;
1085 const uint16_t *s = (const uint16_t *)src;
1086 end = s + src_size/2;
1088 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1094 "movq %1, %%mm0\n\t"
1095 "movq %1, %%mm1\n\t"
1096 "movq %1, %%mm2\n\t"
1097 "pand %2, %%mm0\n\t"
1098 "pand %3, %%mm1\n\t"
1099 "pand %4, %%mm2\n\t"
1100 "psllq $3, %%mm0\n\t"
1101 "psrlq $3, %%mm1\n\t"
1102 "psrlq $8, %%mm2\n\t"
1103 "movq %%mm0, %%mm3\n\t"
1104 "movq %%mm1, %%mm4\n\t"
1105 "movq %%mm2, %%mm5\n\t"
1106 "punpcklwd %5, %%mm0\n\t"
1107 "punpcklwd %5, %%mm1\n\t"
1108 "punpcklwd %5, %%mm2\n\t"
1109 "punpckhwd %5, %%mm3\n\t"
1110 "punpckhwd %5, %%mm4\n\t"
1111 "punpckhwd %5, %%mm5\n\t"
1112 "psllq $8, %%mm1\n\t"
1113 "psllq $16, %%mm2\n\t"
1114 "por %%mm1, %%mm0\n\t"
1115 "por %%mm2, %%mm0\n\t"
1116 "psllq $8, %%mm4\n\t"
1117 "psllq $16, %%mm5\n\t"
1118 "por %%mm4, %%mm3\n\t"
1119 "por %%mm5, %%mm3\n\t"
1121 "movq %%mm0, %%mm6\n\t"
1122 "movq %%mm3, %%mm7\n\t"
1124 "movq 8%1, %%mm0\n\t"
1125 "movq 8%1, %%mm1\n\t"
1126 "movq 8%1, %%mm2\n\t"
1127 "pand %2, %%mm0\n\t"
1128 "pand %3, %%mm1\n\t"
1129 "pand %4, %%mm2\n\t"
1130 "psllq $3, %%mm0\n\t"
1131 "psrlq $3, %%mm1\n\t"
1132 "psrlq $8, %%mm2\n\t"
1133 "movq %%mm0, %%mm3\n\t"
1134 "movq %%mm1, %%mm4\n\t"
1135 "movq %%mm2, %%mm5\n\t"
1136 "punpcklwd %5, %%mm0\n\t"
1137 "punpcklwd %5, %%mm1\n\t"
1138 "punpcklwd %5, %%mm2\n\t"
1139 "punpckhwd %5, %%mm3\n\t"
1140 "punpckhwd %5, %%mm4\n\t"
1141 "punpckhwd %5, %%mm5\n\t"
1142 "psllq $8, %%mm1\n\t"
1143 "psllq $16, %%mm2\n\t"
1144 "por %%mm1, %%mm0\n\t"
1145 "por %%mm2, %%mm0\n\t"
1146 "psllq $8, %%mm4\n\t"
1147 "psllq $16, %%mm5\n\t"
1148 "por %%mm4, %%mm3\n\t"
1149 "por %%mm5, %%mm3\n\t"
1151 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1153 /* Borrowed 32 to 24 */
1155 "movq %%mm0, %%mm4\n\t"
1156 "movq %%mm3, %%mm5\n\t"
1157 "movq %%mm6, %%mm0\n\t"
1158 "movq %%mm7, %%mm1\n\t"
1160 "movq %%mm4, %%mm6\n\t"
1161 "movq %%mm5, %%mm7\n\t"
1162 "movq %%mm0, %%mm2\n\t"
1163 "movq %%mm1, %%mm3\n\t"
1165 "psrlq $8, %%mm2\n\t"
1166 "psrlq $8, %%mm3\n\t"
1167 "psrlq $8, %%mm6\n\t"
1168 "psrlq $8, %%mm7\n\t"
1169 "pand %2, %%mm0\n\t"
1170 "pand %2, %%mm1\n\t"
1171 "pand %2, %%mm4\n\t"
1172 "pand %2, %%mm5\n\t"
1173 "pand %3, %%mm2\n\t"
1174 "pand %3, %%mm3\n\t"
1175 "pand %3, %%mm6\n\t"
1176 "pand %3, %%mm7\n\t"
1177 "por %%mm2, %%mm0\n\t"
1178 "por %%mm3, %%mm1\n\t"
1179 "por %%mm6, %%mm4\n\t"
1180 "por %%mm7, %%mm5\n\t"
1182 "movq %%mm1, %%mm2\n\t"
1183 "movq %%mm4, %%mm3\n\t"
1184 "psllq $48, %%mm2\n\t"
1185 "psllq $32, %%mm3\n\t"
1186 "pand %4, %%mm2\n\t"
1187 "pand %5, %%mm3\n\t"
1188 "por %%mm2, %%mm0\n\t"
1189 "psrlq $16, %%mm1\n\t"
1190 "psrlq $32, %%mm4\n\t"
1191 "psllq $16, %%mm5\n\t"
1192 "por %%mm3, %%mm1\n\t"
1193 "pand %6, %%mm5\n\t"
1194 "por %%mm5, %%mm4\n\t"
1196 MOVNTQ" %%mm0, %0\n\t"
1197 MOVNTQ" %%mm1, 8%0\n\t"
1198 MOVNTQ" %%mm4, 16%0"
1201 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1206 __asm __volatile(SFENCE:::"memory");
1207 __asm __volatile(EMMS:::"memory");
1211 register uint16_t bgr;
1213 *d++ = (bgr&0x1F)<<3;
1214 *d++ = (bgr&0x7E0)>>3;
1215 *d++ = (bgr&0xF800)>>8;
1219 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, long src_size)
1221 const uint16_t *end;
1223 const uint16_t *mm_end;
1225 uint8_t *d = (uint8_t *)dst;
1226 const uint16_t *s = (const uint16_t *)src;
1227 end = s + src_size/2;
1229 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1230 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1236 "movq %1, %%mm0\n\t"
1237 "movq %1, %%mm1\n\t"
1238 "movq %1, %%mm2\n\t"
1239 "pand %2, %%mm0\n\t"
1240 "pand %3, %%mm1\n\t"
1241 "pand %4, %%mm2\n\t"
1242 "psllq $3, %%mm0\n\t"
1243 "psrlq $2, %%mm1\n\t"
1244 "psrlq $7, %%mm2\n\t"
1245 "movq %%mm0, %%mm3\n\t"
1246 "movq %%mm1, %%mm4\n\t"
1247 "movq %%mm2, %%mm5\n\t"
1248 "punpcklwd %%mm7, %%mm0\n\t"
1249 "punpcklwd %%mm7, %%mm1\n\t"
1250 "punpcklwd %%mm7, %%mm2\n\t"
1251 "punpckhwd %%mm7, %%mm3\n\t"
1252 "punpckhwd %%mm7, %%mm4\n\t"
1253 "punpckhwd %%mm7, %%mm5\n\t"
1254 "psllq $8, %%mm1\n\t"
1255 "psllq $16, %%mm2\n\t"
1256 "por %%mm1, %%mm0\n\t"
1257 "por %%mm2, %%mm0\n\t"
1258 "psllq $8, %%mm4\n\t"
1259 "psllq $16, %%mm5\n\t"
1260 "por %%mm4, %%mm3\n\t"
1261 "por %%mm5, %%mm3\n\t"
1262 MOVNTQ" %%mm0, %0\n\t"
1263 MOVNTQ" %%mm3, 8%0\n\t"
1265 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1270 __asm __volatile(SFENCE:::"memory");
1271 __asm __volatile(EMMS:::"memory");
1275 #if 0 //slightly slower on athlon
1277 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1279 register uint16_t bgr;
1281 #ifdef WORDS_BIGENDIAN
1283 *d++ = (bgr&0x7C00)>>7;
1284 *d++ = (bgr&0x3E0)>>2;
1285 *d++ = (bgr&0x1F)<<3;
1287 *d++ = (bgr&0x1F)<<3;
1288 *d++ = (bgr&0x3E0)>>2;
1289 *d++ = (bgr&0x7C00)>>7;
1297 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, long src_size)
1299 const uint16_t *end;
1301 const uint16_t *mm_end;
1303 uint8_t *d = (uint8_t *)dst;
1304 const uint16_t *s = (uint16_t *)src;
1305 end = s + src_size/2;
1307 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1308 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1314 "movq %1, %%mm0\n\t"
1315 "movq %1, %%mm1\n\t"
1316 "movq %1, %%mm2\n\t"
1317 "pand %2, %%mm0\n\t"
1318 "pand %3, %%mm1\n\t"
1319 "pand %4, %%mm2\n\t"
1320 "psllq $3, %%mm0\n\t"
1321 "psrlq $3, %%mm1\n\t"
1322 "psrlq $8, %%mm2\n\t"
1323 "movq %%mm0, %%mm3\n\t"
1324 "movq %%mm1, %%mm4\n\t"
1325 "movq %%mm2, %%mm5\n\t"
1326 "punpcklwd %%mm7, %%mm0\n\t"
1327 "punpcklwd %%mm7, %%mm1\n\t"
1328 "punpcklwd %%mm7, %%mm2\n\t"
1329 "punpckhwd %%mm7, %%mm3\n\t"
1330 "punpckhwd %%mm7, %%mm4\n\t"
1331 "punpckhwd %%mm7, %%mm5\n\t"
1332 "psllq $8, %%mm1\n\t"
1333 "psllq $16, %%mm2\n\t"
1334 "por %%mm1, %%mm0\n\t"
1335 "por %%mm2, %%mm0\n\t"
1336 "psllq $8, %%mm4\n\t"
1337 "psllq $16, %%mm5\n\t"
1338 "por %%mm4, %%mm3\n\t"
1339 "por %%mm5, %%mm3\n\t"
1340 MOVNTQ" %%mm0, %0\n\t"
1341 MOVNTQ" %%mm3, 8%0\n\t"
1343 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1348 __asm __volatile(SFENCE:::"memory");
1349 __asm __volatile(EMMS:::"memory");
1353 register uint16_t bgr;
1355 #ifdef WORDS_BIGENDIAN
1357 *d++ = (bgr&0xF800)>>8;
1358 *d++ = (bgr&0x7E0)>>3;
1359 *d++ = (bgr&0x1F)<<3;
1361 *d++ = (bgr&0x1F)<<3;
1362 *d++ = (bgr&0x7E0)>>3;
1363 *d++ = (bgr&0xF800)>>8;
1369 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, long src_size)
1371 long idx = 15 - src_size;
1372 uint8_t *s = (uint8_t *) src-idx, *d = dst-idx;
1377 " "PREFETCH" (%1, %0) \n"
1378 " movq %3, %%mm7 \n"
1379 " pxor %4, %%mm7 \n"
1380 " movq %%mm7, %%mm6 \n"
1381 " pxor %5, %%mm7 \n"
1384 " "PREFETCH" 32(%1, %0) \n"
1385 " movq (%1, %0), %%mm0 \n"
1386 " movq 8(%1, %0), %%mm1 \n"
1388 " pshufw $177, %%mm0, %%mm3 \n"
1389 " pshufw $177, %%mm1, %%mm5 \n"
1390 " pand %%mm7, %%mm0 \n"
1391 " pand %%mm6, %%mm3 \n"
1392 " pand %%mm7, %%mm1 \n"
1393 " pand %%mm6, %%mm5 \n"
1394 " por %%mm3, %%mm0 \n"
1395 " por %%mm5, %%mm1 \n"
1397 " movq %%mm0, %%mm2 \n"
1398 " movq %%mm1, %%mm4 \n"
1399 " pand %%mm7, %%mm0 \n"
1400 " pand %%mm6, %%mm2 \n"
1401 " pand %%mm7, %%mm1 \n"
1402 " pand %%mm6, %%mm4 \n"
1403 " movq %%mm2, %%mm3 \n"
1404 " movq %%mm4, %%mm5 \n"
1405 " pslld $16, %%mm2 \n"
1406 " psrld $16, %%mm3 \n"
1407 " pslld $16, %%mm4 \n"
1408 " psrld $16, %%mm5 \n"
1409 " por %%mm2, %%mm0 \n"
1410 " por %%mm4, %%mm1 \n"
1411 " por %%mm3, %%mm0 \n"
1412 " por %%mm5, %%mm1 \n"
1414 " "MOVNTQ" %%mm0, (%2, %0) \n"
1415 " "MOVNTQ" %%mm1, 8(%2, %0) \n"
1422 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1425 for (; idx<15; idx+=4) {
1426 register int v = *(uint32_t *)&s[idx], g = v & 0xff00;
1428 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1432 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, long src_size)
1436 long mmx_size= 23 - src_size;
1438 "test %%"REG_a", %%"REG_a" \n\t"
1440 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1441 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1442 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1445 PREFETCH" 32(%1, %%"REG_a") \n\t"
1446 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1447 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1448 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1449 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1450 "pand %%mm5, %%mm0 \n\t"
1451 "pand %%mm6, %%mm1 \n\t"
1452 "pand %%mm7, %%mm2 \n\t"
1453 "por %%mm0, %%mm1 \n\t"
1454 "por %%mm2, %%mm1 \n\t"
1455 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1456 MOVNTQ" %%mm1, (%2, %%"REG_a")\n\t" // RGB RGB RG
1457 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1458 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1459 "pand %%mm7, %%mm0 \n\t"
1460 "pand %%mm5, %%mm1 \n\t"
1461 "pand %%mm6, %%mm2 \n\t"
1462 "por %%mm0, %%mm1 \n\t"
1463 "por %%mm2, %%mm1 \n\t"
1464 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1465 MOVNTQ" %%mm1, 8(%2, %%"REG_a")\n\t" // B RGB RGB R
1466 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1467 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1468 "pand %%mm6, %%mm0 \n\t"
1469 "pand %%mm7, %%mm1 \n\t"
1470 "pand %%mm5, %%mm2 \n\t"
1471 "por %%mm0, %%mm1 \n\t"
1472 "por %%mm2, %%mm1 \n\t"
1473 MOVNTQ" %%mm1, 16(%2, %%"REG_a")\n\t"
1474 "add $24, %%"REG_a" \n\t"
1478 : "r" (src-mmx_size), "r"(dst-mmx_size)
1481 __asm __volatile(SFENCE:::"memory");
1482 __asm __volatile(EMMS:::"memory");
1484 if(mmx_size==23) return; //finihsed, was multiple of 8
1488 src_size= 23-mmx_size;
1492 for(i=0; i<src_size; i+=3)
1496 dst[i + 1] = src[i + 1];
1497 dst[i + 2] = src[i + 0];
1502 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1503 long width, long height,
1504 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1507 const long chromWidth= width>>1;
1508 for(y=0; y<height; y++)
1511 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1513 "xor %%"REG_a", %%"REG_a" \n\t"
1516 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1517 PREFETCH" 32(%2, %%"REG_a") \n\t"
1518 PREFETCH" 32(%3, %%"REG_a") \n\t"
1519 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1520 "movq %%mm0, %%mm2 \n\t" // U(0)
1521 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1522 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1523 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1525 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1526 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1527 "movq %%mm3, %%mm4 \n\t" // Y(0)
1528 "movq %%mm5, %%mm6 \n\t" // Y(8)
1529 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1530 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1531 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1532 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1534 MOVNTQ" %%mm3, (%0, %%"REG_a", 4)\n\t"
1535 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1536 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4)\n\t"
1537 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1539 "add $8, %%"REG_a" \n\t"
1540 "cmp %4, %%"REG_a" \n\t"
1542 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1547 #if defined ARCH_ALPHA && defined HAVE_MVI
1548 #define pl2yuy2(n) \
1553 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1554 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1555 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1556 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1557 yuv1 = (u << 8) + (v << 24); \
1564 uint64_t *qdst = (uint64_t *) dst;
1565 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1566 const uint32_t *yc = (uint32_t *) ysrc;
1567 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1568 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1569 for(i = 0; i < chromWidth; i += 8){
1570 uint64_t y1, y2, yuv1, yuv2;
1573 asm("ldq $31,64(%0)" :: "r"(yc));
1574 asm("ldq $31,64(%0)" :: "r"(yc2));
1575 asm("ldq $31,64(%0)" :: "r"(uc));
1576 asm("ldq $31,64(%0)" :: "r"(vc));
1594 #elif __WORDSIZE >= 64
1596 uint64_t *ldst = (uint64_t *) dst;
1597 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1598 for(i = 0; i < chromWidth; i += 2){
1600 k = yc[0] + (uc[0] << 8) +
1601 (yc[1] << 16) + (vc[0] << 24);
1602 l = yc[2] + (uc[1] << 8) +
1603 (yc[3] << 16) + (vc[1] << 24);
1604 *ldst++ = k + (l << 32);
1611 int i, *idst = (int32_t *) dst;
1612 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1613 for(i = 0; i < chromWidth; i++){
1614 #ifdef WORDS_BIGENDIAN
1615 *idst++ = (yc[0] << 24)+ (uc[0] << 16) +
1616 (yc[1] << 8) + (vc[0] << 0);
1618 *idst++ = yc[0] + (uc[0] << 8) +
1619 (yc[1] << 16) + (vc[0] << 24);
1627 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1629 usrc += chromStride;
1630 vsrc += chromStride;
1644 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1645 * problem for anyone then tell me, and ill fix it)
1647 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1648 long width, long height,
1649 long lumStride, long chromStride, long dstStride)
1651 //FIXME interpolate chroma
1652 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1655 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1656 long width, long height,
1657 long lumStride, long chromStride, long dstStride, long vertLumPerChroma)
1660 const long chromWidth= width>>1;
1661 for(y=0; y<height; y++)
1664 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1666 "xor %%"REG_a", %%"REG_a" \n\t"
1669 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1670 PREFETCH" 32(%2, %%"REG_a") \n\t"
1671 PREFETCH" 32(%3, %%"REG_a") \n\t"
1672 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1673 "movq %%mm0, %%mm2 \n\t" // U(0)
1674 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1675 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1676 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1678 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1679 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1680 "movq %%mm0, %%mm4 \n\t" // Y(0)
1681 "movq %%mm2, %%mm6 \n\t" // Y(8)
1682 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1683 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1684 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1685 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1687 MOVNTQ" %%mm0, (%0, %%"REG_a", 4)\n\t"
1688 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4)\n\t"
1689 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4)\n\t"
1690 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4)\n\t"
1692 "add $8, %%"REG_a" \n\t"
1693 "cmp %4, %%"REG_a" \n\t"
1695 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1699 //FIXME adapt the alpha asm code from yv12->yuy2
1701 #if __WORDSIZE >= 64
1703 uint64_t *ldst = (uint64_t *) dst;
1704 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1705 for(i = 0; i < chromWidth; i += 2){
1707 k = uc[0] + (yc[0] << 8) +
1708 (vc[0] << 16) + (yc[1] << 24);
1709 l = uc[1] + (yc[2] << 8) +
1710 (vc[1] << 16) + (yc[3] << 24);
1711 *ldst++ = k + (l << 32);
1718 int i, *idst = (int32_t *) dst;
1719 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1720 for(i = 0; i < chromWidth; i++){
1721 #ifdef WORDS_BIGENDIAN
1722 *idst++ = (uc[0] << 24)+ (yc[0] << 16) +
1723 (vc[0] << 8) + (yc[1] << 0);
1725 *idst++ = uc[0] + (yc[0] << 8) +
1726 (vc[0] << 16) + (yc[1] << 24);
1734 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1736 usrc += chromStride;
1737 vsrc += chromStride;
1751 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1752 * problem for anyone then tell me, and ill fix it)
1754 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1755 long width, long height,
1756 long lumStride, long chromStride, long dstStride)
1758 //FIXME interpolate chroma
1759 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1764 * width should be a multiple of 16
1766 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1767 long width, long height,
1768 long lumStride, long chromStride, long dstStride)
1770 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1775 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1776 * problem for anyone then tell me, and ill fix it)
1778 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1779 long width, long height,
1780 long lumStride, long chromStride, long srcStride)
1783 const long chromWidth= width>>1;
1784 for(y=0; y<height; y+=2)
1788 "xor %%"REG_a", %%"REG_a" \n\t"
1789 "pcmpeqw %%mm7, %%mm7 \n\t"
1790 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1793 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1794 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1795 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1796 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1797 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1798 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1799 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1800 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1801 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1802 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1803 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1805 MOVNTQ" %%mm2, (%1, %%"REG_a", 2)\n\t"
1807 "movq 16(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(8)
1808 "movq 24(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(12)
1809 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1810 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1811 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1812 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1813 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1814 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1815 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1816 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1818 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2)\n\t"
1820 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1821 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1822 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1823 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1824 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1825 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1826 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1827 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1829 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1830 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1832 "add $8, %%"REG_a" \n\t"
1833 "cmp %4, %%"REG_a" \n\t"
1835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1836 : "memory", "%"REG_a
1843 "xor %%"REG_a", %%"REG_a" \n\t"
1846 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1847 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1848 "movq 8(%0, %%"REG_a", 4), %%mm1\n\t" // YUYV YUYV(4)
1849 "movq 16(%0, %%"REG_a", 4), %%mm2\n\t" // YUYV YUYV(8)
1850 "movq 24(%0, %%"REG_a", 4), %%mm3\n\t" // YUYV YUYV(12)
1851 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1852 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1853 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1854 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1855 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1856 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1858 MOVNTQ" %%mm0, (%1, %%"REG_a", 2)\n\t"
1859 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2)\n\t"
1861 "add $8, %%"REG_a" \n\t"
1862 "cmp %4, %%"REG_a" \n\t"
1865 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1866 : "memory", "%"REG_a
1870 for(i=0; i<chromWidth; i++)
1872 ydst[2*i+0] = src[4*i+0];
1873 udst[i] = src[4*i+1];
1874 ydst[2*i+1] = src[4*i+2];
1875 vdst[i] = src[4*i+3];
1880 for(i=0; i<chromWidth; i++)
1882 ydst[2*i+0] = src[4*i+0];
1883 ydst[2*i+1] = src[4*i+2];
1886 udst += chromStride;
1887 vdst += chromStride;
1892 asm volatile( EMMS" \n\t"
1898 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1899 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1900 long width, long height, long lumStride, long chromStride)
1903 memcpy(ydst, ysrc, width*height);
1905 /* XXX: implement upscaling for U,V */
1908 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, long srcWidth, long srcHeight, long srcStride, long dstStride)
1915 for(x=0; x<srcWidth-1; x++){
1916 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1917 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1919 dst[2*srcWidth-1]= src[srcWidth-1];
1923 for(y=1; y<srcHeight; y++){
1924 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1925 const long mmxSize= srcWidth&~15;
1927 "mov %4, %%"REG_a" \n\t"
1929 "movq (%0, %%"REG_a"), %%mm0 \n\t"
1930 "movq (%1, %%"REG_a"), %%mm1 \n\t"
1931 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1932 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1933 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1934 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1935 PAVGB" %%mm0, %%mm5 \n\t"
1936 PAVGB" %%mm0, %%mm3 \n\t"
1937 PAVGB" %%mm0, %%mm5 \n\t"
1938 PAVGB" %%mm0, %%mm3 \n\t"
1939 PAVGB" %%mm1, %%mm4 \n\t"
1940 PAVGB" %%mm1, %%mm2 \n\t"
1941 PAVGB" %%mm1, %%mm4 \n\t"
1942 PAVGB" %%mm1, %%mm2 \n\t"
1943 "movq %%mm5, %%mm7 \n\t"
1944 "movq %%mm4, %%mm6 \n\t"
1945 "punpcklbw %%mm3, %%mm5 \n\t"
1946 "punpckhbw %%mm3, %%mm7 \n\t"
1947 "punpcklbw %%mm2, %%mm4 \n\t"
1948 "punpckhbw %%mm2, %%mm6 \n\t"
1950 MOVNTQ" %%mm5, (%2, %%"REG_a", 2)\n\t"
1951 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1952 MOVNTQ" %%mm4, (%3, %%"REG_a", 2)\n\t"
1953 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1955 "movq %%mm5, (%2, %%"REG_a", 2) \n\t"
1956 "movq %%mm7, 8(%2, %%"REG_a", 2)\n\t"
1957 "movq %%mm4, (%3, %%"REG_a", 2) \n\t"
1958 "movq %%mm6, 8(%3, %%"REG_a", 2)\n\t"
1960 "add $8, %%"REG_a" \n\t"
1962 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1963 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1969 const long mmxSize=1;
1971 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1972 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1974 for(x=mmxSize-1; x<srcWidth-1; x++){
1975 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1976 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1977 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1978 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1980 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1981 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1991 for(x=0; x<srcWidth-1; x++){
1992 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1993 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1995 dst[2*srcWidth-1]= src[srcWidth-1];
1997 for(x=0; x<srcWidth; x++){
2004 asm volatile( EMMS" \n\t"
2012 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
2013 * problem for anyone then tell me, and ill fix it)
2014 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
2016 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2017 long width, long height,
2018 long lumStride, long chromStride, long srcStride)
2021 const long chromWidth= width>>1;
2022 for(y=0; y<height; y+=2)
2026 "xorl %%eax, %%eax \n\t"
2027 "pcmpeqw %%mm7, %%mm7 \n\t"
2028 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
2031 PREFETCH" 64(%0, %%eax, 4) \n\t"
2032 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
2033 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
2034 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
2035 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
2036 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
2037 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
2038 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
2039 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
2040 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
2041 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
2043 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
2045 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
2046 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
2047 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
2048 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
2049 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
2050 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
2051 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
2052 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
2053 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
2054 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
2056 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
2058 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
2059 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
2060 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
2061 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
2062 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
2063 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
2064 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
2065 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
2067 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
2068 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
2070 "addl $8, %%eax \n\t"
2071 "cmpl %4, %%eax \n\t"
2073 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2081 "xorl %%eax, %%eax \n\t"
2084 PREFETCH" 64(%0, %%eax, 4) \n\t"
2085 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
2086 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
2087 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
2088 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2089 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2090 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2091 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2092 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2093 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2094 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2096 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2097 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2099 "addl $8, %%eax \n\t"
2100 "cmpl %4, %%eax \n\t"
2103 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2108 for(i=0; i<chromWidth; i++)
2110 udst[i] = src[4*i+0];
2111 ydst[2*i+0] = src[4*i+1];
2112 vdst[i] = src[4*i+2];
2113 ydst[2*i+1] = src[4*i+3];
2118 for(i=0; i<chromWidth; i++)
2120 ydst[2*i+0] = src[4*i+1];
2121 ydst[2*i+1] = src[4*i+3];
2124 udst += chromStride;
2125 vdst += chromStride;
2130 asm volatile( EMMS" \n\t"
2138 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2139 * problem for anyone then tell me, and ill fix it)
2140 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2142 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2143 long width, long height,
2144 long lumStride, long chromStride, long srcStride)
2147 const long chromWidth= width>>1;
2149 for(y=0; y<height-2; y+=2)
2155 "mov %2, %%"REG_a" \n\t"
2156 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2157 "movq "MANGLE(w1111)", %%mm5 \n\t"
2158 "pxor %%mm7, %%mm7 \n\t"
2159 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2162 PREFETCH" 64(%0, %%"REG_d") \n\t"
2163 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2164 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
2165 "punpcklbw %%mm7, %%mm0 \n\t"
2166 "punpcklbw %%mm7, %%mm1 \n\t"
2167 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
2168 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
2169 "punpcklbw %%mm7, %%mm2 \n\t"
2170 "punpcklbw %%mm7, %%mm3 \n\t"
2171 "pmaddwd %%mm6, %%mm0 \n\t"
2172 "pmaddwd %%mm6, %%mm1 \n\t"
2173 "pmaddwd %%mm6, %%mm2 \n\t"
2174 "pmaddwd %%mm6, %%mm3 \n\t"
2175 #ifndef FAST_BGR2YV12
2176 "psrad $8, %%mm0 \n\t"
2177 "psrad $8, %%mm1 \n\t"
2178 "psrad $8, %%mm2 \n\t"
2179 "psrad $8, %%mm3 \n\t"
2181 "packssdw %%mm1, %%mm0 \n\t"
2182 "packssdw %%mm3, %%mm2 \n\t"
2183 "pmaddwd %%mm5, %%mm0 \n\t"
2184 "pmaddwd %%mm5, %%mm2 \n\t"
2185 "packssdw %%mm2, %%mm0 \n\t"
2186 "psraw $7, %%mm0 \n\t"
2188 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2189 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
2190 "punpcklbw %%mm7, %%mm4 \n\t"
2191 "punpcklbw %%mm7, %%mm1 \n\t"
2192 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
2193 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
2194 "punpcklbw %%mm7, %%mm2 \n\t"
2195 "punpcklbw %%mm7, %%mm3 \n\t"
2196 "pmaddwd %%mm6, %%mm4 \n\t"
2197 "pmaddwd %%mm6, %%mm1 \n\t"
2198 "pmaddwd %%mm6, %%mm2 \n\t"
2199 "pmaddwd %%mm6, %%mm3 \n\t"
2200 #ifndef FAST_BGR2YV12
2201 "psrad $8, %%mm4 \n\t"
2202 "psrad $8, %%mm1 \n\t"
2203 "psrad $8, %%mm2 \n\t"
2204 "psrad $8, %%mm3 \n\t"
2206 "packssdw %%mm1, %%mm4 \n\t"
2207 "packssdw %%mm3, %%mm2 \n\t"
2208 "pmaddwd %%mm5, %%mm4 \n\t"
2209 "pmaddwd %%mm5, %%mm2 \n\t"
2210 "add $24, %%"REG_d" \n\t"
2211 "packssdw %%mm2, %%mm4 \n\t"
2212 "psraw $7, %%mm4 \n\t"
2214 "packuswb %%mm4, %%mm0 \n\t"
2215 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2217 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
2218 "add $8, %%"REG_a" \n\t"
2220 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2221 : "%"REG_a, "%"REG_d
2228 "mov %4, %%"REG_a" \n\t"
2229 "movq "MANGLE(w1111)", %%mm5 \n\t"
2230 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2231 "pxor %%mm7, %%mm7 \n\t"
2232 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d"\n\t"
2233 "add %%"REG_d", %%"REG_d" \n\t"
2236 PREFETCH" 64(%0, %%"REG_d") \n\t"
2237 PREFETCH" 64(%1, %%"REG_d") \n\t"
2238 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2239 "movq (%0, %%"REG_d"), %%mm0 \n\t"
2240 "movq (%1, %%"REG_d"), %%mm1 \n\t"
2241 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
2242 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
2243 PAVGB" %%mm1, %%mm0 \n\t"
2244 PAVGB" %%mm3, %%mm2 \n\t"
2245 "movq %%mm0, %%mm1 \n\t"
2246 "movq %%mm2, %%mm3 \n\t"
2247 "psrlq $24, %%mm0 \n\t"
2248 "psrlq $24, %%mm2 \n\t"
2249 PAVGB" %%mm1, %%mm0 \n\t"
2250 PAVGB" %%mm3, %%mm2 \n\t"
2251 "punpcklbw %%mm7, %%mm0 \n\t"
2252 "punpcklbw %%mm7, %%mm2 \n\t"
2254 "movd (%0, %%"REG_d"), %%mm0 \n\t"
2255 "movd (%1, %%"REG_d"), %%mm1 \n\t"
2256 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
2257 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
2258 "punpcklbw %%mm7, %%mm0 \n\t"
2259 "punpcklbw %%mm7, %%mm1 \n\t"
2260 "punpcklbw %%mm7, %%mm2 \n\t"
2261 "punpcklbw %%mm7, %%mm3 \n\t"
2262 "paddw %%mm1, %%mm0 \n\t"
2263 "paddw %%mm3, %%mm2 \n\t"
2264 "paddw %%mm2, %%mm0 \n\t"
2265 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
2266 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
2267 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
2268 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
2269 "punpcklbw %%mm7, %%mm4 \n\t"
2270 "punpcklbw %%mm7, %%mm1 \n\t"
2271 "punpcklbw %%mm7, %%mm2 \n\t"
2272 "punpcklbw %%mm7, %%mm3 \n\t"
2273 "paddw %%mm1, %%mm4 \n\t"
2274 "paddw %%mm3, %%mm2 \n\t"
2275 "paddw %%mm4, %%mm2 \n\t"
2276 "psrlw $2, %%mm0 \n\t"
2277 "psrlw $2, %%mm2 \n\t"
2279 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2280 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2282 "pmaddwd %%mm0, %%mm1 \n\t"
2283 "pmaddwd %%mm2, %%mm3 \n\t"
2284 "pmaddwd %%mm6, %%mm0 \n\t"
2285 "pmaddwd %%mm6, %%mm2 \n\t"
2286 #ifndef FAST_BGR2YV12
2287 "psrad $8, %%mm0 \n\t"
2288 "psrad $8, %%mm1 \n\t"
2289 "psrad $8, %%mm2 \n\t"
2290 "psrad $8, %%mm3 \n\t"
2292 "packssdw %%mm2, %%mm0 \n\t"
2293 "packssdw %%mm3, %%mm1 \n\t"
2294 "pmaddwd %%mm5, %%mm0 \n\t"
2295 "pmaddwd %%mm5, %%mm1 \n\t"
2296 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2297 "psraw $7, %%mm0 \n\t"
2299 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2300 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
2301 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
2302 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
2303 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
2304 PAVGB" %%mm1, %%mm4 \n\t"
2305 PAVGB" %%mm3, %%mm2 \n\t"
2306 "movq %%mm4, %%mm1 \n\t"
2307 "movq %%mm2, %%mm3 \n\t"
2308 "psrlq $24, %%mm4 \n\t"
2309 "psrlq $24, %%mm2 \n\t"
2310 PAVGB" %%mm1, %%mm4 \n\t"
2311 PAVGB" %%mm3, %%mm2 \n\t"
2312 "punpcklbw %%mm7, %%mm4 \n\t"
2313 "punpcklbw %%mm7, %%mm2 \n\t"
2315 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
2316 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
2317 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
2318 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
2319 "punpcklbw %%mm7, %%mm4 \n\t"
2320 "punpcklbw %%mm7, %%mm1 \n\t"
2321 "punpcklbw %%mm7, %%mm2 \n\t"
2322 "punpcklbw %%mm7, %%mm3 \n\t"
2323 "paddw %%mm1, %%mm4 \n\t"
2324 "paddw %%mm3, %%mm2 \n\t"
2325 "paddw %%mm2, %%mm4 \n\t"
2326 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
2327 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
2328 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
2329 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
2330 "punpcklbw %%mm7, %%mm5 \n\t"
2331 "punpcklbw %%mm7, %%mm1 \n\t"
2332 "punpcklbw %%mm7, %%mm2 \n\t"
2333 "punpcklbw %%mm7, %%mm3 \n\t"
2334 "paddw %%mm1, %%mm5 \n\t"
2335 "paddw %%mm3, %%mm2 \n\t"
2336 "paddw %%mm5, %%mm2 \n\t"
2337 "movq "MANGLE(w1111)", %%mm5 \n\t"
2338 "psrlw $2, %%mm4 \n\t"
2339 "psrlw $2, %%mm2 \n\t"
2341 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2342 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2344 "pmaddwd %%mm4, %%mm1 \n\t"
2345 "pmaddwd %%mm2, %%mm3 \n\t"
2346 "pmaddwd %%mm6, %%mm4 \n\t"
2347 "pmaddwd %%mm6, %%mm2 \n\t"
2348 #ifndef FAST_BGR2YV12
2349 "psrad $8, %%mm4 \n\t"
2350 "psrad $8, %%mm1 \n\t"
2351 "psrad $8, %%mm2 \n\t"
2352 "psrad $8, %%mm3 \n\t"
2354 "packssdw %%mm2, %%mm4 \n\t"
2355 "packssdw %%mm3, %%mm1 \n\t"
2356 "pmaddwd %%mm5, %%mm4 \n\t"
2357 "pmaddwd %%mm5, %%mm1 \n\t"
2358 "add $24, %%"REG_d" \n\t"
2359 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2360 "psraw $7, %%mm4 \n\t"
2362 "movq %%mm0, %%mm1 \n\t"
2363 "punpckldq %%mm4, %%mm0 \n\t"
2364 "punpckhdq %%mm4, %%mm1 \n\t"
2365 "packsswb %%mm1, %%mm0 \n\t"
2366 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2367 "movd %%mm0, (%2, %%"REG_a") \n\t"
2368 "punpckhdq %%mm0, %%mm0 \n\t"
2369 "movd %%mm0, (%3, %%"REG_a") \n\t"
2370 "add $4, %%"REG_a" \n\t"
2372 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
2373 : "%"REG_a, "%"REG_d
2376 udst += chromStride;
2377 vdst += chromStride;
2381 asm volatile( EMMS" \n\t"
2387 for(; y<height; y+=2)
2390 for(i=0; i<chromWidth; i++)
2392 unsigned int b= src[6*i+0];
2393 unsigned int g= src[6*i+1];
2394 unsigned int r= src[6*i+2];
2396 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2397 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2398 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2408 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2414 for(i=0; i<chromWidth; i++)
2416 unsigned int b= src[6*i+0];
2417 unsigned int g= src[6*i+1];
2418 unsigned int r= src[6*i+2];
2420 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2428 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2431 udst += chromStride;
2432 vdst += chromStride;
2438 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2439 long width, long height, long src1Stride,
2440 long src2Stride, long dstStride){
2443 for(h=0; h < height; h++)
2450 "xor %%"REG_a", %%"REG_a" \n\t"
2452 PREFETCH" 64(%1, %%"REG_a") \n\t"
2453 PREFETCH" 64(%2, %%"REG_a") \n\t"
2454 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
2455 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
2456 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
2457 "punpcklbw %%xmm2, %%xmm0 \n\t"
2458 "punpckhbw %%xmm2, %%xmm1 \n\t"
2459 "movntdq %%xmm0, (%0, %%"REG_a", 2)\n\t"
2460 "movntdq %%xmm1, 16(%0, %%"REG_a", 2)\n\t"
2461 "add $16, %%"REG_a" \n\t"
2462 "cmp %3, %%"REG_a" \n\t"
2464 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2465 : "memory", "%"REG_a""
2469 "xor %%"REG_a", %%"REG_a" \n\t"
2471 PREFETCH" 64(%1, %%"REG_a") \n\t"
2472 PREFETCH" 64(%2, %%"REG_a") \n\t"
2473 "movq (%1, %%"REG_a"), %%mm0 \n\t"
2474 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
2475 "movq %%mm0, %%mm1 \n\t"
2476 "movq %%mm2, %%mm3 \n\t"
2477 "movq (%2, %%"REG_a"), %%mm4 \n\t"
2478 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
2479 "punpcklbw %%mm4, %%mm0 \n\t"
2480 "punpckhbw %%mm4, %%mm1 \n\t"
2481 "punpcklbw %%mm5, %%mm2 \n\t"
2482 "punpckhbw %%mm5, %%mm3 \n\t"
2483 MOVNTQ" %%mm0, (%0, %%"REG_a", 2)\n\t"
2484 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2)\n\t"
2485 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2)\n\t"
2486 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2)\n\t"
2487 "add $16, %%"REG_a" \n\t"
2488 "cmp %3, %%"REG_a" \n\t"
2490 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2491 : "memory", "%"REG_a
2494 for(w= (width&(~15)); w < width; w++)
2496 dest[2*w+0] = src1[w];
2497 dest[2*w+1] = src2[w];
2500 for(w=0; w < width; w++)
2502 dest[2*w+0] = src1[w];
2503 dest[2*w+1] = src2[w];
2519 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2520 uint8_t *dst1, uint8_t *dst2,
2521 long width, long height,
2522 long srcStride1, long srcStride2,
2523 long dstStride1, long dstStride2)
2526 w=width/2; h=height/2;
2531 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2534 const uint8_t* s1=src1+srcStride1*(y>>1);
2535 uint8_t* d=dst1+dstStride1*y;
2542 "movq %1, %%mm0\n\t"
2543 "movq 8%1, %%mm2\n\t"
2544 "movq 16%1, %%mm4\n\t"
2545 "movq 24%1, %%mm6\n\t"
2546 "movq %%mm0, %%mm1\n\t"
2547 "movq %%mm2, %%mm3\n\t"
2548 "movq %%mm4, %%mm5\n\t"
2549 "movq %%mm6, %%mm7\n\t"
2550 "punpcklbw %%mm0, %%mm0\n\t"
2551 "punpckhbw %%mm1, %%mm1\n\t"
2552 "punpcklbw %%mm2, %%mm2\n\t"
2553 "punpckhbw %%mm3, %%mm3\n\t"
2554 "punpcklbw %%mm4, %%mm4\n\t"
2555 "punpckhbw %%mm5, %%mm5\n\t"
2556 "punpcklbw %%mm6, %%mm6\n\t"
2557 "punpckhbw %%mm7, %%mm7\n\t"
2558 MOVNTQ" %%mm0, %0\n\t"
2559 MOVNTQ" %%mm1, 8%0\n\t"
2560 MOVNTQ" %%mm2, 16%0\n\t"
2561 MOVNTQ" %%mm3, 24%0\n\t"
2562 MOVNTQ" %%mm4, 32%0\n\t"
2563 MOVNTQ" %%mm5, 40%0\n\t"
2564 MOVNTQ" %%mm6, 48%0\n\t"
2565 MOVNTQ" %%mm7, 56%0"
2571 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2574 const uint8_t* s2=src2+srcStride2*(y>>1);
2575 uint8_t* d=dst2+dstStride2*y;
2582 "movq %1, %%mm0\n\t"
2583 "movq 8%1, %%mm2\n\t"
2584 "movq 16%1, %%mm4\n\t"
2585 "movq 24%1, %%mm6\n\t"
2586 "movq %%mm0, %%mm1\n\t"
2587 "movq %%mm2, %%mm3\n\t"
2588 "movq %%mm4, %%mm5\n\t"
2589 "movq %%mm6, %%mm7\n\t"
2590 "punpcklbw %%mm0, %%mm0\n\t"
2591 "punpckhbw %%mm1, %%mm1\n\t"
2592 "punpcklbw %%mm2, %%mm2\n\t"
2593 "punpckhbw %%mm3, %%mm3\n\t"
2594 "punpcklbw %%mm4, %%mm4\n\t"
2595 "punpckhbw %%mm5, %%mm5\n\t"
2596 "punpcklbw %%mm6, %%mm6\n\t"
2597 "punpckhbw %%mm7, %%mm7\n\t"
2598 MOVNTQ" %%mm0, %0\n\t"
2599 MOVNTQ" %%mm1, 8%0\n\t"
2600 MOVNTQ" %%mm2, 16%0\n\t"
2601 MOVNTQ" %%mm3, 24%0\n\t"
2602 MOVNTQ" %%mm4, 32%0\n\t"
2603 MOVNTQ" %%mm5, 40%0\n\t"
2604 MOVNTQ" %%mm6, 48%0\n\t"
2605 MOVNTQ" %%mm7, 56%0"
2611 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2622 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2624 long width, long height,
2625 long srcStride1, long srcStride2,
2626 long srcStride3, long dstStride)
2629 w=width/2; h=height;
2631 const uint8_t* yp=src1+srcStride1*y;
2632 const uint8_t* up=src2+srcStride2*(y>>2);
2633 const uint8_t* vp=src3+srcStride3*(y>>2);
2634 uint8_t* d=dst+dstStride*y;
2640 PREFETCH" 32(%1, %0)\n\t"
2641 PREFETCH" 32(%2, %0)\n\t"
2642 PREFETCH" 32(%3, %0)\n\t"
2643 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2644 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2645 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2646 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2647 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2648 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2649 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2650 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2651 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2652 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2654 "movq %%mm1, %%mm6\n\t"
2655 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2656 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2657 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2658 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2659 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2661 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2662 "movq 8(%1, %0, 4), %%mm0\n\t"
2663 "movq %%mm0, %%mm3\n\t"
2664 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2665 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2666 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2667 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2669 "movq %%mm4, %%mm6\n\t"
2670 "movq 16(%1, %0, 4), %%mm0\n\t"
2671 "movq %%mm0, %%mm3\n\t"
2672 "punpcklbw %%mm5, %%mm4\n\t"
2673 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2674 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2675 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2676 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2678 "punpckhbw %%mm5, %%mm6\n\t"
2679 "movq 24(%1, %0, 4), %%mm0\n\t"
2680 "movq %%mm0, %%mm3\n\t"
2681 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2682 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2683 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2684 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2687 : "r"(yp), "r" (up), "r"(vp), "r"(d)
2693 const long x2= x<<2;
2713 static inline void RENAME(rgb2rgb_init)(void){
2714 rgb15to16= RENAME(rgb15to16);
2715 rgb15to24= RENAME(rgb15to24);
2716 rgb15to32= RENAME(rgb15to32);
2717 rgb16to24= RENAME(rgb16to24);
2718 rgb16to32= RENAME(rgb16to32);
2719 rgb16to15= RENAME(rgb16to15);
2720 rgb24to16= RENAME(rgb24to16);
2721 rgb24to15= RENAME(rgb24to15);
2722 rgb24to32= RENAME(rgb24to32);
2723 rgb32to16= RENAME(rgb32to16);
2724 rgb32to15= RENAME(rgb32to15);
2725 rgb32to24= RENAME(rgb32to24);
2726 rgb24tobgr15= RENAME(rgb24tobgr15);
2727 rgb24tobgr16= RENAME(rgb24tobgr16);
2728 rgb24tobgr24= RENAME(rgb24tobgr24);
2729 rgb32tobgr32= RENAME(rgb32tobgr32);
2730 rgb32tobgr16= RENAME(rgb32tobgr16);
2731 rgb32tobgr15= RENAME(rgb32tobgr15);
2732 yv12toyuy2= RENAME(yv12toyuy2);
2733 yv12touyvy= RENAME(yv12touyvy);
2734 yuv422ptoyuy2= RENAME(yuv422ptoyuy2);
2735 yuy2toyv12= RENAME(yuy2toyv12);
2736 // uyvytoyv12= RENAME(uyvytoyv12);
2737 // yvu9toyv12= RENAME(yvu9toyv12);
2738 planar2x= RENAME(planar2x);
2739 rgb24toyv12= RENAME(rgb24toyv12);
2740 interleaveBytes= RENAME(interleaveBytes);
2741 vu9_to_vu12= RENAME(vu9_to_vu12);
2742 yvu9_to_yuy2= RENAME(yvu9_to_yuy2);