3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246 register const uint8_t* s=src;
247 register uint8_t* d=dst;
248 register const uint8_t *end;
249 const uint8_t *mm_end;
252 __asm __volatile(PREFETCH" %0"::"m"(*s));
253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
261 "movq 8%1, %%mm2\n\t"
262 "movq %%mm0, %%mm1\n\t"
263 "movq %%mm2, %%mm3\n\t"
264 "psrlq $1, %%mm0\n\t"
265 "psrlq $1, %%mm2\n\t"
266 "pand %%mm7, %%mm0\n\t"
267 "pand %%mm7, %%mm2\n\t"
268 "pand %%mm6, %%mm1\n\t"
269 "pand %%mm6, %%mm3\n\t"
270 "por %%mm1, %%mm0\n\t"
271 "por %%mm3, %%mm2\n\t"
272 MOVNTQ" %%mm0, %0\n\t"
280 __asm __volatile(SFENCE:::"memory");
281 __asm __volatile(EMMS:::"memory");
286 register uint32_t x= *((uint32_t *)s);
287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
293 register uint16_t x= *((uint16_t *)s);
294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302 const uint8_t *s = src;
305 const uint8_t *mm_end;
307 uint16_t *d = (uint16_t *)dst;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
313 "movq %3, %%mm5 \n\t"
314 "movq %4, %%mm6 \n\t"
315 "movq %5, %%mm7 \n\t"
318 PREFETCH" 32(%1) \n\t"
319 "movd (%1), %%mm0 \n\t"
320 "movd 4(%1), %%mm3 \n\t"
321 "punpckldq 8(%1), %%mm0 \n\t"
322 "punpckldq 12(%1), %%mm3 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "movq %%mm3, %%mm4 \n\t"
325 "pand %%mm6, %%mm0 \n\t"
326 "pand %%mm6, %%mm3 \n\t"
327 "pmaddwd %%mm7, %%mm0 \n\t"
328 "pmaddwd %%mm7, %%mm3 \n\t"
329 "pand %%mm5, %%mm1 \n\t"
330 "pand %%mm5, %%mm4 \n\t"
331 "por %%mm1, %%mm0 \n\t"
332 "por %%mm4, %%mm3 \n\t"
333 "psrld $5, %%mm0 \n\t"
334 "pslld $11, %%mm3 \n\t"
335 "por %%mm3, %%mm0 \n\t"
336 MOVNTQ" %%mm0, (%0) \n\t"
342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
349 ::"m"(red_16mask),"m"(green_16mask));
355 "movd 4%1, %%mm3\n\t"
356 "punpckldq 8%1, %%mm0\n\t"
357 "punpckldq 12%1, %%mm3\n\t"
358 "movq %%mm0, %%mm1\n\t"
359 "movq %%mm0, %%mm2\n\t"
360 "movq %%mm3, %%mm4\n\t"
361 "movq %%mm3, %%mm5\n\t"
362 "psrlq $3, %%mm0\n\t"
363 "psrlq $3, %%mm3\n\t"
366 "psrlq $5, %%mm1\n\t"
367 "psrlq $5, %%mm4\n\t"
368 "pand %%mm6, %%mm1\n\t"
369 "pand %%mm6, %%mm4\n\t"
370 "psrlq $8, %%mm2\n\t"
371 "psrlq $8, %%mm5\n\t"
372 "pand %%mm7, %%mm2\n\t"
373 "pand %%mm7, %%mm5\n\t"
374 "por %%mm1, %%mm0\n\t"
375 "por %%mm4, %%mm3\n\t"
376 "por %%mm2, %%mm0\n\t"
377 "por %%mm5, %%mm3\n\t"
378 "psllq $16, %%mm3\n\t"
379 "por %%mm3, %%mm0\n\t"
380 MOVNTQ" %%mm0, %0\n\t"
381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
391 const int src= *((uint32_t*)s)++;
392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399 const uint8_t *s = src;
402 const uint8_t *mm_end;
404 uint16_t *d = (uint16_t *)dst;
407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
411 ::"m"(red_16mask),"m"(green_16mask));
418 "movd 4%1, %%mm3\n\t"
419 "punpckldq 8%1, %%mm0\n\t"
420 "punpckldq 12%1, %%mm3\n\t"
421 "movq %%mm0, %%mm1\n\t"
422 "movq %%mm0, %%mm2\n\t"
423 "movq %%mm3, %%mm4\n\t"
424 "movq %%mm3, %%mm5\n\t"
425 "psllq $8, %%mm0\n\t"
426 "psllq $8, %%mm3\n\t"
427 "pand %%mm7, %%mm0\n\t"
428 "pand %%mm7, %%mm3\n\t"
429 "psrlq $5, %%mm1\n\t"
430 "psrlq $5, %%mm4\n\t"
431 "pand %%mm6, %%mm1\n\t"
432 "pand %%mm6, %%mm4\n\t"
433 "psrlq $19, %%mm2\n\t"
434 "psrlq $19, %%mm5\n\t"
437 "por %%mm1, %%mm0\n\t"
438 "por %%mm4, %%mm3\n\t"
439 "por %%mm2, %%mm0\n\t"
440 "por %%mm5, %%mm3\n\t"
441 "psllq $16, %%mm3\n\t"
442 "por %%mm3, %%mm0\n\t"
443 MOVNTQ" %%mm0, %0\n\t"
444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
448 __asm __volatile(SFENCE:::"memory");
449 __asm __volatile(EMMS:::"memory");
453 const int src= *((uint32_t*)s)++;
454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460 const uint8_t *s = src;
463 const uint8_t *mm_end;
465 uint16_t *d = (uint16_t *)dst;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
471 "movq %3, %%mm5 \n\t"
472 "movq %4, %%mm6 \n\t"
473 "movq %5, %%mm7 \n\t"
476 PREFETCH" 32(%1) \n\t"
477 "movd (%1), %%mm0 \n\t"
478 "movd 4(%1), %%mm3 \n\t"
479 "punpckldq 8(%1), %%mm0 \n\t"
480 "punpckldq 12(%1), %%mm3 \n\t"
481 "movq %%mm0, %%mm1 \n\t"
482 "movq %%mm3, %%mm4 \n\t"
483 "pand %%mm6, %%mm0 \n\t"
484 "pand %%mm6, %%mm3 \n\t"
485 "pmaddwd %%mm7, %%mm0 \n\t"
486 "pmaddwd %%mm7, %%mm3 \n\t"
487 "pand %%mm5, %%mm1 \n\t"
488 "pand %%mm5, %%mm4 \n\t"
489 "por %%mm1, %%mm0 \n\t"
490 "por %%mm4, %%mm3 \n\t"
491 "psrld $6, %%mm0 \n\t"
492 "pslld $10, %%mm3 \n\t"
493 "por %%mm3, %%mm0 \n\t"
494 MOVNTQ" %%mm0, (%0) \n\t"
500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
507 ::"m"(red_15mask),"m"(green_15mask));
513 "movd 4%1, %%mm3\n\t"
514 "punpckldq 8%1, %%mm0\n\t"
515 "punpckldq 12%1, %%mm3\n\t"
516 "movq %%mm0, %%mm1\n\t"
517 "movq %%mm0, %%mm2\n\t"
518 "movq %%mm3, %%mm4\n\t"
519 "movq %%mm3, %%mm5\n\t"
520 "psrlq $3, %%mm0\n\t"
521 "psrlq $3, %%mm3\n\t"
524 "psrlq $6, %%mm1\n\t"
525 "psrlq $6, %%mm4\n\t"
526 "pand %%mm6, %%mm1\n\t"
527 "pand %%mm6, %%mm4\n\t"
528 "psrlq $9, %%mm2\n\t"
529 "psrlq $9, %%mm5\n\t"
530 "pand %%mm7, %%mm2\n\t"
531 "pand %%mm7, %%mm5\n\t"
532 "por %%mm1, %%mm0\n\t"
533 "por %%mm4, %%mm3\n\t"
534 "por %%mm2, %%mm0\n\t"
535 "por %%mm5, %%mm3\n\t"
536 "psllq $16, %%mm3\n\t"
537 "por %%mm3, %%mm0\n\t"
538 MOVNTQ" %%mm0, %0\n\t"
539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
544 __asm __volatile(SFENCE:::"memory");
545 __asm __volatile(EMMS:::"memory");
549 const int src= *((uint32_t*)s)++;
550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556 const uint8_t *s = src;
559 const uint8_t *mm_end;
561 uint16_t *d = (uint16_t *)dst;
564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
568 ::"m"(red_15mask),"m"(green_15mask));
575 "movd 4%1, %%mm3\n\t"
576 "punpckldq 8%1, %%mm0\n\t"
577 "punpckldq 12%1, %%mm3\n\t"
578 "movq %%mm0, %%mm1\n\t"
579 "movq %%mm0, %%mm2\n\t"
580 "movq %%mm3, %%mm4\n\t"
581 "movq %%mm3, %%mm5\n\t"
582 "psllq $7, %%mm0\n\t"
583 "psllq $7, %%mm3\n\t"
584 "pand %%mm7, %%mm0\n\t"
585 "pand %%mm7, %%mm3\n\t"
586 "psrlq $6, %%mm1\n\t"
587 "psrlq $6, %%mm4\n\t"
588 "pand %%mm6, %%mm1\n\t"
589 "pand %%mm6, %%mm4\n\t"
590 "psrlq $19, %%mm2\n\t"
591 "psrlq $19, %%mm5\n\t"
594 "por %%mm1, %%mm0\n\t"
595 "por %%mm4, %%mm3\n\t"
596 "por %%mm2, %%mm0\n\t"
597 "por %%mm5, %%mm3\n\t"
598 "psllq $16, %%mm3\n\t"
599 "por %%mm3, %%mm0\n\t"
600 MOVNTQ" %%mm0, %0\n\t"
601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
605 __asm __volatile(SFENCE:::"memory");
606 __asm __volatile(EMMS:::"memory");
610 const int src= *((uint32_t*)s)++;
611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617 const uint8_t *s = src;
620 const uint8_t *mm_end;
622 uint16_t *d = (uint16_t *)dst;
625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
629 ::"m"(red_16mask),"m"(green_16mask));
636 "movd 3%1, %%mm3\n\t"
637 "punpckldq 6%1, %%mm0\n\t"
638 "punpckldq 9%1, %%mm3\n\t"
639 "movq %%mm0, %%mm1\n\t"
640 "movq %%mm0, %%mm2\n\t"
641 "movq %%mm3, %%mm4\n\t"
642 "movq %%mm3, %%mm5\n\t"
643 "psrlq $3, %%mm0\n\t"
644 "psrlq $3, %%mm3\n\t"
647 "psrlq $5, %%mm1\n\t"
648 "psrlq $5, %%mm4\n\t"
649 "pand %%mm6, %%mm1\n\t"
650 "pand %%mm6, %%mm4\n\t"
651 "psrlq $8, %%mm2\n\t"
652 "psrlq $8, %%mm5\n\t"
653 "pand %%mm7, %%mm2\n\t"
654 "pand %%mm7, %%mm5\n\t"
655 "por %%mm1, %%mm0\n\t"
656 "por %%mm4, %%mm3\n\t"
657 "por %%mm2, %%mm0\n\t"
658 "por %%mm5, %%mm3\n\t"
659 "psllq $16, %%mm3\n\t"
660 "por %%mm3, %%mm0\n\t"
661 MOVNTQ" %%mm0, %0\n\t"
662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
666 __asm __volatile(SFENCE:::"memory");
667 __asm __volatile(EMMS:::"memory");
674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680 const uint8_t *s = src;
683 const uint8_t *mm_end;
685 uint16_t *d = (uint16_t *)dst;
688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
692 ::"m"(red_16mask),"m"(green_16mask));
699 "movd 3%1, %%mm3\n\t"
700 "punpckldq 6%1, %%mm0\n\t"
701 "punpckldq 9%1, %%mm3\n\t"
702 "movq %%mm0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm3, %%mm4\n\t"
705 "movq %%mm3, %%mm5\n\t"
706 "psllq $8, %%mm0\n\t"
707 "psllq $8, %%mm3\n\t"
708 "pand %%mm7, %%mm0\n\t"
709 "pand %%mm7, %%mm3\n\t"
710 "psrlq $5, %%mm1\n\t"
711 "psrlq $5, %%mm4\n\t"
712 "pand %%mm6, %%mm1\n\t"
713 "pand %%mm6, %%mm4\n\t"
714 "psrlq $19, %%mm2\n\t"
715 "psrlq $19, %%mm5\n\t"
718 "por %%mm1, %%mm0\n\t"
719 "por %%mm4, %%mm3\n\t"
720 "por %%mm2, %%mm0\n\t"
721 "por %%mm5, %%mm3\n\t"
722 "psllq $16, %%mm3\n\t"
723 "por %%mm3, %%mm0\n\t"
724 MOVNTQ" %%mm0, %0\n\t"
725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
729 __asm __volatile(SFENCE:::"memory");
730 __asm __volatile(EMMS:::"memory");
737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743 const uint8_t *s = src;
746 const uint8_t *mm_end;
748 uint16_t *d = (uint16_t *)dst;
751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
755 ::"m"(red_15mask),"m"(green_15mask));
762 "movd 3%1, %%mm3\n\t"
763 "punpckldq 6%1, %%mm0\n\t"
764 "punpckldq 9%1, %%mm3\n\t"
765 "movq %%mm0, %%mm1\n\t"
766 "movq %%mm0, %%mm2\n\t"
767 "movq %%mm3, %%mm4\n\t"
768 "movq %%mm3, %%mm5\n\t"
769 "psrlq $3, %%mm0\n\t"
770 "psrlq $3, %%mm3\n\t"
773 "psrlq $6, %%mm1\n\t"
774 "psrlq $6, %%mm4\n\t"
775 "pand %%mm6, %%mm1\n\t"
776 "pand %%mm6, %%mm4\n\t"
777 "psrlq $9, %%mm2\n\t"
778 "psrlq $9, %%mm5\n\t"
779 "pand %%mm7, %%mm2\n\t"
780 "pand %%mm7, %%mm5\n\t"
781 "por %%mm1, %%mm0\n\t"
782 "por %%mm4, %%mm3\n\t"
783 "por %%mm2, %%mm0\n\t"
784 "por %%mm5, %%mm3\n\t"
785 "psllq $16, %%mm3\n\t"
786 "por %%mm3, %%mm0\n\t"
787 MOVNTQ" %%mm0, %0\n\t"
788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
792 __asm __volatile(SFENCE:::"memory");
793 __asm __volatile(EMMS:::"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806 const uint8_t *s = src;
809 const uint8_t *mm_end;
811 uint16_t *d = (uint16_t *)dst;
814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
818 ::"m"(red_15mask),"m"(green_15mask));
825 "movd 3%1, %%mm3\n\t"
826 "punpckldq 6%1, %%mm0\n\t"
827 "punpckldq 9%1, %%mm3\n\t"
828 "movq %%mm0, %%mm1\n\t"
829 "movq %%mm0, %%mm2\n\t"
830 "movq %%mm3, %%mm4\n\t"
831 "movq %%mm3, %%mm5\n\t"
832 "psllq $7, %%mm0\n\t"
833 "psllq $7, %%mm3\n\t"
834 "pand %%mm7, %%mm0\n\t"
835 "pand %%mm7, %%mm3\n\t"
836 "psrlq $6, %%mm1\n\t"
837 "psrlq $6, %%mm4\n\t"
838 "pand %%mm6, %%mm1\n\t"
839 "pand %%mm6, %%mm4\n\t"
840 "psrlq $19, %%mm2\n\t"
841 "psrlq $19, %%mm5\n\t"
844 "por %%mm1, %%mm0\n\t"
845 "por %%mm4, %%mm3\n\t"
846 "por %%mm2, %%mm0\n\t"
847 "por %%mm5, %%mm3\n\t"
848 "psllq $16, %%mm3\n\t"
849 "por %%mm3, %%mm0\n\t"
850 MOVNTQ" %%mm0, %0\n\t"
851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 __asm __volatile(SFENCE:::"memory");
856 __asm __volatile(EMMS:::"memory");
863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868 I use here less accurate approximation by simply
869 left-shifting the input
870 value and filling the low order bits with
871 zeroes. This method improves png's
872 compression but this scheme cannot reproduce white exactly, since it does not
873 generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | Leftmost Bits Repeated to Fill Open Bits
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
894 const uint16_t *mm_end;
896 uint8_t *d = (uint8_t *)dst;
897 const uint16_t *s = (uint16_t *)src;
898 end = s + src_size/2;
900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
912 "psllq $3, %%mm0\n\t"
913 "psrlq $2, %%mm1\n\t"
914 "psrlq $7, %%mm2\n\t"
915 "movq %%mm0, %%mm3\n\t"
916 "movq %%mm1, %%mm4\n\t"
917 "movq %%mm2, %%mm5\n\t"
918 "punpcklwd %5, %%mm0\n\t"
919 "punpcklwd %5, %%mm1\n\t"
920 "punpcklwd %5, %%mm2\n\t"
921 "punpckhwd %5, %%mm3\n\t"
922 "punpckhwd %5, %%mm4\n\t"
923 "punpckhwd %5, %%mm5\n\t"
924 "psllq $8, %%mm1\n\t"
925 "psllq $16, %%mm2\n\t"
926 "por %%mm1, %%mm0\n\t"
927 "por %%mm2, %%mm0\n\t"
928 "psllq $8, %%mm4\n\t"
929 "psllq $16, %%mm5\n\t"
930 "por %%mm4, %%mm3\n\t"
931 "por %%mm5, %%mm3\n\t"
933 "movq %%mm0, %%mm6\n\t"
934 "movq %%mm3, %%mm7\n\t"
936 "movq 8%1, %%mm0\n\t"
937 "movq 8%1, %%mm1\n\t"
938 "movq 8%1, %%mm2\n\t"
942 "psllq $3, %%mm0\n\t"
943 "psrlq $2, %%mm1\n\t"
944 "psrlq $7, %%mm2\n\t"
945 "movq %%mm0, %%mm3\n\t"
946 "movq %%mm1, %%mm4\n\t"
947 "movq %%mm2, %%mm5\n\t"
948 "punpcklwd %5, %%mm0\n\t"
949 "punpcklwd %5, %%mm1\n\t"
950 "punpcklwd %5, %%mm2\n\t"
951 "punpckhwd %5, %%mm3\n\t"
952 "punpckhwd %5, %%mm4\n\t"
953 "punpckhwd %5, %%mm5\n\t"
954 "psllq $8, %%mm1\n\t"
955 "psllq $16, %%mm2\n\t"
956 "por %%mm1, %%mm0\n\t"
957 "por %%mm2, %%mm0\n\t"
958 "psllq $8, %%mm4\n\t"
959 "psllq $16, %%mm5\n\t"
960 "por %%mm4, %%mm3\n\t"
961 "por %%mm5, %%mm3\n\t"
964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
966 /* Borrowed 32 to 24 */
968 "movq %%mm0, %%mm4\n\t"
969 "movq %%mm3, %%mm5\n\t"
970 "movq %%mm6, %%mm0\n\t"
971 "movq %%mm7, %%mm1\n\t"
973 "movq %%mm4, %%mm6\n\t"
974 "movq %%mm5, %%mm7\n\t"
975 "movq %%mm0, %%mm2\n\t"
976 "movq %%mm1, %%mm3\n\t"
978 "psrlq $8, %%mm2\n\t"
979 "psrlq $8, %%mm3\n\t"
980 "psrlq $8, %%mm6\n\t"
981 "psrlq $8, %%mm7\n\t"
990 "por %%mm2, %%mm0\n\t"
991 "por %%mm3, %%mm1\n\t"
992 "por %%mm6, %%mm4\n\t"
993 "por %%mm7, %%mm5\n\t"
995 "movq %%mm1, %%mm2\n\t"
996 "movq %%mm4, %%mm3\n\t"
997 "psllq $48, %%mm2\n\t"
998 "psllq $32, %%mm3\n\t"
1000 "pand %5, %%mm3\n\t"
1001 "por %%mm2, %%mm0\n\t"
1002 "psrlq $16, %%mm1\n\t"
1003 "psrlq $32, %%mm4\n\t"
1004 "psllq $16, %%mm5\n\t"
1005 "por %%mm3, %%mm1\n\t"
1006 "pand %6, %%mm5\n\t"
1007 "por %%mm5, %%mm4\n\t"
1009 MOVNTQ" %%mm0, %0\n\t"
1010 MOVNTQ" %%mm1, 8%0\n\t"
1011 MOVNTQ" %%mm4, 16%0"
1014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1019 __asm __volatile(SFENCE:::"memory");
1020 __asm __volatile(EMMS:::"memory");
1024 register uint16_t bgr;
1026 *d++ = (bgr&0x1F)<<3;
1027 *d++ = (bgr&0x3E0)>>2;
1028 *d++ = (bgr&0x7C00)>>7;
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034 const uint16_t *end;
1036 const uint16_t *mm_end;
1038 uint8_t *d = (uint8_t *)dst;
1039 const uint16_t *s = (const uint16_t *)src;
1040 end = s + src_size/2;
1042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1048 "movq %1, %%mm0\n\t"
1049 "movq %1, %%mm1\n\t"
1050 "movq %1, %%mm2\n\t"
1051 "pand %2, %%mm0\n\t"
1052 "pand %3, %%mm1\n\t"
1053 "pand %4, %%mm2\n\t"
1054 "psllq $3, %%mm0\n\t"
1055 "psrlq $3, %%mm1\n\t"
1056 "psrlq $8, %%mm2\n\t"
1057 "movq %%mm0, %%mm3\n\t"
1058 "movq %%mm1, %%mm4\n\t"
1059 "movq %%mm2, %%mm5\n\t"
1060 "punpcklwd %5, %%mm0\n\t"
1061 "punpcklwd %5, %%mm1\n\t"
1062 "punpcklwd %5, %%mm2\n\t"
1063 "punpckhwd %5, %%mm3\n\t"
1064 "punpckhwd %5, %%mm4\n\t"
1065 "punpckhwd %5, %%mm5\n\t"
1066 "psllq $8, %%mm1\n\t"
1067 "psllq $16, %%mm2\n\t"
1068 "por %%mm1, %%mm0\n\t"
1069 "por %%mm2, %%mm0\n\t"
1070 "psllq $8, %%mm4\n\t"
1071 "psllq $16, %%mm5\n\t"
1072 "por %%mm4, %%mm3\n\t"
1073 "por %%mm5, %%mm3\n\t"
1075 "movq %%mm0, %%mm6\n\t"
1076 "movq %%mm3, %%mm7\n\t"
1078 "movq 8%1, %%mm0\n\t"
1079 "movq 8%1, %%mm1\n\t"
1080 "movq 8%1, %%mm2\n\t"
1081 "pand %2, %%mm0\n\t"
1082 "pand %3, %%mm1\n\t"
1083 "pand %4, %%mm2\n\t"
1084 "psllq $3, %%mm0\n\t"
1085 "psrlq $3, %%mm1\n\t"
1086 "psrlq $8, %%mm2\n\t"
1087 "movq %%mm0, %%mm3\n\t"
1088 "movq %%mm1, %%mm4\n\t"
1089 "movq %%mm2, %%mm5\n\t"
1090 "punpcklwd %5, %%mm0\n\t"
1091 "punpcklwd %5, %%mm1\n\t"
1092 "punpcklwd %5, %%mm2\n\t"
1093 "punpckhwd %5, %%mm3\n\t"
1094 "punpckhwd %5, %%mm4\n\t"
1095 "punpckhwd %5, %%mm5\n\t"
1096 "psllq $8, %%mm1\n\t"
1097 "psllq $16, %%mm2\n\t"
1098 "por %%mm1, %%mm0\n\t"
1099 "por %%mm2, %%mm0\n\t"
1100 "psllq $8, %%mm4\n\t"
1101 "psllq $16, %%mm5\n\t"
1102 "por %%mm4, %%mm3\n\t"
1103 "por %%mm5, %%mm3\n\t"
1105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107 /* Borrowed 32 to 24 */
1109 "movq %%mm0, %%mm4\n\t"
1110 "movq %%mm3, %%mm5\n\t"
1111 "movq %%mm6, %%mm0\n\t"
1112 "movq %%mm7, %%mm1\n\t"
1114 "movq %%mm4, %%mm6\n\t"
1115 "movq %%mm5, %%mm7\n\t"
1116 "movq %%mm0, %%mm2\n\t"
1117 "movq %%mm1, %%mm3\n\t"
1119 "psrlq $8, %%mm2\n\t"
1120 "psrlq $8, %%mm3\n\t"
1121 "psrlq $8, %%mm6\n\t"
1122 "psrlq $8, %%mm7\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %2, %%mm1\n\t"
1125 "pand %2, %%mm4\n\t"
1126 "pand %2, %%mm5\n\t"
1127 "pand %3, %%mm2\n\t"
1128 "pand %3, %%mm3\n\t"
1129 "pand %3, %%mm6\n\t"
1130 "pand %3, %%mm7\n\t"
1131 "por %%mm2, %%mm0\n\t"
1132 "por %%mm3, %%mm1\n\t"
1133 "por %%mm6, %%mm4\n\t"
1134 "por %%mm7, %%mm5\n\t"
1136 "movq %%mm1, %%mm2\n\t"
1137 "movq %%mm4, %%mm3\n\t"
1138 "psllq $48, %%mm2\n\t"
1139 "psllq $32, %%mm3\n\t"
1140 "pand %4, %%mm2\n\t"
1141 "pand %5, %%mm3\n\t"
1142 "por %%mm2, %%mm0\n\t"
1143 "psrlq $16, %%mm1\n\t"
1144 "psrlq $32, %%mm4\n\t"
1145 "psllq $16, %%mm5\n\t"
1146 "por %%mm3, %%mm1\n\t"
1147 "pand %6, %%mm5\n\t"
1148 "por %%mm5, %%mm4\n\t"
1150 MOVNTQ" %%mm0, %0\n\t"
1151 MOVNTQ" %%mm1, 8%0\n\t"
1152 MOVNTQ" %%mm4, 16%0"
1155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1160 __asm __volatile(SFENCE:::"memory");
1161 __asm __volatile(EMMS:::"memory");
1165 register uint16_t bgr;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175 const uint16_t *end;
1177 const uint16_t *mm_end;
1179 uint8_t *d = (uint8_t *)dst;
1180 const uint16_t *s = (const uint16_t *)src;
1181 end = s + src_size/2;
1183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1190 "movq %1, %%mm0\n\t"
1191 "movq %1, %%mm1\n\t"
1192 "movq %1, %%mm2\n\t"
1193 "pand %2, %%mm0\n\t"
1194 "pand %3, %%mm1\n\t"
1195 "pand %4, %%mm2\n\t"
1196 "psllq $3, %%mm0\n\t"
1197 "psrlq $2, %%mm1\n\t"
1198 "psrlq $7, %%mm2\n\t"
1199 "movq %%mm0, %%mm3\n\t"
1200 "movq %%mm1, %%mm4\n\t"
1201 "movq %%mm2, %%mm5\n\t"
1202 "punpcklwd %%mm7, %%mm0\n\t"
1203 "punpcklwd %%mm7, %%mm1\n\t"
1204 "punpcklwd %%mm7, %%mm2\n\t"
1205 "punpckhwd %%mm7, %%mm3\n\t"
1206 "punpckhwd %%mm7, %%mm4\n\t"
1207 "punpckhwd %%mm7, %%mm5\n\t"
1208 "psllq $8, %%mm1\n\t"
1209 "psllq $16, %%mm2\n\t"
1210 "por %%mm1, %%mm0\n\t"
1211 "por %%mm2, %%mm0\n\t"
1212 "psllq $8, %%mm4\n\t"
1213 "psllq $16, %%mm5\n\t"
1214 "por %%mm4, %%mm3\n\t"
1215 "por %%mm5, %%mm3\n\t"
1216 MOVNTQ" %%mm0, %0\n\t"
1217 MOVNTQ" %%mm3, 8%0\n\t"
1219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1224 __asm __volatile(SFENCE:::"memory");
1225 __asm __volatile(EMMS:::"memory");
1229 #if 0 //slightly slower on athlon
1231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234 register uint16_t bgr;
1236 *d++ = (bgr&0x1F)<<3;
1237 *d++ = (bgr&0x3E0)>>2;
1238 *d++ = (bgr&0x7C00)>>7;
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246 const uint16_t *end;
1248 const uint16_t *mm_end;
1250 uint8_t *d = (uint8_t *)dst;
1251 const uint16_t *s = (uint16_t *)src;
1252 end = s + src_size/2;
1254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1261 "movq %1, %%mm0\n\t"
1262 "movq %1, %%mm1\n\t"
1263 "movq %1, %%mm2\n\t"
1264 "pand %2, %%mm0\n\t"
1265 "pand %3, %%mm1\n\t"
1266 "pand %4, %%mm2\n\t"
1267 "psllq $3, %%mm0\n\t"
1268 "psrlq $3, %%mm1\n\t"
1269 "psrlq $8, %%mm2\n\t"
1270 "movq %%mm0, %%mm3\n\t"
1271 "movq %%mm1, %%mm4\n\t"
1272 "movq %%mm2, %%mm5\n\t"
1273 "punpcklwd %%mm7, %%mm0\n\t"
1274 "punpcklwd %%mm7, %%mm1\n\t"
1275 "punpcklwd %%mm7, %%mm2\n\t"
1276 "punpckhwd %%mm7, %%mm3\n\t"
1277 "punpckhwd %%mm7, %%mm4\n\t"
1278 "punpckhwd %%mm7, %%mm5\n\t"
1279 "psllq $8, %%mm1\n\t"
1280 "psllq $16, %%mm2\n\t"
1281 "por %%mm1, %%mm0\n\t"
1282 "por %%mm2, %%mm0\n\t"
1283 "psllq $8, %%mm4\n\t"
1284 "psllq $16, %%mm5\n\t"
1285 "por %%mm4, %%mm3\n\t"
1286 "por %%mm5, %%mm3\n\t"
1287 MOVNTQ" %%mm0, %0\n\t"
1288 MOVNTQ" %%mm3, 8%0\n\t"
1290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1295 __asm __volatile(SFENCE:::"memory");
1296 __asm __volatile(EMMS:::"memory");
1300 register uint16_t bgr;
1302 *d++ = (bgr&0x1F)<<3;
1303 *d++ = (bgr&0x7E0)>>3;
1304 *d++ = (bgr&0xF800)>>8;
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1312 /* TODO: unroll this loop */
1314 "xorl %%eax, %%eax \n\t"
1317 PREFETCH" 32(%0, %%eax) \n\t"
1318 "movq (%0, %%eax), %%mm0 \n\t"
1319 "movq %%mm0, %%mm1 \n\t"
1320 "movq %%mm0, %%mm2 \n\t"
1321 "pslld $16, %%mm0 \n\t"
1322 "psrld $16, %%mm1 \n\t"
1323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1326 "por %%mm0, %%mm2 \n\t"
1327 "por %%mm1, %%mm2 \n\t"
1328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329 "addl $8, %%eax \n\t"
1330 "cmpl %2, %%eax \n\t"
1332 :: "r" (src), "r"(dst), "r" (src_size-7)
1336 __asm __volatile(SFENCE:::"memory");
1337 __asm __volatile(EMMS:::"memory");
1340 unsigned num_pixels = src_size >> 2;
1341 for(i=0; i<num_pixels; i++)
1343 #ifdef WORDS_BIGENDIAN
1344 dst[4*i + 1] = src[4*i + 3];
1345 dst[4*i + 2] = src[4*i + 2];
1346 dst[4*i + 3] = src[4*i + 1];
1348 dst[4*i + 0] = src[4*i + 2];
1349 dst[4*i + 1] = src[4*i + 1];
1350 dst[4*i + 2] = src[4*i + 0];
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1360 int mmx_size= 23 - src_size;
1362 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1363 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1364 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1367 PREFETCH" 32(%1, %%eax) \n\t"
1368 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1369 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1370 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1371 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1372 "pand %%mm5, %%mm0 \n\t"
1373 "pand %%mm6, %%mm1 \n\t"
1374 "pand %%mm7, %%mm2 \n\t"
1375 "por %%mm0, %%mm1 \n\t"
1376 "por %%mm2, %%mm1 \n\t"
1377 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1378 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1379 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1380 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1381 "pand %%mm7, %%mm0 \n\t"
1382 "pand %%mm5, %%mm1 \n\t"
1383 "pand %%mm6, %%mm2 \n\t"
1384 "por %%mm0, %%mm1 \n\t"
1385 "por %%mm2, %%mm1 \n\t"
1386 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1387 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1388 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1389 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1390 "pand %%mm6, %%mm0 \n\t"
1391 "pand %%mm7, %%mm1 \n\t"
1392 "pand %%mm5, %%mm2 \n\t"
1393 "por %%mm0, %%mm1 \n\t"
1394 "por %%mm2, %%mm1 \n\t"
1395 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1396 "addl $24, %%eax \n\t"
1399 : "r" (src-mmx_size), "r"(dst-mmx_size)
1402 __asm __volatile(SFENCE:::"memory");
1403 __asm __volatile(EMMS:::"memory");
1405 if(mmx_size==23) return; //finihsed, was multiple of 8
1409 src_size= 23-mmx_size;
1413 for(i=0; i<src_size; i+=3)
1417 dst[i + 1] = src[i + 1];
1418 dst[i + 2] = src[i + 0];
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424 unsigned int width, unsigned int height,
1425 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1428 const unsigned chromWidth= width>>1;
1429 for(y=0; y<height; y++)
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1434 "xorl %%eax, %%eax \n\t"
1437 PREFETCH" 32(%1, %%eax, 2) \n\t"
1438 PREFETCH" 32(%2, %%eax) \n\t"
1439 PREFETCH" 32(%3, %%eax) \n\t"
1440 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1441 "movq %%mm0, %%mm2 \n\t" // U(0)
1442 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1443 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1444 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1446 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1447 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1448 "movq %%mm3, %%mm4 \n\t" // Y(0)
1449 "movq %%mm5, %%mm6 \n\t" // Y(8)
1450 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1451 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1452 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1453 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1455 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1456 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1457 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1460 "addl $8, %%eax \n\t"
1461 "cmpl %4, %%eax \n\t"
1463 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n) \
1474 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1475 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1476 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1477 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1478 yuv1 = (u << 8) + (v << 24); \
1485 uint64_t *qdst = (uint64_t *) dst;
1486 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487 const uint32_t *yc = (uint32_t *) ysrc;
1488 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490 for(i = 0; i < chromWidth; i += 8){
1491 uint64_t y1, y2, yuv1, yuv2;
1494 asm("ldq $31,64(%0)" :: "r"(yc));
1495 asm("ldq $31,64(%0)" :: "r"(yc2));
1496 asm("ldq $31,64(%0)" :: "r"(uc));
1497 asm("ldq $31,64(%0)" :: "r"(vc));
1515 #elif __WORDSIZE >= 64
1517 uint64_t *ldst = (uint64_t *) dst;
1518 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519 for(i = 0; i < chromWidth; i += 2){
1521 k = yc[0] + (uc[0] << 8) +
1522 (yc[1] << 16) + (vc[0] << 24);
1523 l = yc[2] + (uc[1] << 8) +
1524 (yc[3] << 16) + (vc[1] << 24);
1525 *ldst++ = k + (l << 32);
1532 int i, *idst = (int32_t *) dst;
1533 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534 for(i = 0; i < chromWidth; i++){
1535 *idst++ = yc[0] + (uc[0] << 8) +
1536 (yc[1] << 16) + (vc[0] << 24);
1543 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1545 usrc += chromStride;
1546 vsrc += chromStride;
1560 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561 * problem for anyone then tell me, and ill fix it)
1563 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564 unsigned int width, unsigned int height,
1565 int lumStride, int chromStride, int dstStride)
1567 //FIXME interpolate chroma
1568 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1571 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1572 unsigned int width, unsigned int height,
1573 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1576 const unsigned chromWidth= width>>1;
1577 for(y=0; y<height; y++)
1580 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1582 "xorl %%eax, %%eax \n\t"
1585 PREFETCH" 32(%1, %%eax, 2) \n\t"
1586 PREFETCH" 32(%2, %%eax) \n\t"
1587 PREFETCH" 32(%3, %%eax) \n\t"
1588 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1589 "movq %%mm0, %%mm2 \n\t" // U(0)
1590 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1591 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1592 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1594 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1595 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1596 "movq %%mm0, %%mm4 \n\t" // Y(0)
1597 "movq %%mm2, %%mm6 \n\t" // Y(8)
1598 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1599 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1600 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1601 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1603 MOVNTQ" %%mm0, (%0, %%eax, 4) \n\t"
1604 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1605 MOVNTQ" %%mm2, 16(%0, %%eax, 4) \n\t"
1606 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1608 "addl $8, %%eax \n\t"
1609 "cmpl %4, %%eax \n\t"
1611 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1615 //FIXME adapt the alpha asm code from yv12->yuy2
1617 #if __WORDSIZE >= 64
1619 uint64_t *ldst = (uint64_t *) dst;
1620 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1621 for(i = 0; i < chromWidth; i += 2){
1623 k = uc[0] + (yc[0] << 8) +
1624 (vc[0] << 16) + (yc[1] << 24);
1625 l = uc[1] + (yc[2] << 8) +
1626 (vc[1] << 16) + (yc[3] << 24);
1627 *ldst++ = k + (l << 32);
1634 int i, *idst = (int32_t *) dst;
1635 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1636 for(i = 0; i < chromWidth; i++){
1637 *idst++ = uc[0] + (yc[0] << 8) +
1638 (vc[0] << 16) + (yc[1] << 24);
1645 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1647 usrc += chromStride;
1648 vsrc += chromStride;
1662 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1663 * problem for anyone then tell me, and ill fix it)
1665 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1666 unsigned int width, unsigned int height,
1667 int lumStride, int chromStride, int dstStride)
1669 //FIXME interpolate chroma
1670 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1675 * width should be a multiple of 16
1677 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1678 unsigned int width, unsigned int height,
1679 int lumStride, int chromStride, int dstStride)
1681 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1686 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1687 * problem for anyone then tell me, and ill fix it)
1689 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1690 unsigned int width, unsigned int height,
1691 int lumStride, int chromStride, int srcStride)
1694 const unsigned chromWidth= width>>1;
1695 for(y=0; y<height; y+=2)
1699 "xorl %%eax, %%eax \n\t"
1700 "pcmpeqw %%mm7, %%mm7 \n\t"
1701 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1704 PREFETCH" 64(%0, %%eax, 4) \n\t"
1705 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1706 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1707 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1708 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1709 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1710 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1711 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1712 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1713 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1714 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1716 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1718 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1719 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1720 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1721 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1722 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1723 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1724 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1725 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1726 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1727 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1729 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1731 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1732 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1733 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1734 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1735 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1736 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1737 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1738 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1740 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1741 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1743 "addl $8, %%eax \n\t"
1744 "cmpl %4, %%eax \n\t"
1746 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1754 "xorl %%eax, %%eax \n\t"
1757 PREFETCH" 64(%0, %%eax, 4) \n\t"
1758 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1759 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1760 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1761 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1762 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1763 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1764 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1765 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1766 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1767 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1769 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1770 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1772 "addl $8, %%eax \n\t"
1773 "cmpl %4, %%eax \n\t"
1776 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1781 for(i=0; i<chromWidth; i++)
1783 ydst[2*i+0] = src[4*i+0];
1784 udst[i] = src[4*i+1];
1785 ydst[2*i+1] = src[4*i+2];
1786 vdst[i] = src[4*i+3];
1791 for(i=0; i<chromWidth; i++)
1793 ydst[2*i+0] = src[4*i+0];
1794 ydst[2*i+1] = src[4*i+2];
1797 udst += chromStride;
1798 vdst += chromStride;
1803 asm volatile( EMMS" \n\t"
1809 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1810 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1811 unsigned int width, unsigned int height, int lumStride, int chromStride)
1814 memcpy(ydst, ysrc, width*height);
1816 /* XXX: implement upscaling for U,V */
1819 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1826 for(x=0; x<srcWidth-1; x++){
1827 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1828 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1830 dst[2*srcWidth-1]= src[srcWidth-1];
1834 for(y=1; y<srcHeight; y++){
1835 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1836 const int mmxSize= srcWidth&~15;
1838 "movl %4, %%eax \n\t"
1840 "movq (%0, %%eax), %%mm0 \n\t"
1841 "movq (%1, %%eax), %%mm1 \n\t"
1842 "movq 1(%0, %%eax), %%mm2 \n\t"
1843 "movq 1(%1, %%eax), %%mm3 \n\t"
1844 "movq -1(%0, %%eax), %%mm4 \n\t"
1845 "movq -1(%1, %%eax), %%mm5 \n\t"
1846 PAVGB" %%mm0, %%mm5 \n\t"
1847 PAVGB" %%mm0, %%mm3 \n\t"
1848 PAVGB" %%mm0, %%mm5 \n\t"
1849 PAVGB" %%mm0, %%mm3 \n\t"
1850 PAVGB" %%mm1, %%mm4 \n\t"
1851 PAVGB" %%mm1, %%mm2 \n\t"
1852 PAVGB" %%mm1, %%mm4 \n\t"
1853 PAVGB" %%mm1, %%mm2 \n\t"
1854 "movq %%mm5, %%mm7 \n\t"
1855 "movq %%mm4, %%mm6 \n\t"
1856 "punpcklbw %%mm3, %%mm5 \n\t"
1857 "punpckhbw %%mm3, %%mm7 \n\t"
1858 "punpcklbw %%mm2, %%mm4 \n\t"
1859 "punpckhbw %%mm2, %%mm6 \n\t"
1861 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1862 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1863 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1864 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1866 "movq %%mm5, (%2, %%eax, 2) \n\t"
1867 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1868 "movq %%mm4, (%3, %%eax, 2) \n\t"
1869 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1871 "addl $8, %%eax \n\t"
1873 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1874 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1880 const int mmxSize=1;
1882 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1883 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1885 for(x=mmxSize-1; x<srcWidth-1; x++){
1886 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1887 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1888 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1889 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1891 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1892 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1902 for(x=0; x<srcWidth-1; x++){
1903 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1904 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1906 dst[2*srcWidth-1]= src[srcWidth-1];
1908 for(x=0; x<srcWidth; x++){
1915 asm volatile( EMMS" \n\t"
1923 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1924 * problem for anyone then tell me, and ill fix it)
1925 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1927 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1928 unsigned int width, unsigned int height,
1929 int lumStride, int chromStride, int srcStride)
1932 const unsigned chromWidth= width>>1;
1933 for(y=0; y<height; y+=2)
1937 "xorl %%eax, %%eax \n\t"
1938 "pcmpeqw %%mm7, %%mm7 \n\t"
1939 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1942 PREFETCH" 64(%0, %%eax, 4) \n\t"
1943 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1944 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1945 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1946 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1947 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1948 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1949 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1950 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1951 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1952 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1954 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1956 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1957 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1958 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1959 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1960 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1961 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1962 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1963 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1964 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1965 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1967 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1969 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1970 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1971 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1972 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1973 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1974 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1975 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1976 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1978 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1979 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1981 "addl $8, %%eax \n\t"
1982 "cmpl %4, %%eax \n\t"
1984 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1992 "xorl %%eax, %%eax \n\t"
1995 PREFETCH" 64(%0, %%eax, 4) \n\t"
1996 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1997 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1998 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1999 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
2000 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
2001 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
2002 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
2003 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
2004 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
2005 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
2007 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
2008 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
2010 "addl $8, %%eax \n\t"
2011 "cmpl %4, %%eax \n\t"
2014 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
2019 for(i=0; i<chromWidth; i++)
2021 udst[i] = src[4*i+0];
2022 ydst[2*i+0] = src[4*i+1];
2023 vdst[i] = src[4*i+2];
2024 ydst[2*i+1] = src[4*i+3];
2029 for(i=0; i<chromWidth; i++)
2031 ydst[2*i+0] = src[4*i+1];
2032 ydst[2*i+1] = src[4*i+3];
2035 udst += chromStride;
2036 vdst += chromStride;
2041 asm volatile( EMMS" \n\t"
2049 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
2050 * problem for anyone then tell me, and ill fix it)
2051 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
2053 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2054 unsigned int width, unsigned int height,
2055 int lumStride, int chromStride, int srcStride)
2058 const unsigned chromWidth= width>>1;
2060 for(y=0; y<height-2; y+=2)
2066 "movl %2, %%eax \n\t"
2067 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
2068 "movq "MANGLE(w1111)", %%mm5 \n\t"
2069 "pxor %%mm7, %%mm7 \n\t"
2070 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2073 PREFETCH" 64(%0, %%ebx) \n\t"
2074 "movd (%0, %%ebx), %%mm0 \n\t"
2075 "movd 3(%0, %%ebx), %%mm1 \n\t"
2076 "punpcklbw %%mm7, %%mm0 \n\t"
2077 "punpcklbw %%mm7, %%mm1 \n\t"
2078 "movd 6(%0, %%ebx), %%mm2 \n\t"
2079 "movd 9(%0, %%ebx), %%mm3 \n\t"
2080 "punpcklbw %%mm7, %%mm2 \n\t"
2081 "punpcklbw %%mm7, %%mm3 \n\t"
2082 "pmaddwd %%mm6, %%mm0 \n\t"
2083 "pmaddwd %%mm6, %%mm1 \n\t"
2084 "pmaddwd %%mm6, %%mm2 \n\t"
2085 "pmaddwd %%mm6, %%mm3 \n\t"
2086 #ifndef FAST_BGR2YV12
2087 "psrad $8, %%mm0 \n\t"
2088 "psrad $8, %%mm1 \n\t"
2089 "psrad $8, %%mm2 \n\t"
2090 "psrad $8, %%mm3 \n\t"
2092 "packssdw %%mm1, %%mm0 \n\t"
2093 "packssdw %%mm3, %%mm2 \n\t"
2094 "pmaddwd %%mm5, %%mm0 \n\t"
2095 "pmaddwd %%mm5, %%mm2 \n\t"
2096 "packssdw %%mm2, %%mm0 \n\t"
2097 "psraw $7, %%mm0 \n\t"
2099 "movd 12(%0, %%ebx), %%mm4 \n\t"
2100 "movd 15(%0, %%ebx), %%mm1 \n\t"
2101 "punpcklbw %%mm7, %%mm4 \n\t"
2102 "punpcklbw %%mm7, %%mm1 \n\t"
2103 "movd 18(%0, %%ebx), %%mm2 \n\t"
2104 "movd 21(%0, %%ebx), %%mm3 \n\t"
2105 "punpcklbw %%mm7, %%mm2 \n\t"
2106 "punpcklbw %%mm7, %%mm3 \n\t"
2107 "pmaddwd %%mm6, %%mm4 \n\t"
2108 "pmaddwd %%mm6, %%mm1 \n\t"
2109 "pmaddwd %%mm6, %%mm2 \n\t"
2110 "pmaddwd %%mm6, %%mm3 \n\t"
2111 #ifndef FAST_BGR2YV12
2112 "psrad $8, %%mm4 \n\t"
2113 "psrad $8, %%mm1 \n\t"
2114 "psrad $8, %%mm2 \n\t"
2115 "psrad $8, %%mm3 \n\t"
2117 "packssdw %%mm1, %%mm4 \n\t"
2118 "packssdw %%mm3, %%mm2 \n\t"
2119 "pmaddwd %%mm5, %%mm4 \n\t"
2120 "pmaddwd %%mm5, %%mm2 \n\t"
2121 "addl $24, %%ebx \n\t"
2122 "packssdw %%mm2, %%mm4 \n\t"
2123 "psraw $7, %%mm4 \n\t"
2125 "packuswb %%mm4, %%mm0 \n\t"
2126 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2128 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2129 "addl $8, %%eax \n\t"
2131 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2139 "movl %4, %%eax \n\t"
2140 "movq "MANGLE(w1111)", %%mm5 \n\t"
2141 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2142 "pxor %%mm7, %%mm7 \n\t"
2143 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2144 "addl %%ebx, %%ebx \n\t"
2147 PREFETCH" 64(%0, %%ebx) \n\t"
2148 PREFETCH" 64(%1, %%ebx) \n\t"
2149 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2150 "movq (%0, %%ebx), %%mm0 \n\t"
2151 "movq (%1, %%ebx), %%mm1 \n\t"
2152 "movq 6(%0, %%ebx), %%mm2 \n\t"
2153 "movq 6(%1, %%ebx), %%mm3 \n\t"
2154 PAVGB" %%mm1, %%mm0 \n\t"
2155 PAVGB" %%mm3, %%mm2 \n\t"
2156 "movq %%mm0, %%mm1 \n\t"
2157 "movq %%mm2, %%mm3 \n\t"
2158 "psrlq $24, %%mm0 \n\t"
2159 "psrlq $24, %%mm2 \n\t"
2160 PAVGB" %%mm1, %%mm0 \n\t"
2161 PAVGB" %%mm3, %%mm2 \n\t"
2162 "punpcklbw %%mm7, %%mm0 \n\t"
2163 "punpcklbw %%mm7, %%mm2 \n\t"
2165 "movd (%0, %%ebx), %%mm0 \n\t"
2166 "movd (%1, %%ebx), %%mm1 \n\t"
2167 "movd 3(%0, %%ebx), %%mm2 \n\t"
2168 "movd 3(%1, %%ebx), %%mm3 \n\t"
2169 "punpcklbw %%mm7, %%mm0 \n\t"
2170 "punpcklbw %%mm7, %%mm1 \n\t"
2171 "punpcklbw %%mm7, %%mm2 \n\t"
2172 "punpcklbw %%mm7, %%mm3 \n\t"
2173 "paddw %%mm1, %%mm0 \n\t"
2174 "paddw %%mm3, %%mm2 \n\t"
2175 "paddw %%mm2, %%mm0 \n\t"
2176 "movd 6(%0, %%ebx), %%mm4 \n\t"
2177 "movd 6(%1, %%ebx), %%mm1 \n\t"
2178 "movd 9(%0, %%ebx), %%mm2 \n\t"
2179 "movd 9(%1, %%ebx), %%mm3 \n\t"
2180 "punpcklbw %%mm7, %%mm4 \n\t"
2181 "punpcklbw %%mm7, %%mm1 \n\t"
2182 "punpcklbw %%mm7, %%mm2 \n\t"
2183 "punpcklbw %%mm7, %%mm3 \n\t"
2184 "paddw %%mm1, %%mm4 \n\t"
2185 "paddw %%mm3, %%mm2 \n\t"
2186 "paddw %%mm4, %%mm2 \n\t"
2187 "psrlw $2, %%mm0 \n\t"
2188 "psrlw $2, %%mm2 \n\t"
2190 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2191 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2193 "pmaddwd %%mm0, %%mm1 \n\t"
2194 "pmaddwd %%mm2, %%mm3 \n\t"
2195 "pmaddwd %%mm6, %%mm0 \n\t"
2196 "pmaddwd %%mm6, %%mm2 \n\t"
2197 #ifndef FAST_BGR2YV12
2198 "psrad $8, %%mm0 \n\t"
2199 "psrad $8, %%mm1 \n\t"
2200 "psrad $8, %%mm2 \n\t"
2201 "psrad $8, %%mm3 \n\t"
2203 "packssdw %%mm2, %%mm0 \n\t"
2204 "packssdw %%mm3, %%mm1 \n\t"
2205 "pmaddwd %%mm5, %%mm0 \n\t"
2206 "pmaddwd %%mm5, %%mm1 \n\t"
2207 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2208 "psraw $7, %%mm0 \n\t"
2210 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2211 "movq 12(%0, %%ebx), %%mm4 \n\t"
2212 "movq 12(%1, %%ebx), %%mm1 \n\t"
2213 "movq 18(%0, %%ebx), %%mm2 \n\t"
2214 "movq 18(%1, %%ebx), %%mm3 \n\t"
2215 PAVGB" %%mm1, %%mm4 \n\t"
2216 PAVGB" %%mm3, %%mm2 \n\t"
2217 "movq %%mm4, %%mm1 \n\t"
2218 "movq %%mm2, %%mm3 \n\t"
2219 "psrlq $24, %%mm4 \n\t"
2220 "psrlq $24, %%mm2 \n\t"
2221 PAVGB" %%mm1, %%mm4 \n\t"
2222 PAVGB" %%mm3, %%mm2 \n\t"
2223 "punpcklbw %%mm7, %%mm4 \n\t"
2224 "punpcklbw %%mm7, %%mm2 \n\t"
2226 "movd 12(%0, %%ebx), %%mm4 \n\t"
2227 "movd 12(%1, %%ebx), %%mm1 \n\t"
2228 "movd 15(%0, %%ebx), %%mm2 \n\t"
2229 "movd 15(%1, %%ebx), %%mm3 \n\t"
2230 "punpcklbw %%mm7, %%mm4 \n\t"
2231 "punpcklbw %%mm7, %%mm1 \n\t"
2232 "punpcklbw %%mm7, %%mm2 \n\t"
2233 "punpcklbw %%mm7, %%mm3 \n\t"
2234 "paddw %%mm1, %%mm4 \n\t"
2235 "paddw %%mm3, %%mm2 \n\t"
2236 "paddw %%mm2, %%mm4 \n\t"
2237 "movd 18(%0, %%ebx), %%mm5 \n\t"
2238 "movd 18(%1, %%ebx), %%mm1 \n\t"
2239 "movd 21(%0, %%ebx), %%mm2 \n\t"
2240 "movd 21(%1, %%ebx), %%mm3 \n\t"
2241 "punpcklbw %%mm7, %%mm5 \n\t"
2242 "punpcklbw %%mm7, %%mm1 \n\t"
2243 "punpcklbw %%mm7, %%mm2 \n\t"
2244 "punpcklbw %%mm7, %%mm3 \n\t"
2245 "paddw %%mm1, %%mm5 \n\t"
2246 "paddw %%mm3, %%mm2 \n\t"
2247 "paddw %%mm5, %%mm2 \n\t"
2248 "movq "MANGLE(w1111)", %%mm5 \n\t"
2249 "psrlw $2, %%mm4 \n\t"
2250 "psrlw $2, %%mm2 \n\t"
2252 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2253 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2255 "pmaddwd %%mm4, %%mm1 \n\t"
2256 "pmaddwd %%mm2, %%mm3 \n\t"
2257 "pmaddwd %%mm6, %%mm4 \n\t"
2258 "pmaddwd %%mm6, %%mm2 \n\t"
2259 #ifndef FAST_BGR2YV12
2260 "psrad $8, %%mm4 \n\t"
2261 "psrad $8, %%mm1 \n\t"
2262 "psrad $8, %%mm2 \n\t"
2263 "psrad $8, %%mm3 \n\t"
2265 "packssdw %%mm2, %%mm4 \n\t"
2266 "packssdw %%mm3, %%mm1 \n\t"
2267 "pmaddwd %%mm5, %%mm4 \n\t"
2268 "pmaddwd %%mm5, %%mm1 \n\t"
2269 "addl $24, %%ebx \n\t"
2270 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2271 "psraw $7, %%mm4 \n\t"
2273 "movq %%mm0, %%mm1 \n\t"
2274 "punpckldq %%mm4, %%mm0 \n\t"
2275 "punpckhdq %%mm4, %%mm1 \n\t"
2276 "packsswb %%mm1, %%mm0 \n\t"
2277 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2279 "movd %%mm0, (%2, %%eax) \n\t"
2280 "punpckhdq %%mm0, %%mm0 \n\t"
2281 "movd %%mm0, (%3, %%eax) \n\t"
2282 "addl $4, %%eax \n\t"
2284 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2288 udst += chromStride;
2289 vdst += chromStride;
2293 asm volatile( EMMS" \n\t"
2299 for(; y<height; y+=2)
2302 for(i=0; i<chromWidth; i++)
2304 unsigned int b= src[6*i+0];
2305 unsigned int g= src[6*i+1];
2306 unsigned int r= src[6*i+2];
2308 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2309 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2310 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2320 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2326 for(i=0; i<chromWidth; i++)
2328 unsigned int b= src[6*i+0];
2329 unsigned int g= src[6*i+1];
2330 unsigned int r= src[6*i+2];
2332 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2340 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2343 udst += chromStride;
2344 vdst += chromStride;
2350 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2351 unsigned width, unsigned height, int src1Stride,
2352 int src2Stride, int dstStride){
2355 for(h=0; h < height; h++)
2362 "xorl %%eax, %%eax \n\t"
2364 PREFETCH" 64(%1, %%eax) \n\t"
2365 PREFETCH" 64(%2, %%eax) \n\t"
2366 "movdqa (%1, %%eax), %%xmm0 \n\t"
2367 "movdqa (%1, %%eax), %%xmm1 \n\t"
2368 "movdqa (%2, %%eax), %%xmm2 \n\t"
2369 "punpcklbw %%xmm2, %%xmm0 \n\t"
2370 "punpckhbw %%xmm2, %%xmm1 \n\t"
2371 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2372 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2373 "addl $16, %%eax \n\t"
2374 "cmpl %3, %%eax \n\t"
2376 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2381 "xorl %%eax, %%eax \n\t"
2383 PREFETCH" 64(%1, %%eax) \n\t"
2384 PREFETCH" 64(%2, %%eax) \n\t"
2385 "movq (%1, %%eax), %%mm0 \n\t"
2386 "movq 8(%1, %%eax), %%mm2 \n\t"
2387 "movq %%mm0, %%mm1 \n\t"
2388 "movq %%mm2, %%mm3 \n\t"
2389 "movq (%2, %%eax), %%mm4 \n\t"
2390 "movq 8(%2, %%eax), %%mm5 \n\t"
2391 "punpcklbw %%mm4, %%mm0 \n\t"
2392 "punpckhbw %%mm4, %%mm1 \n\t"
2393 "punpcklbw %%mm5, %%mm2 \n\t"
2394 "punpckhbw %%mm5, %%mm3 \n\t"
2395 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2396 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2397 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2398 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2399 "addl $16, %%eax \n\t"
2400 "cmpl %3, %%eax \n\t"
2402 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2406 for(w= (width&(~15)); w < width; w++)
2408 dest[2*w+0] = src1[w];
2409 dest[2*w+1] = src2[w];
2412 for(w=0; w < width; w++)
2414 dest[2*w+0] = src1[w];
2415 dest[2*w+1] = src2[w];
2431 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2432 uint8_t *dst1, uint8_t *dst2,
2433 unsigned width, unsigned height,
2434 int srcStride1, int srcStride2,
2435 int dstStride1, int dstStride2)
2439 w=width/2; h=height/2;
2444 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2447 const uint8_t* s1=src1+srcStride1*(y>>1);
2448 uint8_t* d=dst1+dstStride1*y;
2455 "movq %1, %%mm0\n\t"
2456 "movq 8%1, %%mm2\n\t"
2457 "movq 16%1, %%mm4\n\t"
2458 "movq 24%1, %%mm6\n\t"
2459 "movq %%mm0, %%mm1\n\t"
2460 "movq %%mm2, %%mm3\n\t"
2461 "movq %%mm4, %%mm5\n\t"
2462 "movq %%mm6, %%mm7\n\t"
2463 "punpcklbw %%mm0, %%mm0\n\t"
2464 "punpckhbw %%mm1, %%mm1\n\t"
2465 "punpcklbw %%mm2, %%mm2\n\t"
2466 "punpckhbw %%mm3, %%mm3\n\t"
2467 "punpcklbw %%mm4, %%mm4\n\t"
2468 "punpckhbw %%mm5, %%mm5\n\t"
2469 "punpcklbw %%mm6, %%mm6\n\t"
2470 "punpckhbw %%mm7, %%mm7\n\t"
2471 MOVNTQ" %%mm0, %0\n\t"
2472 MOVNTQ" %%mm1, 8%0\n\t"
2473 MOVNTQ" %%mm2, 16%0\n\t"
2474 MOVNTQ" %%mm3, 24%0\n\t"
2475 MOVNTQ" %%mm4, 32%0\n\t"
2476 MOVNTQ" %%mm5, 40%0\n\t"
2477 MOVNTQ" %%mm6, 48%0\n\t"
2478 MOVNTQ" %%mm7, 56%0"
2484 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2487 const uint8_t* s2=src2+srcStride2*(y>>1);
2488 uint8_t* d=dst2+dstStride2*y;
2495 "movq %1, %%mm0\n\t"
2496 "movq 8%1, %%mm2\n\t"
2497 "movq 16%1, %%mm4\n\t"
2498 "movq 24%1, %%mm6\n\t"
2499 "movq %%mm0, %%mm1\n\t"
2500 "movq %%mm2, %%mm3\n\t"
2501 "movq %%mm4, %%mm5\n\t"
2502 "movq %%mm6, %%mm7\n\t"
2503 "punpcklbw %%mm0, %%mm0\n\t"
2504 "punpckhbw %%mm1, %%mm1\n\t"
2505 "punpcklbw %%mm2, %%mm2\n\t"
2506 "punpckhbw %%mm3, %%mm3\n\t"
2507 "punpcklbw %%mm4, %%mm4\n\t"
2508 "punpckhbw %%mm5, %%mm5\n\t"
2509 "punpcklbw %%mm6, %%mm6\n\t"
2510 "punpckhbw %%mm7, %%mm7\n\t"
2511 MOVNTQ" %%mm0, %0\n\t"
2512 MOVNTQ" %%mm1, 8%0\n\t"
2513 MOVNTQ" %%mm2, 16%0\n\t"
2514 MOVNTQ" %%mm3, 24%0\n\t"
2515 MOVNTQ" %%mm4, 32%0\n\t"
2516 MOVNTQ" %%mm5, 40%0\n\t"
2517 MOVNTQ" %%mm6, 48%0\n\t"
2518 MOVNTQ" %%mm7, 56%0"
2524 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2535 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2537 unsigned width, unsigned height,
2538 int srcStride1, int srcStride2,
2539 int srcStride3, int dstStride)
2542 w=width/2; h=height;
2544 const uint8_t* yp=src1+srcStride1*y;
2545 const uint8_t* up=src2+srcStride2*(y>>2);
2546 const uint8_t* vp=src3+srcStride3*(y>>2);
2547 uint8_t* d=dst+dstStride*y;
2553 PREFETCH" 32(%1, %0)\n\t"
2554 PREFETCH" 32(%2, %0)\n\t"
2555 PREFETCH" 32(%3, %0)\n\t"
2556 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2557 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2558 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2559 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2560 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2561 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2562 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2563 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2564 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2565 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2567 "movq %%mm1, %%mm6\n\t"
2568 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2569 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2570 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2571 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2572 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2574 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2575 "movq 8(%1, %0, 4), %%mm0\n\t"
2576 "movq %%mm0, %%mm3\n\t"
2577 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2578 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2579 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2580 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2582 "movq %%mm4, %%mm6\n\t"
2583 "movq 16(%1, %0, 4), %%mm0\n\t"
2584 "movq %%mm0, %%mm3\n\t"
2585 "punpcklbw %%mm5, %%mm4\n\t"
2586 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2587 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2588 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2589 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2591 "punpckhbw %%mm5, %%mm6\n\t"
2592 "movq 24(%1, %0, 4), %%mm0\n\t"
2593 "movq %%mm0, %%mm3\n\t"
2594 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2595 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2596 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2597 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2600 : "r"(yp), "r" (up), "r"(vp), "r"(d)