3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
12 #include <inttypes.h> /* for __WORDSIZE */
15 // #warning You have misconfigured system and probably will lose performance!
16 #define __WORDSIZE MP_WORDSIZE
34 #define PREFETCH "prefetch"
35 #define PREFETCHW "prefetchw"
36 #define PAVGB "pavgusb"
37 #elif defined ( HAVE_MMX2 )
38 #define PREFETCH "prefetchnta"
39 #define PREFETCHW "prefetcht0"
42 #define PREFETCH "/nop"
43 #define PREFETCHW "/nop"
47 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
54 #define MOVNTQ "movntq"
55 #define SFENCE "sfence"
61 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
64 const uint8_t *s = src;
67 const uint8_t *mm_end;
71 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
73 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
79 "punpckldq 3%1, %%mm0\n\t"
81 "punpckldq 9%1, %%mm1\n\t"
82 "movd 12%1, %%mm2\n\t"
83 "punpckldq 15%1, %%mm2\n\t"
84 "movd 18%1, %%mm3\n\t"
85 "punpckldq 21%1, %%mm3\n\t"
86 "pand %%mm7, %%mm0\n\t"
87 "pand %%mm7, %%mm1\n\t"
88 "pand %%mm7, %%mm2\n\t"
89 "pand %%mm7, %%mm3\n\t"
90 MOVNTQ" %%mm0, %0\n\t"
91 MOVNTQ" %%mm1, 8%0\n\t"
92 MOVNTQ" %%mm2, 16%0\n\t"
100 __asm __volatile(SFENCE:::"memory");
101 __asm __volatile(EMMS:::"memory");
112 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
115 const uint8_t *s = src;
118 const uint8_t *mm_end;
122 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
129 "movq 8%1, %%mm1\n\t"
130 "movq 16%1, %%mm4\n\t"
131 "movq 24%1, %%mm5\n\t"
132 "movq %%mm0, %%mm2\n\t"
133 "movq %%mm1, %%mm3\n\t"
134 "movq %%mm4, %%mm6\n\t"
135 "movq %%mm5, %%mm7\n\t"
136 "psrlq $8, %%mm2\n\t"
137 "psrlq $8, %%mm3\n\t"
138 "psrlq $8, %%mm6\n\t"
139 "psrlq $8, %%mm7\n\t"
148 "por %%mm2, %%mm0\n\t"
149 "por %%mm3, %%mm1\n\t"
150 "por %%mm6, %%mm4\n\t"
151 "por %%mm7, %%mm5\n\t"
153 "movq %%mm1, %%mm2\n\t"
154 "movq %%mm4, %%mm3\n\t"
155 "psllq $48, %%mm2\n\t"
156 "psllq $32, %%mm3\n\t"
159 "por %%mm2, %%mm0\n\t"
160 "psrlq $16, %%mm1\n\t"
161 "psrlq $32, %%mm4\n\t"
162 "psllq $16, %%mm5\n\t"
163 "por %%mm3, %%mm1\n\t"
165 "por %%mm5, %%mm4\n\t"
167 MOVNTQ" %%mm0, %0\n\t"
168 MOVNTQ" %%mm1, 8%0\n\t"
171 :"m"(*s),"m"(mask24l),
172 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
177 __asm __volatile(SFENCE:::"memory");
178 __asm __volatile(EMMS:::"memory");
190 Original by Strepto/Astral
191 ported to gcc & bugfixed : A'rpi
192 MMX2, 3DNOW optimization by Nick Kurshev
193 32bit c version, and and&add trick by Michael Niedermayer
195 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
197 register const uint8_t* s=src;
198 register uint8_t* d=dst;
199 register const uint8_t *end;
200 const uint8_t *mm_end;
203 __asm __volatile(PREFETCH" %0"::"m"(*s));
204 __asm __volatile("movq %0, %%mm4"::"m"(mask15s));
211 "movq 8%1, %%mm2\n\t"
212 "movq %%mm0, %%mm1\n\t"
213 "movq %%mm2, %%mm3\n\t"
214 "pand %%mm4, %%mm0\n\t"
215 "pand %%mm4, %%mm2\n\t"
216 "paddw %%mm1, %%mm0\n\t"
217 "paddw %%mm3, %%mm2\n\t"
218 MOVNTQ" %%mm0, %0\n\t"
226 __asm __volatile(SFENCE:::"memory");
227 __asm __volatile(EMMS:::"memory");
232 register unsigned x= *((uint32_t *)s);
233 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
239 register unsigned short x= *((uint16_t *)s);
240 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
244 static inline void RENAME(rgb16to15)(const uint8_t *src,uint8_t *dst,unsigned src_size)
246 register const uint8_t* s=src;
247 register uint8_t* d=dst;
248 register const uint8_t *end;
249 const uint8_t *mm_end;
252 __asm __volatile(PREFETCH" %0"::"m"(*s));
253 __asm __volatile("movq %0, %%mm7"::"m"(mask15rg));
254 __asm __volatile("movq %0, %%mm6"::"m"(mask15b));
261 "movq 8%1, %%mm2\n\t"
262 "movq %%mm0, %%mm1\n\t"
263 "movq %%mm2, %%mm3\n\t"
264 "psrlq $1, %%mm0\n\t"
265 "psrlq $1, %%mm2\n\t"
266 "pand %%mm7, %%mm0\n\t"
267 "pand %%mm7, %%mm2\n\t"
268 "pand %%mm6, %%mm1\n\t"
269 "pand %%mm6, %%mm3\n\t"
270 "por %%mm1, %%mm0\n\t"
271 "por %%mm3, %%mm2\n\t"
272 MOVNTQ" %%mm0, %0\n\t"
280 __asm __volatile(SFENCE:::"memory");
281 __asm __volatile(EMMS:::"memory");
286 register uint32_t x= *((uint32_t *)s);
287 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
293 register uint16_t x= *((uint16_t *)s);
294 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
302 const uint8_t *s = src;
305 const uint8_t *mm_end;
307 uint16_t *d = (uint16_t *)dst;
311 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
313 "movq %3, %%mm5 \n\t"
314 "movq %4, %%mm6 \n\t"
315 "movq %5, %%mm7 \n\t"
318 PREFETCH" 32(%1) \n\t"
319 "movd (%1), %%mm0 \n\t"
320 "movd 4(%1), %%mm3 \n\t"
321 "punpckldq 8(%1), %%mm0 \n\t"
322 "punpckldq 12(%1), %%mm3 \n\t"
323 "movq %%mm0, %%mm1 \n\t"
324 "movq %%mm3, %%mm4 \n\t"
325 "pand %%mm6, %%mm0 \n\t"
326 "pand %%mm6, %%mm3 \n\t"
327 "pmaddwd %%mm7, %%mm0 \n\t"
328 "pmaddwd %%mm7, %%mm3 \n\t"
329 "pand %%mm5, %%mm1 \n\t"
330 "pand %%mm5, %%mm4 \n\t"
331 "por %%mm1, %%mm0 \n\t"
332 "por %%mm4, %%mm3 \n\t"
333 "psrld $5, %%mm0 \n\t"
334 "pslld $11, %%mm3 \n\t"
335 "por %%mm3, %%mm0 \n\t"
336 MOVNTQ" %%mm0, (%0) \n\t"
342 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
345 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
349 ::"m"(red_16mask),"m"(green_16mask));
355 "movd 4%1, %%mm3\n\t"
356 "punpckldq 8%1, %%mm0\n\t"
357 "punpckldq 12%1, %%mm3\n\t"
358 "movq %%mm0, %%mm1\n\t"
359 "movq %%mm0, %%mm2\n\t"
360 "movq %%mm3, %%mm4\n\t"
361 "movq %%mm3, %%mm5\n\t"
362 "psrlq $3, %%mm0\n\t"
363 "psrlq $3, %%mm3\n\t"
366 "psrlq $5, %%mm1\n\t"
367 "psrlq $5, %%mm4\n\t"
368 "pand %%mm6, %%mm1\n\t"
369 "pand %%mm6, %%mm4\n\t"
370 "psrlq $8, %%mm2\n\t"
371 "psrlq $8, %%mm5\n\t"
372 "pand %%mm7, %%mm2\n\t"
373 "pand %%mm7, %%mm5\n\t"
374 "por %%mm1, %%mm0\n\t"
375 "por %%mm4, %%mm3\n\t"
376 "por %%mm2, %%mm0\n\t"
377 "por %%mm5, %%mm3\n\t"
378 "psllq $16, %%mm3\n\t"
379 "por %%mm3, %%mm0\n\t"
380 MOVNTQ" %%mm0, %0\n\t"
381 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
391 const int src= *((uint32_t*)s)++;
392 *d++ = ((src&0xFF)>>3) + ((src&0xFC00)>>5) + ((src&0xF80000)>>8);
393 // *d++ = ((src>>3)&0x1F) + ((src>>5)&0x7E0) + ((src>>8)&0xF800);
397 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
399 const uint8_t *s = src;
402 const uint8_t *mm_end;
404 uint16_t *d = (uint16_t *)dst;
407 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
411 ::"m"(red_16mask),"m"(green_16mask));
418 "movd 4%1, %%mm3\n\t"
419 "punpckldq 8%1, %%mm0\n\t"
420 "punpckldq 12%1, %%mm3\n\t"
421 "movq %%mm0, %%mm1\n\t"
422 "movq %%mm0, %%mm2\n\t"
423 "movq %%mm3, %%mm4\n\t"
424 "movq %%mm3, %%mm5\n\t"
425 "psllq $8, %%mm0\n\t"
426 "psllq $8, %%mm3\n\t"
427 "pand %%mm7, %%mm0\n\t"
428 "pand %%mm7, %%mm3\n\t"
429 "psrlq $5, %%mm1\n\t"
430 "psrlq $5, %%mm4\n\t"
431 "pand %%mm6, %%mm1\n\t"
432 "pand %%mm6, %%mm4\n\t"
433 "psrlq $19, %%mm2\n\t"
434 "psrlq $19, %%mm5\n\t"
437 "por %%mm1, %%mm0\n\t"
438 "por %%mm4, %%mm3\n\t"
439 "por %%mm2, %%mm0\n\t"
440 "por %%mm5, %%mm3\n\t"
441 "psllq $16, %%mm3\n\t"
442 "por %%mm3, %%mm0\n\t"
443 MOVNTQ" %%mm0, %0\n\t"
444 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
448 __asm __volatile(SFENCE:::"memory");
449 __asm __volatile(EMMS:::"memory");
453 const int src= *((uint32_t*)s)++;
454 *d++ = ((src&0xF8)<<8) + ((src&0xFC00)>>5) + ((src&0xF80000)>>19);
458 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
460 const uint8_t *s = src;
463 const uint8_t *mm_end;
465 uint16_t *d = (uint16_t *)dst;
469 #if 1 //is faster only if multiplies are reasonable fast (FIXME figure out on which cpus this is faster, on Athlon its slightly faster)
471 "movq %3, %%mm5 \n\t"
472 "movq %4, %%mm6 \n\t"
473 "movq %5, %%mm7 \n\t"
476 PREFETCH" 32(%1) \n\t"
477 "movd (%1), %%mm0 \n\t"
478 "movd 4(%1), %%mm3 \n\t"
479 "punpckldq 8(%1), %%mm0 \n\t"
480 "punpckldq 12(%1), %%mm3 \n\t"
481 "movq %%mm0, %%mm1 \n\t"
482 "movq %%mm3, %%mm4 \n\t"
483 "pand %%mm6, %%mm0 \n\t"
484 "pand %%mm6, %%mm3 \n\t"
485 "pmaddwd %%mm7, %%mm0 \n\t"
486 "pmaddwd %%mm7, %%mm3 \n\t"
487 "pand %%mm5, %%mm1 \n\t"
488 "pand %%mm5, %%mm4 \n\t"
489 "por %%mm1, %%mm0 \n\t"
490 "por %%mm4, %%mm3 \n\t"
491 "psrld $6, %%mm0 \n\t"
492 "pslld $10, %%mm3 \n\t"
493 "por %%mm3, %%mm0 \n\t"
494 MOVNTQ" %%mm0, (%0) \n\t"
500 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
503 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
507 ::"m"(red_15mask),"m"(green_15mask));
513 "movd 4%1, %%mm3\n\t"
514 "punpckldq 8%1, %%mm0\n\t"
515 "punpckldq 12%1, %%mm3\n\t"
516 "movq %%mm0, %%mm1\n\t"
517 "movq %%mm0, %%mm2\n\t"
518 "movq %%mm3, %%mm4\n\t"
519 "movq %%mm3, %%mm5\n\t"
520 "psrlq $3, %%mm0\n\t"
521 "psrlq $3, %%mm3\n\t"
524 "psrlq $6, %%mm1\n\t"
525 "psrlq $6, %%mm4\n\t"
526 "pand %%mm6, %%mm1\n\t"
527 "pand %%mm6, %%mm4\n\t"
528 "psrlq $9, %%mm2\n\t"
529 "psrlq $9, %%mm5\n\t"
530 "pand %%mm7, %%mm2\n\t"
531 "pand %%mm7, %%mm5\n\t"
532 "por %%mm1, %%mm0\n\t"
533 "por %%mm4, %%mm3\n\t"
534 "por %%mm2, %%mm0\n\t"
535 "por %%mm5, %%mm3\n\t"
536 "psllq $16, %%mm3\n\t"
537 "por %%mm3, %%mm0\n\t"
538 MOVNTQ" %%mm0, %0\n\t"
539 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
544 __asm __volatile(SFENCE:::"memory");
545 __asm __volatile(EMMS:::"memory");
549 const int src= *((uint32_t*)s)++;
550 *d++ = ((src&0xFF)>>3) + ((src&0xF800)>>6) + ((src&0xF80000)>>9);
554 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
556 const uint8_t *s = src;
559 const uint8_t *mm_end;
561 uint16_t *d = (uint16_t *)dst;
564 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
568 ::"m"(red_15mask),"m"(green_15mask));
575 "movd 4%1, %%mm3\n\t"
576 "punpckldq 8%1, %%mm0\n\t"
577 "punpckldq 12%1, %%mm3\n\t"
578 "movq %%mm0, %%mm1\n\t"
579 "movq %%mm0, %%mm2\n\t"
580 "movq %%mm3, %%mm4\n\t"
581 "movq %%mm3, %%mm5\n\t"
582 "psllq $7, %%mm0\n\t"
583 "psllq $7, %%mm3\n\t"
584 "pand %%mm7, %%mm0\n\t"
585 "pand %%mm7, %%mm3\n\t"
586 "psrlq $6, %%mm1\n\t"
587 "psrlq $6, %%mm4\n\t"
588 "pand %%mm6, %%mm1\n\t"
589 "pand %%mm6, %%mm4\n\t"
590 "psrlq $19, %%mm2\n\t"
591 "psrlq $19, %%mm5\n\t"
594 "por %%mm1, %%mm0\n\t"
595 "por %%mm4, %%mm3\n\t"
596 "por %%mm2, %%mm0\n\t"
597 "por %%mm5, %%mm3\n\t"
598 "psllq $16, %%mm3\n\t"
599 "por %%mm3, %%mm0\n\t"
600 MOVNTQ" %%mm0, %0\n\t"
601 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
605 __asm __volatile(SFENCE:::"memory");
606 __asm __volatile(EMMS:::"memory");
610 const int src= *((uint32_t*)s)++;
611 *d++ = ((src&0xF8)<<7) + ((src&0xF800)>>6) + ((src&0xF80000)>>19);
615 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
617 const uint8_t *s = src;
620 const uint8_t *mm_end;
622 uint16_t *d = (uint16_t *)dst;
625 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
629 ::"m"(red_16mask),"m"(green_16mask));
636 "movd 3%1, %%mm3\n\t"
637 "punpckldq 6%1, %%mm0\n\t"
638 "punpckldq 9%1, %%mm3\n\t"
639 "movq %%mm0, %%mm1\n\t"
640 "movq %%mm0, %%mm2\n\t"
641 "movq %%mm3, %%mm4\n\t"
642 "movq %%mm3, %%mm5\n\t"
643 "psrlq $3, %%mm0\n\t"
644 "psrlq $3, %%mm3\n\t"
647 "psrlq $5, %%mm1\n\t"
648 "psrlq $5, %%mm4\n\t"
649 "pand %%mm6, %%mm1\n\t"
650 "pand %%mm6, %%mm4\n\t"
651 "psrlq $8, %%mm2\n\t"
652 "psrlq $8, %%mm5\n\t"
653 "pand %%mm7, %%mm2\n\t"
654 "pand %%mm7, %%mm5\n\t"
655 "por %%mm1, %%mm0\n\t"
656 "por %%mm4, %%mm3\n\t"
657 "por %%mm2, %%mm0\n\t"
658 "por %%mm5, %%mm3\n\t"
659 "psllq $16, %%mm3\n\t"
660 "por %%mm3, %%mm0\n\t"
661 MOVNTQ" %%mm0, %0\n\t"
662 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
666 __asm __volatile(SFENCE:::"memory");
667 __asm __volatile(EMMS:::"memory");
674 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
678 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
680 const uint8_t *s = src;
683 const uint8_t *mm_end;
685 uint16_t *d = (uint16_t *)dst;
688 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
692 ::"m"(red_16mask),"m"(green_16mask));
699 "movd 3%1, %%mm3\n\t"
700 "punpckldq 6%1, %%mm0\n\t"
701 "punpckldq 9%1, %%mm3\n\t"
702 "movq %%mm0, %%mm1\n\t"
703 "movq %%mm0, %%mm2\n\t"
704 "movq %%mm3, %%mm4\n\t"
705 "movq %%mm3, %%mm5\n\t"
706 "psllq $8, %%mm0\n\t"
707 "psllq $8, %%mm3\n\t"
708 "pand %%mm7, %%mm0\n\t"
709 "pand %%mm7, %%mm3\n\t"
710 "psrlq $5, %%mm1\n\t"
711 "psrlq $5, %%mm4\n\t"
712 "pand %%mm6, %%mm1\n\t"
713 "pand %%mm6, %%mm4\n\t"
714 "psrlq $19, %%mm2\n\t"
715 "psrlq $19, %%mm5\n\t"
718 "por %%mm1, %%mm0\n\t"
719 "por %%mm4, %%mm3\n\t"
720 "por %%mm2, %%mm0\n\t"
721 "por %%mm5, %%mm3\n\t"
722 "psllq $16, %%mm3\n\t"
723 "por %%mm3, %%mm0\n\t"
724 MOVNTQ" %%mm0, %0\n\t"
725 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
729 __asm __volatile(SFENCE:::"memory");
730 __asm __volatile(EMMS:::"memory");
737 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
741 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
743 const uint8_t *s = src;
746 const uint8_t *mm_end;
748 uint16_t *d = (uint16_t *)dst;
751 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
755 ::"m"(red_15mask),"m"(green_15mask));
762 "movd 3%1, %%mm3\n\t"
763 "punpckldq 6%1, %%mm0\n\t"
764 "punpckldq 9%1, %%mm3\n\t"
765 "movq %%mm0, %%mm1\n\t"
766 "movq %%mm0, %%mm2\n\t"
767 "movq %%mm3, %%mm4\n\t"
768 "movq %%mm3, %%mm5\n\t"
769 "psrlq $3, %%mm0\n\t"
770 "psrlq $3, %%mm3\n\t"
773 "psrlq $6, %%mm1\n\t"
774 "psrlq $6, %%mm4\n\t"
775 "pand %%mm6, %%mm1\n\t"
776 "pand %%mm6, %%mm4\n\t"
777 "psrlq $9, %%mm2\n\t"
778 "psrlq $9, %%mm5\n\t"
779 "pand %%mm7, %%mm2\n\t"
780 "pand %%mm7, %%mm5\n\t"
781 "por %%mm1, %%mm0\n\t"
782 "por %%mm4, %%mm3\n\t"
783 "por %%mm2, %%mm0\n\t"
784 "por %%mm5, %%mm3\n\t"
785 "psllq $16, %%mm3\n\t"
786 "por %%mm3, %%mm0\n\t"
787 MOVNTQ" %%mm0, %0\n\t"
788 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
792 __asm __volatile(SFENCE:::"memory");
793 __asm __volatile(EMMS:::"memory");
800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
804 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
806 const uint8_t *s = src;
809 const uint8_t *mm_end;
811 uint16_t *d = (uint16_t *)dst;
814 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
818 ::"m"(red_15mask),"m"(green_15mask));
825 "movd 3%1, %%mm3\n\t"
826 "punpckldq 6%1, %%mm0\n\t"
827 "punpckldq 9%1, %%mm3\n\t"
828 "movq %%mm0, %%mm1\n\t"
829 "movq %%mm0, %%mm2\n\t"
830 "movq %%mm3, %%mm4\n\t"
831 "movq %%mm3, %%mm5\n\t"
832 "psllq $7, %%mm0\n\t"
833 "psllq $7, %%mm3\n\t"
834 "pand %%mm7, %%mm0\n\t"
835 "pand %%mm7, %%mm3\n\t"
836 "psrlq $6, %%mm1\n\t"
837 "psrlq $6, %%mm4\n\t"
838 "pand %%mm6, %%mm1\n\t"
839 "pand %%mm6, %%mm4\n\t"
840 "psrlq $19, %%mm2\n\t"
841 "psrlq $19, %%mm5\n\t"
844 "por %%mm1, %%mm0\n\t"
845 "por %%mm4, %%mm3\n\t"
846 "por %%mm2, %%mm0\n\t"
847 "por %%mm5, %%mm3\n\t"
848 "psllq $16, %%mm3\n\t"
849 "por %%mm3, %%mm0\n\t"
850 MOVNTQ" %%mm0, %0\n\t"
851 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
855 __asm __volatile(SFENCE:::"memory");
856 __asm __volatile(EMMS:::"memory");
863 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
868 I use here less accurate approximation by simply
869 left-shifting the input
870 value and filling the low order bits with
871 zeroes. This method improves png's
872 compression but this scheme cannot reproduce white exactly, since it does not
873 generate an all-ones maximum value; the net effect is to darken the
876 The better method should be "left bit replication":
886 | Leftmost Bits Repeated to Fill Open Bits
890 static inline void RENAME(rgb15to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
894 const uint16_t *mm_end;
896 uint8_t *d = (uint8_t *)dst;
897 const uint16_t *s = (uint16_t *)src;
898 end = s + src_size/2;
900 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
912 "psllq $3, %%mm0\n\t"
913 "psrlq $2, %%mm1\n\t"
914 "psrlq $7, %%mm2\n\t"
915 "movq %%mm0, %%mm3\n\t"
916 "movq %%mm1, %%mm4\n\t"
917 "movq %%mm2, %%mm5\n\t"
918 "punpcklwd %5, %%mm0\n\t"
919 "punpcklwd %5, %%mm1\n\t"
920 "punpcklwd %5, %%mm2\n\t"
921 "punpckhwd %5, %%mm3\n\t"
922 "punpckhwd %5, %%mm4\n\t"
923 "punpckhwd %5, %%mm5\n\t"
924 "psllq $8, %%mm1\n\t"
925 "psllq $16, %%mm2\n\t"
926 "por %%mm1, %%mm0\n\t"
927 "por %%mm2, %%mm0\n\t"
928 "psllq $8, %%mm4\n\t"
929 "psllq $16, %%mm5\n\t"
930 "por %%mm4, %%mm3\n\t"
931 "por %%mm5, %%mm3\n\t"
933 "movq %%mm0, %%mm6\n\t"
934 "movq %%mm3, %%mm7\n\t"
936 "movq 8%1, %%mm0\n\t"
937 "movq 8%1, %%mm1\n\t"
938 "movq 8%1, %%mm2\n\t"
942 "psllq $3, %%mm0\n\t"
943 "psrlq $2, %%mm1\n\t"
944 "psrlq $7, %%mm2\n\t"
945 "movq %%mm0, %%mm3\n\t"
946 "movq %%mm1, %%mm4\n\t"
947 "movq %%mm2, %%mm5\n\t"
948 "punpcklwd %5, %%mm0\n\t"
949 "punpcklwd %5, %%mm1\n\t"
950 "punpcklwd %5, %%mm2\n\t"
951 "punpckhwd %5, %%mm3\n\t"
952 "punpckhwd %5, %%mm4\n\t"
953 "punpckhwd %5, %%mm5\n\t"
954 "psllq $8, %%mm1\n\t"
955 "psllq $16, %%mm2\n\t"
956 "por %%mm1, %%mm0\n\t"
957 "por %%mm2, %%mm0\n\t"
958 "psllq $8, %%mm4\n\t"
959 "psllq $16, %%mm5\n\t"
960 "por %%mm4, %%mm3\n\t"
961 "por %%mm5, %%mm3\n\t"
964 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
966 /* Borrowed 32 to 24 */
968 "movq %%mm0, %%mm4\n\t"
969 "movq %%mm3, %%mm5\n\t"
970 "movq %%mm6, %%mm0\n\t"
971 "movq %%mm7, %%mm1\n\t"
973 "movq %%mm4, %%mm6\n\t"
974 "movq %%mm5, %%mm7\n\t"
975 "movq %%mm0, %%mm2\n\t"
976 "movq %%mm1, %%mm3\n\t"
978 "psrlq $8, %%mm2\n\t"
979 "psrlq $8, %%mm3\n\t"
980 "psrlq $8, %%mm6\n\t"
981 "psrlq $8, %%mm7\n\t"
990 "por %%mm2, %%mm0\n\t"
991 "por %%mm3, %%mm1\n\t"
992 "por %%mm6, %%mm4\n\t"
993 "por %%mm7, %%mm5\n\t"
995 "movq %%mm1, %%mm2\n\t"
996 "movq %%mm4, %%mm3\n\t"
997 "psllq $48, %%mm2\n\t"
998 "psllq $32, %%mm3\n\t"
1000 "pand %5, %%mm3\n\t"
1001 "por %%mm2, %%mm0\n\t"
1002 "psrlq $16, %%mm1\n\t"
1003 "psrlq $32, %%mm4\n\t"
1004 "psllq $16, %%mm5\n\t"
1005 "por %%mm3, %%mm1\n\t"
1006 "pand %6, %%mm5\n\t"
1007 "por %%mm5, %%mm4\n\t"
1009 MOVNTQ" %%mm0, %0\n\t"
1010 MOVNTQ" %%mm1, 8%0\n\t"
1011 MOVNTQ" %%mm4, 16%0"
1014 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1019 __asm __volatile(SFENCE:::"memory");
1020 __asm __volatile(EMMS:::"memory");
1024 register uint16_t bgr;
1026 *d++ = (bgr&0x1F)<<3;
1027 *d++ = (bgr&0x3E0)>>2;
1028 *d++ = (bgr&0x7C00)>>7;
1032 static inline void RENAME(rgb16to24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1034 const uint16_t *end;
1036 const uint16_t *mm_end;
1038 uint8_t *d = (uint8_t *)dst;
1039 const uint16_t *s = (const uint16_t *)src;
1040 end = s + src_size/2;
1042 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1048 "movq %1, %%mm0\n\t"
1049 "movq %1, %%mm1\n\t"
1050 "movq %1, %%mm2\n\t"
1051 "pand %2, %%mm0\n\t"
1052 "pand %3, %%mm1\n\t"
1053 "pand %4, %%mm2\n\t"
1054 "psllq $3, %%mm0\n\t"
1055 "psrlq $3, %%mm1\n\t"
1056 "psrlq $8, %%mm2\n\t"
1057 "movq %%mm0, %%mm3\n\t"
1058 "movq %%mm1, %%mm4\n\t"
1059 "movq %%mm2, %%mm5\n\t"
1060 "punpcklwd %5, %%mm0\n\t"
1061 "punpcklwd %5, %%mm1\n\t"
1062 "punpcklwd %5, %%mm2\n\t"
1063 "punpckhwd %5, %%mm3\n\t"
1064 "punpckhwd %5, %%mm4\n\t"
1065 "punpckhwd %5, %%mm5\n\t"
1066 "psllq $8, %%mm1\n\t"
1067 "psllq $16, %%mm2\n\t"
1068 "por %%mm1, %%mm0\n\t"
1069 "por %%mm2, %%mm0\n\t"
1070 "psllq $8, %%mm4\n\t"
1071 "psllq $16, %%mm5\n\t"
1072 "por %%mm4, %%mm3\n\t"
1073 "por %%mm5, %%mm3\n\t"
1075 "movq %%mm0, %%mm6\n\t"
1076 "movq %%mm3, %%mm7\n\t"
1078 "movq 8%1, %%mm0\n\t"
1079 "movq 8%1, %%mm1\n\t"
1080 "movq 8%1, %%mm2\n\t"
1081 "pand %2, %%mm0\n\t"
1082 "pand %3, %%mm1\n\t"
1083 "pand %4, %%mm2\n\t"
1084 "psllq $3, %%mm0\n\t"
1085 "psrlq $3, %%mm1\n\t"
1086 "psrlq $8, %%mm2\n\t"
1087 "movq %%mm0, %%mm3\n\t"
1088 "movq %%mm1, %%mm4\n\t"
1089 "movq %%mm2, %%mm5\n\t"
1090 "punpcklwd %5, %%mm0\n\t"
1091 "punpcklwd %5, %%mm1\n\t"
1092 "punpcklwd %5, %%mm2\n\t"
1093 "punpckhwd %5, %%mm3\n\t"
1094 "punpckhwd %5, %%mm4\n\t"
1095 "punpckhwd %5, %%mm5\n\t"
1096 "psllq $8, %%mm1\n\t"
1097 "psllq $16, %%mm2\n\t"
1098 "por %%mm1, %%mm0\n\t"
1099 "por %%mm2, %%mm0\n\t"
1100 "psllq $8, %%mm4\n\t"
1101 "psllq $16, %%mm5\n\t"
1102 "por %%mm4, %%mm3\n\t"
1103 "por %%mm5, %%mm3\n\t"
1105 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
1107 /* Borrowed 32 to 24 */
1109 "movq %%mm0, %%mm4\n\t"
1110 "movq %%mm3, %%mm5\n\t"
1111 "movq %%mm6, %%mm0\n\t"
1112 "movq %%mm7, %%mm1\n\t"
1114 "movq %%mm4, %%mm6\n\t"
1115 "movq %%mm5, %%mm7\n\t"
1116 "movq %%mm0, %%mm2\n\t"
1117 "movq %%mm1, %%mm3\n\t"
1119 "psrlq $8, %%mm2\n\t"
1120 "psrlq $8, %%mm3\n\t"
1121 "psrlq $8, %%mm6\n\t"
1122 "psrlq $8, %%mm7\n\t"
1123 "pand %2, %%mm0\n\t"
1124 "pand %2, %%mm1\n\t"
1125 "pand %2, %%mm4\n\t"
1126 "pand %2, %%mm5\n\t"
1127 "pand %3, %%mm2\n\t"
1128 "pand %3, %%mm3\n\t"
1129 "pand %3, %%mm6\n\t"
1130 "pand %3, %%mm7\n\t"
1131 "por %%mm2, %%mm0\n\t"
1132 "por %%mm3, %%mm1\n\t"
1133 "por %%mm6, %%mm4\n\t"
1134 "por %%mm7, %%mm5\n\t"
1136 "movq %%mm1, %%mm2\n\t"
1137 "movq %%mm4, %%mm3\n\t"
1138 "psllq $48, %%mm2\n\t"
1139 "psllq $32, %%mm3\n\t"
1140 "pand %4, %%mm2\n\t"
1141 "pand %5, %%mm3\n\t"
1142 "por %%mm2, %%mm0\n\t"
1143 "psrlq $16, %%mm1\n\t"
1144 "psrlq $32, %%mm4\n\t"
1145 "psllq $16, %%mm5\n\t"
1146 "por %%mm3, %%mm1\n\t"
1147 "pand %6, %%mm5\n\t"
1148 "por %%mm5, %%mm4\n\t"
1150 MOVNTQ" %%mm0, %0\n\t"
1151 MOVNTQ" %%mm1, 8%0\n\t"
1152 MOVNTQ" %%mm4, 16%0"
1155 :"m"(*s),"m"(mask24l),"m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
1160 __asm __volatile(SFENCE:::"memory");
1161 __asm __volatile(EMMS:::"memory");
1165 register uint16_t bgr;
1167 *d++ = (bgr&0x1F)<<3;
1168 *d++ = (bgr&0x7E0)>>3;
1169 *d++ = (bgr&0xF800)>>8;
1173 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1175 const uint16_t *end;
1177 const uint16_t *mm_end;
1179 uint8_t *d = (uint8_t *)dst;
1180 const uint16_t *s = (const uint16_t *)src;
1181 end = s + src_size/2;
1183 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1184 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1190 "movq %1, %%mm0\n\t"
1191 "movq %1, %%mm1\n\t"
1192 "movq %1, %%mm2\n\t"
1193 "pand %2, %%mm0\n\t"
1194 "pand %3, %%mm1\n\t"
1195 "pand %4, %%mm2\n\t"
1196 "psllq $3, %%mm0\n\t"
1197 "psrlq $2, %%mm1\n\t"
1198 "psrlq $7, %%mm2\n\t"
1199 "movq %%mm0, %%mm3\n\t"
1200 "movq %%mm1, %%mm4\n\t"
1201 "movq %%mm2, %%mm5\n\t"
1202 "punpcklwd %%mm7, %%mm0\n\t"
1203 "punpcklwd %%mm7, %%mm1\n\t"
1204 "punpcklwd %%mm7, %%mm2\n\t"
1205 "punpckhwd %%mm7, %%mm3\n\t"
1206 "punpckhwd %%mm7, %%mm4\n\t"
1207 "punpckhwd %%mm7, %%mm5\n\t"
1208 "psllq $8, %%mm1\n\t"
1209 "psllq $16, %%mm2\n\t"
1210 "por %%mm1, %%mm0\n\t"
1211 "por %%mm2, %%mm0\n\t"
1212 "psllq $8, %%mm4\n\t"
1213 "psllq $16, %%mm5\n\t"
1214 "por %%mm4, %%mm3\n\t"
1215 "por %%mm5, %%mm3\n\t"
1216 MOVNTQ" %%mm0, %0\n\t"
1217 MOVNTQ" %%mm3, 8%0\n\t"
1219 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r)
1224 __asm __volatile(SFENCE:::"memory");
1225 __asm __volatile(EMMS:::"memory");
1229 #if 0 //slightly slower on athlon
1231 *((uint32_t*)d)++ = ((bgr&0x1F)<<3) + ((bgr&0x3E0)<<6) + ((bgr&0x7C00)<<9);
1233 //FIXME this is very likely wrong for bigendian (and the following converters too)
1234 register uint16_t bgr;
1236 *d++ = (bgr&0x1F)<<3;
1237 *d++ = (bgr&0x3E0)>>2;
1238 *d++ = (bgr&0x7C00)>>7;
1244 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, unsigned src_size)
1246 const uint16_t *end;
1248 const uint16_t *mm_end;
1250 uint8_t *d = (uint8_t *)dst;
1251 const uint16_t *s = (uint16_t *)src;
1252 end = s + src_size/2;
1254 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
1255 __asm __volatile("pxor %%mm7,%%mm7\n\t":::"memory");
1261 "movq %1, %%mm0\n\t"
1262 "movq %1, %%mm1\n\t"
1263 "movq %1, %%mm2\n\t"
1264 "pand %2, %%mm0\n\t"
1265 "pand %3, %%mm1\n\t"
1266 "pand %4, %%mm2\n\t"
1267 "psllq $3, %%mm0\n\t"
1268 "psrlq $3, %%mm1\n\t"
1269 "psrlq $8, %%mm2\n\t"
1270 "movq %%mm0, %%mm3\n\t"
1271 "movq %%mm1, %%mm4\n\t"
1272 "movq %%mm2, %%mm5\n\t"
1273 "punpcklwd %%mm7, %%mm0\n\t"
1274 "punpcklwd %%mm7, %%mm1\n\t"
1275 "punpcklwd %%mm7, %%mm2\n\t"
1276 "punpckhwd %%mm7, %%mm3\n\t"
1277 "punpckhwd %%mm7, %%mm4\n\t"
1278 "punpckhwd %%mm7, %%mm5\n\t"
1279 "psllq $8, %%mm1\n\t"
1280 "psllq $16, %%mm2\n\t"
1281 "por %%mm1, %%mm0\n\t"
1282 "por %%mm2, %%mm0\n\t"
1283 "psllq $8, %%mm4\n\t"
1284 "psllq $16, %%mm5\n\t"
1285 "por %%mm4, %%mm3\n\t"
1286 "por %%mm5, %%mm3\n\t"
1287 MOVNTQ" %%mm0, %0\n\t"
1288 MOVNTQ" %%mm3, 8%0\n\t"
1290 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r)
1295 __asm __volatile(SFENCE:::"memory");
1296 __asm __volatile(EMMS:::"memory");
1300 register uint16_t bgr;
1302 *d++ = (bgr&0x1F)<<3;
1303 *d++ = (bgr&0x7E0)>>3;
1304 *d++ = (bgr&0xF800)>>8;
1309 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1312 /* TODO: unroll this loop */
1314 "xorl %%eax, %%eax \n\t"
1317 PREFETCH" 32(%0, %%eax) \n\t"
1318 "movq (%0, %%eax), %%mm0 \n\t"
1319 "movq %%mm0, %%mm1 \n\t"
1320 "movq %%mm0, %%mm2 \n\t"
1321 "pslld $16, %%mm0 \n\t"
1322 "psrld $16, %%mm1 \n\t"
1323 "pand "MANGLE(mask32r)", %%mm0 \n\t"
1324 "pand "MANGLE(mask32g)", %%mm2 \n\t"
1325 "pand "MANGLE(mask32b)", %%mm1 \n\t"
1326 "por %%mm0, %%mm2 \n\t"
1327 "por %%mm1, %%mm2 \n\t"
1328 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
1329 "addl $8, %%eax \n\t"
1330 "cmpl %2, %%eax \n\t"
1332 :: "r" (src), "r"(dst), "r" (src_size-7)
1336 __asm __volatile(SFENCE:::"memory");
1337 __asm __volatile(EMMS:::"memory");
1340 unsigned num_pixels = src_size >> 2;
1341 for(i=0; i<num_pixels; i++)
1343 #ifdef WORDS_BIGENDIAN
1344 dst[4*i + 1] = src[4*i + 3];
1345 dst[4*i + 2] = src[4*i + 2];
1346 dst[4*i + 3] = src[4*i + 1];
1348 dst[4*i + 0] = src[4*i + 2];
1349 dst[4*i + 1] = src[4*i + 1];
1350 dst[4*i + 2] = src[4*i + 0];
1356 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
1360 int mmx_size= 23 - src_size;
1362 "movq "MANGLE(mask24r)", %%mm5 \n\t"
1363 "movq "MANGLE(mask24g)", %%mm6 \n\t"
1364 "movq "MANGLE(mask24b)", %%mm7 \n\t"
1367 PREFETCH" 32(%1, %%eax) \n\t"
1368 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1369 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
1370 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
1371 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1372 "pand %%mm5, %%mm0 \n\t"
1373 "pand %%mm6, %%mm1 \n\t"
1374 "pand %%mm7, %%mm2 \n\t"
1375 "por %%mm0, %%mm1 \n\t"
1376 "por %%mm2, %%mm1 \n\t"
1377 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
1378 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
1379 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
1380 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
1381 "pand %%mm7, %%mm0 \n\t"
1382 "pand %%mm5, %%mm1 \n\t"
1383 "pand %%mm6, %%mm2 \n\t"
1384 "por %%mm0, %%mm1 \n\t"
1385 "por %%mm2, %%mm1 \n\t"
1386 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
1387 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
1388 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
1389 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
1390 "pand %%mm6, %%mm0 \n\t"
1391 "pand %%mm7, %%mm1 \n\t"
1392 "pand %%mm5, %%mm2 \n\t"
1393 "por %%mm0, %%mm1 \n\t"
1394 "por %%mm2, %%mm1 \n\t"
1395 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
1396 "addl $24, %%eax \n\t"
1399 : "r" (src-mmx_size), "r"(dst-mmx_size)
1402 __asm __volatile(SFENCE:::"memory");
1403 __asm __volatile(EMMS:::"memory");
1405 if(mmx_size==23) return; //finihsed, was multiple of 8
1409 src_size= 23-mmx_size;
1413 for(i=0; i<src_size; i+=3)
1417 dst[i + 1] = src[i + 1];
1418 dst[i + 2] = src[i + 0];
1423 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1424 unsigned int width, unsigned int height,
1425 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1428 const unsigned chromWidth= width>>1;
1429 for(y=0; y<height; y++)
1432 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
1434 "xorl %%eax, %%eax \n\t"
1437 PREFETCH" 32(%1, %%eax, 2) \n\t"
1438 PREFETCH" 32(%2, %%eax) \n\t"
1439 PREFETCH" 32(%3, %%eax) \n\t"
1440 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
1441 "movq %%mm0, %%mm2 \n\t" // U(0)
1442 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
1443 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1444 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1446 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
1447 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
1448 "movq %%mm3, %%mm4 \n\t" // Y(0)
1449 "movq %%mm5, %%mm6 \n\t" // Y(8)
1450 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1451 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1452 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1453 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1455 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
1456 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
1457 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
1458 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
1460 "addl $8, %%eax \n\t"
1461 "cmpl %4, %%eax \n\t"
1463 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1468 #if defined ARCH_ALPHA && defined HAVE_MVI
1469 #define pl2yuy2(n) \
1474 asm("unpkbw %1, %0" : "=r"(y1) : "r"(y1)); \
1475 asm("unpkbw %1, %0" : "=r"(y2) : "r"(y2)); \
1476 asm("unpkbl %1, %0" : "=r"(u) : "r"(u)); \
1477 asm("unpkbl %1, %0" : "=r"(v) : "r"(v)); \
1478 yuv1 = (u << 8) + (v << 24); \
1485 uint64_t *qdst = (uint64_t *) dst;
1486 uint64_t *qdst2 = (uint64_t *) (dst + dstStride);
1487 const uint32_t *yc = (uint32_t *) ysrc;
1488 const uint32_t *yc2 = (uint32_t *) (ysrc + lumStride);
1489 const uint16_t *uc = (uint16_t*) usrc, *vc = (uint16_t*) vsrc;
1490 for(i = 0; i < chromWidth; i += 8){
1491 uint64_t y1, y2, yuv1, yuv2;
1494 asm("ldq $31,64(%0)" :: "r"(yc));
1495 asm("ldq $31,64(%0)" :: "r"(yc2));
1496 asm("ldq $31,64(%0)" :: "r"(uc));
1497 asm("ldq $31,64(%0)" :: "r"(vc));
1515 #elif __WORDSIZE >= 64
1517 uint64_t *ldst = (uint64_t *) dst;
1518 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1519 for(i = 0; i < chromWidth; i += 2){
1521 k = yc[0] + (uc[0] << 8) +
1522 (yc[1] << 16) + (vc[0] << 24);
1523 l = yc[2] + (uc[1] << 8) +
1524 (yc[3] << 16) + (vc[1] << 24);
1525 *ldst++ = k + (l << 32);
1532 int i, *idst = (int32_t *) dst;
1533 const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
1534 for(i = 0; i < chromWidth; i++){
1535 *idst++ = yc[0] + (uc[0] << 8) +
1536 (yc[1] << 16) + (vc[0] << 24);
1543 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
1545 usrc += chromStride;
1546 vsrc += chromStride;
1560 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1561 * problem for anyone then tell me, and ill fix it)
1563 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1564 unsigned int width, unsigned int height,
1565 int lumStride, int chromStride, int dstStride)
1567 //FIXME interpolate chroma
1568 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1573 * width should be a multiple of 16
1575 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1576 unsigned int width, unsigned int height,
1577 int lumStride, int chromStride, int dstStride)
1579 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1584 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1585 * problem for anyone then tell me, and ill fix it)
1587 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1588 unsigned int width, unsigned int height,
1589 int lumStride, int chromStride, int srcStride)
1592 const unsigned chromWidth= width>>1;
1593 for(y=0; y<height; y+=2)
1597 "xorl %%eax, %%eax \n\t"
1598 "pcmpeqw %%mm7, %%mm7 \n\t"
1599 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1602 PREFETCH" 64(%0, %%eax, 4) \n\t"
1603 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1604 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1605 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1606 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1607 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1608 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1609 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1610 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1611 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1612 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1614 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1616 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
1617 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
1618 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1619 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1620 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1621 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1622 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1623 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1624 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1625 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1627 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1629 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1630 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1631 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1632 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1633 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1634 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1635 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1636 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1638 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1639 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1641 "addl $8, %%eax \n\t"
1642 "cmpl %4, %%eax \n\t"
1644 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1652 "xorl %%eax, %%eax \n\t"
1655 PREFETCH" 64(%0, %%eax, 4) \n\t"
1656 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1657 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1658 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1659 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1660 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1661 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1662 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1663 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1664 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1665 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1667 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1668 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1670 "addl $8, %%eax \n\t"
1671 "cmpl %4, %%eax \n\t"
1674 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1679 for(i=0; i<chromWidth; i++)
1681 ydst[2*i+0] = src[4*i+0];
1682 udst[i] = src[4*i+1];
1683 ydst[2*i+1] = src[4*i+2];
1684 vdst[i] = src[4*i+3];
1689 for(i=0; i<chromWidth; i++)
1691 ydst[2*i+0] = src[4*i+0];
1692 ydst[2*i+1] = src[4*i+2];
1695 udst += chromStride;
1696 vdst += chromStride;
1701 asm volatile( EMMS" \n\t"
1707 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
1708 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1709 unsigned int width, unsigned int height, int lumStride, int chromStride)
1712 memcpy(ydst, ysrc, width*height);
1714 /* XXX: implement upscaling for U,V */
1717 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1724 for(x=0; x<srcWidth-1; x++){
1725 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1726 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1728 dst[2*srcWidth-1]= src[srcWidth-1];
1732 for(y=1; y<srcHeight; y++){
1733 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1734 const int mmxSize= srcWidth&~15;
1736 "movl %4, %%eax \n\t"
1738 "movq (%0, %%eax), %%mm0 \n\t"
1739 "movq (%1, %%eax), %%mm1 \n\t"
1740 "movq 1(%0, %%eax), %%mm2 \n\t"
1741 "movq 1(%1, %%eax), %%mm3 \n\t"
1742 "movq -1(%0, %%eax), %%mm4 \n\t"
1743 "movq -1(%1, %%eax), %%mm5 \n\t"
1744 PAVGB" %%mm0, %%mm5 \n\t"
1745 PAVGB" %%mm0, %%mm3 \n\t"
1746 PAVGB" %%mm0, %%mm5 \n\t"
1747 PAVGB" %%mm0, %%mm3 \n\t"
1748 PAVGB" %%mm1, %%mm4 \n\t"
1749 PAVGB" %%mm1, %%mm2 \n\t"
1750 PAVGB" %%mm1, %%mm4 \n\t"
1751 PAVGB" %%mm1, %%mm2 \n\t"
1752 "movq %%mm5, %%mm7 \n\t"
1753 "movq %%mm4, %%mm6 \n\t"
1754 "punpcklbw %%mm3, %%mm5 \n\t"
1755 "punpckhbw %%mm3, %%mm7 \n\t"
1756 "punpcklbw %%mm2, %%mm4 \n\t"
1757 "punpckhbw %%mm2, %%mm6 \n\t"
1759 MOVNTQ" %%mm5, (%2, %%eax, 2) \n\t"
1760 MOVNTQ" %%mm7, 8(%2, %%eax, 2) \n\t"
1761 MOVNTQ" %%mm4, (%3, %%eax, 2) \n\t"
1762 MOVNTQ" %%mm6, 8(%3, %%eax, 2) \n\t"
1764 "movq %%mm5, (%2, %%eax, 2) \n\t"
1765 "movq %%mm7, 8(%2, %%eax, 2) \n\t"
1766 "movq %%mm4, (%3, %%eax, 2) \n\t"
1767 "movq %%mm6, 8(%3, %%eax, 2) \n\t"
1769 "addl $8, %%eax \n\t"
1771 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1772 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1778 const int mmxSize=1;
1780 dst[0 ]= (3*src[0] + src[srcStride])>>2;
1781 dst[dstStride]= ( src[0] + 3*src[srcStride])>>2;
1783 for(x=mmxSize-1; x<srcWidth-1; x++){
1784 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1785 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1786 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1787 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1789 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1790 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1800 for(x=0; x<srcWidth-1; x++){
1801 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1802 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1804 dst[2*srcWidth-1]= src[srcWidth-1];
1806 for(x=0; x<srcWidth; x++){
1813 asm volatile( EMMS" \n\t"
1821 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
1822 * problem for anyone then tell me, and ill fix it)
1823 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
1825 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1826 unsigned int width, unsigned int height,
1827 int lumStride, int chromStride, int srcStride)
1830 const unsigned chromWidth= width>>1;
1831 for(y=0; y<height; y+=2)
1835 "xorl %%eax, %%eax \n\t"
1836 "pcmpeqw %%mm7, %%mm7 \n\t"
1837 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1840 PREFETCH" 64(%0, %%eax, 4) \n\t"
1841 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
1842 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
1843 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1844 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1845 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1846 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1847 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1848 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1849 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1850 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1852 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
1854 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
1855 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
1856 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1857 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1858 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1859 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1860 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1861 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1862 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1863 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1865 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
1867 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1868 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1869 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1870 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1871 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1872 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1873 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1874 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1876 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
1877 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
1879 "addl $8, %%eax \n\t"
1880 "cmpl %4, %%eax \n\t"
1882 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1890 "xorl %%eax, %%eax \n\t"
1893 PREFETCH" 64(%0, %%eax, 4) \n\t"
1894 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
1895 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
1896 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
1897 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
1898 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1899 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1900 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1901 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1902 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1903 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1905 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
1906 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
1908 "addl $8, %%eax \n\t"
1909 "cmpl %4, %%eax \n\t"
1912 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1917 for(i=0; i<chromWidth; i++)
1919 udst[i] = src[4*i+0];
1920 ydst[2*i+0] = src[4*i+1];
1921 vdst[i] = src[4*i+2];
1922 ydst[2*i+1] = src[4*i+3];
1927 for(i=0; i<chromWidth; i++)
1929 ydst[2*i+0] = src[4*i+1];
1930 ydst[2*i+1] = src[4*i+3];
1933 udst += chromStride;
1934 vdst += chromStride;
1939 asm volatile( EMMS" \n\t"
1947 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1948 * problem for anyone then tell me, and ill fix it)
1949 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1951 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1952 unsigned int width, unsigned int height,
1953 int lumStride, int chromStride, int srcStride)
1956 const unsigned chromWidth= width>>1;
1958 for(y=0; y<height-2; y+=2)
1964 "movl %2, %%eax \n\t"
1965 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1966 "movq "MANGLE(w1111)", %%mm5 \n\t"
1967 "pxor %%mm7, %%mm7 \n\t"
1968 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1971 PREFETCH" 64(%0, %%ebx) \n\t"
1972 "movd (%0, %%ebx), %%mm0 \n\t"
1973 "movd 3(%0, %%ebx), %%mm1 \n\t"
1974 "punpcklbw %%mm7, %%mm0 \n\t"
1975 "punpcklbw %%mm7, %%mm1 \n\t"
1976 "movd 6(%0, %%ebx), %%mm2 \n\t"
1977 "movd 9(%0, %%ebx), %%mm3 \n\t"
1978 "punpcklbw %%mm7, %%mm2 \n\t"
1979 "punpcklbw %%mm7, %%mm3 \n\t"
1980 "pmaddwd %%mm6, %%mm0 \n\t"
1981 "pmaddwd %%mm6, %%mm1 \n\t"
1982 "pmaddwd %%mm6, %%mm2 \n\t"
1983 "pmaddwd %%mm6, %%mm3 \n\t"
1984 #ifndef FAST_BGR2YV12
1985 "psrad $8, %%mm0 \n\t"
1986 "psrad $8, %%mm1 \n\t"
1987 "psrad $8, %%mm2 \n\t"
1988 "psrad $8, %%mm3 \n\t"
1990 "packssdw %%mm1, %%mm0 \n\t"
1991 "packssdw %%mm3, %%mm2 \n\t"
1992 "pmaddwd %%mm5, %%mm0 \n\t"
1993 "pmaddwd %%mm5, %%mm2 \n\t"
1994 "packssdw %%mm2, %%mm0 \n\t"
1995 "psraw $7, %%mm0 \n\t"
1997 "movd 12(%0, %%ebx), %%mm4 \n\t"
1998 "movd 15(%0, %%ebx), %%mm1 \n\t"
1999 "punpcklbw %%mm7, %%mm4 \n\t"
2000 "punpcklbw %%mm7, %%mm1 \n\t"
2001 "movd 18(%0, %%ebx), %%mm2 \n\t"
2002 "movd 21(%0, %%ebx), %%mm3 \n\t"
2003 "punpcklbw %%mm7, %%mm2 \n\t"
2004 "punpcklbw %%mm7, %%mm3 \n\t"
2005 "pmaddwd %%mm6, %%mm4 \n\t"
2006 "pmaddwd %%mm6, %%mm1 \n\t"
2007 "pmaddwd %%mm6, %%mm2 \n\t"
2008 "pmaddwd %%mm6, %%mm3 \n\t"
2009 #ifndef FAST_BGR2YV12
2010 "psrad $8, %%mm4 \n\t"
2011 "psrad $8, %%mm1 \n\t"
2012 "psrad $8, %%mm2 \n\t"
2013 "psrad $8, %%mm3 \n\t"
2015 "packssdw %%mm1, %%mm4 \n\t"
2016 "packssdw %%mm3, %%mm2 \n\t"
2017 "pmaddwd %%mm5, %%mm4 \n\t"
2018 "pmaddwd %%mm5, %%mm2 \n\t"
2019 "addl $24, %%ebx \n\t"
2020 "packssdw %%mm2, %%mm4 \n\t"
2021 "psraw $7, %%mm4 \n\t"
2023 "packuswb %%mm4, %%mm0 \n\t"
2024 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
2026 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
2027 "addl $8, %%eax \n\t"
2029 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
2037 "movl %4, %%eax \n\t"
2038 "movq "MANGLE(w1111)", %%mm5 \n\t"
2039 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
2040 "pxor %%mm7, %%mm7 \n\t"
2041 "leal (%%eax, %%eax, 2), %%ebx \n\t"
2042 "addl %%ebx, %%ebx \n\t"
2045 PREFETCH" 64(%0, %%ebx) \n\t"
2046 PREFETCH" 64(%1, %%ebx) \n\t"
2047 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2048 "movq (%0, %%ebx), %%mm0 \n\t"
2049 "movq (%1, %%ebx), %%mm1 \n\t"
2050 "movq 6(%0, %%ebx), %%mm2 \n\t"
2051 "movq 6(%1, %%ebx), %%mm3 \n\t"
2052 PAVGB" %%mm1, %%mm0 \n\t"
2053 PAVGB" %%mm3, %%mm2 \n\t"
2054 "movq %%mm0, %%mm1 \n\t"
2055 "movq %%mm2, %%mm3 \n\t"
2056 "psrlq $24, %%mm0 \n\t"
2057 "psrlq $24, %%mm2 \n\t"
2058 PAVGB" %%mm1, %%mm0 \n\t"
2059 PAVGB" %%mm3, %%mm2 \n\t"
2060 "punpcklbw %%mm7, %%mm0 \n\t"
2061 "punpcklbw %%mm7, %%mm2 \n\t"
2063 "movd (%0, %%ebx), %%mm0 \n\t"
2064 "movd (%1, %%ebx), %%mm1 \n\t"
2065 "movd 3(%0, %%ebx), %%mm2 \n\t"
2066 "movd 3(%1, %%ebx), %%mm3 \n\t"
2067 "punpcklbw %%mm7, %%mm0 \n\t"
2068 "punpcklbw %%mm7, %%mm1 \n\t"
2069 "punpcklbw %%mm7, %%mm2 \n\t"
2070 "punpcklbw %%mm7, %%mm3 \n\t"
2071 "paddw %%mm1, %%mm0 \n\t"
2072 "paddw %%mm3, %%mm2 \n\t"
2073 "paddw %%mm2, %%mm0 \n\t"
2074 "movd 6(%0, %%ebx), %%mm4 \n\t"
2075 "movd 6(%1, %%ebx), %%mm1 \n\t"
2076 "movd 9(%0, %%ebx), %%mm2 \n\t"
2077 "movd 9(%1, %%ebx), %%mm3 \n\t"
2078 "punpcklbw %%mm7, %%mm4 \n\t"
2079 "punpcklbw %%mm7, %%mm1 \n\t"
2080 "punpcklbw %%mm7, %%mm2 \n\t"
2081 "punpcklbw %%mm7, %%mm3 \n\t"
2082 "paddw %%mm1, %%mm4 \n\t"
2083 "paddw %%mm3, %%mm2 \n\t"
2084 "paddw %%mm4, %%mm2 \n\t"
2085 "psrlw $2, %%mm0 \n\t"
2086 "psrlw $2, %%mm2 \n\t"
2088 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2089 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2091 "pmaddwd %%mm0, %%mm1 \n\t"
2092 "pmaddwd %%mm2, %%mm3 \n\t"
2093 "pmaddwd %%mm6, %%mm0 \n\t"
2094 "pmaddwd %%mm6, %%mm2 \n\t"
2095 #ifndef FAST_BGR2YV12
2096 "psrad $8, %%mm0 \n\t"
2097 "psrad $8, %%mm1 \n\t"
2098 "psrad $8, %%mm2 \n\t"
2099 "psrad $8, %%mm3 \n\t"
2101 "packssdw %%mm2, %%mm0 \n\t"
2102 "packssdw %%mm3, %%mm1 \n\t"
2103 "pmaddwd %%mm5, %%mm0 \n\t"
2104 "pmaddwd %%mm5, %%mm1 \n\t"
2105 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
2106 "psraw $7, %%mm0 \n\t"
2108 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
2109 "movq 12(%0, %%ebx), %%mm4 \n\t"
2110 "movq 12(%1, %%ebx), %%mm1 \n\t"
2111 "movq 18(%0, %%ebx), %%mm2 \n\t"
2112 "movq 18(%1, %%ebx), %%mm3 \n\t"
2113 PAVGB" %%mm1, %%mm4 \n\t"
2114 PAVGB" %%mm3, %%mm2 \n\t"
2115 "movq %%mm4, %%mm1 \n\t"
2116 "movq %%mm2, %%mm3 \n\t"
2117 "psrlq $24, %%mm4 \n\t"
2118 "psrlq $24, %%mm2 \n\t"
2119 PAVGB" %%mm1, %%mm4 \n\t"
2120 PAVGB" %%mm3, %%mm2 \n\t"
2121 "punpcklbw %%mm7, %%mm4 \n\t"
2122 "punpcklbw %%mm7, %%mm2 \n\t"
2124 "movd 12(%0, %%ebx), %%mm4 \n\t"
2125 "movd 12(%1, %%ebx), %%mm1 \n\t"
2126 "movd 15(%0, %%ebx), %%mm2 \n\t"
2127 "movd 15(%1, %%ebx), %%mm3 \n\t"
2128 "punpcklbw %%mm7, %%mm4 \n\t"
2129 "punpcklbw %%mm7, %%mm1 \n\t"
2130 "punpcklbw %%mm7, %%mm2 \n\t"
2131 "punpcklbw %%mm7, %%mm3 \n\t"
2132 "paddw %%mm1, %%mm4 \n\t"
2133 "paddw %%mm3, %%mm2 \n\t"
2134 "paddw %%mm2, %%mm4 \n\t"
2135 "movd 18(%0, %%ebx), %%mm5 \n\t"
2136 "movd 18(%1, %%ebx), %%mm1 \n\t"
2137 "movd 21(%0, %%ebx), %%mm2 \n\t"
2138 "movd 21(%1, %%ebx), %%mm3 \n\t"
2139 "punpcklbw %%mm7, %%mm5 \n\t"
2140 "punpcklbw %%mm7, %%mm1 \n\t"
2141 "punpcklbw %%mm7, %%mm2 \n\t"
2142 "punpcklbw %%mm7, %%mm3 \n\t"
2143 "paddw %%mm1, %%mm5 \n\t"
2144 "paddw %%mm3, %%mm2 \n\t"
2145 "paddw %%mm5, %%mm2 \n\t"
2146 "movq "MANGLE(w1111)", %%mm5 \n\t"
2147 "psrlw $2, %%mm4 \n\t"
2148 "psrlw $2, %%mm2 \n\t"
2150 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
2151 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
2153 "pmaddwd %%mm4, %%mm1 \n\t"
2154 "pmaddwd %%mm2, %%mm3 \n\t"
2155 "pmaddwd %%mm6, %%mm4 \n\t"
2156 "pmaddwd %%mm6, %%mm2 \n\t"
2157 #ifndef FAST_BGR2YV12
2158 "psrad $8, %%mm4 \n\t"
2159 "psrad $8, %%mm1 \n\t"
2160 "psrad $8, %%mm2 \n\t"
2161 "psrad $8, %%mm3 \n\t"
2163 "packssdw %%mm2, %%mm4 \n\t"
2164 "packssdw %%mm3, %%mm1 \n\t"
2165 "pmaddwd %%mm5, %%mm4 \n\t"
2166 "pmaddwd %%mm5, %%mm1 \n\t"
2167 "addl $24, %%ebx \n\t"
2168 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
2169 "psraw $7, %%mm4 \n\t"
2171 "movq %%mm0, %%mm1 \n\t"
2172 "punpckldq %%mm4, %%mm0 \n\t"
2173 "punpckhdq %%mm4, %%mm1 \n\t"
2174 "packsswb %%mm1, %%mm0 \n\t"
2175 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
2177 "movd %%mm0, (%2, %%eax) \n\t"
2178 "punpckhdq %%mm0, %%mm0 \n\t"
2179 "movd %%mm0, (%3, %%eax) \n\t"
2180 "addl $4, %%eax \n\t"
2182 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
2186 udst += chromStride;
2187 vdst += chromStride;
2191 asm volatile( EMMS" \n\t"
2197 for(; y<height; y+=2)
2200 for(i=0; i<chromWidth; i++)
2202 unsigned int b= src[6*i+0];
2203 unsigned int g= src[6*i+1];
2204 unsigned int r= src[6*i+2];
2206 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2207 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
2208 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
2218 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2224 for(i=0; i<chromWidth; i++)
2226 unsigned int b= src[6*i+0];
2227 unsigned int g= src[6*i+1];
2228 unsigned int r= src[6*i+2];
2230 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2238 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
2241 udst += chromStride;
2242 vdst += chromStride;
2248 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
2249 unsigned width, unsigned height, int src1Stride,
2250 int src2Stride, int dstStride){
2253 for(h=0; h < height; h++)
2260 "xorl %%eax, %%eax \n\t"
2262 PREFETCH" 64(%1, %%eax) \n\t"
2263 PREFETCH" 64(%2, %%eax) \n\t"
2264 "movdqa (%1, %%eax), %%xmm0 \n\t"
2265 "movdqa (%1, %%eax), %%xmm1 \n\t"
2266 "movdqa (%2, %%eax), %%xmm2 \n\t"
2267 "punpcklbw %%xmm2, %%xmm0 \n\t"
2268 "punpckhbw %%xmm2, %%xmm1 \n\t"
2269 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
2270 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
2271 "addl $16, %%eax \n\t"
2272 "cmpl %3, %%eax \n\t"
2274 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2279 "xorl %%eax, %%eax \n\t"
2281 PREFETCH" 64(%1, %%eax) \n\t"
2282 PREFETCH" 64(%2, %%eax) \n\t"
2283 "movq (%1, %%eax), %%mm0 \n\t"
2284 "movq 8(%1, %%eax), %%mm2 \n\t"
2285 "movq %%mm0, %%mm1 \n\t"
2286 "movq %%mm2, %%mm3 \n\t"
2287 "movq (%2, %%eax), %%mm4 \n\t"
2288 "movq 8(%2, %%eax), %%mm5 \n\t"
2289 "punpcklbw %%mm4, %%mm0 \n\t"
2290 "punpckhbw %%mm4, %%mm1 \n\t"
2291 "punpcklbw %%mm5, %%mm2 \n\t"
2292 "punpckhbw %%mm5, %%mm3 \n\t"
2293 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
2294 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
2295 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
2296 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
2297 "addl $16, %%eax \n\t"
2298 "cmpl %3, %%eax \n\t"
2300 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
2304 for(w= (width&(~15)); w < width; w++)
2306 dest[2*w+0] = src1[w];
2307 dest[2*w+1] = src2[w];
2310 for(w=0; w < width; w++)
2312 dest[2*w+0] = src1[w];
2313 dest[2*w+1] = src2[w];
2329 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
2330 uint8_t *dst1, uint8_t *dst2,
2331 unsigned width, unsigned height,
2332 int srcStride1, int srcStride2,
2333 int dstStride1, int dstStride2)
2337 w=width/2; h=height/2;
2342 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2345 const uint8_t* s1=src1+srcStride1*(y>>1);
2346 uint8_t* d=dst1+dstStride1*y;
2353 "movq %1, %%mm0\n\t"
2354 "movq 8%1, %%mm2\n\t"
2355 "movq 16%1, %%mm4\n\t"
2356 "movq 24%1, %%mm6\n\t"
2357 "movq %%mm0, %%mm1\n\t"
2358 "movq %%mm2, %%mm3\n\t"
2359 "movq %%mm4, %%mm5\n\t"
2360 "movq %%mm6, %%mm7\n\t"
2361 "punpcklbw %%mm0, %%mm0\n\t"
2362 "punpckhbw %%mm1, %%mm1\n\t"
2363 "punpcklbw %%mm2, %%mm2\n\t"
2364 "punpckhbw %%mm3, %%mm3\n\t"
2365 "punpcklbw %%mm4, %%mm4\n\t"
2366 "punpckhbw %%mm5, %%mm5\n\t"
2367 "punpcklbw %%mm6, %%mm6\n\t"
2368 "punpckhbw %%mm7, %%mm7\n\t"
2369 MOVNTQ" %%mm0, %0\n\t"
2370 MOVNTQ" %%mm1, 8%0\n\t"
2371 MOVNTQ" %%mm2, 16%0\n\t"
2372 MOVNTQ" %%mm3, 24%0\n\t"
2373 MOVNTQ" %%mm4, 32%0\n\t"
2374 MOVNTQ" %%mm5, 40%0\n\t"
2375 MOVNTQ" %%mm6, 48%0\n\t"
2376 MOVNTQ" %%mm7, 56%0"
2382 for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2385 const uint8_t* s2=src2+srcStride2*(y>>1);
2386 uint8_t* d=dst2+dstStride2*y;
2393 "movq %1, %%mm0\n\t"
2394 "movq 8%1, %%mm2\n\t"
2395 "movq 16%1, %%mm4\n\t"
2396 "movq 24%1, %%mm6\n\t"
2397 "movq %%mm0, %%mm1\n\t"
2398 "movq %%mm2, %%mm3\n\t"
2399 "movq %%mm4, %%mm5\n\t"
2400 "movq %%mm6, %%mm7\n\t"
2401 "punpcklbw %%mm0, %%mm0\n\t"
2402 "punpckhbw %%mm1, %%mm1\n\t"
2403 "punpcklbw %%mm2, %%mm2\n\t"
2404 "punpckhbw %%mm3, %%mm3\n\t"
2405 "punpcklbw %%mm4, %%mm4\n\t"
2406 "punpckhbw %%mm5, %%mm5\n\t"
2407 "punpcklbw %%mm6, %%mm6\n\t"
2408 "punpckhbw %%mm7, %%mm7\n\t"
2409 MOVNTQ" %%mm0, %0\n\t"
2410 MOVNTQ" %%mm1, 8%0\n\t"
2411 MOVNTQ" %%mm2, 16%0\n\t"
2412 MOVNTQ" %%mm3, 24%0\n\t"
2413 MOVNTQ" %%mm4, 32%0\n\t"
2414 MOVNTQ" %%mm5, 40%0\n\t"
2415 MOVNTQ" %%mm6, 48%0\n\t"
2416 MOVNTQ" %%mm7, 56%0"
2422 for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2433 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2435 unsigned width, unsigned height,
2436 int srcStride1, int srcStride2,
2437 int srcStride3, int dstStride)
2440 w=width/2; h=height;
2442 const uint8_t* yp=src1+srcStride1*y;
2443 const uint8_t* up=src2+srcStride2*(y>>2);
2444 const uint8_t* vp=src3+srcStride3*(y>>2);
2445 uint8_t* d=dst+dstStride*y;
2451 PREFETCH" 32(%1, %0)\n\t"
2452 PREFETCH" 32(%2, %0)\n\t"
2453 PREFETCH" 32(%3, %0)\n\t"
2454 "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2455 "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
2456 "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
2457 "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2458 "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
2459 "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
2460 "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
2461 "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
2462 "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
2463 "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
2465 "movq %%mm1, %%mm6\n\t"
2466 "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
2467 "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2468 "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2469 MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
2470 MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
2472 "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
2473 "movq 8(%1, %0, 4), %%mm0\n\t"
2474 "movq %%mm0, %%mm3\n\t"
2475 "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
2476 "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
2477 MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
2478 MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
2480 "movq %%mm4, %%mm6\n\t"
2481 "movq 16(%1, %0, 4), %%mm0\n\t"
2482 "movq %%mm0, %%mm3\n\t"
2483 "punpcklbw %%mm5, %%mm4\n\t"
2484 "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
2485 "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
2486 MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
2487 MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
2489 "punpckhbw %%mm5, %%mm6\n\t"
2490 "movq 24(%1, %0, 4), %%mm0\n\t"
2491 "movq %%mm0, %%mm3\n\t"
2492 "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
2493 "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
2494 MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
2495 MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
2498 : "r"(yp), "r" (up), "r"(vp), "r"(d)