3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
26 #define PREFETCH "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
56 const uint8_t *s = src;
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
64 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
66 if(mm_end == end) mm_end -= MMREG_SIZE*4;
72 "punpckldq 3%1, %%mm0\n\t"
74 "punpckldq 9%1, %%mm1\n\t"
75 "movd 12%1, %%mm2\n\t"
76 "punpckldq 15%1, %%mm2\n\t"
77 "movd 18%1, %%mm3\n\t"
78 "punpckldq 21%1, %%mm3\n\t"
79 "pand %%mm7, %%mm0\n\t"
80 "pand %%mm7, %%mm1\n\t"
81 "pand %%mm7, %%mm2\n\t"
82 "pand %%mm7, %%mm3\n\t"
83 MOVNTQ" %%mm0, %0\n\t"
84 MOVNTQ" %%mm1, 8%0\n\t"
85 MOVNTQ" %%mm2, 16%0\n\t"
93 __asm __volatile(SFENCE:::"memory");
94 __asm __volatile(EMMS:::"memory");
105 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
108 const uint8_t *s = src;
115 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
116 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
122 "movq 8%1, %%mm1\n\t"
123 "movq 16%1, %%mm4\n\t"
124 "movq 24%1, %%mm5\n\t"
125 "movq %%mm0, %%mm2\n\t"
126 "movq %%mm1, %%mm3\n\t"
127 "movq %%mm4, %%mm6\n\t"
128 "movq %%mm5, %%mm7\n\t"
129 "psrlq $8, %%mm2\n\t"
130 "psrlq $8, %%mm3\n\t"
131 "psrlq $8, %%mm6\n\t"
132 "psrlq $8, %%mm7\n\t"
141 "por %%mm2, %%mm0\n\t"
142 "por %%mm3, %%mm1\n\t"
143 "por %%mm6, %%mm4\n\t"
144 "por %%mm7, %%mm5\n\t"
146 "movq %%mm1, %%mm2\n\t"
147 "movq %%mm4, %%mm3\n\t"
148 "psllq $48, %%mm2\n\t"
149 "psllq $32, %%mm3\n\t"
152 "por %%mm2, %%mm0\n\t"
153 "psrlq $16, %%mm1\n\t"
154 "psrlq $32, %%mm4\n\t"
155 "psllq $16, %%mm5\n\t"
156 "por %%mm3, %%mm1\n\t"
158 "por %%mm5, %%mm4\n\t"
160 MOVNTQ" %%mm0, %0\n\t"
161 MOVNTQ" %%mm1, 8%0\n\t"
164 :"m"(*s),"m"(mask24l),
165 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
170 __asm __volatile(SFENCE:::"memory");
171 __asm __volatile(EMMS:::"memory");
183 Original by Strepto/Astral
184 ported to gcc & bugfixed : A'rpi
185 MMX2, 3DNOW optimization by Nick Kurshev
186 32bit c version, and and&add trick by Michael Niedermayer
188 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
191 register const char* s=src+src_size;
192 register char* d=dst+src_size;
193 register int offs=-src_size;
194 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
203 "movq 8%1, %%mm2\n\t"
204 "movq %%mm0, %%mm1\n\t"
205 "movq %%mm2, %%mm3\n\t"
206 "pand %%mm4, %%mm0\n\t"
207 "pand %%mm4, %%mm2\n\t"
208 "paddw %%mm1, %%mm0\n\t"
209 "paddw %%mm3, %%mm2\n\t"
210 MOVNTQ" %%mm0, %0\n\t"
217 __asm __volatile(SFENCE:::"memory");
218 __asm __volatile(EMMS:::"memory");
221 const uint16_t *s1=( uint16_t * )src;
222 uint16_t *d1=( uint16_t * )dst;
223 uint16_t *e=((uint8_t *)s1)+src_size;
225 register int x=*( s1++ );
228 0111 1111 1110 0000=0x7FE0
229 00000000000001 1111=0x001F */
230 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
233 const unsigned *s1=( unsigned * )src;
234 unsigned *d1=( unsigned * )dst;
236 int size= src_size>>2;
237 for(i=0; i<size; i++)
239 register int x= s1[i];
240 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
241 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
248 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
251 const uint8_t *s = src;
252 const uint8_t *end,*mm_end;
253 uint16_t *d = (uint16_t *)dst;
255 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
256 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
260 ::"m"(red_16mask),"m"(green_16mask));
266 "movd 4%1, %%mm3\n\t"
267 "punpckldq 8%1, %%mm0\n\t"
268 "punpckldq 12%1, %%mm3\n\t"
269 "movq %%mm0, %%mm1\n\t"
270 "movq %%mm0, %%mm2\n\t"
271 "movq %%mm3, %%mm4\n\t"
272 "movq %%mm3, %%mm5\n\t"
273 "psrlq $3, %%mm0\n\t"
274 "psrlq $3, %%mm3\n\t"
277 "psrlq $5, %%mm1\n\t"
278 "psrlq $5, %%mm4\n\t"
279 "pand %%mm6, %%mm1\n\t"
280 "pand %%mm6, %%mm4\n\t"
281 "psrlq $8, %%mm2\n\t"
282 "psrlq $8, %%mm5\n\t"
283 "pand %%mm7, %%mm2\n\t"
284 "pand %%mm7, %%mm5\n\t"
285 "por %%mm1, %%mm0\n\t"
286 "por %%mm4, %%mm3\n\t"
287 "por %%mm2, %%mm0\n\t"
288 "por %%mm5, %%mm3\n\t"
289 "psllq $16, %%mm3\n\t"
290 "por %%mm3, %%mm0\n\t"
291 MOVNTQ" %%mm0, %0\n\t"
292 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
301 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
303 __asm __volatile(SFENCE:::"memory");
304 __asm __volatile(EMMS:::"memory");
306 unsigned j,i,num_pixels=src_size/4;
307 uint16_t *d = (uint16_t *)dst;
308 for(i=0,j=0; j<num_pixels; i+=4,j++)
310 const int b= src[i+0];
311 const int g= src[i+1];
312 const int r= src[i+2];
314 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
322 const uint8_t *s = src;
323 const uint8_t *end,*mm_end;
324 uint16_t *d = (uint16_t *)dst;
326 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
327 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
331 ::"m"(red_15mask),"m"(green_15mask));
337 "movd 4%1, %%mm3\n\t"
338 "punpckldq 8%1, %%mm0\n\t"
339 "punpckldq 12%1, %%mm3\n\t"
340 "movq %%mm0, %%mm1\n\t"
341 "movq %%mm0, %%mm2\n\t"
342 "movq %%mm3, %%mm4\n\t"
343 "movq %%mm3, %%mm5\n\t"
344 "psrlq $3, %%mm0\n\t"
345 "psrlq $3, %%mm3\n\t"
348 "psrlq $6, %%mm1\n\t"
349 "psrlq $6, %%mm4\n\t"
350 "pand %%mm6, %%mm1\n\t"
351 "pand %%mm6, %%mm4\n\t"
352 "psrlq $9, %%mm2\n\t"
353 "psrlq $9, %%mm5\n\t"
354 "pand %%mm7, %%mm2\n\t"
355 "pand %%mm7, %%mm5\n\t"
356 "por %%mm1, %%mm0\n\t"
357 "por %%mm4, %%mm3\n\t"
358 "por %%mm2, %%mm0\n\t"
359 "por %%mm5, %%mm3\n\t"
360 "psllq $16, %%mm3\n\t"
361 "por %%mm3, %%mm0\n\t"
362 MOVNTQ" %%mm0, %0\n\t"
363 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
372 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
374 __asm __volatile(SFENCE:::"memory");
375 __asm __volatile(EMMS:::"memory");
377 unsigned j,i,num_pixels=src_size/4;
378 uint16_t *d = (uint16_t *)dst;
379 for(i=0,j=0; j<num_pixels; i+=4,j++)
381 const int b= src[i+0];
382 const int g= src[i+1];
383 const int r= src[i+2];
385 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
390 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
393 const uint8_t *s = src;
394 const uint8_t *end,*mm_end;
395 uint16_t *d = (uint16_t *)dst;
397 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
398 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
402 ::"m"(red_16mask),"m"(green_16mask));
403 if(mm_end == end) mm_end -= MMREG_SIZE*2;
409 "movd 3%1, %%mm3\n\t"
410 "punpckldq 6%1, %%mm0\n\t"
411 "punpckldq 9%1, %%mm3\n\t"
412 "movq %%mm0, %%mm1\n\t"
413 "movq %%mm0, %%mm2\n\t"
414 "movq %%mm3, %%mm4\n\t"
415 "movq %%mm3, %%mm5\n\t"
416 "psrlq $3, %%mm0\n\t"
417 "psrlq $3, %%mm3\n\t"
420 "psrlq $5, %%mm1\n\t"
421 "psrlq $5, %%mm4\n\t"
422 "pand %%mm6, %%mm1\n\t"
423 "pand %%mm6, %%mm4\n\t"
424 "psrlq $8, %%mm2\n\t"
425 "psrlq $8, %%mm5\n\t"
426 "pand %%mm7, %%mm2\n\t"
427 "pand %%mm7, %%mm5\n\t"
428 "por %%mm1, %%mm0\n\t"
429 "por %%mm4, %%mm3\n\t"
430 "por %%mm2, %%mm0\n\t"
431 "por %%mm5, %%mm3\n\t"
432 "psllq $16, %%mm3\n\t"
433 "por %%mm3, %%mm0\n\t"
434 MOVNTQ" %%mm0, %0\n\t"
435 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
444 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
446 __asm __volatile(SFENCE:::"memory");
447 __asm __volatile(EMMS:::"memory");
449 unsigned j,i,num_pixels=src_size/3;
450 uint16_t *d = (uint16_t *)dst;
451 for(i=0,j=0; j<num_pixels; i+=3,j++)
453 const int b= src[i+0];
454 const int g= src[i+1];
455 const int r= src[i+2];
457 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
465 const uint8_t *s = src;
466 const uint8_t *end,*mm_end;
467 uint16_t *d = (uint16_t *)dst;
469 mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
470 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
474 ::"m"(red_15mask),"m"(green_15mask));
475 if(mm_end == end) mm_end -= MMREG_SIZE*2;
481 "movd 3%1, %%mm3\n\t"
482 "punpckldq 6%1, %%mm0\n\t"
483 "punpckldq 9%1, %%mm3\n\t"
484 "movq %%mm0, %%mm1\n\t"
485 "movq %%mm0, %%mm2\n\t"
486 "movq %%mm3, %%mm4\n\t"
487 "movq %%mm3, %%mm5\n\t"
488 "psrlq $3, %%mm0\n\t"
489 "psrlq $3, %%mm3\n\t"
492 "psrlq $6, %%mm1\n\t"
493 "psrlq $6, %%mm4\n\t"
494 "pand %%mm6, %%mm1\n\t"
495 "pand %%mm6, %%mm4\n\t"
496 "psrlq $9, %%mm2\n\t"
497 "psrlq $9, %%mm5\n\t"
498 "pand %%mm7, %%mm2\n\t"
499 "pand %%mm7, %%mm5\n\t"
500 "por %%mm1, %%mm0\n\t"
501 "por %%mm4, %%mm3\n\t"
502 "por %%mm2, %%mm0\n\t"
503 "por %%mm5, %%mm3\n\t"
504 "psllq $16, %%mm3\n\t"
505 "por %%mm3, %%mm0\n\t"
506 MOVNTQ" %%mm0, %0\n\t"
507 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
516 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
518 __asm __volatile(SFENCE:::"memory");
519 __asm __volatile(EMMS:::"memory");
521 unsigned j,i,num_pixels=src_size/3;
522 uint16_t *d = (uint16_t *)dst;
523 for(i=0,j=0; j<num_pixels; i+=3,j++)
525 const int b= src[i+0];
526 const int g= src[i+1];
527 const int r= src[i+2];
529 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
534 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
536 int num_pixels= src_size >> 2;
539 "xorl %%eax, %%eax \n\t"
542 PREFETCH" 32(%0, %%eax) \n\t"
543 "movq (%0, %%eax), %%mm0 \n\t"
544 "movq %%mm0, %%mm1 \n\t"
545 "movq %%mm0, %%mm2 \n\t"
546 "pslld $16, %%mm0 \n\t"
547 "psrld $16, %%mm1 \n\t"
548 "pand mask32r, %%mm0 \n\t"
549 "pand mask32g, %%mm2 \n\t"
550 "pand mask32b, %%mm1 \n\t"
551 "por %%mm0, %%mm2 \n\t"
552 "por %%mm1, %%mm2 \n\t"
553 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
554 "addl $2, %%eax \n\t"
555 "cmpl %2, %%eax \n\t"
557 :: "r" (src), "r"(dst), "r" (num_pixels)
561 __asm __volatile(SFENCE:::"memory");
562 __asm __volatile(EMMS:::"memory");
565 for(i=0; i<num_pixels; i++)
567 dst[4*i + 0] = src[4*i + 2];
568 dst[4*i + 1] = src[4*i + 1];
569 dst[4*i + 2] = src[4*i + 0];
576 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
577 * problem for anyone then tell me, and ill fix it)
579 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
580 unsigned int width, unsigned int height,
581 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
584 const int chromWidth= width>>1;
585 for(y=0; y<height; y++)
588 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
590 "xorl %%eax, %%eax \n\t"
593 PREFETCH" 32(%1, %%eax, 2) \n\t"
594 PREFETCH" 32(%2, %%eax) \n\t"
595 PREFETCH" 32(%3, %%eax) \n\t"
596 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
597 "movq %%mm0, %%mm2 \n\t" // U(0)
598 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
599 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
600 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
602 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
603 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
604 "movq %%mm3, %%mm4 \n\t" // Y(0)
605 "movq %%mm5, %%mm6 \n\t" // Y(8)
606 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
607 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
608 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
609 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
611 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
612 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
613 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
614 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
616 "addl $8, %%eax \n\t"
617 "cmpl %4, %%eax \n\t"
619 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
624 for(i=0; i<chromWidth; i++)
626 dst[4*i+0] = ysrc[2*i+0];
627 dst[4*i+1] = usrc[i];
628 dst[4*i+2] = ysrc[2*i+1];
629 dst[4*i+3] = vsrc[i];
649 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
650 * problem for anyone then tell me, and ill fix it)
652 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
653 unsigned int width, unsigned int height,
654 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
657 const int chromWidth= width>>1;
658 for(y=0; y<height; y+=2)
662 "xorl %%eax, %%eax \n\t"
663 "pcmpeqw %%mm7, %%mm7 \n\t"
664 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
667 PREFETCH" 64(%0, %%eax, 4) \n\t"
668 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
669 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
670 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
671 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
672 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
673 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
674 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
675 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
676 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
677 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
679 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
681 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
682 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
683 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
684 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
685 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
686 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
687 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
688 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
689 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
690 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
692 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
694 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
695 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
696 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
697 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
698 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
699 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
700 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
701 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
703 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
704 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
706 "addl $8, %%eax \n\t"
707 "cmpl %4, %%eax \n\t"
709 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
717 "xorl %%eax, %%eax \n\t"
720 PREFETCH" 64(%0, %%eax, 4) \n\t"
721 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
722 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
723 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
724 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
725 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
726 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
727 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
728 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
729 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
730 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
732 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
733 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
735 "addl $8, %%eax \n\t"
736 "cmpl %4, %%eax \n\t"
739 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
744 for(i=0; i<chromWidth; i++)
746 ydst[2*i+0] = src[4*i+0];
747 udst[i] = src[4*i+1];
748 ydst[2*i+1] = src[4*i+2];
749 vdst[i] = src[4*i+3];
754 for(i=0; i<chromWidth; i++)
756 ydst[2*i+0] = src[4*i+0];
757 ydst[2*i+1] = src[4*i+2];
766 asm volatile( EMMS" \n\t"
774 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
775 * problem for anyone then tell me, and ill fix it)
776 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
778 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
779 unsigned int width, unsigned int height,
780 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
783 const int chromWidth= width>>1;
784 for(y=0; y<height; y+=2)
788 "xorl %%eax, %%eax \n\t"
789 "pcmpeqw %%mm7, %%mm7 \n\t"
790 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
793 PREFETCH" 64(%0, %%eax, 4) \n\t"
794 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
795 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
796 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
797 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
798 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
799 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
800 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
801 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
802 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
803 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
805 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
807 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
808 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
809 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
810 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
811 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
812 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
813 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
814 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
815 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
816 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
818 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
820 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
821 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
822 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
823 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
824 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
825 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
826 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
827 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
829 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
830 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
832 "addl $8, %%eax \n\t"
833 "cmpl %4, %%eax \n\t"
835 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
843 "xorl %%eax, %%eax \n\t"
846 PREFETCH" 64(%0, %%eax, 4) \n\t"
847 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
848 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
849 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
850 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
851 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
852 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
853 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
854 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
855 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
856 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
858 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
859 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
861 "addl $8, %%eax \n\t"
862 "cmpl %4, %%eax \n\t"
865 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
870 for(i=0; i<chromWidth; i++)
872 udst[i] = src[4*i+0];
873 ydst[2*i+0] = src[4*i+1];
874 vdst[i] = src[4*i+2];
875 ydst[2*i+1] = src[4*i+3];
880 for(i=0; i<chromWidth; i++)
882 ydst[2*i+0] = src[4*i+1];
883 ydst[2*i+1] = src[4*i+3];
892 asm volatile( EMMS" \n\t"
900 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
901 * problem for anyone then tell me, and ill fix it)
902 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
904 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
905 unsigned int width, unsigned int height,
906 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
909 const int chromWidth= width>>1;
910 for(y=0; y<height; y+=2)
913 for(i=0; i<chromWidth; i++)
915 unsigned int b= src[6*i+0];
916 unsigned int g= src[6*i+1];
917 unsigned int r= src[6*i+2];
919 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
920 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
921 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
931 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
937 for(i=0; i<chromWidth; i++)
939 unsigned int b= src[6*i+0];
940 unsigned int g= src[6*i+1];
941 unsigned int r= src[6*i+2];
943 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
951 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;