3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
26 #define PREFETCH "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
56 const uint8_t *s = src;
59 const uint8_t *mm_end;
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
71 "punpckldq 3%1, %%mm0\n\t"
73 "punpckldq 9%1, %%mm1\n\t"
74 "movd 12%1, %%mm2\n\t"
75 "punpckldq 15%1, %%mm2\n\t"
76 "movd 18%1, %%mm3\n\t"
77 "punpckldq 21%1, %%mm3\n\t"
78 "pand %%mm7, %%mm0\n\t"
79 "pand %%mm7, %%mm1\n\t"
80 "pand %%mm7, %%mm2\n\t"
81 "pand %%mm7, %%mm3\n\t"
82 MOVNTQ" %%mm0, %0\n\t"
83 MOVNTQ" %%mm1, 8%0\n\t"
84 MOVNTQ" %%mm2, 16%0\n\t"
92 __asm __volatile(SFENCE:::"memory");
93 __asm __volatile(EMMS:::"memory");
104 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
107 const uint8_t *s = src;
110 const uint8_t *mm_end;
114 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
121 "movq 8%1, %%mm1\n\t"
122 "movq 16%1, %%mm4\n\t"
123 "movq 24%1, %%mm5\n\t"
124 "movq %%mm0, %%mm2\n\t"
125 "movq %%mm1, %%mm3\n\t"
126 "movq %%mm4, %%mm6\n\t"
127 "movq %%mm5, %%mm7\n\t"
128 "psrlq $8, %%mm2\n\t"
129 "psrlq $8, %%mm3\n\t"
130 "psrlq $8, %%mm6\n\t"
131 "psrlq $8, %%mm7\n\t"
140 "por %%mm2, %%mm0\n\t"
141 "por %%mm3, %%mm1\n\t"
142 "por %%mm6, %%mm4\n\t"
143 "por %%mm7, %%mm5\n\t"
145 "movq %%mm1, %%mm2\n\t"
146 "movq %%mm4, %%mm3\n\t"
147 "psllq $48, %%mm2\n\t"
148 "psllq $32, %%mm3\n\t"
151 "por %%mm2, %%mm0\n\t"
152 "psrlq $16, %%mm1\n\t"
153 "psrlq $32, %%mm4\n\t"
154 "psllq $16, %%mm5\n\t"
155 "por %%mm3, %%mm1\n\t"
157 "por %%mm5, %%mm4\n\t"
159 MOVNTQ" %%mm0, %0\n\t"
160 MOVNTQ" %%mm1, 8%0\n\t"
163 :"m"(*s),"m"(mask24l),
164 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
169 __asm __volatile(SFENCE:::"memory");
170 __asm __volatile(EMMS:::"memory");
182 Original by Strepto/Astral
183 ported to gcc & bugfixed : A'rpi
184 MMX2, 3DNOW optimization by Nick Kurshev
185 32bit c version, and and&add trick by Michael Niedermayer
187 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
190 register int offs=15-src_size;
191 register const char* s=src-offs;
192 register char* d=dst-offs;
193 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
202 "movq 8%1, %%mm2\n\t"
203 "movq %%mm0, %%mm1\n\t"
204 "movq %%mm2, %%mm3\n\t"
205 "pand %%mm4, %%mm0\n\t"
206 "pand %%mm4, %%mm2\n\t"
207 "paddw %%mm1, %%mm0\n\t"
208 "paddw %%mm3, %%mm2\n\t"
209 MOVNTQ" %%mm0, %0\n\t"
216 __asm __volatile(SFENCE:::"memory");
217 __asm __volatile(EMMS:::"memory");
220 const uint16_t *s1=( uint16_t * )src;
221 uint16_t *d1=( uint16_t * )dst;
222 uint16_t *e=((uint8_t *)s1)+src_size;
224 register int x=*( s1++ );
227 0111 1111 1110 0000=0x7FE0
228 00000000000001 1111=0x001F */
229 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
232 const unsigned *s1=( unsigned * )src;
233 unsigned *d1=( unsigned * )dst;
235 int size= src_size>>2;
236 for(i=0; i<size; i++)
238 register int x= s1[i];
239 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
240 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
247 static inline void RENAME(bgr24torgb24)(const uint8_t *src, uint8_t *dst, unsigned src_size)
249 unsigned j,i,num_pixels=src_size/3;
250 for(i=0,j=0; j<num_pixels; i+=3,j+=3)
258 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
261 const uint8_t *s = src;
262 const uint8_t *end,*mm_end;
263 uint16_t *d = (uint16_t *)dst;
266 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
270 ::"m"(red_16mask),"m"(green_16mask));
276 "movd 4%1, %%mm3\n\t"
277 "punpckldq 8%1, %%mm0\n\t"
278 "punpckldq 12%1, %%mm3\n\t"
279 "movq %%mm0, %%mm1\n\t"
280 "movq %%mm0, %%mm2\n\t"
281 "movq %%mm3, %%mm4\n\t"
282 "movq %%mm3, %%mm5\n\t"
283 "psrlq $3, %%mm0\n\t"
284 "psrlq $3, %%mm3\n\t"
287 "psrlq $5, %%mm1\n\t"
288 "psrlq $5, %%mm4\n\t"
289 "pand %%mm6, %%mm1\n\t"
290 "pand %%mm6, %%mm4\n\t"
291 "psrlq $8, %%mm2\n\t"
292 "psrlq $8, %%mm5\n\t"
293 "pand %%mm7, %%mm2\n\t"
294 "pand %%mm7, %%mm5\n\t"
295 "por %%mm1, %%mm0\n\t"
296 "por %%mm4, %%mm3\n\t"
297 "por %%mm2, %%mm0\n\t"
298 "por %%mm5, %%mm3\n\t"
299 "psllq $16, %%mm3\n\t"
300 "por %%mm3, %%mm0\n\t"
301 MOVNTQ" %%mm0, %0\n\t"
302 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
312 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
314 __asm __volatile(SFENCE:::"memory");
315 __asm __volatile(EMMS:::"memory");
317 unsigned j,i,num_pixels=src_size/4;
318 uint16_t *d = (uint16_t *)dst;
319 for(i=0,j=0; j<num_pixels; i+=4,j++)
321 const int b= src[i+0];
322 const int g= src[i+1];
323 const int r= src[i+2];
325 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
330 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
333 const uint8_t *s = src;
334 const uint8_t *end,*mm_end;
335 uint16_t *d = (uint16_t *)dst;
338 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
342 ::"m"(red_15mask),"m"(green_15mask));
348 "movd 4%1, %%mm3\n\t"
349 "punpckldq 8%1, %%mm0\n\t"
350 "punpckldq 12%1, %%mm3\n\t"
351 "movq %%mm0, %%mm1\n\t"
352 "movq %%mm0, %%mm2\n\t"
353 "movq %%mm3, %%mm4\n\t"
354 "movq %%mm3, %%mm5\n\t"
355 "psrlq $3, %%mm0\n\t"
356 "psrlq $3, %%mm3\n\t"
359 "psrlq $6, %%mm1\n\t"
360 "psrlq $6, %%mm4\n\t"
361 "pand %%mm6, %%mm1\n\t"
362 "pand %%mm6, %%mm4\n\t"
363 "psrlq $9, %%mm2\n\t"
364 "psrlq $9, %%mm5\n\t"
365 "pand %%mm7, %%mm2\n\t"
366 "pand %%mm7, %%mm5\n\t"
367 "por %%mm1, %%mm0\n\t"
368 "por %%mm4, %%mm3\n\t"
369 "por %%mm2, %%mm0\n\t"
370 "por %%mm5, %%mm3\n\t"
371 "psllq $16, %%mm3\n\t"
372 "por %%mm3, %%mm0\n\t"
373 MOVNTQ" %%mm0, %0\n\t"
374 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
384 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
386 __asm __volatile(SFENCE:::"memory");
387 __asm __volatile(EMMS:::"memory");
389 unsigned j,i,num_pixels=src_size/4;
390 uint16_t *d = (uint16_t *)dst;
391 for(i=0,j=0; j<num_pixels; i+=4,j++)
393 const int b= src[i+0];
394 const int g= src[i+1];
395 const int r= src[i+2];
397 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
402 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
405 const uint8_t *s = src;
406 const uint8_t *end,*mm_end;
407 uint16_t *d = (uint16_t *)dst;
410 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
414 ::"m"(red_16mask),"m"(green_16mask));
420 "movd 3%1, %%mm3\n\t"
421 "punpckldq 6%1, %%mm0\n\t"
422 "punpckldq 9%1, %%mm3\n\t"
423 "movq %%mm0, %%mm1\n\t"
424 "movq %%mm0, %%mm2\n\t"
425 "movq %%mm3, %%mm4\n\t"
426 "movq %%mm3, %%mm5\n\t"
427 "psrlq $3, %%mm0\n\t"
428 "psrlq $3, %%mm3\n\t"
431 "psrlq $5, %%mm1\n\t"
432 "psrlq $5, %%mm4\n\t"
433 "pand %%mm6, %%mm1\n\t"
434 "pand %%mm6, %%mm4\n\t"
435 "psrlq $8, %%mm2\n\t"
436 "psrlq $8, %%mm5\n\t"
437 "pand %%mm7, %%mm2\n\t"
438 "pand %%mm7, %%mm5\n\t"
439 "por %%mm1, %%mm0\n\t"
440 "por %%mm4, %%mm3\n\t"
441 "por %%mm2, %%mm0\n\t"
442 "por %%mm5, %%mm3\n\t"
443 "psllq $16, %%mm3\n\t"
444 "por %%mm3, %%mm0\n\t"
445 MOVNTQ" %%mm0, %0\n\t"
446 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
455 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
457 __asm __volatile(SFENCE:::"memory");
458 __asm __volatile(EMMS:::"memory");
460 unsigned j,i,num_pixels=src_size/3;
461 uint16_t *d = (uint16_t *)dst;
462 for(i=0,j=0; j<num_pixels; i+=3,j++)
464 const int b= src[i+0];
465 const int g= src[i+1];
466 const int r= src[i+2];
468 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
473 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
476 const uint8_t *s = src;
477 const uint8_t *end,*mm_end;
478 uint16_t *d = (uint16_t *)dst;
481 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
485 ::"m"(red_15mask),"m"(green_15mask));
491 "movd 3%1, %%mm3\n\t"
492 "punpckldq 6%1, %%mm0\n\t"
493 "punpckldq 9%1, %%mm3\n\t"
494 "movq %%mm0, %%mm1\n\t"
495 "movq %%mm0, %%mm2\n\t"
496 "movq %%mm3, %%mm4\n\t"
497 "movq %%mm3, %%mm5\n\t"
498 "psrlq $3, %%mm0\n\t"
499 "psrlq $3, %%mm3\n\t"
502 "psrlq $6, %%mm1\n\t"
503 "psrlq $6, %%mm4\n\t"
504 "pand %%mm6, %%mm1\n\t"
505 "pand %%mm6, %%mm4\n\t"
506 "psrlq $9, %%mm2\n\t"
507 "psrlq $9, %%mm5\n\t"
508 "pand %%mm7, %%mm2\n\t"
509 "pand %%mm7, %%mm5\n\t"
510 "por %%mm1, %%mm0\n\t"
511 "por %%mm4, %%mm3\n\t"
512 "por %%mm2, %%mm0\n\t"
513 "por %%mm5, %%mm3\n\t"
514 "psllq $16, %%mm3\n\t"
515 "por %%mm3, %%mm0\n\t"
516 MOVNTQ" %%mm0, %0\n\t"
517 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
526 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
528 __asm __volatile(SFENCE:::"memory");
529 __asm __volatile(EMMS:::"memory");
531 unsigned j,i,num_pixels=src_size/3;
532 uint16_t *d = (uint16_t *)dst;
533 for(i=0,j=0; j<num_pixels; i+=3,j++)
535 const int b= src[i+0];
536 const int g= src[i+1];
537 const int r= src[i+2];
539 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
544 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
548 "xorl %%eax, %%eax \n\t"
551 PREFETCH" 32(%0, %%eax) \n\t"
552 "movq (%0, %%eax), %%mm0 \n\t"
553 "movq %%mm0, %%mm1 \n\t"
554 "movq %%mm0, %%mm2 \n\t"
555 "pslld $16, %%mm0 \n\t"
556 "psrld $16, %%mm1 \n\t"
557 "pand "MANGLE(mask32r)", %%mm0 \n\t"
558 "pand "MANGLE(mask32g)", %%mm2 \n\t"
559 "pand "MANGLE(mask32b)", %%mm1 \n\t"
560 "por %%mm0, %%mm2 \n\t"
561 "por %%mm1, %%mm2 \n\t"
562 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
563 "addl $8, %%eax \n\t"
564 "cmpl %2, %%eax \n\t"
566 :: "r" (src), "r"(dst), "r" (src_size)
570 __asm __volatile(SFENCE:::"memory");
571 __asm __volatile(EMMS:::"memory");
574 int num_pixels= src_size >> 2;
575 for(i=0; i<num_pixels; i++)
577 dst[4*i + 0] = src[4*i + 2];
578 dst[4*i + 1] = src[4*i + 1];
579 dst[4*i + 2] = src[4*i + 0];
584 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
588 int mmx_size= 23 - src_size;
590 "movq "MANGLE(mask24r)", %%mm5 \n\t"
591 "movq "MANGLE(mask24g)", %%mm6 \n\t"
592 "movq "MANGLE(mask24b)", %%mm7 \n\t"
595 PREFETCH" 32(%1, %%eax) \n\t"
596 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
597 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
598 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
599 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
600 "pand %%mm5, %%mm0 \n\t"
601 "pand %%mm6, %%mm1 \n\t"
602 "pand %%mm7, %%mm2 \n\t"
603 "por %%mm0, %%mm1 \n\t"
604 "por %%mm2, %%mm1 \n\t"
605 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
606 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
607 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
608 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
609 "pand %%mm7, %%mm0 \n\t"
610 "pand %%mm5, %%mm1 \n\t"
611 "pand %%mm6, %%mm2 \n\t"
612 "por %%mm0, %%mm1 \n\t"
613 "por %%mm2, %%mm1 \n\t"
614 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
615 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
616 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
617 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
618 "pand %%mm6, %%mm0 \n\t"
619 "pand %%mm7, %%mm1 \n\t"
620 "pand %%mm5, %%mm2 \n\t"
621 "por %%mm0, %%mm1 \n\t"
622 "por %%mm2, %%mm1 \n\t"
623 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
624 "addl $24, %%eax \n\t"
627 : "r" (src-mmx_size), "r"(dst-mmx_size)
630 __asm __volatile(SFENCE:::"memory");
631 __asm __volatile(EMMS:::"memory");
633 if(mmx_size==23) return; //finihsed, was multiple of 8
636 src_size= 23 - mmx_size;
640 for(i=0; i<src_size; i+=3)
644 dst[i + 1] = src[i + 1];
645 dst[i + 2] = src[i + 0];
650 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
651 unsigned int width, unsigned int height,
652 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
655 const int chromWidth= width>>1;
656 for(y=0; y<height; y++)
659 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
661 "xorl %%eax, %%eax \n\t"
664 PREFETCH" 32(%1, %%eax, 2) \n\t"
665 PREFETCH" 32(%2, %%eax) \n\t"
666 PREFETCH" 32(%3, %%eax) \n\t"
667 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
668 "movq %%mm0, %%mm2 \n\t" // U(0)
669 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
670 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
671 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
673 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
674 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
675 "movq %%mm3, %%mm4 \n\t" // Y(0)
676 "movq %%mm5, %%mm6 \n\t" // Y(8)
677 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
678 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
679 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
680 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
682 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
683 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
684 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
685 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
687 "addl $8, %%eax \n\t"
688 "cmpl %4, %%eax \n\t"
690 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
695 for(i=0; i<chromWidth; i++)
697 dst[4*i+0] = ysrc[2*i+0];
698 dst[4*i+1] = usrc[i];
699 dst[4*i+2] = ysrc[2*i+1];
700 dst[4*i+3] = vsrc[i];
703 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
720 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
721 * problem for anyone then tell me, and ill fix it)
723 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
724 unsigned int width, unsigned int height,
725 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
727 //FIXME interpolate chroma
728 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
733 * width should be a multiple of 16
735 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
736 unsigned int width, unsigned int height,
737 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
739 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
744 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
745 * problem for anyone then tell me, and ill fix it)
747 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
748 unsigned int width, unsigned int height,
749 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
752 const int chromWidth= width>>1;
753 for(y=0; y<height; y+=2)
757 "xorl %%eax, %%eax \n\t"
758 "pcmpeqw %%mm7, %%mm7 \n\t"
759 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
762 PREFETCH" 64(%0, %%eax, 4) \n\t"
763 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
764 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
765 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
766 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
767 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
768 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
769 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
770 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
771 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
772 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
774 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
776 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
777 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
778 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
779 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
780 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
781 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
782 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
783 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
784 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
785 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
787 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
789 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
790 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
791 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
792 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
793 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
794 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
795 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
796 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
798 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
799 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
801 "addl $8, %%eax \n\t"
802 "cmpl %4, %%eax \n\t"
804 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
812 "xorl %%eax, %%eax \n\t"
815 PREFETCH" 64(%0, %%eax, 4) \n\t"
816 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
817 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
818 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
819 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
820 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
821 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
822 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
823 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
824 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
825 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
827 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
828 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
830 "addl $8, %%eax \n\t"
831 "cmpl %4, %%eax \n\t"
834 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
839 for(i=0; i<chromWidth; i++)
841 ydst[2*i+0] = src[4*i+0];
842 udst[i] = src[4*i+1];
843 ydst[2*i+1] = src[4*i+2];
844 vdst[i] = src[4*i+3];
849 for(i=0; i<chromWidth; i++)
851 ydst[2*i+0] = src[4*i+0];
852 ydst[2*i+1] = src[4*i+2];
861 asm volatile( EMMS" \n\t"
867 static inline void RENAME(yvu9toyv12)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc,
868 uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
869 unsigned int width, unsigned int height, unsigned int lumStride, unsigned int chromStride)
872 memcpy(ydst, ysrc, width*height);
874 /* XXX: implement upscaling for U,V */
879 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
880 * problem for anyone then tell me, and ill fix it)
881 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
883 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
884 unsigned int width, unsigned int height,
885 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
888 const int chromWidth= width>>1;
889 for(y=0; y<height; y+=2)
893 "xorl %%eax, %%eax \n\t"
894 "pcmpeqw %%mm7, %%mm7 \n\t"
895 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
898 PREFETCH" 64(%0, %%eax, 4) \n\t"
899 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
900 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
901 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
902 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
903 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
904 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
905 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
906 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
907 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
908 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
910 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
912 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
913 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
914 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
915 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
916 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
917 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
918 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
919 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
920 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
921 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
923 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
925 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
926 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
927 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
928 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
929 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
930 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
931 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
932 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
934 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
935 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
937 "addl $8, %%eax \n\t"
938 "cmpl %4, %%eax \n\t"
940 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
948 "xorl %%eax, %%eax \n\t"
951 PREFETCH" 64(%0, %%eax, 4) \n\t"
952 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
953 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
954 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
955 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
956 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
957 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
958 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
959 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
960 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
961 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
963 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
964 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
966 "addl $8, %%eax \n\t"
967 "cmpl %4, %%eax \n\t"
970 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
975 for(i=0; i<chromWidth; i++)
977 udst[i] = src[4*i+0];
978 ydst[2*i+0] = src[4*i+1];
979 vdst[i] = src[4*i+2];
980 ydst[2*i+1] = src[4*i+3];
985 for(i=0; i<chromWidth; i++)
987 ydst[2*i+0] = src[4*i+1];
988 ydst[2*i+1] = src[4*i+3];
997 asm volatile( EMMS" \n\t"
1005 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
1006 * problem for anyone then tell me, and ill fix it)
1007 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
1009 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1010 unsigned int width, unsigned int height,
1011 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
1014 const int chromWidth= width>>1;
1016 for(y=0; y<height-2; y+=2)
1022 "movl %2, %%eax \n\t"
1023 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1024 "movq "MANGLE(w1111)", %%mm5 \n\t"
1025 "pxor %%mm7, %%mm7 \n\t"
1026 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1029 PREFETCH" 64(%0, %%ebx) \n\t"
1030 "movd (%0, %%ebx), %%mm0 \n\t"
1031 "movd 3(%0, %%ebx), %%mm1 \n\t"
1032 "punpcklbw %%mm7, %%mm0 \n\t"
1033 "punpcklbw %%mm7, %%mm1 \n\t"
1034 "movd 6(%0, %%ebx), %%mm2 \n\t"
1035 "movd 9(%0, %%ebx), %%mm3 \n\t"
1036 "punpcklbw %%mm7, %%mm2 \n\t"
1037 "punpcklbw %%mm7, %%mm3 \n\t"
1038 "pmaddwd %%mm6, %%mm0 \n\t"
1039 "pmaddwd %%mm6, %%mm1 \n\t"
1040 "pmaddwd %%mm6, %%mm2 \n\t"
1041 "pmaddwd %%mm6, %%mm3 \n\t"
1042 #ifndef FAST_BGR2YV12
1043 "psrad $8, %%mm0 \n\t"
1044 "psrad $8, %%mm1 \n\t"
1045 "psrad $8, %%mm2 \n\t"
1046 "psrad $8, %%mm3 \n\t"
1048 "packssdw %%mm1, %%mm0 \n\t"
1049 "packssdw %%mm3, %%mm2 \n\t"
1050 "pmaddwd %%mm5, %%mm0 \n\t"
1051 "pmaddwd %%mm5, %%mm2 \n\t"
1052 "packssdw %%mm2, %%mm0 \n\t"
1053 "psraw $7, %%mm0 \n\t"
1055 "movd 12(%0, %%ebx), %%mm4 \n\t"
1056 "movd 15(%0, %%ebx), %%mm1 \n\t"
1057 "punpcklbw %%mm7, %%mm4 \n\t"
1058 "punpcklbw %%mm7, %%mm1 \n\t"
1059 "movd 18(%0, %%ebx), %%mm2 \n\t"
1060 "movd 21(%0, %%ebx), %%mm3 \n\t"
1061 "punpcklbw %%mm7, %%mm2 \n\t"
1062 "punpcklbw %%mm7, %%mm3 \n\t"
1063 "pmaddwd %%mm6, %%mm4 \n\t"
1064 "pmaddwd %%mm6, %%mm1 \n\t"
1065 "pmaddwd %%mm6, %%mm2 \n\t"
1066 "pmaddwd %%mm6, %%mm3 \n\t"
1067 #ifndef FAST_BGR2YV12
1068 "psrad $8, %%mm4 \n\t"
1069 "psrad $8, %%mm1 \n\t"
1070 "psrad $8, %%mm2 \n\t"
1071 "psrad $8, %%mm3 \n\t"
1073 "packssdw %%mm1, %%mm4 \n\t"
1074 "packssdw %%mm3, %%mm2 \n\t"
1075 "pmaddwd %%mm5, %%mm4 \n\t"
1076 "pmaddwd %%mm5, %%mm2 \n\t"
1077 "addl $24, %%ebx \n\t"
1078 "packssdw %%mm2, %%mm4 \n\t"
1079 "psraw $7, %%mm4 \n\t"
1081 "packuswb %%mm4, %%mm0 \n\t"
1082 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1084 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1085 "addl $8, %%eax \n\t"
1087 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1095 "movl %4, %%eax \n\t"
1096 "movq "MANGLE(w1111)", %%mm5 \n\t"
1097 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1098 "pxor %%mm7, %%mm7 \n\t"
1099 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1100 "addl %%ebx, %%ebx \n\t"
1103 PREFETCH" 64(%0, %%ebx) \n\t"
1104 PREFETCH" 64(%1, %%ebx) \n\t"
1105 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1106 "movq (%0, %%ebx), %%mm0 \n\t"
1107 "movq (%1, %%ebx), %%mm1 \n\t"
1108 "movq 6(%0, %%ebx), %%mm2 \n\t"
1109 "movq 6(%1, %%ebx), %%mm3 \n\t"
1110 PAVGB" %%mm1, %%mm0 \n\t"
1111 PAVGB" %%mm3, %%mm2 \n\t"
1112 "movq %%mm0, %%mm1 \n\t"
1113 "movq %%mm2, %%mm3 \n\t"
1114 "psrlq $24, %%mm0 \n\t"
1115 "psrlq $24, %%mm2 \n\t"
1116 PAVGB" %%mm1, %%mm0 \n\t"
1117 PAVGB" %%mm3, %%mm2 \n\t"
1118 "punpcklbw %%mm7, %%mm0 \n\t"
1119 "punpcklbw %%mm7, %%mm2 \n\t"
1121 "movd (%0, %%ebx), %%mm0 \n\t"
1122 "movd (%1, %%ebx), %%mm1 \n\t"
1123 "movd 3(%0, %%ebx), %%mm2 \n\t"
1124 "movd 3(%1, %%ebx), %%mm3 \n\t"
1125 "punpcklbw %%mm7, %%mm0 \n\t"
1126 "punpcklbw %%mm7, %%mm1 \n\t"
1127 "punpcklbw %%mm7, %%mm2 \n\t"
1128 "punpcklbw %%mm7, %%mm3 \n\t"
1129 "paddw %%mm1, %%mm0 \n\t"
1130 "paddw %%mm3, %%mm2 \n\t"
1131 "paddw %%mm2, %%mm0 \n\t"
1132 "movd 6(%0, %%ebx), %%mm4 \n\t"
1133 "movd 6(%1, %%ebx), %%mm1 \n\t"
1134 "movd 9(%0, %%ebx), %%mm2 \n\t"
1135 "movd 9(%1, %%ebx), %%mm3 \n\t"
1136 "punpcklbw %%mm7, %%mm4 \n\t"
1137 "punpcklbw %%mm7, %%mm1 \n\t"
1138 "punpcklbw %%mm7, %%mm2 \n\t"
1139 "punpcklbw %%mm7, %%mm3 \n\t"
1140 "paddw %%mm1, %%mm4 \n\t"
1141 "paddw %%mm3, %%mm2 \n\t"
1142 "paddw %%mm4, %%mm2 \n\t"
1143 "psrlw $2, %%mm0 \n\t"
1144 "psrlw $2, %%mm2 \n\t"
1146 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1147 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1149 "pmaddwd %%mm0, %%mm1 \n\t"
1150 "pmaddwd %%mm2, %%mm3 \n\t"
1151 "pmaddwd %%mm6, %%mm0 \n\t"
1152 "pmaddwd %%mm6, %%mm2 \n\t"
1153 #ifndef FAST_BGR2YV12
1154 "psrad $8, %%mm0 \n\t"
1155 "psrad $8, %%mm1 \n\t"
1156 "psrad $8, %%mm2 \n\t"
1157 "psrad $8, %%mm3 \n\t"
1159 "packssdw %%mm2, %%mm0 \n\t"
1160 "packssdw %%mm3, %%mm1 \n\t"
1161 "pmaddwd %%mm5, %%mm0 \n\t"
1162 "pmaddwd %%mm5, %%mm1 \n\t"
1163 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1164 "psraw $7, %%mm0 \n\t"
1166 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1167 "movq 12(%0, %%ebx), %%mm4 \n\t"
1168 "movq 12(%1, %%ebx), %%mm1 \n\t"
1169 "movq 18(%0, %%ebx), %%mm2 \n\t"
1170 "movq 18(%1, %%ebx), %%mm3 \n\t"
1171 PAVGB" %%mm1, %%mm4 \n\t"
1172 PAVGB" %%mm3, %%mm2 \n\t"
1173 "movq %%mm4, %%mm1 \n\t"
1174 "movq %%mm2, %%mm3 \n\t"
1175 "psrlq $24, %%mm4 \n\t"
1176 "psrlq $24, %%mm2 \n\t"
1177 PAVGB" %%mm1, %%mm4 \n\t"
1178 PAVGB" %%mm3, %%mm2 \n\t"
1179 "punpcklbw %%mm7, %%mm4 \n\t"
1180 "punpcklbw %%mm7, %%mm2 \n\t"
1182 "movd 12(%0, %%ebx), %%mm4 \n\t"
1183 "movd 12(%1, %%ebx), %%mm1 \n\t"
1184 "movd 15(%0, %%ebx), %%mm2 \n\t"
1185 "movd 15(%1, %%ebx), %%mm3 \n\t"
1186 "punpcklbw %%mm7, %%mm4 \n\t"
1187 "punpcklbw %%mm7, %%mm1 \n\t"
1188 "punpcklbw %%mm7, %%mm2 \n\t"
1189 "punpcklbw %%mm7, %%mm3 \n\t"
1190 "paddw %%mm1, %%mm4 \n\t"
1191 "paddw %%mm3, %%mm2 \n\t"
1192 "paddw %%mm2, %%mm4 \n\t"
1193 "movd 18(%0, %%ebx), %%mm5 \n\t"
1194 "movd 18(%1, %%ebx), %%mm1 \n\t"
1195 "movd 21(%0, %%ebx), %%mm2 \n\t"
1196 "movd 21(%1, %%ebx), %%mm3 \n\t"
1197 "punpcklbw %%mm7, %%mm5 \n\t"
1198 "punpcklbw %%mm7, %%mm1 \n\t"
1199 "punpcklbw %%mm7, %%mm2 \n\t"
1200 "punpcklbw %%mm7, %%mm3 \n\t"
1201 "paddw %%mm1, %%mm5 \n\t"
1202 "paddw %%mm3, %%mm2 \n\t"
1203 "paddw %%mm5, %%mm2 \n\t"
1204 "movq "MANGLE(w1111)", %%mm5 \n\t"
1205 "psrlw $2, %%mm4 \n\t"
1206 "psrlw $2, %%mm2 \n\t"
1208 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1209 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1211 "pmaddwd %%mm4, %%mm1 \n\t"
1212 "pmaddwd %%mm2, %%mm3 \n\t"
1213 "pmaddwd %%mm6, %%mm4 \n\t"
1214 "pmaddwd %%mm6, %%mm2 \n\t"
1215 #ifndef FAST_BGR2YV12
1216 "psrad $8, %%mm4 \n\t"
1217 "psrad $8, %%mm1 \n\t"
1218 "psrad $8, %%mm2 \n\t"
1219 "psrad $8, %%mm3 \n\t"
1221 "packssdw %%mm2, %%mm4 \n\t"
1222 "packssdw %%mm3, %%mm1 \n\t"
1223 "pmaddwd %%mm5, %%mm4 \n\t"
1224 "pmaddwd %%mm5, %%mm1 \n\t"
1225 "addl $24, %%ebx \n\t"
1226 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1227 "psraw $7, %%mm4 \n\t"
1229 "movq %%mm0, %%mm1 \n\t"
1230 "punpckldq %%mm4, %%mm0 \n\t"
1231 "punpckhdq %%mm4, %%mm1 \n\t"
1232 "packsswb %%mm1, %%mm0 \n\t"
1233 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1235 "movd %%mm0, (%2, %%eax) \n\t"
1236 "punpckhdq %%mm0, %%mm0 \n\t"
1237 "movd %%mm0, (%3, %%eax) \n\t"
1238 "addl $4, %%eax \n\t"
1240 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1244 udst += chromStride;
1245 vdst += chromStride;
1249 asm volatile( EMMS" \n\t"
1255 for(; y<height; y+=2)
1258 for(i=0; i<chromWidth; i++)
1260 unsigned int b= src[6*i+0];
1261 unsigned int g= src[6*i+1];
1262 unsigned int r= src[6*i+2];
1264 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1265 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1266 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1276 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1282 for(i=0; i<chromWidth; i++)
1284 unsigned int b= src[6*i+0];
1285 unsigned int g= src[6*i+1];
1286 unsigned int r= src[6*i+2];
1288 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1296 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1299 udst += chromStride;
1300 vdst += chromStride;
1306 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
1307 int width, int height, int src1Stride, int src2Stride, int dstStride){
1310 for(h=0; h < height; h++)
1317 "xorl %%eax, %%eax \n\t"
1319 PREFETCH" 64(%1, %%eax) \n\t"
1320 PREFETCH" 64(%2, %%eax) \n\t"
1321 "movdqa (%1, %%eax), %%xmm0 \n\t"
1322 "movdqa (%1, %%eax), %%xmm1 \n\t"
1323 "movdqa (%2, %%eax), %%xmm2 \n\t"
1324 "punpcklbw %%xmm2, %%xmm0 \n\t"
1325 "punpckhbw %%xmm2, %%xmm1 \n\t"
1326 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
1327 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
1328 "addl $16, %%eax \n\t"
1329 "cmpl %3, %%eax \n\t"
1331 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1336 "xorl %%eax, %%eax \n\t"
1338 PREFETCH" 64(%1, %%eax) \n\t"
1339 PREFETCH" 64(%2, %%eax) \n\t"
1340 "movq (%1, %%eax), %%mm0 \n\t"
1341 "movq 8(%1, %%eax), %%mm2 \n\t"
1342 "movq %%mm0, %%mm1 \n\t"
1343 "movq %%mm2, %%mm3 \n\t"
1344 "movq (%2, %%eax), %%mm4 \n\t"
1345 "movq 8(%2, %%eax), %%mm5 \n\t"
1346 "punpcklbw %%mm4, %%mm0 \n\t"
1347 "punpckhbw %%mm4, %%mm1 \n\t"
1348 "punpcklbw %%mm5, %%mm2 \n\t"
1349 "punpckhbw %%mm5, %%mm3 \n\t"
1350 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
1351 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
1352 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
1353 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
1354 "addl $16, %%eax \n\t"
1355 "cmpl %3, %%eax \n\t"
1357 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1361 for(w= (width&(~15)); w < width; w++)
1363 dest[2*w+0] = src1[w];
1364 dest[2*w+1] = src2[w];
1367 for(w=0; w < width; w++)
1369 dest[2*w+0] = src1[w];
1370 dest[2*w+1] = src2[w];