3 * rgb2rgb.c, Software RGB to RGB convertor
4 * pluralize by Software PAL8 to RGB convertor
5 * Software YUV to YUV convertor
6 * Software YUV to RGB convertor
7 * Written by Nick Kurshev.
8 * palette & yuv & runtime cpu stuff by Michael (michaelni@gmx.at) (under GPL)
26 #define PREFETCH "prefetch"
27 #define PREFETCHW "prefetchw"
28 #define PAVGB "pavgusb"
29 #elif defined ( HAVE_MMX2 )
30 #define PREFETCH "prefetchnta"
31 #define PREFETCHW "prefetcht0"
34 #define PREFETCH "/nop"
35 #define PREFETCHW "/nop"
39 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
46 #define MOVNTQ "movntq"
47 #define SFENCE "sfence"
53 static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,unsigned src_size)
56 const uint8_t *s = src;
59 const uint8_t *mm_end;
63 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
65 __asm __volatile("movq %0, %%mm7"::"m"(mask32):"memory");
71 "punpckldq 3%1, %%mm0\n\t"
73 "punpckldq 9%1, %%mm1\n\t"
74 "movd 12%1, %%mm2\n\t"
75 "punpckldq 15%1, %%mm2\n\t"
76 "movd 18%1, %%mm3\n\t"
77 "punpckldq 21%1, %%mm3\n\t"
78 "pand %%mm7, %%mm0\n\t"
79 "pand %%mm7, %%mm1\n\t"
80 "pand %%mm7, %%mm2\n\t"
81 "pand %%mm7, %%mm3\n\t"
82 MOVNTQ" %%mm0, %0\n\t"
83 MOVNTQ" %%mm1, 8%0\n\t"
84 MOVNTQ" %%mm2, 16%0\n\t"
92 __asm __volatile(SFENCE:::"memory");
93 __asm __volatile(EMMS:::"memory");
104 static inline void RENAME(rgb32to24)(const uint8_t *src,uint8_t *dst,unsigned src_size)
107 const uint8_t *s = src;
110 const uint8_t *mm_end;
114 __asm __volatile(PREFETCH" %0"::"m"(*s):"memory");
121 "movq 8%1, %%mm1\n\t"
122 "movq 16%1, %%mm4\n\t"
123 "movq 24%1, %%mm5\n\t"
124 "movq %%mm0, %%mm2\n\t"
125 "movq %%mm1, %%mm3\n\t"
126 "movq %%mm4, %%mm6\n\t"
127 "movq %%mm5, %%mm7\n\t"
128 "psrlq $8, %%mm2\n\t"
129 "psrlq $8, %%mm3\n\t"
130 "psrlq $8, %%mm6\n\t"
131 "psrlq $8, %%mm7\n\t"
140 "por %%mm2, %%mm0\n\t"
141 "por %%mm3, %%mm1\n\t"
142 "por %%mm6, %%mm4\n\t"
143 "por %%mm7, %%mm5\n\t"
145 "movq %%mm1, %%mm2\n\t"
146 "movq %%mm4, %%mm3\n\t"
147 "psllq $48, %%mm2\n\t"
148 "psllq $32, %%mm3\n\t"
151 "por %%mm2, %%mm0\n\t"
152 "psrlq $16, %%mm1\n\t"
153 "psrlq $32, %%mm4\n\t"
154 "psllq $16, %%mm5\n\t"
155 "por %%mm3, %%mm1\n\t"
157 "por %%mm5, %%mm4\n\t"
159 MOVNTQ" %%mm0, %0\n\t"
160 MOVNTQ" %%mm1, 8%0\n\t"
163 :"m"(*s),"m"(mask24l),
164 "m"(mask24h),"m"(mask24hh),"m"(mask24hhh),"m"(mask24hhhh)
169 __asm __volatile(SFENCE:::"memory");
170 __asm __volatile(EMMS:::"memory");
182 Original by Strepto/Astral
183 ported to gcc & bugfixed : A'rpi
184 MMX2, 3DNOW optimization by Nick Kurshev
185 32bit c version, and and&add trick by Michael Niedermayer
187 static inline void RENAME(rgb15to16)(const uint8_t *src,uint8_t *dst,unsigned src_size)
190 register int offs=15-src_size;
191 register const char* s=src-offs;
192 register char* d=dst-offs;
193 __asm __volatile(PREFETCH" %0"::"m"(*(s+offs)));
202 "movq 8%1, %%mm2\n\t"
203 "movq %%mm0, %%mm1\n\t"
204 "movq %%mm2, %%mm3\n\t"
205 "pand %%mm4, %%mm0\n\t"
206 "pand %%mm4, %%mm2\n\t"
207 "paddw %%mm1, %%mm0\n\t"
208 "paddw %%mm3, %%mm2\n\t"
209 MOVNTQ" %%mm0, %0\n\t"
216 __asm __volatile(SFENCE:::"memory");
217 __asm __volatile(EMMS:::"memory");
220 const uint16_t *s1=( uint16_t * )src;
221 uint16_t *d1=( uint16_t * )dst;
222 uint16_t *e=((uint8_t *)s1)+src_size;
224 register int x=*( s1++ );
227 0111 1111 1110 0000=0x7FE0
228 00000000000001 1111=0x001F */
229 *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
232 const unsigned *s1=( unsigned * )src;
233 unsigned *d1=( unsigned * )dst;
235 int size= src_size>>2;
236 for(i=0; i<size; i++)
238 register int x= s1[i];
239 // d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
240 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
247 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
250 const uint8_t *s = src;
251 const uint8_t *end,*mm_end;
252 uint16_t *d = (uint16_t *)dst;
255 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
259 ::"m"(red_16mask),"m"(green_16mask));
265 "movd 4%1, %%mm3\n\t"
266 "punpckldq 8%1, %%mm0\n\t"
267 "punpckldq 12%1, %%mm3\n\t"
268 "movq %%mm0, %%mm1\n\t"
269 "movq %%mm0, %%mm2\n\t"
270 "movq %%mm3, %%mm4\n\t"
271 "movq %%mm3, %%mm5\n\t"
272 "psrlq $3, %%mm0\n\t"
273 "psrlq $3, %%mm3\n\t"
276 "psrlq $5, %%mm1\n\t"
277 "psrlq $5, %%mm4\n\t"
278 "pand %%mm6, %%mm1\n\t"
279 "pand %%mm6, %%mm4\n\t"
280 "psrlq $8, %%mm2\n\t"
281 "psrlq $8, %%mm5\n\t"
282 "pand %%mm7, %%mm2\n\t"
283 "pand %%mm7, %%mm5\n\t"
284 "por %%mm1, %%mm0\n\t"
285 "por %%mm4, %%mm3\n\t"
286 "por %%mm2, %%mm0\n\t"
287 "por %%mm5, %%mm3\n\t"
288 "psllq $16, %%mm3\n\t"
289 "por %%mm3, %%mm0\n\t"
290 MOVNTQ" %%mm0, %0\n\t"
291 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
301 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
303 __asm __volatile(SFENCE:::"memory");
304 __asm __volatile(EMMS:::"memory");
306 unsigned j,i,num_pixels=src_size/4;
307 uint16_t *d = (uint16_t *)dst;
308 for(i=0,j=0; j<num_pixels; i+=4,j++)
310 const int b= src[i+0];
311 const int g= src[i+1];
312 const int r= src[i+2];
314 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
319 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
322 const uint8_t *s = src;
323 const uint8_t *end,*mm_end;
324 uint16_t *d = (uint16_t *)dst;
327 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
331 ::"m"(red_15mask),"m"(green_15mask));
337 "movd 4%1, %%mm3\n\t"
338 "punpckldq 8%1, %%mm0\n\t"
339 "punpckldq 12%1, %%mm3\n\t"
340 "movq %%mm0, %%mm1\n\t"
341 "movq %%mm0, %%mm2\n\t"
342 "movq %%mm3, %%mm4\n\t"
343 "movq %%mm3, %%mm5\n\t"
344 "psrlq $3, %%mm0\n\t"
345 "psrlq $3, %%mm3\n\t"
348 "psrlq $6, %%mm1\n\t"
349 "psrlq $6, %%mm4\n\t"
350 "pand %%mm6, %%mm1\n\t"
351 "pand %%mm6, %%mm4\n\t"
352 "psrlq $9, %%mm2\n\t"
353 "psrlq $9, %%mm5\n\t"
354 "pand %%mm7, %%mm2\n\t"
355 "pand %%mm7, %%mm5\n\t"
356 "por %%mm1, %%mm0\n\t"
357 "por %%mm4, %%mm3\n\t"
358 "por %%mm2, %%mm0\n\t"
359 "por %%mm5, %%mm3\n\t"
360 "psllq $16, %%mm3\n\t"
361 "por %%mm3, %%mm0\n\t"
362 MOVNTQ" %%mm0, %0\n\t"
363 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
373 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
375 __asm __volatile(SFENCE:::"memory");
376 __asm __volatile(EMMS:::"memory");
378 unsigned j,i,num_pixels=src_size/4;
379 uint16_t *d = (uint16_t *)dst;
380 for(i=0,j=0; j<num_pixels; i+=4,j++)
382 const int b= src[i+0];
383 const int g= src[i+1];
384 const int r= src[i+2];
386 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
391 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, unsigned src_size)
394 const uint8_t *s = src;
395 const uint8_t *end,*mm_end;
396 uint16_t *d = (uint16_t *)dst;
399 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
403 ::"m"(red_16mask),"m"(green_16mask));
409 "movd 3%1, %%mm3\n\t"
410 "punpckldq 6%1, %%mm0\n\t"
411 "punpckldq 9%1, %%mm3\n\t"
412 "movq %%mm0, %%mm1\n\t"
413 "movq %%mm0, %%mm2\n\t"
414 "movq %%mm3, %%mm4\n\t"
415 "movq %%mm3, %%mm5\n\t"
416 "psrlq $3, %%mm0\n\t"
417 "psrlq $3, %%mm3\n\t"
420 "psrlq $5, %%mm1\n\t"
421 "psrlq $5, %%mm4\n\t"
422 "pand %%mm6, %%mm1\n\t"
423 "pand %%mm6, %%mm4\n\t"
424 "psrlq $8, %%mm2\n\t"
425 "psrlq $8, %%mm5\n\t"
426 "pand %%mm7, %%mm2\n\t"
427 "pand %%mm7, %%mm5\n\t"
428 "por %%mm1, %%mm0\n\t"
429 "por %%mm4, %%mm3\n\t"
430 "por %%mm2, %%mm0\n\t"
431 "por %%mm5, %%mm3\n\t"
432 "psllq $16, %%mm3\n\t"
433 "por %%mm3, %%mm0\n\t"
434 MOVNTQ" %%mm0, %0\n\t"
435 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
444 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
446 __asm __volatile(SFENCE:::"memory");
447 __asm __volatile(EMMS:::"memory");
449 unsigned j,i,num_pixels=src_size/3;
450 uint16_t *d = (uint16_t *)dst;
451 for(i=0,j=0; j<num_pixels; i+=3,j++)
453 const int b= src[i+0];
454 const int g= src[i+1];
455 const int r= src[i+2];
457 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
462 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, unsigned src_size)
465 const uint8_t *s = src;
466 const uint8_t *end,*mm_end;
467 uint16_t *d = (uint16_t *)dst;
470 __asm __volatile(PREFETCH" %0"::"m"(*src):"memory");
474 ::"m"(red_15mask),"m"(green_15mask));
480 "movd 3%1, %%mm3\n\t"
481 "punpckldq 6%1, %%mm0\n\t"
482 "punpckldq 9%1, %%mm3\n\t"
483 "movq %%mm0, %%mm1\n\t"
484 "movq %%mm0, %%mm2\n\t"
485 "movq %%mm3, %%mm4\n\t"
486 "movq %%mm3, %%mm5\n\t"
487 "psrlq $3, %%mm0\n\t"
488 "psrlq $3, %%mm3\n\t"
491 "psrlq $6, %%mm1\n\t"
492 "psrlq $6, %%mm4\n\t"
493 "pand %%mm6, %%mm1\n\t"
494 "pand %%mm6, %%mm4\n\t"
495 "psrlq $9, %%mm2\n\t"
496 "psrlq $9, %%mm5\n\t"
497 "pand %%mm7, %%mm2\n\t"
498 "pand %%mm7, %%mm5\n\t"
499 "por %%mm1, %%mm0\n\t"
500 "por %%mm4, %%mm3\n\t"
501 "por %%mm2, %%mm0\n\t"
502 "por %%mm5, %%mm3\n\t"
503 "psllq $16, %%mm3\n\t"
504 "por %%mm3, %%mm0\n\t"
505 MOVNTQ" %%mm0, %0\n\t"
506 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
515 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
517 __asm __volatile(SFENCE:::"memory");
518 __asm __volatile(EMMS:::"memory");
520 unsigned j,i,num_pixels=src_size/3;
521 uint16_t *d = (uint16_t *)dst;
522 for(i=0,j=0; j<num_pixels; i+=3,j++)
524 const int b= src[i+0];
525 const int g= src[i+1];
526 const int r= src[i+2];
528 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
533 static inline void RENAME(rgb32tobgr32)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
537 "xorl %%eax, %%eax \n\t"
540 PREFETCH" 32(%0, %%eax) \n\t"
541 "movq (%0, %%eax), %%mm0 \n\t"
542 "movq %%mm0, %%mm1 \n\t"
543 "movq %%mm0, %%mm2 \n\t"
544 "pslld $16, %%mm0 \n\t"
545 "psrld $16, %%mm1 \n\t"
546 "pand "MANGLE(mask32r)", %%mm0 \n\t"
547 "pand "MANGLE(mask32g)", %%mm2 \n\t"
548 "pand "MANGLE(mask32b)", %%mm1 \n\t"
549 "por %%mm0, %%mm2 \n\t"
550 "por %%mm1, %%mm2 \n\t"
551 MOVNTQ" %%mm2, (%1, %%eax) \n\t"
552 "addl $8, %%eax \n\t"
553 "cmpl %2, %%eax \n\t"
555 :: "r" (src), "r"(dst), "r" (src_size)
559 __asm __volatile(SFENCE:::"memory");
560 __asm __volatile(EMMS:::"memory");
563 int num_pixels= src_size >> 2;
564 for(i=0; i<num_pixels; i++)
566 dst[4*i + 0] = src[4*i + 2];
567 dst[4*i + 1] = src[4*i + 1];
568 dst[4*i + 2] = src[4*i + 0];
573 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, unsigned int src_size)
577 int mmx_size= 23 - src_size;
579 "movq "MANGLE(mask24r)", %%mm5 \n\t"
580 "movq "MANGLE(mask24g)", %%mm6 \n\t"
581 "movq "MANGLE(mask24b)", %%mm7 \n\t"
584 PREFETCH" 32(%1, %%eax) \n\t"
585 "movq (%1, %%eax), %%mm0 \n\t" // BGR BGR BG
586 "movq (%1, %%eax), %%mm1 \n\t" // BGR BGR BG
587 "movq 2(%1, %%eax), %%mm2 \n\t" // R BGR BGR B
588 "psllq $16, %%mm0 \n\t" // 00 BGR BGR
589 "pand %%mm5, %%mm0 \n\t"
590 "pand %%mm6, %%mm1 \n\t"
591 "pand %%mm7, %%mm2 \n\t"
592 "por %%mm0, %%mm1 \n\t"
593 "por %%mm2, %%mm1 \n\t"
594 "movq 6(%1, %%eax), %%mm0 \n\t" // BGR BGR BG
595 MOVNTQ" %%mm1, (%2, %%eax) \n\t" // RGB RGB RG
596 "movq 8(%1, %%eax), %%mm1 \n\t" // R BGR BGR B
597 "movq 10(%1, %%eax), %%mm2 \n\t" // GR BGR BGR
598 "pand %%mm7, %%mm0 \n\t"
599 "pand %%mm5, %%mm1 \n\t"
600 "pand %%mm6, %%mm2 \n\t"
601 "por %%mm0, %%mm1 \n\t"
602 "por %%mm2, %%mm1 \n\t"
603 "movq 14(%1, %%eax), %%mm0 \n\t" // R BGR BGR B
604 MOVNTQ" %%mm1, 8(%2, %%eax) \n\t" // B RGB RGB R
605 "movq 16(%1, %%eax), %%mm1 \n\t" // GR BGR BGR
606 "movq 18(%1, %%eax), %%mm2 \n\t" // BGR BGR BG
607 "pand %%mm6, %%mm0 \n\t"
608 "pand %%mm7, %%mm1 \n\t"
609 "pand %%mm5, %%mm2 \n\t"
610 "por %%mm0, %%mm1 \n\t"
611 "por %%mm2, %%mm1 \n\t"
612 MOVNTQ" %%mm1, 16(%2, %%eax) \n\t"
613 "addl $24, %%eax \n\t"
616 : "r" (src-mmx_size), "r"(dst-mmx_size)
619 __asm __volatile(SFENCE:::"memory");
620 __asm __volatile(EMMS:::"memory");
622 if(mmx_size==23) return; //finihsed, was multiple of 8
625 src_size= 23 - mmx_size;
629 for(i=0; i<src_size; i+=3)
633 dst[i + 1] = src[i + 1];
634 dst[i + 2] = src[i + 0];
639 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
640 unsigned int width, unsigned int height,
641 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride, int vertLumPerChroma)
644 const int chromWidth= width>>1;
645 for(y=0; y<height; y++)
648 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
650 "xorl %%eax, %%eax \n\t"
653 PREFETCH" 32(%1, %%eax, 2) \n\t"
654 PREFETCH" 32(%2, %%eax) \n\t"
655 PREFETCH" 32(%3, %%eax) \n\t"
656 "movq (%2, %%eax), %%mm0 \n\t" // U(0)
657 "movq %%mm0, %%mm2 \n\t" // U(0)
658 "movq (%3, %%eax), %%mm1 \n\t" // V(0)
659 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
660 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
662 "movq (%1, %%eax,2), %%mm3 \n\t" // Y(0)
663 "movq 8(%1, %%eax,2), %%mm5 \n\t" // Y(8)
664 "movq %%mm3, %%mm4 \n\t" // Y(0)
665 "movq %%mm5, %%mm6 \n\t" // Y(8)
666 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
667 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
668 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
669 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
671 MOVNTQ" %%mm3, (%0, %%eax, 4) \n\t"
672 MOVNTQ" %%mm4, 8(%0, %%eax, 4) \n\t"
673 MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
674 MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
676 "addl $8, %%eax \n\t"
677 "cmpl %4, %%eax \n\t"
679 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
684 for(i=0; i<chromWidth; i++)
686 dst[4*i+0] = ysrc[2*i+0];
687 dst[4*i+1] = usrc[i];
688 dst[4*i+2] = ysrc[2*i+1];
689 dst[4*i+3] = vsrc[i];
692 if((y&(vertLumPerChroma-1))==(vertLumPerChroma-1) )
709 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
710 * problem for anyone then tell me, and ill fix it)
712 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
713 unsigned int width, unsigned int height,
714 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
716 //FIXME interpolate chroma
717 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
722 * width should be a multiple of 16
724 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
725 unsigned int width, unsigned int height,
726 unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
728 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
733 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
734 * problem for anyone then tell me, and ill fix it)
736 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
737 unsigned int width, unsigned int height,
738 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
741 const int chromWidth= width>>1;
742 for(y=0; y<height; y+=2)
746 "xorl %%eax, %%eax \n\t"
747 "pcmpeqw %%mm7, %%mm7 \n\t"
748 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
751 PREFETCH" 64(%0, %%eax, 4) \n\t"
752 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
753 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
754 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
755 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
756 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
757 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
758 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
759 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
760 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
761 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
763 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
765 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(8)
766 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(12)
767 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
768 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
769 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
770 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
771 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
772 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
773 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
774 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
776 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
778 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
779 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
780 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
781 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
782 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
783 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
784 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
785 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
787 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
788 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
790 "addl $8, %%eax \n\t"
791 "cmpl %4, %%eax \n\t"
793 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
801 "xorl %%eax, %%eax \n\t"
804 PREFETCH" 64(%0, %%eax, 4) \n\t"
805 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
806 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
807 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
808 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
809 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
810 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
811 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
812 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
813 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
814 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
816 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
817 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
819 "addl $8, %%eax \n\t"
820 "cmpl %4, %%eax \n\t"
823 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
828 for(i=0; i<chromWidth; i++)
830 ydst[2*i+0] = src[4*i+0];
831 udst[i] = src[4*i+1];
832 ydst[2*i+1] = src[4*i+2];
833 vdst[i] = src[4*i+3];
838 for(i=0; i<chromWidth; i++)
840 ydst[2*i+0] = src[4*i+0];
841 ydst[2*i+1] = src[4*i+2];
850 asm volatile( EMMS" \n\t"
858 * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
859 * problem for anyone then tell me, and ill fix it)
860 * chrominance data is only taken from every secound line others are ignored FIXME write HQ version
862 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
863 unsigned int width, unsigned int height,
864 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
867 const int chromWidth= width>>1;
868 for(y=0; y<height; y+=2)
872 "xorl %%eax, %%eax \n\t"
873 "pcmpeqw %%mm7, %%mm7 \n\t"
874 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
877 PREFETCH" 64(%0, %%eax, 4) \n\t"
878 "movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
879 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(4)
880 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
881 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
882 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
883 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
884 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
885 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
886 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
887 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
889 MOVNTQ" %%mm2, (%1, %%eax, 2) \n\t"
891 "movq 16(%0, %%eax, 4), %%mm1 \n\t" // UYVY UYVY(8)
892 "movq 24(%0, %%eax, 4), %%mm2 \n\t" // UYVY UYVY(12)
893 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
894 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
895 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
896 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
897 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
898 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
899 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
900 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
902 MOVNTQ" %%mm3, 8(%1, %%eax, 2) \n\t"
904 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
905 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
906 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
907 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
908 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
909 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
910 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
911 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
913 MOVNTQ" %%mm0, (%3, %%eax) \n\t"
914 MOVNTQ" %%mm2, (%2, %%eax) \n\t"
916 "addl $8, %%eax \n\t"
917 "cmpl %4, %%eax \n\t"
919 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
927 "xorl %%eax, %%eax \n\t"
930 PREFETCH" 64(%0, %%eax, 4) \n\t"
931 "movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
932 "movq 8(%0, %%eax, 4), %%mm1 \n\t" // YUYV YUYV(4)
933 "movq 16(%0, %%eax, 4), %%mm2 \n\t" // YUYV YUYV(8)
934 "movq 24(%0, %%eax, 4), %%mm3 \n\t" // YUYV YUYV(12)
935 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
936 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
937 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
938 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
939 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
940 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
942 MOVNTQ" %%mm0, (%1, %%eax, 2) \n\t"
943 MOVNTQ" %%mm2, 8(%1, %%eax, 2) \n\t"
945 "addl $8, %%eax \n\t"
946 "cmpl %4, %%eax \n\t"
949 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
954 for(i=0; i<chromWidth; i++)
956 udst[i] = src[4*i+0];
957 ydst[2*i+0] = src[4*i+1];
958 vdst[i] = src[4*i+2];
959 ydst[2*i+1] = src[4*i+3];
964 for(i=0; i<chromWidth; i++)
966 ydst[2*i+0] = src[4*i+1];
967 ydst[2*i+1] = src[4*i+3];
976 asm volatile( EMMS" \n\t"
984 * height should be a multiple of 2 and width should be a multiple of 2 (if this is a
985 * problem for anyone then tell me, and ill fix it)
986 * chrominance data is only taken from every secound line others are ignored in the C version FIXME write HQ version
988 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
989 unsigned int width, unsigned int height,
990 unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
993 const int chromWidth= width>>1;
995 for(y=0; y<height-2; y+=2)
1001 "movl %2, %%eax \n\t"
1002 "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
1003 "movq "MANGLE(w1111)", %%mm5 \n\t"
1004 "pxor %%mm7, %%mm7 \n\t"
1005 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1008 PREFETCH" 64(%0, %%ebx) \n\t"
1009 "movd (%0, %%ebx), %%mm0 \n\t"
1010 "movd 3(%0, %%ebx), %%mm1 \n\t"
1011 "punpcklbw %%mm7, %%mm0 \n\t"
1012 "punpcklbw %%mm7, %%mm1 \n\t"
1013 "movd 6(%0, %%ebx), %%mm2 \n\t"
1014 "movd 9(%0, %%ebx), %%mm3 \n\t"
1015 "punpcklbw %%mm7, %%mm2 \n\t"
1016 "punpcklbw %%mm7, %%mm3 \n\t"
1017 "pmaddwd %%mm6, %%mm0 \n\t"
1018 "pmaddwd %%mm6, %%mm1 \n\t"
1019 "pmaddwd %%mm6, %%mm2 \n\t"
1020 "pmaddwd %%mm6, %%mm3 \n\t"
1021 #ifndef FAST_BGR2YV12
1022 "psrad $8, %%mm0 \n\t"
1023 "psrad $8, %%mm1 \n\t"
1024 "psrad $8, %%mm2 \n\t"
1025 "psrad $8, %%mm3 \n\t"
1027 "packssdw %%mm1, %%mm0 \n\t"
1028 "packssdw %%mm3, %%mm2 \n\t"
1029 "pmaddwd %%mm5, %%mm0 \n\t"
1030 "pmaddwd %%mm5, %%mm2 \n\t"
1031 "packssdw %%mm2, %%mm0 \n\t"
1032 "psraw $7, %%mm0 \n\t"
1034 "movd 12(%0, %%ebx), %%mm4 \n\t"
1035 "movd 15(%0, %%ebx), %%mm1 \n\t"
1036 "punpcklbw %%mm7, %%mm4 \n\t"
1037 "punpcklbw %%mm7, %%mm1 \n\t"
1038 "movd 18(%0, %%ebx), %%mm2 \n\t"
1039 "movd 21(%0, %%ebx), %%mm3 \n\t"
1040 "punpcklbw %%mm7, %%mm2 \n\t"
1041 "punpcklbw %%mm7, %%mm3 \n\t"
1042 "pmaddwd %%mm6, %%mm4 \n\t"
1043 "pmaddwd %%mm6, %%mm1 \n\t"
1044 "pmaddwd %%mm6, %%mm2 \n\t"
1045 "pmaddwd %%mm6, %%mm3 \n\t"
1046 #ifndef FAST_BGR2YV12
1047 "psrad $8, %%mm4 \n\t"
1048 "psrad $8, %%mm1 \n\t"
1049 "psrad $8, %%mm2 \n\t"
1050 "psrad $8, %%mm3 \n\t"
1052 "packssdw %%mm1, %%mm4 \n\t"
1053 "packssdw %%mm3, %%mm2 \n\t"
1054 "pmaddwd %%mm5, %%mm4 \n\t"
1055 "pmaddwd %%mm5, %%mm2 \n\t"
1056 "addl $24, %%ebx \n\t"
1057 "packssdw %%mm2, %%mm4 \n\t"
1058 "psraw $7, %%mm4 \n\t"
1060 "packuswb %%mm4, %%mm0 \n\t"
1061 "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1063 MOVNTQ" %%mm0, (%1, %%eax) \n\t"
1064 "addl $8, %%eax \n\t"
1066 : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
1074 "movl %4, %%eax \n\t"
1075 "movq "MANGLE(w1111)", %%mm5 \n\t"
1076 "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
1077 "pxor %%mm7, %%mm7 \n\t"
1078 "leal (%%eax, %%eax, 2), %%ebx \n\t"
1079 "addl %%ebx, %%ebx \n\t"
1082 PREFETCH" 64(%0, %%ebx) \n\t"
1083 PREFETCH" 64(%1, %%ebx) \n\t"
1084 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1085 "movq (%0, %%ebx), %%mm0 \n\t"
1086 "movq (%1, %%ebx), %%mm1 \n\t"
1087 "movq 6(%0, %%ebx), %%mm2 \n\t"
1088 "movq 6(%1, %%ebx), %%mm3 \n\t"
1089 PAVGB" %%mm1, %%mm0 \n\t"
1090 PAVGB" %%mm3, %%mm2 \n\t"
1091 "movq %%mm0, %%mm1 \n\t"
1092 "movq %%mm2, %%mm3 \n\t"
1093 "psrlq $24, %%mm0 \n\t"
1094 "psrlq $24, %%mm2 \n\t"
1095 PAVGB" %%mm1, %%mm0 \n\t"
1096 PAVGB" %%mm3, %%mm2 \n\t"
1097 "punpcklbw %%mm7, %%mm0 \n\t"
1098 "punpcklbw %%mm7, %%mm2 \n\t"
1100 "movd (%0, %%ebx), %%mm0 \n\t"
1101 "movd (%1, %%ebx), %%mm1 \n\t"
1102 "movd 3(%0, %%ebx), %%mm2 \n\t"
1103 "movd 3(%1, %%ebx), %%mm3 \n\t"
1104 "punpcklbw %%mm7, %%mm0 \n\t"
1105 "punpcklbw %%mm7, %%mm1 \n\t"
1106 "punpcklbw %%mm7, %%mm2 \n\t"
1107 "punpcklbw %%mm7, %%mm3 \n\t"
1108 "paddw %%mm1, %%mm0 \n\t"
1109 "paddw %%mm3, %%mm2 \n\t"
1110 "paddw %%mm2, %%mm0 \n\t"
1111 "movd 6(%0, %%ebx), %%mm4 \n\t"
1112 "movd 6(%1, %%ebx), %%mm1 \n\t"
1113 "movd 9(%0, %%ebx), %%mm2 \n\t"
1114 "movd 9(%1, %%ebx), %%mm3 \n\t"
1115 "punpcklbw %%mm7, %%mm4 \n\t"
1116 "punpcklbw %%mm7, %%mm1 \n\t"
1117 "punpcklbw %%mm7, %%mm2 \n\t"
1118 "punpcklbw %%mm7, %%mm3 \n\t"
1119 "paddw %%mm1, %%mm4 \n\t"
1120 "paddw %%mm3, %%mm2 \n\t"
1121 "paddw %%mm4, %%mm2 \n\t"
1122 "psrlw $2, %%mm0 \n\t"
1123 "psrlw $2, %%mm2 \n\t"
1125 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1126 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1128 "pmaddwd %%mm0, %%mm1 \n\t"
1129 "pmaddwd %%mm2, %%mm3 \n\t"
1130 "pmaddwd %%mm6, %%mm0 \n\t"
1131 "pmaddwd %%mm6, %%mm2 \n\t"
1132 #ifndef FAST_BGR2YV12
1133 "psrad $8, %%mm0 \n\t"
1134 "psrad $8, %%mm1 \n\t"
1135 "psrad $8, %%mm2 \n\t"
1136 "psrad $8, %%mm3 \n\t"
1138 "packssdw %%mm2, %%mm0 \n\t"
1139 "packssdw %%mm3, %%mm1 \n\t"
1140 "pmaddwd %%mm5, %%mm0 \n\t"
1141 "pmaddwd %%mm5, %%mm1 \n\t"
1142 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1143 "psraw $7, %%mm0 \n\t"
1145 #if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
1146 "movq 12(%0, %%ebx), %%mm4 \n\t"
1147 "movq 12(%1, %%ebx), %%mm1 \n\t"
1148 "movq 18(%0, %%ebx), %%mm2 \n\t"
1149 "movq 18(%1, %%ebx), %%mm3 \n\t"
1150 PAVGB" %%mm1, %%mm4 \n\t"
1151 PAVGB" %%mm3, %%mm2 \n\t"
1152 "movq %%mm4, %%mm1 \n\t"
1153 "movq %%mm2, %%mm3 \n\t"
1154 "psrlq $24, %%mm4 \n\t"
1155 "psrlq $24, %%mm2 \n\t"
1156 PAVGB" %%mm1, %%mm4 \n\t"
1157 PAVGB" %%mm3, %%mm2 \n\t"
1158 "punpcklbw %%mm7, %%mm4 \n\t"
1159 "punpcklbw %%mm7, %%mm2 \n\t"
1161 "movd 12(%0, %%ebx), %%mm4 \n\t"
1162 "movd 12(%1, %%ebx), %%mm1 \n\t"
1163 "movd 15(%0, %%ebx), %%mm2 \n\t"
1164 "movd 15(%1, %%ebx), %%mm3 \n\t"
1165 "punpcklbw %%mm7, %%mm4 \n\t"
1166 "punpcklbw %%mm7, %%mm1 \n\t"
1167 "punpcklbw %%mm7, %%mm2 \n\t"
1168 "punpcklbw %%mm7, %%mm3 \n\t"
1169 "paddw %%mm1, %%mm4 \n\t"
1170 "paddw %%mm3, %%mm2 \n\t"
1171 "paddw %%mm2, %%mm4 \n\t"
1172 "movd 18(%0, %%ebx), %%mm5 \n\t"
1173 "movd 18(%1, %%ebx), %%mm1 \n\t"
1174 "movd 21(%0, %%ebx), %%mm2 \n\t"
1175 "movd 21(%1, %%ebx), %%mm3 \n\t"
1176 "punpcklbw %%mm7, %%mm5 \n\t"
1177 "punpcklbw %%mm7, %%mm1 \n\t"
1178 "punpcklbw %%mm7, %%mm2 \n\t"
1179 "punpcklbw %%mm7, %%mm3 \n\t"
1180 "paddw %%mm1, %%mm5 \n\t"
1181 "paddw %%mm3, %%mm2 \n\t"
1182 "paddw %%mm5, %%mm2 \n\t"
1183 "movq "MANGLE(w1111)", %%mm5 \n\t"
1184 "psrlw $2, %%mm4 \n\t"
1185 "psrlw $2, %%mm2 \n\t"
1187 "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
1188 "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
1190 "pmaddwd %%mm4, %%mm1 \n\t"
1191 "pmaddwd %%mm2, %%mm3 \n\t"
1192 "pmaddwd %%mm6, %%mm4 \n\t"
1193 "pmaddwd %%mm6, %%mm2 \n\t"
1194 #ifndef FAST_BGR2YV12
1195 "psrad $8, %%mm4 \n\t"
1196 "psrad $8, %%mm1 \n\t"
1197 "psrad $8, %%mm2 \n\t"
1198 "psrad $8, %%mm3 \n\t"
1200 "packssdw %%mm2, %%mm4 \n\t"
1201 "packssdw %%mm3, %%mm1 \n\t"
1202 "pmaddwd %%mm5, %%mm4 \n\t"
1203 "pmaddwd %%mm5, %%mm1 \n\t"
1204 "addl $24, %%ebx \n\t"
1205 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1206 "psraw $7, %%mm4 \n\t"
1208 "movq %%mm0, %%mm1 \n\t"
1209 "punpckldq %%mm4, %%mm0 \n\t"
1210 "punpckhdq %%mm4, %%mm1 \n\t"
1211 "packsswb %%mm1, %%mm0 \n\t"
1212 "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1214 "movd %%mm0, (%2, %%eax) \n\t"
1215 "punpckhdq %%mm0, %%mm0 \n\t"
1216 "movd %%mm0, (%3, %%eax) \n\t"
1217 "addl $4, %%eax \n\t"
1219 : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
1223 udst += chromStride;
1224 vdst += chromStride;
1228 asm volatile( EMMS" \n\t"
1234 for(; y<height; y+=2)
1237 for(i=0; i<chromWidth; i++)
1239 unsigned int b= src[6*i+0];
1240 unsigned int g= src[6*i+1];
1241 unsigned int r= src[6*i+2];
1243 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1244 unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
1245 unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
1255 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1261 for(i=0; i<chromWidth; i++)
1263 unsigned int b= src[6*i+0];
1264 unsigned int g= src[6*i+1];
1265 unsigned int r= src[6*i+2];
1267 unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1275 Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
1278 udst += chromStride;
1279 vdst += chromStride;
1285 void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
1286 int width, int height, int src1Stride, int src2Stride, int dstStride){
1289 for(h=0; h < height; h++)
1296 "xorl %%eax, %%eax \n\t"
1298 PREFETCH" 64(%1, %%eax) \n\t"
1299 PREFETCH" 64(%2, %%eax) \n\t"
1300 "movdqa (%1, %%eax), %%xmm0 \n\t"
1301 "movdqa (%1, %%eax), %%xmm1 \n\t"
1302 "movdqa (%2, %%eax), %%xmm2 \n\t"
1303 "punpcklbw %%xmm2, %%xmm0 \n\t"
1304 "punpckhbw %%xmm2, %%xmm1 \n\t"
1305 "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
1306 "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
1307 "addl $16, %%eax \n\t"
1308 "cmpl %3, %%eax \n\t"
1310 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1315 "xorl %%eax, %%eax \n\t"
1317 PREFETCH" 64(%1, %%eax) \n\t"
1318 PREFETCH" 64(%2, %%eax) \n\t"
1319 "movq (%1, %%eax), %%mm0 \n\t"
1320 "movq 8(%1, %%eax), %%mm2 \n\t"
1321 "movq %%mm0, %%mm1 \n\t"
1322 "movq %%mm2, %%mm3 \n\t"
1323 "movq (%2, %%eax), %%mm4 \n\t"
1324 "movq 8(%2, %%eax), %%mm5 \n\t"
1325 "punpcklbw %%mm4, %%mm0 \n\t"
1326 "punpckhbw %%mm4, %%mm1 \n\t"
1327 "punpcklbw %%mm5, %%mm2 \n\t"
1328 "punpckhbw %%mm5, %%mm3 \n\t"
1329 MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
1330 MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
1331 MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
1332 MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
1333 "addl $16, %%eax \n\t"
1334 "cmpl %3, %%eax \n\t"
1336 ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
1340 for(w= (width&(~15)); w < width; w++)
1342 dest[2*w+0] = src1[w];
1343 dest[2*w+1] = src2[w];
1346 for(w=0; w < width; w++)
1348 dest[2*w+0] = src1[w];
1349 dest[2*w+1] = src2[w];