+ unsigned y;
+ const unsigned chromWidth= width>>1;
+#ifdef HAVE_MMX
+ for(y=0; y<height-2; y+=2)
+ {
+ unsigned i;
+ for(i=0; i<2; i++)
+ {
+ asm volatile(
+ "movl %2, %%eax \n\t"
+ "movq "MANGLE(bgr2YCoeff)", %%mm6 \n\t"
+ "movq "MANGLE(w1111)", %%mm5 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "leal (%%eax, %%eax, 2), %%ebx \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%ebx) \n\t"
+ "movd (%0, %%ebx), %%mm0 \n\t"
+ "movd 3(%0, %%ebx), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 6(%0, %%ebx), %%mm2 \n\t"
+ "movd 9(%0, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm0 \n\t"
+ "pmaddwd %%mm6, %%mm1 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+ "pmaddwd %%mm6, %%mm3 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm0 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm1, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "pmaddwd %%mm5, %%mm0 \n\t"
+ "pmaddwd %%mm5, %%mm2 \n\t"
+ "packssdw %%mm2, %%mm0 \n\t"
+ "psraw $7, %%mm0 \n\t"
+
+ "movd 12(%0, %%ebx), %%mm4 \n\t"
+ "movd 15(%0, %%ebx), %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "movd 18(%0, %%ebx), %%mm2 \n\t"
+ "movd 21(%0, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm4 \n\t"
+ "pmaddwd %%mm6, %%mm1 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+ "pmaddwd %%mm6, %%mm3 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm4 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm1, %%mm4 \n\t"
+ "packssdw %%mm3, %%mm2 \n\t"
+ "pmaddwd %%mm5, %%mm4 \n\t"
+ "pmaddwd %%mm5, %%mm2 \n\t"
+ "addl $24, %%ebx \n\t"
+ "packssdw %%mm2, %%mm4 \n\t"
+ "psraw $7, %%mm4 \n\t"
+
+ "packuswb %%mm4, %%mm0 \n\t"
+ "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
+
+ MOVNTQ" %%mm0, (%1, %%eax) \n\t"
+ "addl $8, %%eax \n\t"
+ " js 1b \n\t"
+ : : "r" (src+width*3), "r" (ydst+width), "g" (-width)
+ : "%eax", "%ebx"
+ );
+ ydst += lumStride;
+ src += srcStride;
+ }
+ src -= srcStride*2;
+ asm volatile(
+ "movl %4, %%eax \n\t"
+ "movq "MANGLE(w1111)", %%mm5 \n\t"
+ "movq "MANGLE(bgr2UCoeff)", %%mm6 \n\t"
+ "pxor %%mm7, %%mm7 \n\t"
+ "leal (%%eax, %%eax, 2), %%ebx \n\t"
+ "addl %%ebx, %%ebx \n\t"
+ ".balign 16 \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%0, %%ebx) \n\t"
+ PREFETCH" 64(%1, %%ebx) \n\t"
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+ "movq (%0, %%ebx), %%mm0 \n\t"
+ "movq (%1, %%ebx), %%mm1 \n\t"
+ "movq 6(%0, %%ebx), %%mm2 \n\t"
+ "movq 6(%1, %%ebx), %%mm3 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlq $24, %%mm0 \n\t"
+ "psrlq $24, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm0 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+#else
+ "movd (%0, %%ebx), %%mm0 \n\t"
+ "movd (%1, %%ebx), %%mm1 \n\t"
+ "movd 3(%0, %%ebx), %%mm2 \n\t"
+ "movd 3(%1, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm0 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm0 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm0 \n\t"
+ "movd 6(%0, %%ebx), %%mm4 \n\t"
+ "movd 6(%1, %%ebx), %%mm1 \n\t"
+ "movd 9(%0, %%ebx), %%mm2 \n\t"
+ "movd 9(%1, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm4 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm4, %%mm2 \n\t"
+ "psrlw $2, %%mm0 \n\t"
+ "psrlw $2, %%mm2 \n\t"
+#endif
+ "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
+ "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
+
+ "pmaddwd %%mm0, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm0 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm0 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm2, %%mm0 \n\t"
+ "packssdw %%mm3, %%mm1 \n\t"
+ "pmaddwd %%mm5, %%mm0 \n\t"
+ "pmaddwd %%mm5, %%mm1 \n\t"
+ "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
+ "psraw $7, %%mm0 \n\t"
+
+#if defined (HAVE_MMX2) || defined (HAVE_3DNOW)
+ "movq 12(%0, %%ebx), %%mm4 \n\t"
+ "movq 12(%1, %%ebx), %%mm1 \n\t"
+ "movq 18(%0, %%ebx), %%mm2 \n\t"
+ "movq 18(%1, %%ebx), %%mm3 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "movq %%mm4, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "psrlq $24, %%mm4 \n\t"
+ "psrlq $24, %%mm2 \n\t"
+ PAVGB" %%mm1, %%mm4 \n\t"
+ PAVGB" %%mm3, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+#else
+ "movd 12(%0, %%ebx), %%mm4 \n\t"
+ "movd 12(%1, %%ebx), %%mm1 \n\t"
+ "movd 15(%0, %%ebx), %%mm2 \n\t"
+ "movd 15(%1, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm4 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm4 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm2, %%mm4 \n\t"
+ "movd 18(%0, %%ebx), %%mm5 \n\t"
+ "movd 18(%1, %%ebx), %%mm1 \n\t"
+ "movd 21(%0, %%ebx), %%mm2 \n\t"
+ "movd 21(%1, %%ebx), %%mm3 \n\t"
+ "punpcklbw %%mm7, %%mm5 \n\t"
+ "punpcklbw %%mm7, %%mm1 \n\t"
+ "punpcklbw %%mm7, %%mm2 \n\t"
+ "punpcklbw %%mm7, %%mm3 \n\t"
+ "paddw %%mm1, %%mm5 \n\t"
+ "paddw %%mm3, %%mm2 \n\t"
+ "paddw %%mm5, %%mm2 \n\t"
+ "movq "MANGLE(w1111)", %%mm5 \n\t"
+ "psrlw $2, %%mm4 \n\t"
+ "psrlw $2, %%mm2 \n\t"
+#endif
+ "movq "MANGLE(bgr2VCoeff)", %%mm1 \n\t"
+ "movq "MANGLE(bgr2VCoeff)", %%mm3 \n\t"
+
+ "pmaddwd %%mm4, %%mm1 \n\t"
+ "pmaddwd %%mm2, %%mm3 \n\t"
+ "pmaddwd %%mm6, %%mm4 \n\t"
+ "pmaddwd %%mm6, %%mm2 \n\t"
+#ifndef FAST_BGR2YV12
+ "psrad $8, %%mm4 \n\t"
+ "psrad $8, %%mm1 \n\t"
+ "psrad $8, %%mm2 \n\t"
+ "psrad $8, %%mm3 \n\t"
+#endif
+ "packssdw %%mm2, %%mm4 \n\t"
+ "packssdw %%mm3, %%mm1 \n\t"
+ "pmaddwd %%mm5, %%mm4 \n\t"
+ "pmaddwd %%mm5, %%mm1 \n\t"
+ "addl $24, %%ebx \n\t"
+ "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
+ "psraw $7, %%mm4 \n\t"
+
+ "movq %%mm0, %%mm1 \n\t"
+ "punpckldq %%mm4, %%mm0 \n\t"
+ "punpckhdq %%mm4, %%mm1 \n\t"
+ "packsswb %%mm1, %%mm0 \n\t"
+ "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
+
+ "movd %%mm0, (%2, %%eax) \n\t"
+ "punpckhdq %%mm0, %%mm0 \n\t"
+ "movd %%mm0, (%3, %%eax) \n\t"
+ "addl $4, %%eax \n\t"
+ " js 1b \n\t"
+ : : "r" (src+width*6), "r" (src+srcStride+width*6), "r" (udst+width), "r" (vdst+width), "g" (-width)
+ : "%eax", "%ebx"
+ );
+
+ udst += chromStride;
+ vdst += chromStride;
+ src += srcStride*2;
+ }
+
+ asm volatile( EMMS" \n\t"
+ SFENCE" \n\t"
+ :::"memory");
+#else
+ y=0;
+#endif
+ for(; y<height; y+=2)
+ {
+ unsigned i;
+ for(i=0; i<chromWidth; i++)
+ {
+ unsigned int b= src[6*i+0];
+ unsigned int g= src[6*i+1];
+ unsigned int r= src[6*i+2];
+
+ unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+ unsigned int V = ((RV*r + GV*g + BV*b)>>RGB2YUV_SHIFT) + 128;
+ unsigned int U = ((RU*r + GU*g + BU*b)>>RGB2YUV_SHIFT) + 128;
+
+ udst[i] = U;
+ vdst[i] = V;
+ ydst[2*i] = Y;
+
+ b= src[6*i+3];
+ g= src[6*i+4];
+ r= src[6*i+5];
+
+ Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+ ydst[2*i+1] = Y;
+ }
+ ydst += lumStride;
+ src += srcStride;
+
+ for(i=0; i<chromWidth; i++)
+ {
+ unsigned int b= src[6*i+0];
+ unsigned int g= src[6*i+1];
+ unsigned int r= src[6*i+2];
+
+ unsigned int Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+
+ ydst[2*i] = Y;
+
+ b= src[6*i+3];
+ g= src[6*i+4];
+ r= src[6*i+5];
+
+ Y = ((RY*r + GY*g + BY*b)>>RGB2YUV_SHIFT) + 16;
+ ydst[2*i+1] = Y;
+ }
+ udst += chromStride;
+ vdst += chromStride;
+ ydst += lumStride;
+ src += srcStride;
+ }
+}
+
+void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
+ unsigned width, unsigned height, int src1Stride,
+ int src2Stride, int dstStride){
+ unsigned h;
+
+ for(h=0; h < height; h++)
+ {
+ unsigned w;
+
+#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+ asm(
+ "xorl %%eax, %%eax \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%1, %%eax) \n\t"
+ PREFETCH" 64(%2, %%eax) \n\t"
+ "movdqa (%1, %%eax), %%xmm0 \n\t"
+ "movdqa (%1, %%eax), %%xmm1 \n\t"
+ "movdqa (%2, %%eax), %%xmm2 \n\t"
+ "punpcklbw %%xmm2, %%xmm0 \n\t"
+ "punpckhbw %%xmm2, %%xmm1 \n\t"
+ "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
+ "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
+ "addl $16, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ " jb 1b \n\t"
+ ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
+ : "memory", "%eax"
+ );
+#else
+ asm(
+ "xorl %%eax, %%eax \n\t"
+ "1: \n\t"
+ PREFETCH" 64(%1, %%eax) \n\t"
+ PREFETCH" 64(%2, %%eax) \n\t"
+ "movq (%1, %%eax), %%mm0 \n\t"
+ "movq 8(%1, %%eax), %%mm2 \n\t"
+ "movq %%mm0, %%mm1 \n\t"
+ "movq %%mm2, %%mm3 \n\t"
+ "movq (%2, %%eax), %%mm4 \n\t"
+ "movq 8(%2, %%eax), %%mm5 \n\t"
+ "punpcklbw %%mm4, %%mm0 \n\t"
+ "punpckhbw %%mm4, %%mm1 \n\t"
+ "punpcklbw %%mm5, %%mm2 \n\t"
+ "punpckhbw %%mm5, %%mm3 \n\t"
+ MOVNTQ" %%mm0, (%0, %%eax, 2) \n\t"
+ MOVNTQ" %%mm1, 8(%0, %%eax, 2) \n\t"
+ MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
+ MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
+ "addl $16, %%eax \n\t"
+ "cmpl %3, %%eax \n\t"
+ " jb 1b \n\t"
+ ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
+ : "memory", "%eax"
+ );
+#endif
+ for(w= (width&(~15)); w < width; w++)
+ {
+ dest[2*w+0] = src1[w];
+ dest[2*w+1] = src2[w];
+ }
+#else
+ for(w=0; w < width; w++)
+ {
+ dest[2*w+0] = src1[w];
+ dest[2*w+1] = src2[w];
+ }
+#endif
+ dest += dstStride;
+ src1 += src1Stride;
+ src2 += src2Stride;
+ }
+#ifdef HAVE_MMX
+ asm(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+#endif
+}
+
+static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
+ uint8_t *dst1, uint8_t *dst2,
+ unsigned width, unsigned height,
+ int srcStride1, int srcStride2,
+ int dstStride1, int dstStride2)
+{
+ unsigned int y,x,h;
+ int w;
+ w=width/2; h=height/2;
+#ifdef HAVE_MMX
+ asm volatile(
+ PREFETCH" %0\n\t"
+ PREFETCH" %1\n\t"
+ ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
+#endif
+ for(y=0;y<h;y++){
+ const uint8_t* s1=src1+srcStride1*(y>>1);
+ uint8_t* d=dst1+dstStride1*y;
+ x=0;
+#ifdef HAVE_MMX
+ for(;x<w-31;x+=32)
+ {
+ asm volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq 8%1, %%mm2\n\t"
+ "movq 16%1, %%mm4\n\t"
+ "movq 24%1, %%mm6\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "movq %%mm2, %%mm3\n\t"
+ "movq %%mm4, %%mm5\n\t"
+ "movq %%mm6, %%mm7\n\t"
+ "punpcklbw %%mm0, %%mm0\n\t"
+ "punpckhbw %%mm1, %%mm1\n\t"
+ "punpcklbw %%mm2, %%mm2\n\t"
+ "punpckhbw %%mm3, %%mm3\n\t"
+ "punpcklbw %%mm4, %%mm4\n\t"
+ "punpckhbw %%mm5, %%mm5\n\t"
+ "punpcklbw %%mm6, %%mm6\n\t"
+ "punpckhbw %%mm7, %%mm7\n\t"
+ MOVNTQ" %%mm0, %0\n\t"
+ MOVNTQ" %%mm1, 8%0\n\t"
+ MOVNTQ" %%mm2, 16%0\n\t"
+ MOVNTQ" %%mm3, 24%0\n\t"
+ MOVNTQ" %%mm4, 32%0\n\t"
+ MOVNTQ" %%mm5, 40%0\n\t"
+ MOVNTQ" %%mm6, 48%0\n\t"
+ MOVNTQ" %%mm7, 56%0"
+ :"=m"(d[2*x])
+ :"m"(s1[x])
+ :"memory");
+ }
+#endif
+ for(;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
+ }
+ for(y=0;y<h;y++){
+ const uint8_t* s2=src2+srcStride2*(y>>1);
+ uint8_t* d=dst2+dstStride2*y;
+ x=0;
+#ifdef HAVE_MMX
+ for(;x<w-31;x+=32)
+ {
+ asm volatile(
+ PREFETCH" 32%1\n\t"
+ "movq %1, %%mm0\n\t"
+ "movq 8%1, %%mm2\n\t"
+ "movq 16%1, %%mm4\n\t"
+ "movq 24%1, %%mm6\n\t"
+ "movq %%mm0, %%mm1\n\t"
+ "movq %%mm2, %%mm3\n\t"
+ "movq %%mm4, %%mm5\n\t"
+ "movq %%mm6, %%mm7\n\t"
+ "punpcklbw %%mm0, %%mm0\n\t"
+ "punpckhbw %%mm1, %%mm1\n\t"
+ "punpcklbw %%mm2, %%mm2\n\t"
+ "punpckhbw %%mm3, %%mm3\n\t"
+ "punpcklbw %%mm4, %%mm4\n\t"
+ "punpckhbw %%mm5, %%mm5\n\t"
+ "punpcklbw %%mm6, %%mm6\n\t"
+ "punpckhbw %%mm7, %%mm7\n\t"
+ MOVNTQ" %%mm0, %0\n\t"
+ MOVNTQ" %%mm1, 8%0\n\t"
+ MOVNTQ" %%mm2, 16%0\n\t"
+ MOVNTQ" %%mm3, 24%0\n\t"
+ MOVNTQ" %%mm4, 32%0\n\t"
+ MOVNTQ" %%mm5, 40%0\n\t"
+ MOVNTQ" %%mm6, 48%0\n\t"
+ MOVNTQ" %%mm7, 56%0"
+ :"=m"(d[2*x])
+ :"m"(s2[x])
+ :"memory");
+ }
+#endif
+ for(;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
+ }
+#ifdef HAVE_MMX
+ asm(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+#endif
+}
+
+static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
+ uint8_t *dst,
+ unsigned width, unsigned height,
+ int srcStride1, int srcStride2,
+ int srcStride3, int dstStride)
+{
+ unsigned y,x,w,h;
+ w=width/2; h=height;
+ for(y=0;y<h;y++){
+ const uint8_t* yp=src1+srcStride1*y;
+ const uint8_t* up=src2+srcStride2*(y>>2);
+ const uint8_t* vp=src3+srcStride3*(y>>2);
+ uint8_t* d=dst+dstStride*y;
+ x=0;
+#ifdef HAVE_MMX
+ for(;x<w-7;x+=8)
+ {
+ asm volatile(
+ PREFETCH" 32(%1, %0)\n\t"
+ PREFETCH" 32(%2, %0)\n\t"
+ PREFETCH" 32(%3, %0)\n\t"
+ "movq (%1, %0, 4), %%mm0\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+ "movq (%2, %0), %%mm1\n\t" /* U0U1U2U3U4U5U6U7 */
+ "movq (%3, %0), %%mm2\n\t" /* V0V1V2V3V4V5V6V7 */
+ "movq %%mm0, %%mm3\n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
+ "movq %%mm1, %%mm4\n\t" /* U0U1U2U3U4U5U6U7 */
+ "movq %%mm2, %%mm5\n\t" /* V0V1V2V3V4V5V6V7 */
+ "punpcklbw %%mm1, %%mm1\n\t" /* U0U0 U1U1 U2U2 U3U3 */
+ "punpcklbw %%mm2, %%mm2\n\t" /* V0V0 V1V1 V2V2 V3V3 */
+ "punpckhbw %%mm4, %%mm4\n\t" /* U4U4 U5U5 U6U6 U7U7 */
+ "punpckhbw %%mm5, %%mm5\n\t" /* V4V4 V5V5 V6V6 V7V7 */
+
+ "movq %%mm1, %%mm6\n\t"
+ "punpcklbw %%mm2, %%mm1\n\t" /* U0V0 U0V0 U1V1 U1V1*/
+ "punpcklbw %%mm1, %%mm0\n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
+ "punpckhbw %%mm1, %%mm3\n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
+ MOVNTQ" %%mm0, (%4, %0, 8)\n\t"
+ MOVNTQ" %%mm3, 8(%4, %0, 8)\n\t"
+
+ "punpckhbw %%mm2, %%mm6\n\t" /* U2V2 U2V2 U3V3 U3V3*/
+ "movq 8(%1, %0, 4), %%mm0\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "punpcklbw %%mm6, %%mm0\n\t" /* Y U2 Y V2 Y U2 Y V2*/
+ "punpckhbw %%mm6, %%mm3\n\t" /* Y U3 Y V3 Y U3 Y V3*/
+ MOVNTQ" %%mm0, 16(%4, %0, 8)\n\t"
+ MOVNTQ" %%mm3, 24(%4, %0, 8)\n\t"
+
+ "movq %%mm4, %%mm6\n\t"
+ "movq 16(%1, %0, 4), %%mm0\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "punpcklbw %%mm5, %%mm4\n\t"
+ "punpcklbw %%mm4, %%mm0\n\t" /* Y U4 Y V4 Y U4 Y V4*/
+ "punpckhbw %%mm4, %%mm3\n\t" /* Y U5 Y V5 Y U5 Y V5*/
+ MOVNTQ" %%mm0, 32(%4, %0, 8)\n\t"
+ MOVNTQ" %%mm3, 40(%4, %0, 8)\n\t"
+
+ "punpckhbw %%mm5, %%mm6\n\t"
+ "movq 24(%1, %0, 4), %%mm0\n\t"
+ "movq %%mm0, %%mm3\n\t"
+ "punpcklbw %%mm6, %%mm0\n\t" /* Y U6 Y V6 Y U6 Y V6*/
+ "punpckhbw %%mm6, %%mm3\n\t" /* Y U7 Y V7 Y U7 Y V7*/
+ MOVNTQ" %%mm0, 48(%4, %0, 8)\n\t"
+ MOVNTQ" %%mm3, 56(%4, %0, 8)\n\t"
+
+ : "+r" (x)
+ : "r"(yp), "r" (up), "r"(vp), "r"(d)
+ :"memory");
+ }
+#endif
+ for(; x<w; x++)
+ {
+ const int x2= x<<2;
+ d[8*x+0]=yp[x2];
+ d[8*x+1]=up[x];
+ d[8*x+2]=yp[x2+1];
+ d[8*x+3]=vp[x];
+ d[8*x+4]=yp[x2+2];
+ d[8*x+5]=up[x];
+ d[8*x+6]=yp[x2+3];
+ d[8*x+7]=vp[x];
+ }
+ }
+#ifdef HAVE_MMX
+ asm(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+#endif