git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb.c

   1 /*
   2  *
   3  *  rgb2rgb.c, Software RGB to RGB convertor
   4  *  pluralize by Software PAL8 to RGB convertor
   5  *               Software YUV to YUV convertor
   6  *               Software YUV to RGB convertor
   7  *  Written by Nick Kurshev.
   8  *  palette stuff & yuv stuff by Michael
   9  */
  10 #include <inttypes.h>
  11 #include "../config.h"
  12 #include "rgb2rgb.h"
  13 #include "../mmx_defs.h"
  14
  15 #ifdef HAVE_MMX
  16 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  17 static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  18 static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  19 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
  20 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
  21 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  22 static const uint64_t red_mask  __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  23 static const uint64_t green_mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  24 static const uint64_t blue_mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  25 #endif
  26
  27 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  28 {
  29   uint8_t *dest = dst;
  30   const uint8_t *s = src;
  31   const uint8_t *end;
  32 #ifdef HAVE_MMX
  33   uint8_t *mm_end;
  34 #endif
  35   end = s + src_size;
  36 #ifdef HAVE_MMX
  37   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  38   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  39   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
  40   if(mm_end == end) mm_end -= MMREG_SIZE*4;
  41   while(s < mm_end)
  42   {
  43     __asm __volatile(
  44         PREFETCH"       32%1\n\t"
  45         "movd   %1, %%mm0\n\t"
  46         "punpckldq 3%1, %%mm0\n\t"
  47         "movd   6%1, %%mm1\n\t"
  48         "punpckldq 9%1, %%mm1\n\t"
  49         "movd   12%1, %%mm2\n\t"
  50         "punpckldq 15%1, %%mm2\n\t"
  51         "movd   18%1, %%mm3\n\t"
  52         "punpckldq 21%1, %%mm3\n\t"
  53         "pand   %%mm7, %%mm0\n\t"
  54         "pand   %%mm7, %%mm1\n\t"
  55         "pand   %%mm7, %%mm2\n\t"
  56         "pand   %%mm7, %%mm3\n\t"
  57         MOVNTQ" %%mm0, %0\n\t"
  58         MOVNTQ" %%mm1, 8%0\n\t"
  59         MOVNTQ" %%mm2, 16%0\n\t"
  60         MOVNTQ" %%mm3, 24%0"
  61         :"=m"(*dest)
  62         :"m"(*s)
  63         :"memory");
  64     dest += 32;
  65     s += 24;
  66   }
  67   __asm __volatile(SFENCE:::"memory");
  68   __asm __volatile(EMMS:::"memory");
  69 #endif
  70   while(s < end)
  71   {
  72     *dest++ = *s++;
  73     *dest++ = *s++;
  74     *dest++ = *s++;
  75     *dest++ = 0;
  76   }
  77 }
  78
  79 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  80 {
  81   uint8_t *dest = dst;
  82   const uint8_t *s = src;
  83   const uint8_t *end;
  84 #ifdef HAVE_MMX
  85   uint8_t *mm_end;
  86 #endif
  87   end = s + src_size;
  88 #ifdef HAVE_MMX
  89   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  90   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  91   __asm __volatile(
  92         "movq   %0, %%mm7\n\t"
  93         "movq   %1, %%mm6"
  94         ::"m"(mask24l),"m"(mask24h):"memory");
  95   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  96   while(s < mm_end)
  97   {
  98     __asm __volatile(
  99         PREFETCH"       32%1\n\t"
 100         "movq   %1, %%mm0\n\t"
 101         "movq   8%1, %%mm1\n\t"
 102         "movq   %%mm0, %%mm2\n\t"
 103         "movq   %%mm1, %%mm3\n\t"
 104         "psrlq  $8, %%mm2\n\t"
 105         "psrlq  $8, %%mm3\n\t"
 106         "pand   %%mm7, %%mm0\n\t"
 107         "pand   %%mm7, %%mm1\n\t"
 108         "pand   %%mm6, %%mm2\n\t"
 109         "pand   %%mm6, %%mm3\n\t"
 110         "por    %%mm2, %%mm0\n\t"
 111         "por    %%mm3, %%mm1\n\t"
 112         MOVNTQ" %%mm0, %0\n\t"
 113         MOVNTQ" %%mm1, 6%0"
 114         :"=m"(*dest)
 115         :"m"(*s)
 116         :"memory");
 117     dest += 12;
 118     s += 16;
 119   }
 120   __asm __volatile(SFENCE:::"memory");
 121   __asm __volatile(EMMS:::"memory");
 122 #endif
 123   while(s < end)
 124   {
 125     *dest++ = *s++;
 126     *dest++ = *s++;
 127     *dest++ = *s++;
 128     s++;
 129   }
 130 }
 131
 132 /*
 133  Original by Strepto/Astral
 134  ported to gcc & bugfixed : A'rpi
 135  MMX2, 3DNOW optimization by Nick Kurshev
 136  32bit c version, and and&add trick by Michael Niedermayer
 137 */
 138 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
 139 {
 140 #ifdef HAVE_MMX
 141   register const char* s=src+src_size;
 142   register char* d=dst+src_size;
 143   register int offs=-src_size;
 144   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
 145   __asm __volatile(
 146         "movq   %0, %%mm4\n\t"
 147         ::"m"(mask15s));
 148   while(offs<0)
 149   {
 150         __asm __volatile(
 151                 PREFETCH"       32%1\n\t"
 152                 "movq   %1, %%mm0\n\t"
 153                 "movq   8%1, %%mm2\n\t"
 154                 "movq   %%mm0, %%mm1\n\t"
 155                 "movq   %%mm2, %%mm3\n\t"
 156                 "pand   %%mm4, %%mm0\n\t"
 157                 "pand   %%mm4, %%mm2\n\t"
 158                 "paddw  %%mm1, %%mm0\n\t"
 159                 "paddw  %%mm3, %%mm2\n\t"
 160                 MOVNTQ" %%mm0, %0\n\t"
 161                 MOVNTQ" %%mm2, 8%0"
 162                 :"=m"(*(d+offs))
 163                 :"m"(*(s+offs))
 164                 );
 165         offs+=16;
 166   }
 167   __asm __volatile(SFENCE:::"memory");
 168   __asm __volatile(EMMS:::"memory");
 169 #else
 170 #if 0
 171    const uint16_t *s1=( uint16_t * )src;
 172    uint16_t *d1=( uint16_t * )dst;
 173    uint16_t *e=((uint8_t *)s1)+src_size;
 174    while( s1<e ){
 175      register int x=*( s1++ );
 176      /* rrrrrggggggbbbbb
 177         0rrrrrgggggbbbbb
 178         0111 1111 1110 0000=0x7FE0
 179         00000000000001 1111=0x001F */
 180      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
 181    }
 182 #else
 183         const unsigned *s1=( unsigned * )src;
 184         unsigned *d1=( unsigned * )dst;
 185         int i;
 186         int size= src_size>>2;
 187         for(i=0; i<size; i++)
 188         {
 189                 register int x= s1[i];
 190 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 191                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 192
 193         }
 194 #endif
 195 #endif
 196 }
 197
 198 /**
 199  * Pallete is assumed to contain bgr32
 200  */
 201 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 202 {
 203         unsigned i;
 204         for(i=0; i<num_pixels; i++)
 205                 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
 206 }
 207
 208 /**
 209  * Pallete is assumed to contain bgr32
 210  */
 211 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 212 {
 213         unsigned i;
 214 /*
 215         writes 1 byte o much and might cause alignment issues on some architectures?
 216         for(i=0; i<num_pixels; i++)
 217                 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
 218 */
 219         for(i=0; i<num_pixels; i++)
 220         {
 221                 //FIXME slow?
 222                 dst[0]= palette[ src[i]*4+0 ];
 223                 dst[1]= palette[ src[i]*4+1 ];
 224                 dst[2]= palette[ src[i]*4+2 ];
 225                 dst+= 3;
 226         }
 227 }
 228
 229 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 230 {
 231         unsigned j,i,num_pixels=src_size/4;
 232         uint16_t *d = (uint16_t *)dst;
 233         for(i=0,j=0; j<num_pixels; i+=4,j++)
 234         {
 235                 const int b= src[i+0];
 236                 const int g= src[i+1];
 237                 const int r= src[i+2];
 238
 239                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 240         }
 241 }
 242
 243 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 244 {
 245         unsigned j,i,num_pixels=src_size/4;
 246         uint16_t *d = (uint16_t *)dst;
 247         for(i=0,j=0; j<num_pixels; i+=4,j++)
 248         {
 249                 const int b= src[i+0];
 250                 const int g= src[i+1];
 251                 const int r= src[i+2];
 252
 253                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 254         }
 255 }
 256
 257 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 258 {
 259 #ifdef HAVE_MMX
 260         const uint8_t *s = src;
 261         const uint8_t *end,*mm_end;
 262         uint16_t *d = (uint16_t *)dst;
 263         end = s + src_size;
 264         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
 265         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 266         __asm __volatile(
 267             "movq       %0, %%mm7\n\t"
 268             "movq       %1, %%mm6\n\t"
 269             ::"m"(red_mask),"m"(green_mask));
 270         while(s < mm_end)
 271         {
 272             __asm __volatile(
 273                 PREFETCH" 32%1\n\t"
 274                 "movd   %1, %%mm0\n\t"
 275                 "movd   3%1, %%mm3\n\t"
 276                 "punpckldq 6%1, %%mm0\n\t"
 277                 "punpckldq 9%1, %%mm3\n\t"
 278                 "movq   %%mm0, %%mm1\n\t"
 279                 "movq   %%mm0, %%mm2\n\t"
 280                 "movq   %%mm3, %%mm4\n\t"
 281                 "movq   %%mm3, %%mm5\n\t"
 282                 "psrlq  $3, %%mm0\n\t"
 283                 "psrlq  $3, %%mm3\n\t"
 284                 "pand   %2, %%mm0\n\t"
 285                 "pand   %2, %%mm3\n\t"
 286                 "psrlq  $5, %%mm1\n\t"
 287                 "psrlq  $5, %%mm4\n\t"
 288                 "pand   %%mm6, %%mm1\n\t"
 289                 "pand   %%mm6, %%mm4\n\t"
 290                 "psrlq  $8, %%mm2\n\t"
 291                 "psrlq  $8, %%mm5\n\t"
 292                 "pand   %%mm7, %%mm2\n\t"
 293                 "pand   %%mm7, %%mm5\n\t"
 294                 "por    %%mm1, %%mm0\n\t"
 295                 "por    %%mm4, %%mm3\n\t"
 296                 "por    %%mm2, %%mm0\n\t"
 297                 "por    %%mm5, %%mm3\n\t"
 298                 "psllq  $16, %%mm3\n\t"
 299                 "por    %%mm3, %%mm0\n\t"
 300                 MOVNTQ" %%mm0, %0\n\t"
 301                 :"=m"(*d):"m"(*s),"m"(blue_mask):"memory");
 302                 d += 4;
 303                 s += 12;
 304         }
 305         while(s < end)
 306         {
 307                 const int b= *s++;
 308                 const int g= *s++;
 309                 const int r= *s++;
 310                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 311         }
 312         __asm __volatile(SFENCE:::"memory");
 313         __asm __volatile(EMMS:::"memory");
 314 #else
 315         unsigned j,i,num_pixels=src_size/3;
 316         uint16_t *d = (uint16_t *)dst;
 317         for(i=0,j=0; j<num_pixels; i+=3,j++)
 318         {
 319                 const int b= src[i+0];
 320                 const int g= src[i+1];
 321                 const int r= src[i+2];
 322
 323                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 324         }
 325 #endif
 326 }
 327
 328 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 329 {
 330         unsigned j,i,num_pixels=src_size/3;
 331         uint16_t *d = (uint16_t *)dst;
 332         for(i=0,j=0; j<num_pixels; i+=3,j++)
 333         {
 334                 const int b= src[i+0];
 335                 const int g= src[i+1];
 336                 const int r= src[i+2];
 337
 338                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 339         }
 340 }
 341
 342 /**
 343  * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 344  */
 345 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 346 {
 347         unsigned i;
 348         for(i=0; i<num_pixels; i++)
 349                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 350 }
 351
 352 /**
 353  * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 354  */
 355 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 356 {
 357         unsigned i;
 358         for(i=0; i<num_pixels; i++)
 359                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 360 }
 361 /**
 362  *
 363  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 364  * problem for anyone then tell me, and ill fix it)
 365  */
 366 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
 367         unsigned int width, unsigned int height,
 368         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
 369 {
 370         int y;
 371         const int chromWidth= width>>1;
 372         for(y=0; y<height; y++)
 373         {
 374 #ifdef HAVE_MMX
 375 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 376                 asm volatile(
 377                         "xorl %%eax, %%eax              \n\t"
 378                         "1:                             \n\t"
 379                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
 380                         PREFETCH" 32(%2, %%eax)         \n\t"
 381                         PREFETCH" 32(%3, %%eax)         \n\t"
 382                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
 383                         "movq %%mm0, %%mm2              \n\t" // U(0)
 384                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
 385                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
 386                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 387
 388                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
 389                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
 390                         "movq %%mm3, %%mm4              \n\t" // Y(0)
 391                         "movq %%mm5, %%mm6              \n\t" // Y(8)
 392                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
 393                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
 394                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
 395                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 396
 397                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
 398                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
 399                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
 400                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 401
 402                         "addl $8, %%eax                 \n\t"
 403                         "cmpl %4, %%eax                 \n\t"
 404                         " jb 1b                         \n\t"
 405                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
 406                         : "%eax"
 407                 );
 408 #else
 409                 int i;
 410                 for(i=0; i<chromWidth; i++)
 411                 {
 412                         dst[4*i+0] = ysrc[2*i+0];
 413                         dst[4*i+1] = usrc[i];
 414                         dst[4*i+2] = ysrc[2*i+1];
 415                         dst[4*i+3] = vsrc[i];
 416                 }
 417 #endif
 418                 if(y&1)
 419                 {
 420                         usrc += chromStride;
 421                         vsrc += chromStride;
 422                 }
 423                 ysrc += lumStride;
 424                 dst += dstStride;
 425         }
 426 #ifdef HAVE_MMX
 427 asm(    EMMS" \n\t"
 428         SFENCE" \n\t"
 429         :::"memory");
 430 #endif
 431 }
 432
 433 /**
 434  *
 435  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 436  * problem for anyone then tell me, and ill fix it)
 437  */
 438 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 439         unsigned int width, unsigned int height,
 440         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 441 {
 442         int y;
 443         const int chromWidth= width>>1;
 444         for(y=0; y<height; y+=2)
 445         {
 446 #ifdef HAVE_MMX
 447                 asm volatile(
 448                         "xorl %%eax, %%eax              \n\t"
 449                         "pcmpeqw %%mm7, %%mm7           \n\t"
 450                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
 451                         "1:                             \n\t"
 452                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 453                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 454                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 455                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
 456                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
 457                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
 458                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
 459                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
 460                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
 461                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
 462                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
 463
 464                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
 465
 466                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
 467                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
 468                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
 469                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
 470                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
 471                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
 472                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
 473                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
 474                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
 475                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
 476
 477                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
 478
 479                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
 480                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
 481                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
 482                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
 483                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
 484                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
 485                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
 486                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
 487
 488                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
 489                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
 490
 491                         "addl $8, %%eax                 \n\t"
 492                         "cmpl %4, %%eax                 \n\t"
 493                         " jb 1b                         \n\t"
 494                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
 495                         : "memory", "%eax"
 496                 );
 497
 498                 asm volatile(
 499                         "xorl %%eax, %%eax              \n\t"
 500                         "1:                             \n\t"
 501                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 502                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 503                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 504                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
 505                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
 506                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
 507                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
 508                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
 509                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
 510                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
 511                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
 512
 513                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
 514                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
 515
 516                         "addl $8, %%eax                 \n\t"
 517                         "cmpl %4, %%eax                 \n\t"
 518                         " jb 1b                         \n\t"
 519
 520                         ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth)
 521                         : "memory", "%eax"
 522                 );
 523 #else
 524                 int i;
 525                 for(i=0; i<chromWidth; i++)
 526                 {
 527                         ydst[2*i+0]     = src[4*i+0];
 528                         udst[i]         = src[4*i+1];
 529                         ydst[2*i+1]     = src[4*i+2];
 530                         vdst[i]         = src[4*i+3];
 531                 }
 532                 ydst += lumStride;
 533                 src  += srcStride;
 534
 535                 for(i=0; i<chromWidth; i++)
 536                 {
 537                         ydst[2*i+0]     = src[4*i+0];
 538                         ydst[2*i+1]     = src[4*i+2];
 539                 }
 540 #endif
 541                 udst += chromStride;
 542                 vdst += chromStride;
 543                 ydst += lumStride;
 544                 src  += srcStride;
 545         }
 546 #ifdef HAVE_MMX
 547 asm(    EMMS" \n\t"
 548         SFENCE" \n\t"
 549         :::"memory");
 550 #endif
 551 }