git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c

   1 /*
   2  *
   3  *  rgb2rgb.c, Software RGB to RGB convertor
   4  *  pluralize by Software PAL8 to RGB convertor
   5  *               Software YUV to YUV convertor
   6  *               Software YUV to RGB convertor
   7  *  Written by Nick Kurshev.
   8  *  palette stuff & yuv stuff by Michael
   9  */
  10 #include <inttypes.h>
  11 #include "../config.h"
  12 #include "rgb2rgb.h"
  13 #include "../mmx_defs.h"
  14
  15 #ifdef HAVE_MMX
  16 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  17 static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  18 static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  19 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
  20 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
  21 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  22 #endif
  23
  24 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  25 {
  26   uint8_t *dest = dst;
  27   const uint8_t *s = src;
  28   const uint8_t *end;
  29 #ifdef HAVE_MMX
  30   uint8_t *mm_end;
  31 #endif
  32   end = s + src_size;
  33 #ifdef HAVE_MMX
  34   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  35   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  36   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
  37   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  38   while(s < mm_end)
  39   {
  40     __asm __volatile(
  41         PREFETCH"       32%1\n\t"
  42         "movd   %1, %%mm0\n\t"
  43         "punpckldq 3%1, %%mm0\n\t"
  44         "movd   6%1, %%mm1\n\t"
  45         "punpckldq 9%1, %%mm1\n\t"
  46         "movd   12%1, %%mm2\n\t"
  47         "punpckldq 15%1, %%mm2\n\t"
  48         "movd   18%1, %%mm3\n\t"
  49         "punpckldq 21%1, %%mm3\n\t"
  50         "pand   %%mm7, %%mm0\n\t"
  51         "pand   %%mm7, %%mm1\n\t"
  52         "pand   %%mm7, %%mm2\n\t"
  53         "pand   %%mm7, %%mm3\n\t"
  54         MOVNTQ" %%mm0, %0\n\t"
  55         MOVNTQ" %%mm1, 8%0\n\t"
  56         MOVNTQ" %%mm2, 16%0\n\t"
  57         MOVNTQ" %%mm3, 24%0"
  58         :"=m"(*dest)
  59         :"m"(*s)
  60         :"memory");
  61     dest += 32;
  62     s += 24;
  63   }
  64   __asm __volatile(SFENCE:::"memory");
  65   __asm __volatile(EMMS:::"memory");
  66 #endif
  67   while(s < end)
  68   {
  69     *dest++ = *s++;
  70     *dest++ = *s++;
  71     *dest++ = *s++;
  72     *dest++ = 0;
  73   }
  74 }
  75
  76 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  77 {
  78   uint8_t *dest = dst;
  79   const uint8_t *s = src;
  80   const uint8_t *end;
  81 #ifdef HAVE_MMX
  82   uint8_t *mm_end;
  83 #endif
  84   end = s + src_size;
  85 #ifdef HAVE_MMX
  86   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  87   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  88   __asm __volatile(
  89         "movq   %0, %%mm7\n\t"
  90         "movq   %1, %%mm6"
  91         ::"m"(mask24l),"m"(mask24h):"memory");
  92   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  93   while(s < mm_end)
  94   {
  95     __asm __volatile(
  96         PREFETCH"       32%1\n\t"
  97         "movq   %1, %%mm0\n\t"
  98         "movq   8%1, %%mm1\n\t"
  99         "movq   %%mm0, %%mm2\n\t"
 100         "movq   %%mm1, %%mm3\n\t"
 101         "psrlq  $8, %%mm2\n\t"
 102         "psrlq  $8, %%mm3\n\t"
 103         "pand   %%mm7, %%mm0\n\t"
 104         "pand   %%mm7, %%mm1\n\t"
 105         "pand   %%mm6, %%mm2\n\t"
 106         "pand   %%mm6, %%mm3\n\t"
 107         "por    %%mm2, %%mm0\n\t"
 108         "por    %%mm3, %%mm1\n\t"
 109         MOVNTQ" %%mm0, %0\n\t"
 110         MOVNTQ" %%mm1, 6%0"
 111         :"=m"(*dest)
 112         :"m"(*s)
 113         :"memory");
 114     dest += 12;
 115     s += 16;
 116   }
 117   __asm __volatile(SFENCE:::"memory");
 118   __asm __volatile(EMMS:::"memory");
 119 #endif
 120   while(s < end)
 121   {
 122     *dest++ = *s++;
 123     *dest++ = *s++;
 124     *dest++ = *s++;
 125     s++;
 126   }
 127 }
 128
 129 /*
 130  Original by Strepto/Astral
 131  ported to gcc & bugfixed : A'rpi
 132  MMX2, 3DNOW optimization by Nick Kurshev
 133  32bit c version, and and&add trick by Michael Niedermayer
 134 */
 135 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
 136 {
 137 #ifdef HAVE_MMX
 138   register const char* s=src+src_size;
 139   register char* d=dst+src_size;
 140   register int offs=-src_size;
 141   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
 142   __asm __volatile(
 143         "movq   %0, %%mm4\n\t"
 144         ::"m"(mask15s));
 145   while(offs<0)
 146   {
 147         __asm __volatile(
 148                 PREFETCH"       32%1\n\t"
 149                 "movq   %1, %%mm0\n\t"
 150                 "movq   8%1, %%mm2\n\t"
 151                 "movq   %%mm0, %%mm1\n\t"
 152                 "movq   %%mm2, %%mm3\n\t"
 153                 "pand   %%mm4, %%mm0\n\t"
 154                 "pand   %%mm4, %%mm2\n\t"
 155                 "paddw  %%mm1, %%mm0\n\t"
 156                 "paddw  %%mm3, %%mm2\n\t"
 157                 MOVNTQ" %%mm0, %0\n\t"
 158                 MOVNTQ" %%mm2, 8%0"
 159                 :"=m"(*(d+offs))
 160                 :"m"(*(s+offs))
 161                 );
 162         offs+=16;
 163   }
 164   __asm __volatile(SFENCE:::"memory");
 165   __asm __volatile(EMMS:::"memory");
 166 #else
 167 #if 0
 168    const uint16_t *s1=( uint16_t * )src;
 169    uint16_t *d1=( uint16_t * )dst;
 170    uint16_t *e=((uint8_t *)s1)+src_size;
 171    while( s1<e ){
 172      register int x=*( s1++ );
 173      /* rrrrrggggggbbbbb
 174         0rrrrrgggggbbbbb
 175         0111 1111 1110 0000=0x7FE0
 176         00000000000001 1111=0x001F */
 177      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
 178    }
 179 #else
 180         const unsigned *s1=( unsigned * )src;
 181         unsigned *d1=( unsigned * )dst;
 182         int i;
 183         int size= src_size>>2;
 184         for(i=0; i<size; i++)
 185         {
 186                 register int x= s1[i];
 187 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 188                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 189
 190         }
 191 #endif
 192 #endif
 193 }
 194
 195 /**
 196  * Pallete is assumed to contain bgr32
 197  */
 198 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 199 {
 200         unsigned i;
 201         for(i=0; i<num_pixels; i++)
 202                 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
 203 }
 204
 205 /**
 206  * Pallete is assumed to contain bgr32
 207  */
 208 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 209 {
 210         unsigned i;
 211 /*
 212         writes 1 byte o much and might cause alignment issues on some architectures?
 213         for(i=0; i<num_pixels; i++)
 214                 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
 215 */
 216         for(i=0; i<num_pixels; i++)
 217         {
 218                 //FIXME slow?
 219                 dst[0]= palette[ src[i]*4+0 ];
 220                 dst[1]= palette[ src[i]*4+1 ];
 221                 dst[2]= palette[ src[i]*4+2 ];
 222                 dst+= 3;
 223         }
 224 }
 225
 226 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 227 {
 228         unsigned j,i,num_pixels=src_size/4;
 229         uint16_t *d = (uint16_t *)dst;
 230         for(i=0,j=0; j<num_pixels; i+=4,j++)
 231         {
 232                 const int b= src[i+0];
 233                 const int g= src[i+1];
 234                 const int r= src[i+2];
 235
 236                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 237         }
 238 }
 239
 240 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 241 {
 242         unsigned j,i,num_pixels=src_size/4;
 243         uint16_t *d = (uint16_t *)dst;
 244         for(i=0,j=0; j<num_pixels; i+=4,j++)
 245         {
 246                 const int b= src[i+0];
 247                 const int g= src[i+1];
 248                 const int r= src[i+2];
 249
 250                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 251         }
 252 }
 253
 254 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 255 {
 256         unsigned j,i,num_pixels=src_size/3;
 257         uint16_t *d = (uint16_t *)dst;
 258 #if 0/*def HAVE_MMX*/
 259         unsigned mm_npix;
 260         const uint64_t mm_fc = 0xFCFCFCFCFCFCFCFCULL, mm_f8 = 0xF8F8F8F8F8F8F8F8ULL;
 261         mm_npix = ((num_pixels)/(MMREG_SIZE*2))*(MMREG_SIZE*2);
 262         num_pixels -= mm_npix;
 263         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 264         __asm __volatile(
 265             "movq       %0, %%mm7\n\t"
 266             "movq       %1, %%mm6\n\t"
 267             ::"m"(mm_fc),"m"(mm_f8));
 268
 269         for(j=0,i=0;j<mm_npix;j+=4,i+=12)
 270         {
 271             __asm __volatile(
 272                 PREFETCH" 32%1\n\t"
 273                 "movd   %1, %%mm0\n\t"
 274                 "punpckldq 3%1, %%mm0\n\t"
 275                 "movd   6%1, %%mm3\n\t"
 276                 "punpckldq 9%1, %%mm3\n\t"
 277                 "movq   %%mm0, %%mm1\n\t"
 278                 "movq   %%mm0, %%mm2\n\t"
 279                 "movq   %%mm3, %%mm4\n\t"
 280                 "movq   %%mm3, %%mm5\n\t"
 281                 "psrlq  $3, %%mm0\n\t"
 282                 "psrlq  $3, %%mm3\n\t"
 283                 "pand   %%mm7, %%mm1\n\t"
 284                 "pand   %%mm7, %%mm4\n\t"
 285                 "psllq  $3, %%mm1\n\t"
 286                 "psllq  $3, %%mm4\n\t"
 287                 "pand   %%mm6, %%mm2\n\t"
 288                 "pand   %%mm6, %%mm5\n\t"
 289                 "psllq  $8, %%mm2\n\t"
 290                 "psllq  $8, %%mm5\n\t"
 291                 "por    %%mm1, %%mm0\n\t"
 292                 "por    %%mm2, %%mm0\n\t"
 293                 "por    %%mm4, %%mm3\n\t"
 294                 "por    %%mm5, %%mm3\n\t"
 295                 "punpcklwd %%mm3, %%mm0\n\t"
 296                 MOVNTQ" %%mm0, %0\n\t"
 297                 :"=m"(d[j]):"m"(src[i]):"memory");
 298         }
 299 #endif
 300         for(i=0,j=0; j<num_pixels; i+=3,j++)
 301         {
 302                 const int b= src[i+0];
 303                 const int g= src[i+1];
 304                 const int r= src[i+2];
 305
 306                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 307         }
 308 }
 309
 310 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 311 {
 312         unsigned j,i,num_pixels=src_size/3;
 313         uint16_t *d = (uint16_t *)dst;
 314         for(i=0,j=0; j<num_pixels; i+=3,j++)
 315         {
 316                 const int b= src[i+0];
 317                 const int g= src[i+1];
 318                 const int r= src[i+2];
 319
 320                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 321         }
 322 }
 323
 324 /**
 325  * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 326  */
 327 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 328 {
 329         unsigned i;
 330         for(i=0; i<num_pixels; i++)
 331                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 332 }
 333
 334 /**
 335  * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 336  */
 337 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 338 {
 339         unsigned i;
 340         for(i=0; i<num_pixels; i++)
 341                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 342 }
 343 /**
 344  *
 345  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 346  * problem for anyone then tell me, and ill fix it)
 347  */
 348 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
 349         unsigned int width, unsigned int height,
 350         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
 351 {
 352         int y;
 353         const int chromWidth= width>>1;
 354         for(y=0; y<height; y++)
 355         {
 356 #ifdef HAVE_MMX
 357 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 358                 asm volatile(
 359                         "xorl %%eax, %%eax              \n\t"
 360                         "1:                             \n\t"
 361                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
 362                         PREFETCH" 32(%2, %%eax)         \n\t"
 363                         PREFETCH" 32(%3, %%eax)         \n\t"
 364                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
 365                         "movq %%mm0, %%mm2              \n\t" // U(0)
 366                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
 367                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
 368                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 369
 370                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
 371                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
 372                         "movq %%mm3, %%mm4              \n\t" // Y(0)
 373                         "movq %%mm5, %%mm6              \n\t" // Y(8)
 374                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
 375                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
 376                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
 377                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 378
 379                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
 380                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
 381                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
 382                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 383
 384                         "addl $8, %%eax                 \n\t"
 385                         "cmpl %4, %%eax                 \n\t"
 386                         " jb 1b                         \n\t"
 387                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
 388                         : "%eax"
 389                 );
 390 #else
 391                 int i;
 392                 for(i=0; i<chromWidth; i++)
 393                 {
 394                         dst[4*i+0] = ysrc[2*i+0];
 395                         dst[4*i+1] = usrc[i];
 396                         dst[4*i+2] = ysrc[2*i+1];
 397                         dst[4*i+3] = vsrc[i];
 398                 }
 399 #endif
 400                 if(y&1)
 401                 {
 402                         usrc += chromStride;
 403                         vsrc += chromStride;
 404                 }
 405                 ysrc += lumStride;
 406                 dst += dstStride;
 407         }
 408 #ifdef HAVE_MMX
 409 asm(    EMMS" \n\t"
 410         SFENCE" \n\t"
 411         :::"memory");
 412 #endif
 413 }
 414
 415 /**
 416  *
 417  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 418  * problem for anyone then tell me, and ill fix it)
 419  */
 420 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 421         unsigned int width, unsigned int height,
 422         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 423 {
 424         int y;
 425         const int chromWidth= width>>1;
 426         for(y=0; y<height; y+=2)
 427         {
 428 #ifdef HAVE_MMX
 429                 asm volatile(
 430                         "xorl %%eax, %%eax              \n\t"
 431                         "pcmpeqw %%mm7, %%mm7           \n\t"
 432                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
 433                         "1:                             \n\t"
 434                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 435                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 436                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 437                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
 438                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
 439                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
 440                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
 441                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
 442                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
 443                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
 444                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
 445
 446                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
 447
 448                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
 449                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
 450                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
 451                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
 452                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
 453                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
 454                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
 455                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
 456                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
 457                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
 458
 459                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
 460
 461                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
 462                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
 463                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
 464                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
 465                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
 466                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
 467                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
 468                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
 469
 470                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
 471                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
 472
 473                         "addl $8, %%eax                 \n\t"
 474                         "cmpl %4, %%eax                 \n\t"
 475                         " jb 1b                         \n\t"
 476                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
 477                         : "memory", "%eax"
 478                 );
 479
 480                 asm volatile(
 481                         "xorl %%eax, %%eax              \n\t"
 482                         "1:                             \n\t"
 483                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 484                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 485                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 486                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
 487                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
 488                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
 489                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
 490                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
 491                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
 492                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
 493                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
 494
 495                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
 496                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
 497
 498                         "addl $8, %%eax                 \n\t"
 499                         "cmpl %4, %%eax                 \n\t"
 500                         " jb 1b                         \n\t"
 501
 502                         ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth)
 503                         : "memory", "%eax"
 504                 );
 505 #else
 506                 int i;
 507                 for(i=0; i<chromWidth; i++)
 508                 {
 509                         ydst[2*i+0]     = src[4*i+0];
 510                         udst[i]         = src[4*i+1];
 511                         ydst[2*i+1]     = src[4*i+2];
 512                         vdst[i]         = src[4*i+3];
 513                 }
 514                 ydst += lumStride;
 515                 src  += srcStride;
 516
 517                 for(i=0; i<chromWidth; i++)
 518                 {
 519                         ydst[2*i+0]     = src[4*i+0];
 520                         ydst[2*i+1]     = src[4*i+2];
 521                 }
 522 #endif
 523                 udst += chromStride;
 524                 vdst += chromStride;
 525                 ydst += lumStride;
 526                 src  += srcStride;
 527         }
 528 #ifdef HAVE_MMX
 529 asm(    EMMS" \n\t"
 530         SFENCE" \n\t"
 531         :::"memory");
 532 #endif
 533 }