git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb.c

   1 /*
   2  *
   3  *  rgb2rgb.c, Software RGB to RGB convertor
   4  *  pluralize by Software PAL8 to RGB convertor
   5  *               Software YUV to YUV convertor
   6  *               Software YUV to RGB convertor
   7  *  Written by Nick Kurshev.
   8  *  palette stuff & yuv stuff by Michael
   9  */
  10 #include <inttypes.h>
  11 #include "../config.h"
  12 #include "rgb2rgb.h"
  13 #include "../mmx_defs.h"
  14
  15 #ifdef HAVE_MMX
  16 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  17 static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  18 static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  19 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
  20 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
  21 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  22 static const uint64_t red_16mask  __attribute__((aligned(8))) = 0x0000f8000000f800ULL;
  23 static const uint64_t green_16mask __attribute__((aligned(8)))= 0x000007e0000007e0ULL;
  24 static const uint64_t blue_16mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  25 static const uint64_t red_15mask  __attribute__((aligned(8))) = 0x00007c000000f800ULL;
  26 static const uint64_t green_15mask __attribute__((aligned(8)))= 0x000003e0000007e0ULL;
  27 static const uint64_t blue_15mask __attribute__((aligned(8))) = 0x0000001f0000001fULL;
  28 #endif
  29
  30 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  31 {
  32   uint8_t *dest = dst;
  33   const uint8_t *s = src;
  34   const uint8_t *end;
  35 #ifdef HAVE_MMX
  36   uint8_t *mm_end;
  37 #endif
  38   end = s + src_size;
  39 #ifdef HAVE_MMX
  40   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  41   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*4))*(MMREG_SIZE*4));
  42   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
  43   if(mm_end == end) mm_end -= MMREG_SIZE*4;
  44   while(s < mm_end)
  45   {
  46     __asm __volatile(
  47         PREFETCH"       32%1\n\t"
  48         "movd   %1, %%mm0\n\t"
  49         "punpckldq 3%1, %%mm0\n\t"
  50         "movd   6%1, %%mm1\n\t"
  51         "punpckldq 9%1, %%mm1\n\t"
  52         "movd   12%1, %%mm2\n\t"
  53         "punpckldq 15%1, %%mm2\n\t"
  54         "movd   18%1, %%mm3\n\t"
  55         "punpckldq 21%1, %%mm3\n\t"
  56         "pand   %%mm7, %%mm0\n\t"
  57         "pand   %%mm7, %%mm1\n\t"
  58         "pand   %%mm7, %%mm2\n\t"
  59         "pand   %%mm7, %%mm3\n\t"
  60         MOVNTQ" %%mm0, %0\n\t"
  61         MOVNTQ" %%mm1, 8%0\n\t"
  62         MOVNTQ" %%mm2, 16%0\n\t"
  63         MOVNTQ" %%mm3, 24%0"
  64         :"=m"(*dest)
  65         :"m"(*s)
  66         :"memory");
  67     dest += 32;
  68     s += 24;
  69   }
  70   __asm __volatile(SFENCE:::"memory");
  71   __asm __volatile(EMMS:::"memory");
  72 #endif
  73   while(s < end)
  74   {
  75     *dest++ = *s++;
  76     *dest++ = *s++;
  77     *dest++ = *s++;
  78     *dest++ = 0;
  79   }
  80 }
  81
  82 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  83 {
  84   uint8_t *dest = dst;
  85   const uint8_t *s = src;
  86   const uint8_t *end;
  87 #ifdef HAVE_MMX
  88   uint8_t *mm_end;
  89 #endif
  90   end = s + src_size;
  91 #ifdef HAVE_MMX
  92   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  93   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  94   __asm __volatile(
  95         "movq   %0, %%mm7\n\t"
  96         "movq   %1, %%mm6"
  97         ::"m"(mask24l),"m"(mask24h):"memory");
  98   while(s < mm_end)
  99   {
 100     __asm __volatile(
 101         PREFETCH"       32%1\n\t"
 102         "movq   %1, %%mm0\n\t"
 103         "movq   8%1, %%mm1\n\t"
 104         "movq   %%mm0, %%mm2\n\t"
 105         "movq   %%mm1, %%mm3\n\t"
 106         "psrlq  $8, %%mm2\n\t"
 107         "psrlq  $8, %%mm3\n\t"
 108         "pand   %%mm7, %%mm0\n\t"
 109         "pand   %%mm7, %%mm1\n\t"
 110         "pand   %%mm6, %%mm2\n\t"
 111         "pand   %%mm6, %%mm3\n\t"
 112         "por    %%mm2, %%mm0\n\t"
 113         "por    %%mm3, %%mm1\n\t"
 114         MOVNTQ" %%mm0, %0\n\t"
 115         MOVNTQ" %%mm1, 6%0"
 116         :"=m"(*dest)
 117         :"m"(*s)
 118         :"memory");
 119     dest += 12;
 120     s += 16;
 121   }
 122   __asm __volatile(SFENCE:::"memory");
 123   __asm __volatile(EMMS:::"memory");
 124 #endif
 125   while(s < end)
 126   {
 127     *dest++ = *s++;
 128     *dest++ = *s++;
 129     *dest++ = *s++;
 130     s++;
 131   }
 132 }
 133
 134 /*
 135  Original by Strepto/Astral
 136  ported to gcc & bugfixed : A'rpi
 137  MMX2, 3DNOW optimization by Nick Kurshev
 138  32bit c version, and and&add trick by Michael Niedermayer
 139 */
 140 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
 141 {
 142 #ifdef HAVE_MMX
 143   register const char* s=src+src_size;
 144   register char* d=dst+src_size;
 145   register int offs=-src_size;
 146   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
 147   __asm __volatile(
 148         "movq   %0, %%mm4\n\t"
 149         ::"m"(mask15s));
 150   while(offs<0)
 151   {
 152         __asm __volatile(
 153                 PREFETCH"       32%1\n\t"
 154                 "movq   %1, %%mm0\n\t"
 155                 "movq   8%1, %%mm2\n\t"
 156                 "movq   %%mm0, %%mm1\n\t"
 157                 "movq   %%mm2, %%mm3\n\t"
 158                 "pand   %%mm4, %%mm0\n\t"
 159                 "pand   %%mm4, %%mm2\n\t"
 160                 "paddw  %%mm1, %%mm0\n\t"
 161                 "paddw  %%mm3, %%mm2\n\t"
 162                 MOVNTQ" %%mm0, %0\n\t"
 163                 MOVNTQ" %%mm2, 8%0"
 164                 :"=m"(*(d+offs))
 165                 :"m"(*(s+offs))
 166                 );
 167         offs+=16;
 168   }
 169   __asm __volatile(SFENCE:::"memory");
 170   __asm __volatile(EMMS:::"memory");
 171 #else
 172 #if 0
 173    const uint16_t *s1=( uint16_t * )src;
 174    uint16_t *d1=( uint16_t * )dst;
 175    uint16_t *e=((uint8_t *)s1)+src_size;
 176    while( s1<e ){
 177      register int x=*( s1++ );
 178      /* rrrrrggggggbbbbb
 179         0rrrrrgggggbbbbb
 180         0111 1111 1110 0000=0x7FE0
 181         00000000000001 1111=0x001F */
 182      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
 183    }
 184 #else
 185         const unsigned *s1=( unsigned * )src;
 186         unsigned *d1=( unsigned * )dst;
 187         int i;
 188         int size= src_size>>2;
 189         for(i=0; i<size; i++)
 190         {
 191                 register int x= s1[i];
 192 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 193                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 194
 195         }
 196 #endif
 197 #endif
 198 }
 199
 200 /**
 201  * Pallete is assumed to contain bgr32
 202  */
 203 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 204 {
 205         unsigned i;
 206         for(i=0; i<num_pixels; i++)
 207                 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
 208 }
 209
 210 /**
 211  * Pallete is assumed to contain bgr32
 212  */
 213 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 214 {
 215         unsigned i;
 216 /*
 217         writes 1 byte o much and might cause alignment issues on some architectures?
 218         for(i=0; i<num_pixels; i++)
 219                 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
 220 */
 221         for(i=0; i<num_pixels; i++)
 222         {
 223                 //FIXME slow?
 224                 dst[0]= palette[ src[i]*4+0 ];
 225                 dst[1]= palette[ src[i]*4+1 ];
 226                 dst[2]= palette[ src[i]*4+2 ];
 227                 dst+= 3;
 228         }
 229 }
 230
 231 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 232 {
 233 #ifdef HAVE_MMX
 234         const uint8_t *s = src;
 235         const uint8_t *end,*mm_end;
 236         uint16_t *d = (uint16_t *)dst;
 237         end = s + src_size;
 238         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
 239         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 240         __asm __volatile(
 241             "movq       %0, %%mm7\n\t"
 242             "movq       %1, %%mm6\n\t"
 243             ::"m"(red_16mask),"m"(green_16mask));
 244         while(s < mm_end)
 245         {
 246             __asm __volatile(
 247                 PREFETCH" 32%1\n\t"
 248                 "movd   %1, %%mm0\n\t"
 249                 "movd   4%1, %%mm3\n\t"
 250                 "punpckldq 8%1, %%mm0\n\t"
 251                 "punpckldq 12%1, %%mm3\n\t"
 252                 "movq   %%mm0, %%mm1\n\t"
 253                 "movq   %%mm0, %%mm2\n\t"
 254                 "movq   %%mm3, %%mm4\n\t"
 255                 "movq   %%mm3, %%mm5\n\t"
 256                 "psrlq  $3, %%mm0\n\t"
 257                 "psrlq  $3, %%mm3\n\t"
 258                 "pand   %2, %%mm0\n\t"
 259                 "pand   %2, %%mm3\n\t"
 260                 "psrlq  $5, %%mm1\n\t"
 261                 "psrlq  $5, %%mm4\n\t"
 262                 "pand   %%mm6, %%mm1\n\t"
 263                 "pand   %%mm6, %%mm4\n\t"
 264                 "psrlq  $8, %%mm2\n\t"
 265                 "psrlq  $8, %%mm5\n\t"
 266                 "pand   %%mm7, %%mm2\n\t"
 267                 "pand   %%mm7, %%mm5\n\t"
 268                 "por    %%mm1, %%mm0\n\t"
 269                 "por    %%mm4, %%mm3\n\t"
 270                 "por    %%mm2, %%mm0\n\t"
 271                 "por    %%mm5, %%mm3\n\t"
 272                 "psllq  $16, %%mm3\n\t"
 273                 "por    %%mm3, %%mm0\n\t"
 274                 MOVNTQ" %%mm0, %0\n\t"
 275                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 276                 d += 4;
 277                 s += 16;
 278         }
 279         while(s < end)
 280         {
 281                 const int b= *s++;
 282                 const int g= *s++;
 283                 const int r= *s++;
 284                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 285         }
 286         __asm __volatile(SFENCE:::"memory");
 287         __asm __volatile(EMMS:::"memory");
 288 #else
 289         unsigned j,i,num_pixels=src_size/4;
 290         uint16_t *d = (uint16_t *)dst;
 291         for(i=0,j=0; j<num_pixels; i+=4,j++)
 292         {
 293                 const int b= src[i+0];
 294                 const int g= src[i+1];
 295                 const int r= src[i+2];
 296
 297                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 298         }
 299 #endif
 300 }
 301
 302 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 303 {
 304 #ifdef HAVE_MMX
 305         const uint8_t *s = src;
 306         const uint8_t *end,*mm_end;
 307         uint16_t *d = (uint16_t *)dst;
 308         end = s + src_size;
 309         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
 310         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 311         __asm __volatile(
 312             "movq       %0, %%mm7\n\t"
 313             "movq       %1, %%mm6\n\t"
 314             ::"m"(red_15mask),"m"(green_15mask));
 315         while(s < mm_end)
 316         {
 317             __asm __volatile(
 318                 PREFETCH" 32%1\n\t"
 319                 "movd   %1, %%mm0\n\t"
 320                 "movd   4%1, %%mm3\n\t"
 321                 "punpckldq 8%1, %%mm0\n\t"
 322                 "punpckldq 12%1, %%mm3\n\t"
 323                 "movq   %%mm0, %%mm1\n\t"
 324                 "movq   %%mm0, %%mm2\n\t"
 325                 "movq   %%mm3, %%mm4\n\t"
 326                 "movq   %%mm3, %%mm5\n\t"
 327                 "psrlq  $3, %%mm0\n\t"
 328                 "psrlq  $3, %%mm3\n\t"
 329                 "pand   %2, %%mm0\n\t"
 330                 "pand   %2, %%mm3\n\t"
 331                 "psrlq  $6, %%mm1\n\t"
 332                 "psrlq  $6, %%mm4\n\t"
 333                 "pand   %%mm6, %%mm1\n\t"
 334                 "pand   %%mm6, %%mm4\n\t"
 335                 "psrlq  $9, %%mm2\n\t"
 336                 "psrlq  $9, %%mm5\n\t"
 337                 "pand   %%mm7, %%mm2\n\t"
 338                 "pand   %%mm7, %%mm5\n\t"
 339                 "por    %%mm1, %%mm0\n\t"
 340                 "por    %%mm4, %%mm3\n\t"
 341                 "por    %%mm2, %%mm0\n\t"
 342                 "por    %%mm5, %%mm3\n\t"
 343                 "psllq  $16, %%mm3\n\t"
 344                 "por    %%mm3, %%mm0\n\t"
 345                 MOVNTQ" %%mm0, %0\n\t"
 346                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 347                 d += 4;
 348                 s += 16;
 349         }
 350         while(s < end)
 351         {
 352                 const int b= *s++;
 353                 const int g= *s++;
 354                 const int r= *s++;
 355                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 356         }
 357         __asm __volatile(SFENCE:::"memory");
 358         __asm __volatile(EMMS:::"memory");
 359 #else
 360         unsigned j,i,num_pixels=src_size/4;
 361         uint16_t *d = (uint16_t *)dst;
 362         for(i=0,j=0; j<num_pixels; i+=4,j++)
 363         {
 364                 const int b= src[i+0];
 365                 const int g= src[i+1];
 366                 const int r= src[i+2];
 367
 368                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 369         }
 370 #endif
 371 }
 372
 373 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 374 {
 375 #ifdef HAVE_MMX
 376         const uint8_t *s = src;
 377         const uint8_t *end,*mm_end;
 378         uint16_t *d = (uint16_t *)dst;
 379         end = s + src_size;
 380         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
 381         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 382         __asm __volatile(
 383             "movq       %0, %%mm7\n\t"
 384             "movq       %1, %%mm6\n\t"
 385             ::"m"(red_16mask),"m"(green_16mask));
 386         if(mm_end == end) mm_end -= MMREG_SIZE*2;
 387         while(s < mm_end)
 388         {
 389             __asm __volatile(
 390                 PREFETCH" 32%1\n\t"
 391                 "movd   %1, %%mm0\n\t"
 392                 "movd   3%1, %%mm3\n\t"
 393                 "punpckldq 6%1, %%mm0\n\t"
 394                 "punpckldq 9%1, %%mm3\n\t"
 395                 "movq   %%mm0, %%mm1\n\t"
 396                 "movq   %%mm0, %%mm2\n\t"
 397                 "movq   %%mm3, %%mm4\n\t"
 398                 "movq   %%mm3, %%mm5\n\t"
 399                 "psrlq  $3, %%mm0\n\t"
 400                 "psrlq  $3, %%mm3\n\t"
 401                 "pand   %2, %%mm0\n\t"
 402                 "pand   %2, %%mm3\n\t"
 403                 "psrlq  $5, %%mm1\n\t"
 404                 "psrlq  $5, %%mm4\n\t"
 405                 "pand   %%mm6, %%mm1\n\t"
 406                 "pand   %%mm6, %%mm4\n\t"
 407                 "psrlq  $8, %%mm2\n\t"
 408                 "psrlq  $8, %%mm5\n\t"
 409                 "pand   %%mm7, %%mm2\n\t"
 410                 "pand   %%mm7, %%mm5\n\t"
 411                 "por    %%mm1, %%mm0\n\t"
 412                 "por    %%mm4, %%mm3\n\t"
 413                 "por    %%mm2, %%mm0\n\t"
 414                 "por    %%mm5, %%mm3\n\t"
 415                 "psllq  $16, %%mm3\n\t"
 416                 "por    %%mm3, %%mm0\n\t"
 417                 MOVNTQ" %%mm0, %0\n\t"
 418                 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory");
 419                 d += 4;
 420                 s += 12;
 421         }
 422         while(s < end)
 423         {
 424                 const int b= *s++;
 425                 const int g= *s++;
 426                 const int r= *s++;
 427                 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 428         }
 429         __asm __volatile(SFENCE:::"memory");
 430         __asm __volatile(EMMS:::"memory");
 431 #else
 432         unsigned j,i,num_pixels=src_size/3;
 433         uint16_t *d = (uint16_t *)dst;
 434         for(i=0,j=0; j<num_pixels; i+=3,j++)
 435         {
 436                 const int b= src[i+0];
 437                 const int g= src[i+1];
 438                 const int r= src[i+2];
 439
 440                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 441         }
 442 #endif
 443 }
 444
 445 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 446 {
 447 #ifdef HAVE_MMX
 448         const uint8_t *s = src;
 449         const uint8_t *end,*mm_end;
 450         uint16_t *d = (uint16_t *)dst;
 451         end = s + src_size;
 452         mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
 453         __asm __volatile(PREFETCH"      %0"::"m"(*src):"memory");
 454         __asm __volatile(
 455             "movq       %0, %%mm7\n\t"
 456             "movq       %1, %%mm6\n\t"
 457             ::"m"(red_15mask),"m"(green_15mask));
 458         if(mm_end == end) mm_end -= MMREG_SIZE*2;
 459         while(s < mm_end)
 460         {
 461             __asm __volatile(
 462                 PREFETCH" 32%1\n\t"
 463                 "movd   %1, %%mm0\n\t"
 464                 "movd   3%1, %%mm3\n\t"
 465                 "punpckldq 6%1, %%mm0\n\t"
 466                 "punpckldq 9%1, %%mm3\n\t"
 467                 "movq   %%mm0, %%mm1\n\t"
 468                 "movq   %%mm0, %%mm2\n\t"
 469                 "movq   %%mm3, %%mm4\n\t"
 470                 "movq   %%mm3, %%mm5\n\t"
 471                 "psrlq  $3, %%mm0\n\t"
 472                 "psrlq  $3, %%mm3\n\t"
 473                 "pand   %2, %%mm0\n\t"
 474                 "pand   %2, %%mm3\n\t"
 475                 "psrlq  $6, %%mm1\n\t"
 476                 "psrlq  $6, %%mm4\n\t"
 477                 "pand   %%mm6, %%mm1\n\t"
 478                 "pand   %%mm6, %%mm4\n\t"
 479                 "psrlq  $9, %%mm2\n\t"
 480                 "psrlq  $9, %%mm5\n\t"
 481                 "pand   %%mm7, %%mm2\n\t"
 482                 "pand   %%mm7, %%mm5\n\t"
 483                 "por    %%mm1, %%mm0\n\t"
 484                 "por    %%mm4, %%mm3\n\t"
 485                 "por    %%mm2, %%mm0\n\t"
 486                 "por    %%mm5, %%mm3\n\t"
 487                 "psllq  $16, %%mm3\n\t"
 488                 "por    %%mm3, %%mm0\n\t"
 489                 MOVNTQ" %%mm0, %0\n\t"
 490                 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory");
 491                 d += 4;
 492                 s += 12;
 493         }
 494         while(s < end)
 495         {
 496                 const int b= *s++;
 497                 const int g= *s++;
 498                 const int r= *s++;
 499                 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 500         }
 501         __asm __volatile(SFENCE:::"memory");
 502         __asm __volatile(EMMS:::"memory");
 503 #else
 504         unsigned j,i,num_pixels=src_size/3;
 505         uint16_t *d = (uint16_t *)dst;
 506         for(i=0,j=0; j<num_pixels; i+=3,j++)
 507         {
 508                 const int b= src[i+0];
 509                 const int g= src[i+1];
 510                 const int r= src[i+2];
 511
 512                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 513         }
 514 #endif
 515 }
 516
 517 /**
 518  * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 519  */
 520 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 521 {
 522         unsigned i;
 523         for(i=0; i<num_pixels; i++)
 524                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 525 }
 526
 527 /**
 528  * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 529  */
 530 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 531 {
 532         unsigned i;
 533         for(i=0; i<num_pixels; i++)
 534                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 535 }
 536 /**
 537  *
 538  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 539  * problem for anyone then tell me, and ill fix it)
 540  */
 541 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
 542         unsigned int width, unsigned int height,
 543         unsigned int lumStride, unsigned int chromStride, unsigned int dstStride)
 544 {
 545         int y;
 546         const int chromWidth= width>>1;
 547         for(y=0; y<height; y++)
 548         {
 549 #ifdef HAVE_MMX
 550 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 551                 asm volatile(
 552                         "xorl %%eax, %%eax              \n\t"
 553                         "1:                             \n\t"
 554                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
 555                         PREFETCH" 32(%2, %%eax)         \n\t"
 556                         PREFETCH" 32(%3, %%eax)         \n\t"
 557                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
 558                         "movq %%mm0, %%mm2              \n\t" // U(0)
 559                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
 560                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
 561                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 562
 563                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
 564                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
 565                         "movq %%mm3, %%mm4              \n\t" // Y(0)
 566                         "movq %%mm5, %%mm6              \n\t" // Y(8)
 567                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
 568                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
 569                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
 570                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 571
 572                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
 573                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
 574                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
 575                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 576
 577                         "addl $8, %%eax                 \n\t"
 578                         "cmpl %4, %%eax                 \n\t"
 579                         " jb 1b                         \n\t"
 580                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
 581                         : "%eax"
 582                 );
 583 #else
 584                 int i;
 585                 for(i=0; i<chromWidth; i++)
 586                 {
 587                         dst[4*i+0] = ysrc[2*i+0];
 588                         dst[4*i+1] = usrc[i];
 589                         dst[4*i+2] = ysrc[2*i+1];
 590                         dst[4*i+3] = vsrc[i];
 591                 }
 592 #endif
 593                 if(y&1)
 594                 {
 595                         usrc += chromStride;
 596                         vsrc += chromStride;
 597                 }
 598                 ysrc += lumStride;
 599                 dst += dstStride;
 600         }
 601 #ifdef HAVE_MMX
 602 asm(    EMMS" \n\t"
 603         SFENCE" \n\t"
 604         :::"memory");
 605 #endif
 606 }
 607
 608 /**
 609  *
 610  * height should be a multiple of 2 and width should be a multiple of 16 (if this is a
 611  * problem for anyone then tell me, and ill fix it)
 612  */
 613 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
 614         unsigned int width, unsigned int height,
 615         unsigned int lumStride, unsigned int chromStride, unsigned int srcStride)
 616 {
 617         int y;
 618         const int chromWidth= width>>1;
 619         for(y=0; y<height; y+=2)
 620         {
 621 #ifdef HAVE_MMX
 622                 asm volatile(
 623                         "xorl %%eax, %%eax              \n\t"
 624                         "pcmpeqw %%mm7, %%mm7           \n\t"
 625                         "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
 626                         "1:                             \n\t"
 627                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 628                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 629                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 630                         "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
 631                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
 632                         "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
 633                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
 634                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
 635                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
 636                         "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
 637                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
 638
 639                         MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
 640
 641                         "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
 642                         "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
 643                         "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
 644                         "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
 645                         "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
 646                         "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
 647                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
 648                         "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
 649                         "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
 650                         "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
 651
 652                         MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
 653
 654                         "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
 655                         "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
 656                         "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
 657                         "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
 658                         "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
 659                         "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
 660                         "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
 661                         "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
 662
 663                         MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
 664                         MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
 665
 666                         "addl $8, %%eax                 \n\t"
 667                         "cmpl %4, %%eax                 \n\t"
 668                         " jb 1b                         \n\t"
 669                         ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (chromWidth)
 670                         : "memory", "%eax"
 671                 );
 672
 673                 asm volatile(
 674                         "xorl %%eax, %%eax              \n\t"
 675                         "1:                             \n\t"
 676                         PREFETCH" 64(%0, %%eax, 4)      \n\t"
 677                         "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 678                         "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 679                         "movq 16(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(8)
 680                         "movq 24(%0, %%eax, 4), %%mm3   \n\t" // YUYV YUYV(12)
 681                         "pand %%mm7, %%mm0              \n\t" // Y0Y0 Y0Y0(0)
 682                         "pand %%mm7, %%mm1              \n\t" // Y0Y0 Y0Y0(4)
 683                         "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(8)
 684                         "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(12)
 685                         "packuswb %%mm1, %%mm0          \n\t" // YYYY YYYY(0)
 686                         "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(8)
 687
 688                         MOVNTQ" %%mm0, (%1, %%eax, 2)   \n\t"
 689                         MOVNTQ" %%mm2, 8(%1, %%eax, 2)  \n\t"
 690
 691                         "addl $8, %%eax                 \n\t"
 692                         "cmpl %4, %%eax                 \n\t"
 693                         " jb 1b                         \n\t"
 694
 695                         ::"r"(src+srcStride), "r"(ydst+lumStride), "r"(udst), "r"(vdst), "r" (chromWidth)
 696                         : "memory", "%eax"
 697                 );
 698 #else
 699                 int i;
 700                 for(i=0; i<chromWidth; i++)
 701                 {
 702                         ydst[2*i+0]     = src[4*i+0];
 703                         udst[i]         = src[4*i+1];
 704                         ydst[2*i+1]     = src[4*i+2];
 705                         vdst[i]         = src[4*i+3];
 706                 }
 707                 ydst += lumStride;
 708                 src  += srcStride;
 709
 710                 for(i=0; i<chromWidth; i++)
 711                 {
 712                         ydst[2*i+0]     = src[4*i+0];
 713                         ydst[2*i+1]     = src[4*i+2];
 714                 }
 715 #endif
 716                 udst += chromStride;
 717                 vdst += chromStride;
 718                 ydst += lumStride;
 719                 src  += srcStride;
 720         }
 721 #ifdef HAVE_MMX
 722 asm(    EMMS" \n\t"
 723         SFENCE" \n\t"
 724         :::"memory");
 725 #endif
 726 }