git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb.c

   1 /*
   2  *
   3  *  rgb2rgb.c, Software RGB to RGB convertor
   4  *  Written by Nick Kurshev.
   5  *  palette stuff & yuv stuff by Michael
   6  */
   7 #include <inttypes.h>
   8 #include "../config.h"
   9 #include "rgb2rgb.h"
  10 #include "../mmx_defs.h"
  11
  12 #ifdef HAVE_MMX
  13 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  14 static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  15 static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  16 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
  17 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
  18 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  19 #endif
  20
  21 void rgb24to32(const uint8_t *src,uint8_t *dst,unsigned src_size)
  22 {
  23   uint8_t *dest = dst;
  24   const uint8_t *s = src;
  25   const uint8_t *end;
  26 #ifdef HAVE_MMX
  27   uint8_t *mm_end;
  28 #endif
  29   end = s + src_size;
  30 #ifdef HAVE_MMX
  31   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  32   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  33   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
  34   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  35   while(s < mm_end)
  36   {
  37     __asm __volatile(
  38         PREFETCH"       32%1\n\t"
  39         "movd   %1, %%mm0\n\t"
  40         "movd   3%1, %%mm1\n\t"
  41         "movd   6%1, %%mm2\n\t"
  42         "movd   9%1, %%mm3\n\t"
  43         "punpckldq %%mm1, %%mm0\n\t"
  44         "punpckldq %%mm3, %%mm2\n\t"
  45         "pand   %%mm7, %%mm0\n\t"
  46         "pand   %%mm7, %%mm2\n\t"
  47         MOVNTQ" %%mm0, %0\n\t"
  48         MOVNTQ" %%mm2, 8%0"
  49         :"=m"(*dest)
  50         :"m"(*s)
  51         :"memory");
  52     dest += 16;
  53     s += 12;
  54   }
  55   __asm __volatile(SFENCE:::"memory");
  56   __asm __volatile(EMMS:::"memory");
  57 #endif
  58   while(s < end)
  59   {
  60     *dest++ = *s++;
  61     *dest++ = *s++;
  62     *dest++ = *s++;
  63     *dest++ = 0;
  64   }
  65 }
  66
  67 void rgb32to24(const uint8_t *src,uint8_t *dst,unsigned src_size)
  68 {
  69   uint8_t *dest = dst;
  70   const uint8_t *s = src;
  71   const uint8_t *end;
  72 #ifdef HAVE_MMX
  73   uint8_t *mm_end;
  74 #endif
  75   end = s + src_size;
  76 #ifdef HAVE_MMX
  77   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  78   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  79   __asm __volatile(
  80         "movq   %0, %%mm7\n\t"
  81         "movq   %1, %%mm6"
  82         ::"m"(mask24l),"m"(mask24h):"memory");
  83   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  84   while(s < mm_end)
  85   {
  86     __asm __volatile(
  87         PREFETCH"       32%1\n\t"
  88         "movq   %1, %%mm0\n\t"
  89         "movq   8%1, %%mm1\n\t"
  90         "movq   %%mm0, %%mm2\n\t"
  91         "movq   %%mm1, %%mm3\n\t"
  92         "psrlq  $8, %%mm2\n\t"
  93         "psrlq  $8, %%mm3\n\t"
  94         "pand   %%mm7, %%mm0\n\t"
  95         "pand   %%mm7, %%mm1\n\t"
  96         "pand   %%mm6, %%mm2\n\t"
  97         "pand   %%mm6, %%mm3\n\t"
  98         "por    %%mm2, %%mm0\n\t"
  99         "por    %%mm3, %%mm1\n\t"
 100         MOVNTQ" %%mm0, %0\n\t"
 101         MOVNTQ" %%mm1, 6%0"
 102         :"=m"(*dest)
 103         :"m"(*s)
 104         :"memory");
 105     dest += 12;
 106     s += 16;
 107   }
 108   __asm __volatile(SFENCE:::"memory");
 109   __asm __volatile(EMMS:::"memory");
 110 #endif
 111   while(s < end)
 112   {
 113     *dest++ = *s++;
 114     *dest++ = *s++;
 115     *dest++ = *s++;
 116     s++;
 117   }
 118 }
 119
 120 /*
 121  Original by Strepto/Astral
 122  ported to gcc & bugfixed : A'rpi
 123  MMX2, 3DNOW optimization by Nick Kurshev
 124  32bit c version, and and&add trick by Michael Niedermayer
 125 */
 126 void rgb15to16(const uint8_t *src,uint8_t *dst,unsigned src_size)
 127 {
 128 #ifdef HAVE_MMX
 129   register const char* s=src+src_size;
 130   register char* d=dst+src_size;
 131   register int offs=-src_size;
 132   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
 133   __asm __volatile(
 134         "movq   %0, %%mm4\n\t"
 135         ::"m"(mask15s));
 136   while(offs<0)
 137   {
 138         __asm __volatile(
 139                 PREFETCH"       32%1\n\t"
 140                 "movq   %1, %%mm0\n\t"
 141                 "movq   8%1, %%mm2\n\t"
 142                 "movq   %%mm0, %%mm1\n\t"
 143                 "movq   %%mm2, %%mm3\n\t"
 144                 "pand   %%mm4, %%mm0\n\t"
 145                 "pand   %%mm4, %%mm2\n\t"
 146                 "paddw  %%mm1, %%mm0\n\t"
 147                 "paddw  %%mm3, %%mm2\n\t"
 148                 MOVNTQ" %%mm0, %0\n\t"
 149                 MOVNTQ" %%mm2, 8%0"
 150                 :"=m"(*(d+offs))
 151                 :"m"(*(s+offs))
 152                 );
 153         offs+=16;
 154   }
 155   __asm __volatile(SFENCE:::"memory");
 156   __asm __volatile(EMMS:::"memory");
 157 #else
 158 #if 0
 159    const uint16_t *s1=( uint16_t * )src;
 160    uint16_t *d1=( uint16_t * )dst;
 161    uint16_t *e=((uint8_t *)s1)+src_size;
 162    while( s1<e ){
 163      register int x=*( s1++ );
 164      /* rrrrrggggggbbbbb
 165         0rrrrrgggggbbbbb
 166         0111 1111 1110 0000=0x7FE0
 167         00000000000001 1111=0x001F */
 168      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
 169    }
 170 #else
 171         const unsigned *s1=( unsigned * )src;
 172         unsigned *d1=( unsigned * )dst;
 173         int i;
 174         int size= src_size>>2;
 175         for(i=0; i<size; i++)
 176         {
 177                 register int x= s1[i];
 178 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 179                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 180
 181         }
 182 #endif
 183 #endif
 184 }
 185
 186 /**
 187  * Pallete is assumed to contain bgr32
 188  */
 189 void palette8torgb32(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 190 {
 191         unsigned i;
 192         for(i=0; i<num_pixels; i++)
 193                 ((unsigned *)dst)[i] = ((unsigned *)palette)[ src[i] ];
 194 }
 195
 196 /**
 197  * Pallete is assumed to contain bgr32
 198  */
 199 void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 200 {
 201         unsigned i;
 202 /*
 203         writes 1 byte o much and might cause alignment issues on some architectures?
 204         for(i=0; i<num_pixels; i++)
 205                 ((unsigned *)(&dst[i*3])) = ((unsigned *)palette)[ src[i] ];
 206 */
 207         for(i=0; i<num_pixels; i++)
 208         {
 209                 //FIXME slow?
 210                 dst[0]= palette[ src[i]*4+0 ];
 211                 dst[1]= palette[ src[i]*4+1 ];
 212                 dst[2]= palette[ src[i]*4+2 ];
 213                 dst+= 3;
 214         }
 215 }
 216
 217 void rgb32to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 218 {
 219         unsigned j,i,num_pixels=src_size/4;
 220         uint16_t *d = (uint16_t *)dst;
 221         for(i=0,j=0; j<num_pixels; i+=4,j++)
 222         {
 223                 const int b= src[i+0];
 224                 const int g= src[i+1];
 225                 const int r= src[i+2];
 226
 227                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 228         }
 229 }
 230
 231 void rgb32to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 232 {
 233         unsigned j,i,num_pixels=src_size/4;
 234         uint16_t *d = (uint16_t *)dst;
 235         for(i=0,j=0; j<num_pixels; i+=4,j++)
 236         {
 237                 const int b= src[i+0];
 238                 const int g= src[i+1];
 239                 const int r= src[i+2];
 240
 241                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 242         }
 243 }
 244
 245 void rgb24to16(const uint8_t *src, uint8_t *dst, unsigned src_size)
 246 {
 247         unsigned j,i,num_pixels=src_size/3;
 248         uint16_t *d = (uint16_t *)dst;
 249         for(i=0,j=0; j<num_pixels; i+=3,j++)
 250         {
 251                 const int b= src[i+0];
 252                 const int g= src[i+1];
 253                 const int r= src[i+2];
 254
 255                 d[j]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 256         }
 257 }
 258
 259 void rgb24to15(const uint8_t *src, uint8_t *dst, unsigned src_size)
 260 {
 261         unsigned j,i,num_pixels=src_size/3;
 262         uint16_t *d = (uint16_t *)dst;
 263         for(i=0,j=0; j<num_pixels; i+=3,j++)
 264         {
 265                 const int b= src[i+0];
 266                 const int g= src[i+1];
 267                 const int r= src[i+2];
 268
 269                 d[j]= (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
 270         }
 271 }
 272
 273 /**
 274  * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 275  */
 276 void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 277 {
 278         unsigned i;
 279         for(i=0; i<num_pixels; i++)
 280                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 281 }
 282
 283 /**
 284  * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 285  */
 286 void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette)
 287 {
 288         unsigned i;
 289         for(i=0; i<num_pixels; i++)
 290                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 291 }
 292 /**
 293  *
 294  * width must be a multiple of 16 for the MMX version
 295  */
 296 void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
 297         int width, int height, int lumStride, int chromStride, int dstStride)
 298 {
 299         int y;
 300         const int chromWidth= width>>1;
 301         for(y=0; y<height; y++)
 302         {
 303 #ifdef HAVE_MMX
 304 //FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
 305                 asm volatile(
 306                         "xorl %%eax, %%eax              \n\t"
 307                         "1:                             \n\t"
 308                         PREFETCH" 32(%1, %%eax, 2)      \n\t"
 309                         PREFETCH" 32(%2, %%eax)         \n\t"
 310                         PREFETCH" 32(%3, %%eax)         \n\t"
 311                         "movq (%2, %%eax), %%mm0        \n\t" // U(0)
 312                         "movq %%mm0, %%mm2              \n\t" // U(0)
 313                         "movq (%3, %%eax), %%mm1        \n\t" // V(0)
 314                         "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
 315                         "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 316
 317                         "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
 318                         "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
 319                         "movq %%mm3, %%mm4              \n\t" // Y(0)
 320                         "movq %%mm5, %%mm6              \n\t" // Y(8)
 321                         "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
 322                         "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
 323                         "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
 324                         "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 325
 326                         MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
 327                         MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
 328                         MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
 329                         MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 330
 331                         "addl $8, %%eax                 \n\t"
 332                         "cmpl %4, %%eax                 \n\t"
 333                         " jb 1b                         \n\t"
 334                         ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
 335                         : "%eax"
 336                 );
 337 #else
 338                 int i;
 339                 for(i=0; i<chromWidth; i++)
 340                 {
 341                         dst[4*i+0] = ysrc[2*i+0];
 342                         dst[4*i+1] = usrc[i];
 343                         dst[4*i+2] = ysrc[2*i+1];
 344                         dst[4*i+3] = vsrc[i];
 345                 }
 346 #endif
 347                 if(y&1)
 348                 {
 349                         usrc += chromStride;
 350                         vsrc += chromStride;
 351                 }
 352                 ysrc += lumStride;
 353                 dst += dstStride;
 354         }
 355 #ifdef HAVE_MMX
 356 asm(    EMMS" \n\t"
 357         SFENCE" \n\t"
 358         :::"memory");
 359 #endif
 360 }
 361
 362 void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, unsigned num_pixels)
 363 {
 364 #ifdef HAVE_MMX
 365         asm volatile(
 366                 "xorl %%eax, %%eax              \n\t"
 367                 "pcmpeqw %%mm7, %%mm7           \n\t"
 368                 "psrlw $8, %%mm7                \n\t" // FF,00,FF,00...
 369                 "1:                             \n\t"
 370                 PREFETCH" 64(%0, %%eax, 4)      \n\t"
 371                 "movq (%0, %%eax, 4), %%mm0     \n\t" // YUYV YUYV(0)
 372                 "movq 8(%0, %%eax, 4), %%mm1    \n\t" // YUYV YUYV(4)
 373                 "movq %%mm0, %%mm2              \n\t" // YUYV YUYV(0)
 374                 "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(4)
 375                 "psrlw $8, %%mm0                \n\t" // U0V0 U0V0(0)
 376                 "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(4)
 377                 "pand %%mm7, %%mm2              \n\t" // Y0Y0 Y0Y0(0)
 378                 "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(4)
 379                 "packuswb %%mm1, %%mm0          \n\t" // UVUV UVUV(0)
 380                 "packuswb %%mm3, %%mm2          \n\t" // YYYY YYYY(0)
 381
 382                 MOVNTQ" %%mm2, (%1, %%eax, 2)   \n\t"
 383
 384                 "movq 16(%0, %%eax, 4), %%mm1   \n\t" // YUYV YUYV(8)
 385                 "movq 24(%0, %%eax, 4), %%mm2   \n\t" // YUYV YUYV(12)
 386                 "movq %%mm1, %%mm3              \n\t" // YUYV YUYV(8)
 387                 "movq %%mm2, %%mm4              \n\t" // YUYV YUYV(12)
 388                 "psrlw $8, %%mm1                \n\t" // U0V0 U0V0(8)
 389                 "psrlw $8, %%mm2                \n\t" // U0V0 U0V0(12)
 390                 "pand %%mm7, %%mm3              \n\t" // Y0Y0 Y0Y0(8)
 391                 "pand %%mm7, %%mm4              \n\t" // Y0Y0 Y0Y0(12)
 392                 "packuswb %%mm2, %%mm1          \n\t" // UVUV UVUV(8)
 393                 "packuswb %%mm4, %%mm3          \n\t" // YYYY YYYY(8)
 394
 395                 MOVNTQ" %%mm3, 8(%1, %%eax, 2)  \n\t"
 396
 397                 "movq %%mm0, %%mm2              \n\t" // UVUV UVUV(0)
 398                 "movq %%mm1, %%mm3              \n\t" // UVUV UVUV(8)
 399                 "psrlw $8, %%mm0                \n\t" // V0V0 V0V0(0)
 400                 "psrlw $8, %%mm1                \n\t" // V0V0 V0V0(8)
 401                 "pand %%mm7, %%mm2              \n\t" // U0U0 U0U0(0)
 402                 "pand %%mm7, %%mm3              \n\t" // U0U0 U0U0(8)
 403                 "packuswb %%mm1, %%mm0          \n\t" // VVVV VVVV(0)
 404                 "packuswb %%mm3, %%mm2          \n\t" // UUUU UUUU(0)
 405
 406                 MOVNTQ" %%mm0, (%3, %%eax)      \n\t"
 407                 MOVNTQ" %%mm2, (%2, %%eax)      \n\t"
 408
 409                 "addl $8, %%eax                 \n\t"
 410                 "cmpl %4, %%eax                 \n\t"
 411                 " jb 1b                         \n\t"
 412                 EMMS" \n\t"
 413                 SFENCE
 414                 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "r" (num_pixels>>1)
 415                 : "memory", "%eax"
 416         );
 417 #else
 418         int i;
 419         num_pixels>>=1;
 420         for(i=0; i<num_pixels; i++)
 421         {
 422                  ydst[2*i+0]    = src[4*i+0];
 423                  udst[i]        = src[4*i+1];
 424                  ydst[2*i+1]    = src[4*i+2];
 425                  vdst[i]        = src[4*i+3];
 426         }
 427 #endif
 428 }