git.sesse.net Git - ffmpeg/blob - postproc/rgb2rgb_template.c

   1 /*
   2  *
   3  *  rgb2rgb.c, Software RGB to RGB convertor
   4  *  Written by Nick Kurshev.
   5  */
   6 #include <inttypes.h>
   7 #include "../config.h"
   8 #include "rgb2rgb.h"
   9 #include "../mmx_defs.h"
  10
  11 #ifdef HAVE_MMX
  12 static const uint64_t mask32   __attribute__((aligned(8))) = 0x00FFFFFF00FFFFFFULL;
  13 static const uint64_t mask24l  __attribute__((aligned(8))) = 0x0000000000FFFFFFULL;
  14 static const uint64_t mask24h  __attribute__((aligned(8))) = 0x0000FFFFFF000000ULL;
  15 static const uint64_t mask15b  __attribute__((aligned(8))) = 0x001F001F001F001FULL; /* 00000000 00011111  xxB */
  16 static const uint64_t mask15rg __attribute__((aligned(8))) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000  RGx */
  17 static const uint64_t mask15s  __attribute__((aligned(8))) = 0xFFE0FFE0FFE0FFE0ULL;
  18 #endif
  19
  20 void rgb24to32(const uint8_t *src,uint8_t *dst,uint32_t src_size)
  21 {
  22   uint8_t *dest = dst;
  23   const uint8_t *s = src;
  24   const uint8_t *end;
  25 #ifdef HAVE_MMX
  26   uint8_t *mm_end;
  27 #endif
  28   end = s + src_size;
  29 #ifdef HAVE_MMX
  30   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  31   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  32   __asm __volatile("movq        %0, %%mm7"::"m"(mask32):"memory");
  33   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  34   while(s < mm_end)
  35   {
  36     __asm __volatile(
  37         PREFETCH"       32%1\n\t"
  38         "movd   %1, %%mm0\n\t"
  39         "movd   3%1, %%mm1\n\t"
  40         "movd   6%1, %%mm2\n\t"
  41         "movd   9%1, %%mm3\n\t"
  42         "punpckldq %%mm1, %%mm0\n\t"
  43         "punpckldq %%mm3, %%mm2\n\t"
  44         "pand   %%mm7, %%mm0\n\t"
  45         "pand   %%mm7, %%mm2\n\t"
  46         MOVNTQ" %%mm0, %0\n\t"
  47         MOVNTQ" %%mm2, 8%0"
  48         :"=m"(*dest)
  49         :"m"(*s)
  50         :"memory");
  51     dest += 16;
  52     s += 12;
  53   }
  54   __asm __volatile(SFENCE:::"memory");
  55   __asm __volatile(EMMS:::"memory");
  56 #endif
  57   while(s < end)
  58   {
  59     *dest++ = *s++;
  60     *dest++ = *s++;
  61     *dest++ = *s++;
  62     *dest++ = 0;
  63   }
  64 }
  65
  66 void rgb32to24(const uint8_t *src,uint8_t *dst,uint32_t src_size)
  67 {
  68   uint8_t *dest = dst;
  69   const uint8_t *s = src;
  70   const uint8_t *end;
  71 #ifdef HAVE_MMX
  72   uint8_t *mm_end;
  73 #endif
  74   end = s + src_size;
  75 #ifdef HAVE_MMX
  76   __asm __volatile(PREFETCH"    %0"::"m"(*s):"memory");
  77   mm_end = (uint8_t*)((((unsigned long)end)/(MMREG_SIZE*2))*(MMREG_SIZE*2));
  78   __asm __volatile(
  79         "movq   %0, %%mm7\n\t"
  80         "movq   %1, %%mm6"
  81         ::"m"(mask24l),"m"(mask24h):"memory");
  82   if(mm_end == end) mm_end -= MMREG_SIZE*2;
  83   while(s < mm_end)
  84   {
  85     __asm __volatile(
  86         PREFETCH"       32%1\n\t"
  87         "movq   %1, %%mm0\n\t"
  88         "movq   8%1, %%mm1\n\t"
  89         "movq   %%mm0, %%mm2\n\t"
  90         "movq   %%mm1, %%mm3\n\t"
  91         "psrlq  $8, %%mm2\n\t"
  92         "psrlq  $8, %%mm3\n\t"
  93         "pand   %%mm7, %%mm0\n\t"
  94         "pand   %%mm7, %%mm1\n\t"
  95         "pand   %%mm6, %%mm2\n\t"
  96         "pand   %%mm6, %%mm3\n\t"
  97         "por    %%mm2, %%mm0\n\t"
  98         "por    %%mm3, %%mm1\n\t"
  99         MOVNTQ" %%mm0, %0\n\t"
 100         MOVNTQ" %%mm1, 6%0"
 101         :"=m"(*dest)
 102         :"m"(*s)
 103         :"memory");
 104     dest += 12;
 105     s += 16;
 106   }
 107   __asm __volatile(SFENCE:::"memory");
 108   __asm __volatile(EMMS:::"memory");
 109 #endif
 110   while(s < end)
 111   {
 112     *dest++ = *s++;
 113     *dest++ = *s++;
 114     *dest++ = *s++;
 115     s++;
 116   }
 117 }
 118
 119 /*
 120  Original by Strepto/Astral
 121  ported to gcc & bugfixed : A'rpi
 122  MMX2, 3DNOW optimization by Nick Kurshev
 123  32bit c version, and and&add trick by Michael Niedermayer
 124 */
 125 void rgb15to16(const uint8_t *src,uint8_t *dst,uint32_t src_size)
 126 {
 127 #ifdef HAVE_MMX
 128   register const char* s=src+src_size;
 129   register char* d=dst+src_size;
 130   register int offs=-src_size;
 131   __asm __volatile(PREFETCH"    %0"::"m"(*(s+offs)));
 132   __asm __volatile(
 133         "movq   %0, %%mm4\n\t"
 134         ::"m"(mask15s));
 135   while(offs<0)
 136   {
 137         __asm __volatile(
 138                 PREFETCH"       32%1\n\t"
 139                 "movq   %1, %%mm0\n\t"
 140                 "movq   8%1, %%mm2\n\t"
 141                 "movq   %%mm0, %%mm1\n\t"
 142                 "movq   %%mm2, %%mm3\n\t"
 143                 "pand   %%mm4, %%mm0\n\t"
 144                 "pand   %%mm4, %%mm2\n\t"
 145                 "paddw  %%mm1, %%mm0\n\t"
 146                 "paddw  %%mm3, %%mm2\n\t"
 147                 MOVNTQ" %%mm0, %0\n\t"
 148                 MOVNTQ" %%mm2, 8%0"
 149                 :"=m"(*(d+offs))
 150                 :"m"(*(s+offs))
 151                 );
 152         offs+=16;
 153   }
 154   __asm __volatile(SFENCE:::"memory");
 155   __asm __volatile(EMMS:::"memory");
 156 #else
 157 #if 0
 158    const uint16_t *s1=( uint16_t * )src;
 159    uint16_t *d1=( uint16_t * )dst;
 160    uint16_t *e=((uint8_t *)s1)+src_size;
 161    while( s1<e ){
 162      register int x=*( s1++ );
 163      /* rrrrrggggggbbbbb
 164         0rrrrrgggggbbbbb
 165         0111 1111 1110 0000=0x7FE0
 166         00000000000001 1111=0x001F */
 167      *( d1++ )=( x&0x001F )|( ( x&0x7FE0 )<<1 );
 168    }
 169 #else
 170         const uint32_t *s1=( uint32_t * )src;
 171         uint32_t *d1=( uint32_t * )dst;
 172         int i;
 173         int size= src_size>>2;
 174         for(i=0; i<size; i++)
 175         {
 176                 register int x= s1[i];
 177 //              d1[i] = x + (x&0x7FE07FE0); //faster but need msbit =0 which might not allways be true
 178                 d1[i] = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
 179
 180         }
 181 #endif
 182 #endif
 183 }
 184
 185 /**
 186  * Pallete is assumed to contain bgr32
 187  */
 188 void palette8torgb32(uint8_t *src, uint8_t *dst, int src_size, uint8_t *palette)
 189 {
 190         int i;
 191         for(i=0; i<src_size; i++)
 192                 ((uint32_t *)dst)[i] = ((uint32_t *)palette)[ src[i] ];
 193 }
 194
 195 /**
 196  * Pallete is assumed to contain bgr32
 197  */
 198 void palette8torgb24(uint8_t *src, uint8_t *dst, int src_size, uint8_t *palette)
 199 {
 200         int i;
 201 /*
 202         writes 1 byte o much and might cause alignment issues on some architectures?
 203         for(i=0; i<src_size; i++)
 204                 ((uint32_t *)(&dst[i*3])) = ((uint32_t *)palette)[ src[i] ];
 205 */
 206         for(i=0; i<src_size; i++)
 207         {
 208                 //FIXME slow?
 209                 dst[0]= palette[ src[i]*4+0 ];
 210                 dst[1]= palette[ src[i]*4+1 ];
 211                 dst[2]= palette[ src[i]*4+2 ];
 212                 dst+= 3;
 213         }
 214 }
 215
 216 void rgb32to16(uint8_t *src, uint8_t *dst, int src_size)
 217 {
 218         int i;
 219         for(i=0; i<src_size; i+=4)
 220         {
 221                 const int b= src[i+0];
 222                 const int g= src[i+1];
 223                 const int r= src[i+2];
 224
 225                 ((uint16_t *)dst)[i]= (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
 226         }
 227 }
 228
 229 void rgb32to15(uint8_t *src, uint8_t *dst, int src_size)
 230 {
 231         int i;
 232         for(i=0; i<src_size; i+=4)
 233         {
 234                 const int b= src[i+0];
 235                 const int g= src[i+1];
 236                 const int r= src[i+2];
 237
 238                 ((uint16_t *)dst)[i]= (b>>3) | ((g&0xF8)<<3) | ((r&0xF8)<<7);
 239         }
 240 }
 241
 242
 243 /**
 244  * Palette is assumed to contain bgr16, see rgb32to16 to convert the palette
 245  */
 246 void palette8torgb16(uint8_t *src, uint8_t *dst, int src_size, uint8_t *palette)
 247 {
 248         int i;
 249         for(i=0; i<src_size; i++)
 250                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 251 }
 252
 253 /**
 254  * Pallete is assumed to contain bgr15, see rgb32to15 to convert the palette
 255  */
 256 void palette8torgb15(uint8_t *src, uint8_t *dst, int src_size, uint8_t *palette)
 257 {
 258         int i;
 259         for(i=0; i<src_size; i++)
 260                 ((uint16_t *)dst)[i] = ((uint16_t *)palette)[ src[i] ];
 261 }