git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: dct.c,v 1.1 2004/06/03 19:27:06 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMXEXT
  26 #   include "i386/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31
  32 int x264_dct4_weight2_zigzag[2][16];
  33 int x264_dct8_weight2_zigzag[2][64];
  34
  35 static inline int clip_uint8( int a )
  36 {
  37     if (a&(~255))
  38         return (-a)>>31;
  39     else
  40         return a;
  41 }
  42
  43 /*
  44  * XXX For all dct dc : input could be equal to output so ...
  45  */
  46
  47 static void dct2x2dc( int16_t d[2][2] )
  48 {
  49     int tmp[2][2];
  50
  51     tmp[0][0] = d[0][0] + d[0][1];
  52     tmp[1][0] = d[0][0] - d[0][1];
  53     tmp[0][1] = d[1][0] + d[1][1];
  54     tmp[1][1] = d[1][0] - d[1][1];
  55
  56     d[0][0] = tmp[0][0] + tmp[0][1];
  57     d[1][0] = tmp[1][0] + tmp[1][1];
  58     d[0][1] = tmp[0][0] - tmp[0][1];
  59     d[1][1] = tmp[1][0] - tmp[1][1];
  60 }
  61
  62 static void dct4x4dc( int16_t d[4][4] )
  63 {
  64     int16_t tmp[4][4];
  65     int s01, s23;
  66     int d01, d23;
  67     int i;
  68
  69     for( i = 0; i < 4; i++ )
  70     {
  71         s01 = d[i][0] + d[i][1];
  72         d01 = d[i][0] - d[i][1];
  73         s23 = d[i][2] + d[i][3];
  74         d23 = d[i][2] - d[i][3];
  75
  76         tmp[0][i] = s01 + s23;
  77         tmp[1][i] = s01 - s23;
  78         tmp[2][i] = d01 - d23;
  79         tmp[3][i] = d01 + d23;
  80     }
  81
  82     for( i = 0; i < 4; i++ )
  83     {
  84         s01 = tmp[i][0] + tmp[i][1];
  85         d01 = tmp[i][0] - tmp[i][1];
  86         s23 = tmp[i][2] + tmp[i][3];
  87         d23 = tmp[i][2] - tmp[i][3];
  88
  89         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  90         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  91         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  92         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  93     }
  94 }
  95
  96 static void idct4x4dc( int16_t d[4][4] )
  97 {
  98     int16_t tmp[4][4];
  99     int s01, s23;
 100     int d01, d23;
 101     int i;
 102
 103     for( i = 0; i < 4; i++ )
 104     {
 105         s01 = d[i][0] + d[i][1];
 106         d01 = d[i][0] - d[i][1];
 107         s23 = d[i][2] + d[i][3];
 108         d23 = d[i][2] - d[i][3];
 109
 110         tmp[0][i] = s01 + s23;
 111         tmp[1][i] = s01 - s23;
 112         tmp[2][i] = d01 - d23;
 113         tmp[3][i] = d01 + d23;
 114     }
 115
 116     for( i = 0; i < 4; i++ )
 117     {
 118         s01 = tmp[i][0] + tmp[i][1];
 119         d01 = tmp[i][0] - tmp[i][1];
 120         s23 = tmp[i][2] + tmp[i][3];
 121         d23 = tmp[i][2] - tmp[i][3];
 122
 123         d[i][0] = s01 + s23;
 124         d[i][1] = s01 - s23;
 125         d[i][2] = d01 - d23;
 126         d[i][3] = d01 + d23;
 127     }
 128 }
 129
 130 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 131                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 132 {
 133     int y, x;
 134     for( y = 0; y < i_size; y++ )
 135     {
 136         for( x = 0; x < i_size; x++ )
 137         {
 138             diff[x + y*i_size] = pix1[x] - pix2[x];
 139         }
 140         pix1 += i_pix1;
 141         pix2 += i_pix2;
 142     }
 143 }
 144
 145 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 146 {
 147     int16_t d[4][4];
 148     int16_t tmp[4][4];
 149     int i;
 150
 151     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 152
 153     for( i = 0; i < 4; i++ )
 154     {
 155         const int s03 = d[i][0] + d[i][3];
 156         const int s12 = d[i][1] + d[i][2];
 157         const int d03 = d[i][0] - d[i][3];
 158         const int d12 = d[i][1] - d[i][2];
 159
 160         tmp[0][i] =   s03 +   s12;
 161         tmp[1][i] = 2*d03 +   d12;
 162         tmp[2][i] =   s03 -   s12;
 163         tmp[3][i] =   d03 - 2*d12;
 164     }
 165
 166     for( i = 0; i < 4; i++ )
 167     {
 168         const int s03 = tmp[i][0] + tmp[i][3];
 169         const int s12 = tmp[i][1] + tmp[i][2];
 170         const int d03 = tmp[i][0] - tmp[i][3];
 171         const int d12 = tmp[i][1] - tmp[i][2];
 172
 173         dct[i][0] =   s03 +   s12;
 174         dct[i][1] = 2*d03 +   d12;
 175         dct[i][2] =   s03 -   s12;
 176         dct[i][3] =   d03 - 2*d12;
 177     }
 178 }
 179
 180 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 181 {
 182     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 183     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 184     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 185     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 186 }
 187
 188 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 189 {
 190     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 191     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 192     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 193     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 194 }
 195
 196
 197 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 198 {
 199     int16_t d[4][4];
 200     int16_t tmp[4][4];
 201     int x, y;
 202     int i;
 203
 204     for( i = 0; i < 4; i++ )
 205     {
 206         const int s02 =  dct[0][i]     +  dct[2][i];
 207         const int d02 =  dct[0][i]     -  dct[2][i];
 208         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 209         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 210
 211         tmp[i][0] = s02 + s13;
 212         tmp[i][1] = d02 + d13;
 213         tmp[i][2] = d02 - d13;
 214         tmp[i][3] = s02 - s13;
 215     }
 216
 217     for( i = 0; i < 4; i++ )
 218     {
 219         const int s02 =  tmp[0][i]     +  tmp[2][i];
 220         const int d02 =  tmp[0][i]     -  tmp[2][i];
 221         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 222         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 223
 224         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 225         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 226         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 227         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 228     }
 229
 230
 231     for( y = 0; y < 4; y++ )
 232     {
 233         for( x = 0; x < 4; x++ )
 234         {
 235             p_dst[x] = clip_uint8( p_dst[x] + d[y][x] );
 236         }
 237         p_dst += FDEC_STRIDE;
 238     }
 239 }
 240
 241 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 242 {
 243     add4x4_idct( &p_dst[0],               dct[0] );
 244     add4x4_idct( &p_dst[4],               dct[1] );
 245     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 246     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 247 }
 248
 249 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 250 {
 251     add8x8_idct( &p_dst[0],               &dct[0] );
 252     add8x8_idct( &p_dst[8],               &dct[4] );
 253     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 254     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 255 }
 256
 257 /****************************************************************************
 258  * 8x8 transform:
 259  ****************************************************************************/
 260
 261 #define DCT8_1D {\
 262     const int s07 = SRC(0) + SRC(7);\
 263     const int s16 = SRC(1) + SRC(6);\
 264     const int s25 = SRC(2) + SRC(5);\
 265     const int s34 = SRC(3) + SRC(4);\
 266     const int a0 = s07 + s34;\
 267     const int a1 = s16 + s25;\
 268     const int a2 = s07 - s34;\
 269     const int a3 = s16 - s25;\
 270     const int d07 = SRC(0) - SRC(7);\
 271     const int d16 = SRC(1) - SRC(6);\
 272     const int d25 = SRC(2) - SRC(5);\
 273     const int d34 = SRC(3) - SRC(4);\
 274     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 275     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 276     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 277     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 278     DST(0) =  a0 + a1     ;\
 279     DST(1) =  a4 + (a7>>2);\
 280     DST(2) =  a2 + (a3>>1);\
 281     DST(3) =  a5 + (a6>>2);\
 282     DST(4) =  a0 - a1     ;\
 283     DST(5) =  a6 - (a5>>2);\
 284     DST(6) = (a2>>1) - a3 ;\
 285     DST(7) = (a4>>2) - a7 ;\
 286 }
 287
 288 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 289 {
 290     int i;
 291     int16_t tmp[8][8];
 292
 293     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 294
 295 #define SRC(x) tmp[x][i]
 296 #define DST(x) tmp[x][i]
 297     for( i = 0; i < 8; i++ )
 298         DCT8_1D
 299 #undef SRC
 300 #undef DST
 301
 302 #define SRC(x) tmp[i][x]
 303 #define DST(x) dct[x][i]
 304     for( i = 0; i < 8; i++ )
 305         DCT8_1D
 306 #undef SRC
 307 #undef DST
 308 }
 309
 310 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 311 {
 312     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 313     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 314     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 315     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 316 }
 317
 318 #define IDCT8_1D {\
 319     const int a0 =  SRC(0) + SRC(4);\
 320     const int a2 =  SRC(0) - SRC(4);\
 321     const int a4 = (SRC(2)>>1) - SRC(6);\
 322     const int a6 = (SRC(6)>>1) + SRC(2);\
 323     const int b0 = a0 + a6;\
 324     const int b2 = a2 + a4;\
 325     const int b4 = a2 - a4;\
 326     const int b6 = a0 - a6;\
 327     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 328     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 329     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 330     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 331     const int b1 = (a7>>2) + a1;\
 332     const int b3 =  a3 + (a5>>2);\
 333     const int b5 = (a3>>2) - a5;\
 334     const int b7 =  a7 - (a1>>2);\
 335     DST(0, b0 + b7);\
 336     DST(1, b2 + b5);\
 337     DST(2, b4 + b3);\
 338     DST(3, b6 + b1);\
 339     DST(4, b6 - b1);\
 340     DST(5, b4 - b3);\
 341     DST(6, b2 - b5);\
 342     DST(7, b0 - b7);\
 343 }
 344
 345 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 346 {
 347     int i;
 348
 349     dct[0][0] += 32; // rounding for the >>6 at the end
 350
 351 #define SRC(x)     dct[x][i]
 352 #define DST(x,rhs) dct[x][i] = (rhs)
 353     for( i = 0; i < 8; i++ )
 354         IDCT8_1D
 355 #undef SRC
 356 #undef DST
 357
 358 #define SRC(x)     dct[i][x]
 359 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 360     for( i = 0; i < 8; i++ )
 361         IDCT8_1D
 362 #undef SRC
 363 #undef DST
 364 }
 365
 366 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 367 {
 368     add8x8_idct8( &dst[0],               dct[0] );
 369     add8x8_idct8( &dst[8],               dct[1] );
 370     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 371     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 372 }
 373
 374
 375 /****************************************************************************
 376  * x264_dct_init:
 377  ****************************************************************************/
 378 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 379 {
 380     dctf->sub4x4_dct    = sub4x4_dct;
 381     dctf->add4x4_idct   = add4x4_idct;
 382
 383     dctf->sub8x8_dct    = sub8x8_dct;
 384     dctf->add8x8_idct   = add8x8_idct;
 385
 386     dctf->sub16x16_dct  = sub16x16_dct;
 387     dctf->add16x16_idct = add16x16_idct;
 388
 389     dctf->sub8x8_dct8   = sub8x8_dct8;
 390     dctf->add8x8_idct8  = add8x8_idct8;
 391
 392     dctf->sub16x16_dct8  = sub16x16_dct8;
 393     dctf->add16x16_idct8 = add16x16_idct8;
 394
 395     dctf->dct4x4dc  = dct4x4dc;
 396     dctf->idct4x4dc = idct4x4dc;
 397
 398     dctf->dct2x2dc  = dct2x2dc;
 399     dctf->idct2x2dc = dct2x2dc;
 400
 401 #ifdef HAVE_MMXEXT
 402     if( cpu&X264_CPU_MMX )
 403     {
 404         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 405         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 406         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 407
 408         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 409         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 410         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 411
 412         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 413         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 414
 415 #ifndef ARCH_X86_64
 416         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 417         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 418
 419         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 420         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 421 #endif
 422     }
 423 #endif
 424
 425 #if defined(HAVE_SSE2) && defined(ARCH_X86_64)
 426     if( cpu&X264_CPU_SSE2 )
 427     {
 428         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 429         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 430
 431         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 432         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 433     }
 434 #endif
 435
 436 #ifdef ARCH_PPC
 437     if( cpu&X264_CPU_ALTIVEC )
 438     {
 439         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 440         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 441         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 442     }
 443 #endif
 444 }
 445
 446 void x264_dct_init_weights( void )
 447 {
 448     int i, j;
 449     for( j=0; j<2; j++ )
 450     {
 451         for( i=0; i<16; i++ )
 452             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 453         for( i=0; i<64; i++ )
 454             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 455     }
 456 }
 457