git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31
  32 int x264_dct4_weight2_zigzag[2][16];
  33 int x264_dct8_weight2_zigzag[2][64];
  34
  35 /*
  36  * XXX For all dct dc : input could be equal to output so ...
  37  */
  38
  39 static void dct4x4dc( int16_t d[4][4] )
  40 {
  41     int16_t tmp[4][4];
  42     int s01, s23;
  43     int d01, d23;
  44     int i;
  45
  46     for( i = 0; i < 4; i++ )
  47     {
  48         s01 = d[i][0] + d[i][1];
  49         d01 = d[i][0] - d[i][1];
  50         s23 = d[i][2] + d[i][3];
  51         d23 = d[i][2] - d[i][3];
  52
  53         tmp[0][i] = s01 + s23;
  54         tmp[1][i] = s01 - s23;
  55         tmp[2][i] = d01 - d23;
  56         tmp[3][i] = d01 + d23;
  57     }
  58
  59     for( i = 0; i < 4; i++ )
  60     {
  61         s01 = tmp[i][0] + tmp[i][1];
  62         d01 = tmp[i][0] - tmp[i][1];
  63         s23 = tmp[i][2] + tmp[i][3];
  64         d23 = tmp[i][2] - tmp[i][3];
  65
  66         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  67         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  68         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  69         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  70     }
  71 }
  72
  73 static void idct4x4dc( int16_t d[4][4] )
  74 {
  75     int16_t tmp[4][4];
  76     int s01, s23;
  77     int d01, d23;
  78     int i;
  79
  80     for( i = 0; i < 4; i++ )
  81     {
  82         s01 = d[i][0] + d[i][1];
  83         d01 = d[i][0] - d[i][1];
  84         s23 = d[i][2] + d[i][3];
  85         d23 = d[i][2] - d[i][3];
  86
  87         tmp[0][i] = s01 + s23;
  88         tmp[1][i] = s01 - s23;
  89         tmp[2][i] = d01 - d23;
  90         tmp[3][i] = d01 + d23;
  91     }
  92
  93     for( i = 0; i < 4; i++ )
  94     {
  95         s01 = tmp[i][0] + tmp[i][1];
  96         d01 = tmp[i][0] - tmp[i][1];
  97         s23 = tmp[i][2] + tmp[i][3];
  98         d23 = tmp[i][2] - tmp[i][3];
  99
 100         d[i][0] = s01 + s23;
 101         d[i][1] = s01 - s23;
 102         d[i][2] = d01 - d23;
 103         d[i][3] = d01 + d23;
 104     }
 105 }
 106
 107 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 108                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 109 {
 110     int y, x;
 111     for( y = 0; y < i_size; y++ )
 112     {
 113         for( x = 0; x < i_size; x++ )
 114         {
 115             diff[x + y*i_size] = pix1[x] - pix2[x];
 116         }
 117         pix1 += i_pix1;
 118         pix2 += i_pix2;
 119     }
 120 }
 121
 122 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 123 {
 124     int16_t d[4][4];
 125     int16_t tmp[4][4];
 126     int i;
 127
 128     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 129
 130     for( i = 0; i < 4; i++ )
 131     {
 132         const int s03 = d[i][0] + d[i][3];
 133         const int s12 = d[i][1] + d[i][2];
 134         const int d03 = d[i][0] - d[i][3];
 135         const int d12 = d[i][1] - d[i][2];
 136
 137         tmp[0][i] =   s03 +   s12;
 138         tmp[1][i] = 2*d03 +   d12;
 139         tmp[2][i] =   s03 -   s12;
 140         tmp[3][i] =   d03 - 2*d12;
 141     }
 142
 143     for( i = 0; i < 4; i++ )
 144     {
 145         const int s03 = tmp[i][0] + tmp[i][3];
 146         const int s12 = tmp[i][1] + tmp[i][2];
 147         const int d03 = tmp[i][0] - tmp[i][3];
 148         const int d12 = tmp[i][1] - tmp[i][2];
 149
 150         dct[i][0] =   s03 +   s12;
 151         dct[i][1] = 2*d03 +   d12;
 152         dct[i][2] =   s03 -   s12;
 153         dct[i][3] =   d03 - 2*d12;
 154     }
 155 }
 156
 157 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 158 {
 159     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 160     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 161     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 162     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 163 }
 164
 165 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 166 {
 167     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 168     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 169     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 170     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 171 }
 172
 173 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
 174 {
 175     int16_t d[4][4];
 176     int sum = 0;
 177
 178     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 179
 180     sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
 181     sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
 182     sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
 183     sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
 184
 185     return sum;
 186 }
 187
 188 static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
 189 {
 190     dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 191     dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 192     dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 193     dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 194 }
 195
 196 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 197 {
 198     int16_t d[4][4];
 199     int16_t tmp[4][4];
 200     int x, y;
 201     int i;
 202
 203     for( i = 0; i < 4; i++ )
 204     {
 205         const int s02 =  dct[0][i]     +  dct[2][i];
 206         const int d02 =  dct[0][i]     -  dct[2][i];
 207         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 208         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 209
 210         tmp[i][0] = s02 + s13;
 211         tmp[i][1] = d02 + d13;
 212         tmp[i][2] = d02 - d13;
 213         tmp[i][3] = s02 - s13;
 214     }
 215
 216     for( i = 0; i < 4; i++ )
 217     {
 218         const int s02 =  tmp[0][i]     +  tmp[2][i];
 219         const int d02 =  tmp[0][i]     -  tmp[2][i];
 220         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 221         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 222
 223         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 224         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 225         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 226         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 227     }
 228
 229
 230     for( y = 0; y < 4; y++ )
 231     {
 232         for( x = 0; x < 4; x++ )
 233         {
 234             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
 235         }
 236         p_dst += FDEC_STRIDE;
 237     }
 238 }
 239
 240 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 241 {
 242     add4x4_idct( &p_dst[0],               dct[0] );
 243     add4x4_idct( &p_dst[4],               dct[1] );
 244     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 245     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 246 }
 247
 248 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 249 {
 250     add8x8_idct( &p_dst[0],               &dct[0] );
 251     add8x8_idct( &p_dst[8],               &dct[4] );
 252     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 253     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 254 }
 255
 256 /****************************************************************************
 257  * 8x8 transform:
 258  ****************************************************************************/
 259
 260 #define DCT8_1D {\
 261     const int s07 = SRC(0) + SRC(7);\
 262     const int s16 = SRC(1) + SRC(6);\
 263     const int s25 = SRC(2) + SRC(5);\
 264     const int s34 = SRC(3) + SRC(4);\
 265     const int a0 = s07 + s34;\
 266     const int a1 = s16 + s25;\
 267     const int a2 = s07 - s34;\
 268     const int a3 = s16 - s25;\
 269     const int d07 = SRC(0) - SRC(7);\
 270     const int d16 = SRC(1) - SRC(6);\
 271     const int d25 = SRC(2) - SRC(5);\
 272     const int d34 = SRC(3) - SRC(4);\
 273     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 274     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 275     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 276     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 277     DST(0) =  a0 + a1     ;\
 278     DST(1) =  a4 + (a7>>2);\
 279     DST(2) =  a2 + (a3>>1);\
 280     DST(3) =  a5 + (a6>>2);\
 281     DST(4) =  a0 - a1     ;\
 282     DST(5) =  a6 - (a5>>2);\
 283     DST(6) = (a2>>1) - a3 ;\
 284     DST(7) = (a4>>2) - a7 ;\
 285 }
 286
 287 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 288 {
 289     int i;
 290     int16_t tmp[8][8];
 291
 292     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 293
 294 #define SRC(x) tmp[x][i]
 295 #define DST(x) tmp[x][i]
 296     for( i = 0; i < 8; i++ )
 297         DCT8_1D
 298 #undef SRC
 299 #undef DST
 300
 301 #define SRC(x) tmp[i][x]
 302 #define DST(x) dct[x][i]
 303     for( i = 0; i < 8; i++ )
 304         DCT8_1D
 305 #undef SRC
 306 #undef DST
 307 }
 308
 309 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 310 {
 311     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 312     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 313     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 314     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 315 }
 316
 317 #define IDCT8_1D {\
 318     const int a0 =  SRC(0) + SRC(4);\
 319     const int a2 =  SRC(0) - SRC(4);\
 320     const int a4 = (SRC(2)>>1) - SRC(6);\
 321     const int a6 = (SRC(6)>>1) + SRC(2);\
 322     const int b0 = a0 + a6;\
 323     const int b2 = a2 + a4;\
 324     const int b4 = a2 - a4;\
 325     const int b6 = a0 - a6;\
 326     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 327     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 328     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 329     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 330     const int b1 = (a7>>2) + a1;\
 331     const int b3 =  a3 + (a5>>2);\
 332     const int b5 = (a3>>2) - a5;\
 333     const int b7 =  a7 - (a1>>2);\
 334     DST(0, b0 + b7);\
 335     DST(1, b2 + b5);\
 336     DST(2, b4 + b3);\
 337     DST(3, b6 + b1);\
 338     DST(4, b6 - b1);\
 339     DST(5, b4 - b3);\
 340     DST(6, b2 - b5);\
 341     DST(7, b0 - b7);\
 342 }
 343
 344 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 345 {
 346     int i;
 347
 348     dct[0][0] += 32; // rounding for the >>6 at the end
 349
 350 #define SRC(x)     dct[x][i]
 351 #define DST(x,rhs) dct[x][i] = (rhs)
 352     for( i = 0; i < 8; i++ )
 353         IDCT8_1D
 354 #undef SRC
 355 #undef DST
 356
 357 #define SRC(x)     dct[i][x]
 358 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 359     for( i = 0; i < 8; i++ )
 360         IDCT8_1D
 361 #undef SRC
 362 #undef DST
 363 }
 364
 365 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 366 {
 367     add8x8_idct8( &dst[0],               dct[0] );
 368     add8x8_idct8( &dst[8],               dct[1] );
 369     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 370     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 371 }
 372
 373 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
 374 {
 375     int i;
 376     dc = (dc + 32) >> 6;
 377     for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 378     {
 379         p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
 380         p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
 381         p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
 382         p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
 383     }
 384 }
 385
 386 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
 387 {
 388     add4x4_idct_dc( &p_dst[0],               dct[0][0] );
 389     add4x4_idct_dc( &p_dst[4],               dct[0][1] );
 390     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
 391     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
 392 }
 393
 394 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
 395 {
 396     int i;
 397     for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
 398     {
 399         add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
 400         add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
 401         add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
 402         add4x4_idct_dc( &p_dst[12], dct[i][3] );
 403     }
 404 }
 405
 406
 407 /****************************************************************************
 408  * x264_dct_init:
 409  ****************************************************************************/
 410 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 411 {
 412     dctf->sub4x4_dct    = sub4x4_dct;
 413     dctf->add4x4_idct   = add4x4_idct;
 414
 415     dctf->sub8x8_dct    = sub8x8_dct;
 416     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 417     dctf->add8x8_idct   = add8x8_idct;
 418     dctf->add8x8_idct_dc = add8x8_idct_dc;
 419
 420     dctf->sub16x16_dct  = sub16x16_dct;
 421     dctf->add16x16_idct = add16x16_idct;
 422     dctf->add16x16_idct_dc = add16x16_idct_dc;
 423
 424     dctf->sub8x8_dct8   = sub8x8_dct8;
 425     dctf->add8x8_idct8  = add8x8_idct8;
 426
 427     dctf->sub16x16_dct8  = sub16x16_dct8;
 428     dctf->add16x16_idct8 = add16x16_idct8;
 429
 430     dctf->dct4x4dc  = dct4x4dc;
 431     dctf->idct4x4dc = idct4x4dc;
 432
 433 #ifdef HAVE_MMX
 434     if( cpu&X264_CPU_MMX )
 435     {
 436         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 437         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 438         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 439         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 440         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 441         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 442         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 443
 444 #ifndef ARCH_X86_64
 445         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 446         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 447         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 448         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 449
 450         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 451         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 452         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 453         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 454 #endif
 455     }
 456
 457     if( cpu&X264_CPU_SSE2 )
 458     {
 459         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 460         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 461         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 462         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 463         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 464
 465         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 466         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 467         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 468         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 469         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 470     }
 471
 472     if( cpu&X264_CPU_SSSE3 )
 473     {
 474         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 475         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 476         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 477         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 478         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 479         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 480         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 481     }
 482 #endif //HAVE_MMX
 483
 484 #ifdef ARCH_PPC
 485     if( cpu&X264_CPU_ALTIVEC )
 486     {
 487         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 488         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 489         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 490
 491         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 492         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 493         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 494
 495         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 496         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 497
 498         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 499         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 500     }
 501 #endif
 502 }
 503
 504 void x264_dct_init_weights( void )
 505 {
 506     int i, j;
 507     for( j=0; j<2; j++ )
 508     {
 509         for( i=0; i<16; i++ )
 510             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 511         for( i=0; i<64; i++ )
 512             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 513     }
 514 }
 515
 516
 517 // gcc pessimizes multi-dimensional arrays here, even with constant indices
 518 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
 519 #define ZIGZAG8_FRAME\
 520     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 521     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 522     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 523     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 524     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 525     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 526     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 527     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 528     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 529     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 530     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 531     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 532     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 533     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 534     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 535     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 536
 537 #define ZIGZAG8_FIELD\
 538     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 539     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 540     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 541     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 542     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 543     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 544     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 545     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 546     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 547     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 548     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 549     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 550     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 551     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 552     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 553     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 554
 555 #define ZIGZAG4_FRAME\
 556     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 557     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 558     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 559     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 560
 561 #define ZIGZAG4_FIELD\
 562     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 563     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 564     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 565     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 566
 567 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
 568 {
 569     ZIGZAG8_FRAME
 570 }
 571
 572 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
 573 {
 574     ZIGZAG8_FIELD
 575 }
 576
 577 #undef ZIG
 578 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
 579
 580 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
 581 {
 582     ZIGZAG4_FRAME
 583 }
 584
 585 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 586 {
 587     *(uint32_t*)level = *(uint32_t*)dct;
 588     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 589     *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
 590     *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
 591     *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
 592 }
 593
 594 #undef ZIG
 595 #define ZIG(i,y,x) {\
 596     int oe = x+y*FENC_STRIDE;\
 597     int od = x+y*FDEC_STRIDE;\
 598     level[i] = p_src[oe] - p_dst[od];\
 599 }
 600 #define COPY4x4\
 601     *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
 602     *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
 603     *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
 604     *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
 605 #define COPY8x8\
 606     *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
 607     *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
 608     *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
 609     *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
 610     *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
 611     *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
 612     *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
 613     *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
 614
 615 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 616 {
 617     ZIGZAG4_FRAME
 618     COPY4x4
 619 }
 620
 621 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 622 {
 623     ZIGZAG4_FIELD
 624     COPY4x4
 625 }
 626
 627 static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 628 {
 629     ZIGZAG8_FRAME
 630     COPY8x8
 631 }
 632 static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 633 {
 634     ZIGZAG8_FIELD
 635     COPY8x8
 636 }
 637
 638 #undef ZIG
 639 #undef COPY4x4
 640
 641 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 642 {
 643     int i,j;
 644     for( i=0; i<4; i++ )
 645     {
 646         int nz = 0;
 647         for( j=0; j<16; j++ )
 648         {
 649             nz |= src[i+j*4];
 650             dst[i*16+j] = src[i+j*4];
 651         }
 652         nnz[(i&1) + (i>>1)*8] = !!nz;
 653     }
 654 }
 655
 656 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 657 {
 658     if( b_interlaced )
 659     {
 660         pf->scan_8x8   = zigzag_scan_8x8_field;
 661         pf->scan_4x4   = zigzag_scan_4x4_field;
 662         pf->sub_8x8    = zigzag_sub_8x8_field;
 663         pf->sub_4x4    = zigzag_sub_4x4_field;
 664 #ifdef HAVE_MMX
 665         if( cpu&X264_CPU_MMXEXT )
 666             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 667 #endif
 668
 669 #ifdef ARCH_PPC
 670         if( cpu&X264_CPU_ALTIVEC )
 671             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 672 #endif
 673     }
 674     else
 675     {
 676         pf->scan_8x8   = zigzag_scan_8x8_frame;
 677         pf->scan_4x4   = zigzag_scan_4x4_frame;
 678         pf->sub_8x8    = zigzag_sub_8x8_frame;
 679         pf->sub_4x4    = zigzag_sub_4x4_frame;
 680 #ifdef HAVE_MMX
 681         if( cpu&X264_CPU_MMX )
 682             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 683         if( cpu&X264_CPU_MMXEXT )
 684             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 685         if( cpu&X264_CPU_SSE2_IS_FAST )
 686             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 687         if( cpu&X264_CPU_SSSE3 )
 688         {
 689             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 690             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 691             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 692                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 693         }
 694 #endif
 695
 696 #ifdef ARCH_PPC
 697         if( cpu&X264_CPU_ALTIVEC )
 698             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 699 #endif
 700     }
 701
 702     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 703 #ifdef HAVE_MMX
 704     if( cpu&X264_CPU_MMX )
 705         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 706     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 707         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 708 #endif
 709 }