git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31
  32 int x264_dct4_weight2_zigzag[2][16];
  33 int x264_dct8_weight2_zigzag[2][64];
  34
  35 /*
  36  * XXX For all dct dc : input could be equal to output so ...
  37  */
  38
  39 static void dct2x2dc( int16_t d[2][2] )
  40 {
  41     int tmp[2][2];
  42
  43     tmp[0][0] = d[0][0] + d[0][1];
  44     tmp[1][0] = d[0][0] - d[0][1];
  45     tmp[0][1] = d[1][0] + d[1][1];
  46     tmp[1][1] = d[1][0] - d[1][1];
  47
  48     d[0][0] = tmp[0][0] + tmp[0][1];
  49     d[1][0] = tmp[1][0] + tmp[1][1];
  50     d[0][1] = tmp[0][0] - tmp[0][1];
  51     d[1][1] = tmp[1][0] - tmp[1][1];
  52 }
  53
  54 static void dct4x4dc( int16_t d[4][4] )
  55 {
  56     int16_t tmp[4][4];
  57     int s01, s23;
  58     int d01, d23;
  59     int i;
  60
  61     for( i = 0; i < 4; i++ )
  62     {
  63         s01 = d[i][0] + d[i][1];
  64         d01 = d[i][0] - d[i][1];
  65         s23 = d[i][2] + d[i][3];
  66         d23 = d[i][2] - d[i][3];
  67
  68         tmp[0][i] = s01 + s23;
  69         tmp[1][i] = s01 - s23;
  70         tmp[2][i] = d01 - d23;
  71         tmp[3][i] = d01 + d23;
  72     }
  73
  74     for( i = 0; i < 4; i++ )
  75     {
  76         s01 = tmp[i][0] + tmp[i][1];
  77         d01 = tmp[i][0] - tmp[i][1];
  78         s23 = tmp[i][2] + tmp[i][3];
  79         d23 = tmp[i][2] - tmp[i][3];
  80
  81         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  82         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  83         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  84         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  85     }
  86 }
  87
  88 static void idct4x4dc( int16_t d[4][4] )
  89 {
  90     int16_t tmp[4][4];
  91     int s01, s23;
  92     int d01, d23;
  93     int i;
  94
  95     for( i = 0; i < 4; i++ )
  96     {
  97         s01 = d[i][0] + d[i][1];
  98         d01 = d[i][0] - d[i][1];
  99         s23 = d[i][2] + d[i][3];
 100         d23 = d[i][2] - d[i][3];
 101
 102         tmp[0][i] = s01 + s23;
 103         tmp[1][i] = s01 - s23;
 104         tmp[2][i] = d01 - d23;
 105         tmp[3][i] = d01 + d23;
 106     }
 107
 108     for( i = 0; i < 4; i++ )
 109     {
 110         s01 = tmp[i][0] + tmp[i][1];
 111         d01 = tmp[i][0] - tmp[i][1];
 112         s23 = tmp[i][2] + tmp[i][3];
 113         d23 = tmp[i][2] - tmp[i][3];
 114
 115         d[i][0] = s01 + s23;
 116         d[i][1] = s01 - s23;
 117         d[i][2] = d01 - d23;
 118         d[i][3] = d01 + d23;
 119     }
 120 }
 121
 122 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 123                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 124 {
 125     int y, x;
 126     for( y = 0; y < i_size; y++ )
 127     {
 128         for( x = 0; x < i_size; x++ )
 129         {
 130             diff[x + y*i_size] = pix1[x] - pix2[x];
 131         }
 132         pix1 += i_pix1;
 133         pix2 += i_pix2;
 134     }
 135 }
 136
 137 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 138 {
 139     int16_t d[4][4];
 140     int16_t tmp[4][4];
 141     int i;
 142
 143     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 144
 145     for( i = 0; i < 4; i++ )
 146     {
 147         const int s03 = d[i][0] + d[i][3];
 148         const int s12 = d[i][1] + d[i][2];
 149         const int d03 = d[i][0] - d[i][3];
 150         const int d12 = d[i][1] - d[i][2];
 151
 152         tmp[0][i] =   s03 +   s12;
 153         tmp[1][i] = 2*d03 +   d12;
 154         tmp[2][i] =   s03 -   s12;
 155         tmp[3][i] =   d03 - 2*d12;
 156     }
 157
 158     for( i = 0; i < 4; i++ )
 159     {
 160         const int s03 = tmp[i][0] + tmp[i][3];
 161         const int s12 = tmp[i][1] + tmp[i][2];
 162         const int d03 = tmp[i][0] - tmp[i][3];
 163         const int d12 = tmp[i][1] - tmp[i][2];
 164
 165         dct[i][0] =   s03 +   s12;
 166         dct[i][1] = 2*d03 +   d12;
 167         dct[i][2] =   s03 -   s12;
 168         dct[i][3] =   d03 - 2*d12;
 169     }
 170 }
 171
 172 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 173 {
 174     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 175     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 176     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 177     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 178 }
 179
 180 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 181 {
 182     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 183     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 184     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 185     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 186 }
 187
 188
 189 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 190 {
 191     int16_t d[4][4];
 192     int16_t tmp[4][4];
 193     int x, y;
 194     int i;
 195
 196     for( i = 0; i < 4; i++ )
 197     {
 198         const int s02 =  dct[0][i]     +  dct[2][i];
 199         const int d02 =  dct[0][i]     -  dct[2][i];
 200         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 201         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 202
 203         tmp[i][0] = s02 + s13;
 204         tmp[i][1] = d02 + d13;
 205         tmp[i][2] = d02 - d13;
 206         tmp[i][3] = s02 - s13;
 207     }
 208
 209     for( i = 0; i < 4; i++ )
 210     {
 211         const int s02 =  tmp[0][i]     +  tmp[2][i];
 212         const int d02 =  tmp[0][i]     -  tmp[2][i];
 213         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 214         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 215
 216         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 217         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 218         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 219         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 220     }
 221
 222
 223     for( y = 0; y < 4; y++ )
 224     {
 225         for( x = 0; x < 4; x++ )
 226         {
 227             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
 228         }
 229         p_dst += FDEC_STRIDE;
 230     }
 231 }
 232
 233 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 234 {
 235     add4x4_idct( &p_dst[0],               dct[0] );
 236     add4x4_idct( &p_dst[4],               dct[1] );
 237     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 238     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 239 }
 240
 241 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 242 {
 243     add8x8_idct( &p_dst[0],               &dct[0] );
 244     add8x8_idct( &p_dst[8],               &dct[4] );
 245     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 246     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 247 }
 248
 249 /****************************************************************************
 250  * 8x8 transform:
 251  ****************************************************************************/
 252
 253 #define DCT8_1D {\
 254     const int s07 = SRC(0) + SRC(7);\
 255     const int s16 = SRC(1) + SRC(6);\
 256     const int s25 = SRC(2) + SRC(5);\
 257     const int s34 = SRC(3) + SRC(4);\
 258     const int a0 = s07 + s34;\
 259     const int a1 = s16 + s25;\
 260     const int a2 = s07 - s34;\
 261     const int a3 = s16 - s25;\
 262     const int d07 = SRC(0) - SRC(7);\
 263     const int d16 = SRC(1) - SRC(6);\
 264     const int d25 = SRC(2) - SRC(5);\
 265     const int d34 = SRC(3) - SRC(4);\
 266     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 267     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 268     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 269     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 270     DST(0) =  a0 + a1     ;\
 271     DST(1) =  a4 + (a7>>2);\
 272     DST(2) =  a2 + (a3>>1);\
 273     DST(3) =  a5 + (a6>>2);\
 274     DST(4) =  a0 - a1     ;\
 275     DST(5) =  a6 - (a5>>2);\
 276     DST(6) = (a2>>1) - a3 ;\
 277     DST(7) = (a4>>2) - a7 ;\
 278 }
 279
 280 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 281 {
 282     int i;
 283     int16_t tmp[8][8];
 284
 285     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 286
 287 #define SRC(x) tmp[x][i]
 288 #define DST(x) tmp[x][i]
 289     for( i = 0; i < 8; i++ )
 290         DCT8_1D
 291 #undef SRC
 292 #undef DST
 293
 294 #define SRC(x) tmp[i][x]
 295 #define DST(x) dct[x][i]
 296     for( i = 0; i < 8; i++ )
 297         DCT8_1D
 298 #undef SRC
 299 #undef DST
 300 }
 301
 302 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 303 {
 304     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 305     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 306     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 307     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 308 }
 309
 310 #define IDCT8_1D {\
 311     const int a0 =  SRC(0) + SRC(4);\
 312     const int a2 =  SRC(0) - SRC(4);\
 313     const int a4 = (SRC(2)>>1) - SRC(6);\
 314     const int a6 = (SRC(6)>>1) + SRC(2);\
 315     const int b0 = a0 + a6;\
 316     const int b2 = a2 + a4;\
 317     const int b4 = a2 - a4;\
 318     const int b6 = a0 - a6;\
 319     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 320     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 321     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 322     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 323     const int b1 = (a7>>2) + a1;\
 324     const int b3 =  a3 + (a5>>2);\
 325     const int b5 = (a3>>2) - a5;\
 326     const int b7 =  a7 - (a1>>2);\
 327     DST(0, b0 + b7);\
 328     DST(1, b2 + b5);\
 329     DST(2, b4 + b3);\
 330     DST(3, b6 + b1);\
 331     DST(4, b6 - b1);\
 332     DST(5, b4 - b3);\
 333     DST(6, b2 - b5);\
 334     DST(7, b0 - b7);\
 335 }
 336
 337 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 338 {
 339     int i;
 340
 341     dct[0][0] += 32; // rounding for the >>6 at the end
 342
 343 #define SRC(x)     dct[x][i]
 344 #define DST(x,rhs) dct[x][i] = (rhs)
 345     for( i = 0; i < 8; i++ )
 346         IDCT8_1D
 347 #undef SRC
 348 #undef DST
 349
 350 #define SRC(x)     dct[i][x]
 351 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 352     for( i = 0; i < 8; i++ )
 353         IDCT8_1D
 354 #undef SRC
 355 #undef DST
 356 }
 357
 358 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 359 {
 360     add8x8_idct8( &dst[0],               dct[0] );
 361     add8x8_idct8( &dst[8],               dct[1] );
 362     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 363     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 364 }
 365
 366
 367 /****************************************************************************
 368  * x264_dct_init:
 369  ****************************************************************************/
 370 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 371 {
 372     dctf->sub4x4_dct    = sub4x4_dct;
 373     dctf->add4x4_idct   = add4x4_idct;
 374
 375     dctf->sub8x8_dct    = sub8x8_dct;
 376     dctf->add8x8_idct   = add8x8_idct;
 377
 378     dctf->sub16x16_dct  = sub16x16_dct;
 379     dctf->add16x16_idct = add16x16_idct;
 380
 381     dctf->sub8x8_dct8   = sub8x8_dct8;
 382     dctf->add8x8_idct8  = add8x8_idct8;
 383
 384     dctf->sub16x16_dct8  = sub16x16_dct8;
 385     dctf->add16x16_idct8 = add16x16_idct8;
 386
 387     dctf->dct4x4dc  = dct4x4dc;
 388     dctf->idct4x4dc = idct4x4dc;
 389
 390     dctf->dct2x2dc  = dct2x2dc;
 391     dctf->idct2x2dc = dct2x2dc;
 392
 393 #ifdef HAVE_MMX
 394     if( cpu&X264_CPU_MMX )
 395     {
 396         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 397         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 398         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 399         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 400
 401 #ifndef ARCH_X86_64
 402         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 403         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 404         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 405         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 406
 407         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 408         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 409         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 410         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 411 #endif
 412     }
 413
 414     if( cpu&X264_CPU_SSE2 )
 415     {
 416         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 417         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 418         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 419         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 420
 421         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 422         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 423         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 424         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 425     }
 426 #endif //HAVE_MMX
 427
 428 #ifdef ARCH_PPC
 429     if( cpu&X264_CPU_ALTIVEC )
 430     {
 431         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 432         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 433         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 434
 435         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 436         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 437         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 438
 439         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 440         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 441
 442         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 443         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 444     }
 445 #endif
 446 }
 447
 448 void x264_dct_init_weights( void )
 449 {
 450     int i, j;
 451     for( j=0; j<2; j++ )
 452     {
 453         for( i=0; i<16; i++ )
 454             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 455         for( i=0; i<64; i++ )
 456             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 457     }
 458 }
 459
 460
 461 // gcc pessimizes multi-dimensional arrays here, even with constant indices
 462 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
 463
 464 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
 465 {
 466     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 467     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 468     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)
 469     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)
 470     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)
 471     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)
 472     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)
 473     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)
 474     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)
 475     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)
 476     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)
 477     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)
 478     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)
 479     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)
 480     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)
 481     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)
 482 }
 483
 484 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
 485 {
 486     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)
 487     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)
 488     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)
 489     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)
 490     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)
 491     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)
 492     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)
 493     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)
 494     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)
 495     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)
 496     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)
 497     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)
 498     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)
 499     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)
 500     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)
 501     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 502 }
 503
 504 #undef ZIG
 505 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
 506
 507 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
 508 {
 509     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 510     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 511     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 512     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 513 }
 514
 515 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 516 {
 517     *(uint32_t*)level = *(uint32_t*)dct;
 518     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 519     *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
 520     *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
 521     *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
 522 }
 523
 524 #undef ZIG
 525 #define ZIG(i,y,x) {\
 526     int oe = x+y*FENC_STRIDE;\
 527     int od = x+y*FDEC_STRIDE;\
 528     level[i] = p_src[oe] - p_dst[od];\
 529 }
 530 #define COPY4x4\
 531     *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
 532     *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
 533     *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
 534     *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\
 535
 536 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 537 {
 538     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)
 539     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
 540     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
 541     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 542     COPY4x4
 543 }
 544
 545 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 546 {
 547     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)
 548     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
 549     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
 550     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 551     COPY4x4
 552 }
 553
 554 #undef ZIG
 555 #undef COPY4x4
 556
 557 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 558 {
 559     if( b_interlaced )
 560     {
 561         pf->scan_8x8   = zigzag_scan_8x8_field;
 562         pf->scan_4x4   = zigzag_scan_4x4_field;
 563         pf->sub_4x4    = zigzag_sub_4x4_field;
 564 #ifdef HAVE_MMX
 565         if( cpu&X264_CPU_MMXEXT )
 566             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 567 #endif
 568
 569 #ifdef ARCH_PPC
 570         if( cpu&X264_CPU_ALTIVEC )
 571             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 572 #endif
 573     }
 574     else
 575     {
 576         pf->scan_8x8   = zigzag_scan_8x8_frame;
 577         pf->scan_4x4   = zigzag_scan_4x4_frame;
 578         pf->sub_4x4    = zigzag_sub_4x4_frame;
 579 #ifdef HAVE_MMX
 580         if( cpu&X264_CPU_SSSE3 )
 581             pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
 582 #endif
 583
 584 #ifdef ARCH_PPC
 585         if( cpu&X264_CPU_ALTIVEC )
 586             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 587 #endif
 588     }
 589 }