git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: transform and zigzag
   3  *****************************************************************************
   4  * Copyright (C) 2003-2013 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Henrik Gramner <hengar-6@student.ltu.se>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  23  *
  24  * This program is also available under a commercial proprietary license.
  25  * For more information, contact us at licensing@x264.com.
  26  *****************************************************************************/
  27
  28 #include "common.h"
  29 #if HAVE_MMX
  30 #   include "x86/dct.h"
  31 #endif
  32 #if ARCH_PPC
  33 #   include "ppc/dct.h"
  34 #endif
  35 #if ARCH_ARM
  36 #   include "arm/dct.h"
  37 #endif
  38
  39 /* the inverse of the scaling factors introduced by 8x8 fdct */
  40 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
  41 #define W(i) (i==0 ? FIX8(1.0000) :\
  42               i==1 ? FIX8(0.8859) :\
  43               i==2 ? FIX8(1.6000) :\
  44               i==3 ? FIX8(0.9415) :\
  45               i==4 ? FIX8(1.2651) :\
  46               i==5 ? FIX8(1.1910) :0)
  47 const uint32_t x264_dct8_weight_tab[64] = {
  48     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  49     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  50     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  51     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  52
  53     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  54     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  55     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  56     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
  57 };
  58 #undef W
  59
  60 #define W(i) (i==0 ? FIX8(1.76777) :\
  61               i==1 ? FIX8(1.11803) :\
  62               i==2 ? FIX8(0.70711) :0)
  63 const uint32_t x264_dct4_weight_tab[16] = {
  64     W(0), W(1), W(0), W(1),
  65     W(1), W(2), W(1), W(2),
  66     W(0), W(1), W(0), W(1),
  67     W(1), W(2), W(1), W(2)
  68 };
  69 #undef W
  70
  71 /* inverse squared */
  72 #define W(i) (i==0 ? FIX8(3.125) :\
  73               i==1 ? FIX8(1.25) :\
  74               i==2 ? FIX8(0.5) :0)
  75 const uint32_t x264_dct4_weight2_tab[16] = {
  76     W(0), W(1), W(0), W(1),
  77     W(1), W(2), W(1), W(2),
  78     W(0), W(1), W(0), W(1),
  79     W(1), W(2), W(1), W(2)
  80 };
  81 #undef W
  82
  83 #define W(i) (i==0 ? FIX8(1.00000) :\
  84               i==1 ? FIX8(0.78487) :\
  85               i==2 ? FIX8(2.56132) :\
  86               i==3 ? FIX8(0.88637) :\
  87               i==4 ? FIX8(1.60040) :\
  88               i==5 ? FIX8(1.41850) :0)
  89 const uint32_t x264_dct8_weight2_tab[64] = {
  90     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  91     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  92     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  93     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  94
  95     W(0), W(3), W(4), W(3),  W(0), W(3), W(4), W(3),
  96     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1),
  97     W(4), W(5), W(2), W(5),  W(4), W(5), W(2), W(5),
  98     W(3), W(1), W(5), W(1),  W(3), W(1), W(5), W(1)
  99 };
 100 #undef W
 101
 102
 103 static void dct4x4dc( dctcoef d[16] )
 104 {
 105     dctcoef tmp[16];
 106
 107     for( int i = 0; i < 4; i++ )
 108     {
 109         int s01 = d[i*4+0] + d[i*4+1];
 110         int d01 = d[i*4+0] - d[i*4+1];
 111         int s23 = d[i*4+2] + d[i*4+3];
 112         int d23 = d[i*4+2] - d[i*4+3];
 113
 114         tmp[0*4+i] = s01 + s23;
 115         tmp[1*4+i] = s01 - s23;
 116         tmp[2*4+i] = d01 - d23;
 117         tmp[3*4+i] = d01 + d23;
 118     }
 119
 120     for( int i = 0; i < 4; i++ )
 121     {
 122         int s01 = tmp[i*4+0] + tmp[i*4+1];
 123         int d01 = tmp[i*4+0] - tmp[i*4+1];
 124         int s23 = tmp[i*4+2] + tmp[i*4+3];
 125         int d23 = tmp[i*4+2] - tmp[i*4+3];
 126
 127         d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
 128         d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
 129         d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
 130         d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
 131     }
 132 }
 133
 134 static void idct4x4dc( dctcoef d[16] )
 135 {
 136     dctcoef tmp[16];
 137
 138     for( int i = 0; i < 4; i++ )
 139     {
 140         int s01 = d[i*4+0] + d[i*4+1];
 141         int d01 = d[i*4+0] - d[i*4+1];
 142         int s23 = d[i*4+2] + d[i*4+3];
 143         int d23 = d[i*4+2] - d[i*4+3];
 144
 145         tmp[0*4+i] = s01 + s23;
 146         tmp[1*4+i] = s01 - s23;
 147         tmp[2*4+i] = d01 - d23;
 148         tmp[3*4+i] = d01 + d23;
 149     }
 150
 151     for( int i = 0; i < 4; i++ )
 152     {
 153         int s01 = tmp[i*4+0] + tmp[i*4+1];
 154         int d01 = tmp[i*4+0] - tmp[i*4+1];
 155         int s23 = tmp[i*4+2] + tmp[i*4+3];
 156         int d23 = tmp[i*4+2] - tmp[i*4+3];
 157
 158         d[i*4+0] = s01 + s23;
 159         d[i*4+1] = s01 - s23;
 160         d[i*4+2] = d01 - d23;
 161         d[i*4+3] = d01 + d23;
 162     }
 163 }
 164
 165 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
 166 {
 167     int a0 = dct4x4[0][0] + dct4x4[1][0];
 168     int a1 = dct4x4[2][0] + dct4x4[3][0];
 169     int a2 = dct4x4[4][0] + dct4x4[5][0];
 170     int a3 = dct4x4[6][0] + dct4x4[7][0];
 171     int a4 = dct4x4[0][0] - dct4x4[1][0];
 172     int a5 = dct4x4[2][0] - dct4x4[3][0];
 173     int a6 = dct4x4[4][0] - dct4x4[5][0];
 174     int a7 = dct4x4[6][0] - dct4x4[7][0];
 175     int b0 = a0 + a1;
 176     int b1 = a2 + a3;
 177     int b2 = a4 + a5;
 178     int b3 = a6 + a7;
 179     int b4 = a0 - a1;
 180     int b5 = a2 - a3;
 181     int b6 = a4 - a5;
 182     int b7 = a6 - a7;
 183     dct[0] = b0 + b1;
 184     dct[1] = b2 + b3;
 185     dct[2] = b0 - b1;
 186     dct[3] = b2 - b3;
 187     dct[4] = b4 - b5;
 188     dct[5] = b6 - b7;
 189     dct[6] = b4 + b5;
 190     dct[7] = b6 + b7;
 191     dct4x4[0][0] = 0;
 192     dct4x4[1][0] = 0;
 193     dct4x4[2][0] = 0;
 194     dct4x4[3][0] = 0;
 195     dct4x4[4][0] = 0;
 196     dct4x4[5][0] = 0;
 197     dct4x4[6][0] = 0;
 198     dct4x4[7][0] = 0;
 199 }
 200
 201 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
 202                                   pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
 203 {
 204     for( int y = 0; y < i_size; y++ )
 205     {
 206         for( int x = 0; x < i_size; x++ )
 207             diff[x + y*i_size] = pix1[x] - pix2[x];
 208         pix1 += i_pix1;
 209         pix2 += i_pix2;
 210     }
 211 }
 212
 213 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
 214 {
 215     dctcoef d[16];
 216     dctcoef tmp[16];
 217
 218     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 219
 220     for( int i = 0; i < 4; i++ )
 221     {
 222         int s03 = d[i*4+0] + d[i*4+3];
 223         int s12 = d[i*4+1] + d[i*4+2];
 224         int d03 = d[i*4+0] - d[i*4+3];
 225         int d12 = d[i*4+1] - d[i*4+2];
 226
 227         tmp[0*4+i] =   s03 +   s12;
 228         tmp[1*4+i] = 2*d03 +   d12;
 229         tmp[2*4+i] =   s03 -   s12;
 230         tmp[3*4+i] =   d03 - 2*d12;
 231     }
 232
 233     for( int i = 0; i < 4; i++ )
 234     {
 235         int s03 = tmp[i*4+0] + tmp[i*4+3];
 236         int s12 = tmp[i*4+1] + tmp[i*4+2];
 237         int d03 = tmp[i*4+0] - tmp[i*4+3];
 238         int d12 = tmp[i*4+1] - tmp[i*4+2];
 239
 240         dct[i*4+0] =   s03 +   s12;
 241         dct[i*4+1] = 2*d03 +   d12;
 242         dct[i*4+2] =   s03 -   s12;
 243         dct[i*4+3] =   d03 - 2*d12;
 244     }
 245 }
 246
 247 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
 248 {
 249     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 250     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 251     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 252     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 253 }
 254
 255 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
 256 {
 257     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 258     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 259     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 260     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 261 }
 262
 263 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
 264 {
 265     int sum = 0;
 266     for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
 267         sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
 268              - pix2[0] - pix2[1] - pix2[2] - pix2[3];
 269     return sum;
 270 }
 271
 272 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
 273 {
 274     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 275     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 276     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 277     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 278
 279     /* 2x2 DC transform */
 280     int d0 = dct[0] + dct[1];
 281     int d1 = dct[2] + dct[3];
 282     int d2 = dct[0] - dct[1];
 283     int d3 = dct[2] - dct[3];
 284     dct[0] = d0 + d1;
 285     dct[1] = d0 - d1;
 286     dct[2] = d2 + d3;
 287     dct[3] = d2 - d3;
 288 }
 289
 290 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
 291 {
 292     int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
 293     int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
 294     int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
 295     int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
 296     int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
 297     int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
 298     int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
 299     int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
 300
 301     /* 2x4 DC transform */
 302     int b0 = a0 + a1;
 303     int b1 = a2 + a3;
 304     int b2 = a4 + a5;
 305     int b3 = a6 + a7;
 306     int b4 = a0 - a1;
 307     int b5 = a2 - a3;
 308     int b6 = a4 - a5;
 309     int b7 = a6 - a7;
 310     a0 = b0 + b1;
 311     a1 = b2 + b3;
 312     a2 = b4 + b5;
 313     a3 = b6 + b7;
 314     a4 = b0 - b1;
 315     a5 = b2 - b3;
 316     a6 = b4 - b5;
 317     a7 = b6 - b7;
 318     dct[0] = a0 + a1;
 319     dct[1] = a2 + a3;
 320     dct[2] = a0 - a1;
 321     dct[3] = a2 - a3;
 322     dct[4] = a4 - a5;
 323     dct[5] = a6 - a7;
 324     dct[6] = a4 + a5;
 325     dct[7] = a6 + a7;
 326 }
 327
 328 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
 329 {
 330     dctcoef d[16];
 331     dctcoef tmp[16];
 332
 333     for( int i = 0; i < 4; i++ )
 334     {
 335         int s02 =  dct[0*4+i]     +  dct[2*4+i];
 336         int d02 =  dct[0*4+i]     -  dct[2*4+i];
 337         int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
 338         int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
 339
 340         tmp[i*4+0] = s02 + s13;
 341         tmp[i*4+1] = d02 + d13;
 342         tmp[i*4+2] = d02 - d13;
 343         tmp[i*4+3] = s02 - s13;
 344     }
 345
 346     for( int i = 0; i < 4; i++ )
 347     {
 348         int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
 349         int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
 350         int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
 351         int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
 352
 353         d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
 354         d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
 355         d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
 356         d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
 357     }
 358
 359
 360     for( int y = 0; y < 4; y++ )
 361     {
 362         for( int x = 0; x < 4; x++ )
 363             p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
 364         p_dst += FDEC_STRIDE;
 365     }
 366 }
 367
 368 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
 369 {
 370     add4x4_idct( &p_dst[0],               dct[0] );
 371     add4x4_idct( &p_dst[4],               dct[1] );
 372     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 373     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 374 }
 375
 376 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
 377 {
 378     add8x8_idct( &p_dst[0],               &dct[0] );
 379     add8x8_idct( &p_dst[8],               &dct[4] );
 380     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 381     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 382 }
 383
 384 /****************************************************************************
 385  * 8x8 transform:
 386  ****************************************************************************/
 387
 388 #define DCT8_1D {\
 389     int s07 = SRC(0) + SRC(7);\
 390     int s16 = SRC(1) + SRC(6);\
 391     int s25 = SRC(2) + SRC(5);\
 392     int s34 = SRC(3) + SRC(4);\
 393     int a0 = s07 + s34;\
 394     int a1 = s16 + s25;\
 395     int a2 = s07 - s34;\
 396     int a3 = s16 - s25;\
 397     int d07 = SRC(0) - SRC(7);\
 398     int d16 = SRC(1) - SRC(6);\
 399     int d25 = SRC(2) - SRC(5);\
 400     int d34 = SRC(3) - SRC(4);\
 401     int a4 = d16 + d25 + (d07 + (d07>>1));\
 402     int a5 = d07 - d34 - (d25 + (d25>>1));\
 403     int a6 = d07 + d34 - (d16 + (d16>>1));\
 404     int a7 = d16 - d25 + (d34 + (d34>>1));\
 405     DST(0) =  a0 + a1     ;\
 406     DST(1) =  a4 + (a7>>2);\
 407     DST(2) =  a2 + (a3>>1);\
 408     DST(3) =  a5 + (a6>>2);\
 409     DST(4) =  a0 - a1     ;\
 410     DST(5) =  a6 - (a5>>2);\
 411     DST(6) = (a2>>1) - a3 ;\
 412     DST(7) = (a4>>2) - a7 ;\
 413 }
 414
 415 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
 416 {
 417     dctcoef tmp[64];
 418
 419     pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 420
 421 #define SRC(x) tmp[x*8+i]
 422 #define DST(x) tmp[x*8+i]
 423     for( int i = 0; i < 8; i++ )
 424         DCT8_1D
 425 #undef SRC
 426 #undef DST
 427
 428 #define SRC(x) tmp[i*8+x]
 429 #define DST(x) dct[x*8+i]
 430     for( int i = 0; i < 8; i++ )
 431         DCT8_1D
 432 #undef SRC
 433 #undef DST
 434 }
 435
 436 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
 437 {
 438     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 439     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 440     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 441     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 442 }
 443
 444 #define IDCT8_1D {\
 445     int a0 =  SRC(0) + SRC(4);\
 446     int a2 =  SRC(0) - SRC(4);\
 447     int a4 = (SRC(2)>>1) - SRC(6);\
 448     int a6 = (SRC(6)>>1) + SRC(2);\
 449     int b0 = a0 + a6;\
 450     int b2 = a2 + a4;\
 451     int b4 = a2 - a4;\
 452     int b6 = a0 - a6;\
 453     int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 454     int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 455     int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 456     int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 457     int b1 = (a7>>2) + a1;\
 458     int b3 =  a3 + (a5>>2);\
 459     int b5 = (a3>>2) - a5;\
 460     int b7 =  a7 - (a1>>2);\
 461     DST(0, b0 + b7);\
 462     DST(1, b2 + b5);\
 463     DST(2, b4 + b3);\
 464     DST(3, b6 + b1);\
 465     DST(4, b6 - b1);\
 466     DST(5, b4 - b3);\
 467     DST(6, b2 - b5);\
 468     DST(7, b0 - b7);\
 469 }
 470
 471 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
 472 {
 473     dct[0] += 32; // rounding for the >>6 at the end
 474
 475 #define SRC(x)     dct[x*8+i]
 476 #define DST(x,rhs) dct[x*8+i] = (rhs)
 477     for( int i = 0; i < 8; i++ )
 478         IDCT8_1D
 479 #undef SRC
 480 #undef DST
 481
 482 #define SRC(x)     dct[i*8+x]
 483 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 484     for( int i = 0; i < 8; i++ )
 485         IDCT8_1D
 486 #undef SRC
 487 #undef DST
 488 }
 489
 490 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
 491 {
 492     add8x8_idct8( &dst[0],               dct[0] );
 493     add8x8_idct8( &dst[8],               dct[1] );
 494     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 495     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 496 }
 497
 498 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
 499 {
 500     dc = (dc + 32) >> 6;
 501     for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 502     {
 503         p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
 504         p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
 505         p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
 506         p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
 507     }
 508 }
 509
 510 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
 511 {
 512     add4x4_idct_dc( &p_dst[0],               dct[0] );
 513     add4x4_idct_dc( &p_dst[4],               dct[1] );
 514     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 515     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 516 }
 517
 518 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
 519 {
 520     for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
 521     {
 522         add4x4_idct_dc( &p_dst[ 0], dct[0] );
 523         add4x4_idct_dc( &p_dst[ 4], dct[1] );
 524         add4x4_idct_dc( &p_dst[ 8], dct[2] );
 525         add4x4_idct_dc( &p_dst[12], dct[3] );
 526     }
 527 }
 528
 529
 530 /****************************************************************************
 531  * x264_dct_init:
 532  ****************************************************************************/
 533 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 534 {
 535     dctf->sub4x4_dct    = sub4x4_dct;
 536     dctf->add4x4_idct   = add4x4_idct;
 537
 538     dctf->sub8x8_dct    = sub8x8_dct;
 539     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 540     dctf->add8x8_idct   = add8x8_idct;
 541     dctf->add8x8_idct_dc = add8x8_idct_dc;
 542
 543     dctf->sub8x16_dct_dc = sub8x16_dct_dc;
 544
 545     dctf->sub16x16_dct  = sub16x16_dct;
 546     dctf->add16x16_idct = add16x16_idct;
 547     dctf->add16x16_idct_dc = add16x16_idct_dc;
 548
 549     dctf->sub8x8_dct8   = sub8x8_dct8;
 550     dctf->add8x8_idct8  = add8x8_idct8;
 551
 552     dctf->sub16x16_dct8  = sub16x16_dct8;
 553     dctf->add16x16_idct8 = add16x16_idct8;
 554
 555     dctf->dct4x4dc  = dct4x4dc;
 556     dctf->idct4x4dc = idct4x4dc;
 557
 558     dctf->dct2x4dc = dct2x4dc;
 559
 560 #if HIGH_BIT_DEPTH
 561 #if HAVE_MMX
 562     if( cpu&X264_CPU_MMX )
 563     {
 564         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 565         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 566         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 567     }
 568     if( cpu&X264_CPU_SSE2 )
 569     {
 570         dctf->add4x4_idct     = x264_add4x4_idct_sse2;
 571         dctf->dct4x4dc        = x264_dct4x4dc_sse2;
 572         dctf->idct4x4dc       = x264_idct4x4dc_sse2;
 573         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse2;
 574         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse2;
 575         dctf->add8x8_idct     = x264_add8x8_idct_sse2;
 576         dctf->add16x16_idct   = x264_add16x16_idct_sse2;
 577         dctf->add8x8_idct8    = x264_add8x8_idct8_sse2;
 578         dctf->add16x16_idct8    = x264_add16x16_idct8_sse2;
 579         dctf->sub8x8_dct_dc   = x264_sub8x8_dct_dc_sse2;
 580         dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_sse2;
 581         dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_sse2;
 582         dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
 583     }
 584     if( cpu&X264_CPU_SSE4 )
 585     {
 586         dctf->sub8x8_dct8     = x264_sub8x8_dct8_sse4;
 587         dctf->sub16x16_dct8   = x264_sub16x16_dct8_sse4;
 588     }
 589     if( cpu&X264_CPU_AVX )
 590     {
 591         dctf->add4x4_idct     = x264_add4x4_idct_avx;
 592         dctf->dct4x4dc        = x264_dct4x4dc_avx;
 593         dctf->idct4x4dc       = x264_idct4x4dc_avx;
 594         dctf->sub8x8_dct8     = x264_sub8x8_dct8_avx;
 595         dctf->sub16x16_dct8   = x264_sub16x16_dct8_avx;
 596         dctf->add8x8_idct     = x264_add8x8_idct_avx;
 597         dctf->add16x16_idct   = x264_add16x16_idct_avx;
 598         dctf->add8x8_idct8    = x264_add8x8_idct8_avx;
 599         dctf->add16x16_idct8  = x264_add16x16_idct8_avx;
 600         dctf->add8x8_idct_dc  = x264_add8x8_idct_dc_avx;
 601         dctf->sub8x16_dct_dc  = x264_sub8x16_dct_dc_avx;
 602         dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
 603     }
 604 #endif // HAVE_MMX
 605 #else // !HIGH_BIT_DEPTH
 606 #if HAVE_MMX
 607     if( cpu&X264_CPU_MMX )
 608     {
 609         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 610         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 611         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 612         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 613         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
 614
 615 #if !ARCH_X86_64
 616         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 617         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 618         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 619         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 620
 621         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 622         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 623         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 624         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 625 #endif
 626     }
 627
 628     if( cpu&X264_CPU_MMX2 )
 629     {
 630         dctf->add8x8_idct_dc   = x264_add8x8_idct_dc_mmx2;
 631         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
 632     }
 633
 634     if( cpu&X264_CPU_SSE2 )
 635     {
 636         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 637         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 638         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 639         dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
 640         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 641         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 642
 643         if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
 644         {
 645             dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 646             dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 647             dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 648             dctf->add16x16_idct = x264_add16x16_idct_sse2;
 649             dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 650         }
 651     }
 652
 653     if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
 654     {
 655         dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
 656         if( !(cpu&X264_CPU_SLOW_ATOM) )
 657         {
 658             dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 659             dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 660             dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 661             dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 662             dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 663             if( !(cpu&X264_CPU_SLOW_PSHUFB) )
 664             {
 665                 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 666                 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 667             }
 668         }
 669     }
 670
 671     if( cpu&X264_CPU_SSE4 )
 672         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 673
 674     if( cpu&X264_CPU_AVX )
 675     {
 676         dctf->add4x4_idct      = x264_add4x4_idct_avx;
 677         dctf->add8x8_idct      = x264_add8x8_idct_avx;
 678         dctf->add16x16_idct    = x264_add16x16_idct_avx;
 679         dctf->add8x8_idct8     = x264_add8x8_idct8_avx;
 680         dctf->add16x16_idct8   = x264_add16x16_idct8_avx;
 681         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
 682         dctf->sub8x8_dct       = x264_sub8x8_dct_avx;
 683         dctf->sub16x16_dct     = x264_sub16x16_dct_avx;
 684         dctf->sub8x8_dct8      = x264_sub8x8_dct8_avx;
 685         dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx;
 686     }
 687
 688     if( cpu&X264_CPU_XOP )
 689     {
 690         dctf->sub8x8_dct       = x264_sub8x8_dct_xop;
 691         dctf->sub16x16_dct     = x264_sub16x16_dct_xop;
 692     }
 693
 694     if( cpu&X264_CPU_AVX2 )
 695     {
 696         dctf->add8x8_idct      = x264_add8x8_idct_avx2;
 697         dctf->add16x16_idct    = x264_add16x16_idct_avx2;
 698         dctf->sub8x8_dct       = x264_sub8x8_dct_avx2;
 699         dctf->sub16x16_dct     = x264_sub16x16_dct_avx2;
 700         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
 701 #if ARCH_X86_64
 702         dctf->sub16x16_dct8    = x264_sub16x16_dct8_avx2;
 703 #endif
 704     }
 705 #endif //HAVE_MMX
 706
 707 #if HAVE_ALTIVEC
 708     if( cpu&X264_CPU_ALTIVEC )
 709     {
 710         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 711         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 712         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 713
 714         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 715         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 716         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 717
 718         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 719         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 720
 721         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 722         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 723     }
 724 #endif
 725
 726 #if HAVE_ARMV6
 727     if( cpu&X264_CPU_NEON )
 728     {
 729         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 730         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 731         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 732         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 733         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 734         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 735         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 736         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 737
 738         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 739         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 740         dctf->add16x16_idct = x264_add16x16_idct_neon;
 741
 742         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 743         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 744
 745         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 746         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 747     }
 748 #endif
 749 #endif // HIGH_BIT_DEPTH
 750 }
 751
 752
 753 #define ZIG(i,y,x) level[i] = dct[x*8+y];
 754 #define ZIGZAG8_FRAME\
 755     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 756     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 757     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 758     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 759     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 760     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 761     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 762     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 763     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 764     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 765     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 766     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 767     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 768     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 769     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 770     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 771
 772 #define ZIGZAG8_FIELD\
 773     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 774     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 775     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 776     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 777     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 778     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 779     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 780     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 781     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 782     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 783     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 784     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 785     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 786     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 787     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 788     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 789
 790 #define ZIGZAG4_FRAME\
 791     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 792     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 793     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 794     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 795
 796 #define ZIGZAG4_FIELD\
 797     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 798     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 799     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 800     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 801
 802 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
 803 {
 804     ZIGZAG8_FRAME
 805 }
 806
 807 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
 808 {
 809     ZIGZAG8_FIELD
 810 }
 811
 812 #undef ZIG
 813 #define ZIG(i,y,x) level[i] = dct[x*4+y];
 814 #define ZIGDC(i,y,x) ZIG(i,y,x)
 815
 816 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
 817 {
 818     ZIGZAG4_FRAME
 819 }
 820
 821 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
 822 {
 823     memcpy( level, dct, 2 * sizeof(dctcoef) );
 824     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 825     memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
 826 }
 827
 828 #undef ZIG
 829 #define ZIG(i,y,x) {\
 830     int oe = x+y*FENC_STRIDE;\
 831     int od = x+y*FDEC_STRIDE;\
 832     level[i] = p_src[oe] - p_dst[od];\
 833     nz |= level[i];\
 834 }
 835 #define COPY4x4\
 836     CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 837     CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 838     CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 839     CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 840 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
 841 #define COPY8x8\
 842     CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 843     CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 844     CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 845     CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
 846     CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
 847     CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
 848     CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
 849     CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 850
 851 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 852 {
 853     int nz = 0;
 854     ZIGZAG4_FRAME
 855     COPY4x4
 856     return !!nz;
 857 }
 858
 859 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 860 {
 861     int nz = 0;
 862     ZIGZAG4_FIELD
 863     COPY4x4
 864     return !!nz;
 865 }
 866
 867 #undef ZIGDC
 868 #define ZIGDC(i,y,x) {\
 869     int oe = x+y*FENC_STRIDE;\
 870     int od = x+y*FDEC_STRIDE;\
 871     *dc = p_src[oe] - p_dst[od];\
 872     level[0] = 0;\
 873 }
 874
 875 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 876 {
 877     int nz = 0;
 878     ZIGZAG4_FRAME
 879     COPY4x4
 880     return !!nz;
 881 }
 882
 883 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 884 {
 885     int nz = 0;
 886     ZIGZAG4_FIELD
 887     COPY4x4
 888     return !!nz;
 889 }
 890
 891 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 892 {
 893     int nz = 0;
 894     ZIGZAG8_FRAME
 895     COPY8x8
 896     return !!nz;
 897 }
 898 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 899 {
 900     int nz = 0;
 901     ZIGZAG8_FIELD
 902     COPY8x8
 903     return !!nz;
 904 }
 905
 906 #undef ZIG
 907 #undef COPY4x4
 908
 909 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
 910 {
 911     for( int i = 0; i < 4; i++ )
 912     {
 913         int nz = 0;
 914         for( int j = 0; j < 16; j++ )
 915         {
 916             nz |= src[i+j*4];
 917             dst[i*16+j] = src[i+j*4];
 918         }
 919         nnz[(i&1) + (i>>1)*8] = !!nz;
 920     }
 921 }
 922
 923 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
 924 {
 925     pf_interlaced->scan_8x8   = zigzag_scan_8x8_field;
 926     pf_progressive->scan_8x8  = zigzag_scan_8x8_frame;
 927     pf_interlaced->scan_4x4   = zigzag_scan_4x4_field;
 928     pf_progressive->scan_4x4  = zigzag_scan_4x4_frame;
 929     pf_interlaced->sub_8x8    = zigzag_sub_8x8_field;
 930     pf_progressive->sub_8x8   = zigzag_sub_8x8_frame;
 931     pf_interlaced->sub_4x4    = zigzag_sub_4x4_field;
 932     pf_progressive->sub_4x4   = zigzag_sub_4x4_frame;
 933     pf_interlaced->sub_4x4ac  = zigzag_sub_4x4ac_field;
 934     pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
 935
 936 #if HIGH_BIT_DEPTH
 937 #if HAVE_MMX
 938     if( cpu&X264_CPU_SSE2 )
 939     {
 940         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_sse2;
 941         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
 942         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 943     }
 944     if( cpu&X264_CPU_SSE4 )
 945         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
 946     if( cpu&X264_CPU_AVX )
 947         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
 948 #if ARCH_X86_64
 949     if( cpu&X264_CPU_AVX )
 950     {
 951         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
 952         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
 953     }
 954 #endif // ARCH_X86_64
 955 #endif // HAVE_MMX
 956 #else
 957 #if HAVE_MMX
 958     if( cpu&X264_CPU_MMX )
 959         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 960     if( cpu&X264_CPU_MMX2 )
 961     {
 962         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_mmx2;
 963         pf_interlaced->scan_8x8  = x264_zigzag_scan_8x8_field_mmx2;
 964         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
 965     }
 966     if( cpu&X264_CPU_SSE2_IS_FAST )
 967         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 968     if( cpu&X264_CPU_SSSE3 )
 969     {
 970         pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_ssse3;
 971         pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 972         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
 973         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 974         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 975         if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
 976             pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 977     }
 978     if( cpu&X264_CPU_AVX )
 979     {
 980         pf_interlaced->sub_4x4   = x264_zigzag_sub_4x4_field_avx;
 981         pf_progressive->sub_4x4  = x264_zigzag_sub_4x4_frame_avx;
 982 #if ARCH_X86_64
 983         pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
 984         pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
 985 #endif
 986         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
 987     }
 988     if( cpu&X264_CPU_XOP )
 989     {
 990         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
 991         pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
 992         pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
 993     }
 994 #endif // HAVE_MMX
 995 #if HAVE_ALTIVEC
 996     if( cpu&X264_CPU_ALTIVEC )
 997     {
 998         pf_interlaced->scan_4x4  = x264_zigzag_scan_4x4_field_altivec;
 999         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1000     }
1001 #endif
1002 #if HAVE_ARMV6
1003     if( cpu&X264_CPU_NEON )
1004         pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1005 #endif
1006 #endif // HIGH_BIT_DEPTH
1007
1008     pf_interlaced->interleave_8x8_cavlc =
1009     pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1010 #if HAVE_MMX
1011 #if HIGH_BIT_DEPTH
1012     if( cpu&X264_CPU_SSE2 )
1013     {
1014         pf_interlaced->interleave_8x8_cavlc =
1015         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1016     }
1017     if( cpu&X264_CPU_AVX )
1018     {
1019         pf_interlaced->interleave_8x8_cavlc =
1020         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1021     }
1022 #else
1023     if( cpu&X264_CPU_MMX )
1024     {
1025         pf_interlaced->interleave_8x8_cavlc =
1026         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1027     }
1028     if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1029     {
1030         pf_interlaced->interleave_8x8_cavlc =
1031         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1032     }
1033
1034     if( cpu&X264_CPU_AVX )
1035     {
1036         pf_interlaced->interleave_8x8_cavlc =
1037         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1038     }
1039
1040     if( cpu&X264_CPU_AVX2 )
1041     {
1042         pf_interlaced->interleave_8x8_cavlc =
1043         pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1044     }
1045 #endif // HIGH_BIT_DEPTH
1046 #endif
1047 }