git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #if HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #if ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31 #if ARCH_ARM
  32 #   include "arm/dct.h"
  33 #endif
  34
  35 int x264_dct4_weight2_zigzag[2][16];
  36 int x264_dct8_weight2_zigzag[2][64];
  37
  38 static void dct4x4dc( dctcoef d[16] )
  39 {
  40     dctcoef tmp[16];
  41
  42     for( int i = 0; i < 4; i++ )
  43     {
  44         int s01 = d[i*4+0] + d[i*4+1];
  45         int d01 = d[i*4+0] - d[i*4+1];
  46         int s23 = d[i*4+2] + d[i*4+3];
  47         int d23 = d[i*4+2] - d[i*4+3];
  48
  49         tmp[0*4+i] = s01 + s23;
  50         tmp[1*4+i] = s01 - s23;
  51         tmp[2*4+i] = d01 - d23;
  52         tmp[3*4+i] = d01 + d23;
  53     }
  54
  55     for( int i = 0; i < 4; i++ )
  56     {
  57         int s01 = tmp[i*4+0] + tmp[i*4+1];
  58         int d01 = tmp[i*4+0] - tmp[i*4+1];
  59         int s23 = tmp[i*4+2] + tmp[i*4+3];
  60         int d23 = tmp[i*4+2] - tmp[i*4+3];
  61
  62         d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
  63         d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
  64         d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
  65         d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
  66     }
  67 }
  68
  69 static void idct4x4dc( dctcoef d[16] )
  70 {
  71     dctcoef tmp[16];
  72
  73     for( int i = 0; i < 4; i++ )
  74     {
  75         int s01 = d[i*4+0] + d[i*4+1];
  76         int d01 = d[i*4+0] - d[i*4+1];
  77         int s23 = d[i*4+2] + d[i*4+3];
  78         int d23 = d[i*4+2] - d[i*4+3];
  79
  80         tmp[0*4+i] = s01 + s23;
  81         tmp[1*4+i] = s01 - s23;
  82         tmp[2*4+i] = d01 - d23;
  83         tmp[3*4+i] = d01 + d23;
  84     }
  85
  86     for( int i = 0; i < 4; i++ )
  87     {
  88         int s01 = tmp[i*4+0] + tmp[i*4+1];
  89         int d01 = tmp[i*4+0] - tmp[i*4+1];
  90         int s23 = tmp[i*4+2] + tmp[i*4+3];
  91         int d23 = tmp[i*4+2] - tmp[i*4+3];
  92
  93         d[i*4+0] = s01 + s23;
  94         d[i*4+1] = s01 - s23;
  95         d[i*4+2] = d01 - d23;
  96         d[i*4+3] = d01 + d23;
  97     }
  98 }
  99
 100 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
 101                                   pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
 102 {
 103     for( int y = 0; y < i_size; y++ )
 104     {
 105         for( int x = 0; x < i_size; x++ )
 106             diff[x + y*i_size] = pix1[x] - pix2[x];
 107         pix1 += i_pix1;
 108         pix2 += i_pix2;
 109     }
 110 }
 111
 112 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
 113 {
 114     dctcoef d[16];
 115     dctcoef tmp[16];
 116
 117     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 118
 119     for( int i = 0; i < 4; i++ )
 120     {
 121         int s03 = d[i*4+0] + d[i*4+3];
 122         int s12 = d[i*4+1] + d[i*4+2];
 123         int d03 = d[i*4+0] - d[i*4+3];
 124         int d12 = d[i*4+1] - d[i*4+2];
 125
 126         tmp[0*4+i] =   s03 +   s12;
 127         tmp[1*4+i] = 2*d03 +   d12;
 128         tmp[2*4+i] =   s03 -   s12;
 129         tmp[3*4+i] =   d03 - 2*d12;
 130     }
 131
 132     for( int i = 0; i < 4; i++ )
 133     {
 134         int s03 = tmp[i*4+0] + tmp[i*4+3];
 135         int s12 = tmp[i*4+1] + tmp[i*4+2];
 136         int d03 = tmp[i*4+0] - tmp[i*4+3];
 137         int d12 = tmp[i*4+1] - tmp[i*4+2];
 138
 139         dct[i*4+0] =   s03 +   s12;
 140         dct[i*4+1] = 2*d03 +   d12;
 141         dct[i*4+2] =   s03 -   s12;
 142         dct[i*4+3] =   d03 - 2*d12;
 143     }
 144 }
 145
 146 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
 147 {
 148     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 149     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 150     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 151     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 152 }
 153
 154 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
 155 {
 156     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 157     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 158     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 159     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 160 }
 161
 162 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
 163 {
 164     dctcoef d[16];
 165     int sum = 0;
 166
 167     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 168
 169     sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
 170     sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
 171
 172     return sum;
 173 }
 174
 175 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
 176 {
 177     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 178     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 179     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 180     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 181
 182     /* 2x2 DC transform */
 183     int d0 = dct[0] + dct[1];
 184     int d1 = dct[2] + dct[3];
 185     int d2 = dct[0] - dct[1];
 186     int d3 = dct[2] - dct[3];
 187     dct[0] = d0 + d1;
 188     dct[2] = d2 + d3;
 189     dct[1] = d0 - d1;
 190     dct[3] = d2 - d3;
 191 }
 192
 193 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
 194 {
 195     dctcoef d[16];
 196     dctcoef tmp[16];
 197
 198     for( int i = 0; i < 4; i++ )
 199     {
 200         int s02 =  dct[0*4+i]     +  dct[2*4+i];
 201         int d02 =  dct[0*4+i]     -  dct[2*4+i];
 202         int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
 203         int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
 204
 205         tmp[i*4+0] = s02 + s13;
 206         tmp[i*4+1] = d02 + d13;
 207         tmp[i*4+2] = d02 - d13;
 208         tmp[i*4+3] = s02 - s13;
 209     }
 210
 211     for( int i = 0; i < 4; i++ )
 212     {
 213         int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
 214         int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
 215         int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
 216         int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
 217
 218         d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
 219         d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
 220         d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
 221         d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
 222     }
 223
 224
 225     for( int y = 0; y < 4; y++ )
 226     {
 227         for( int x = 0; x < 4; x++ )
 228             p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
 229         p_dst += FDEC_STRIDE;
 230     }
 231 }
 232
 233 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
 234 {
 235     add4x4_idct( &p_dst[0],               dct[0] );
 236     add4x4_idct( &p_dst[4],               dct[1] );
 237     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 238     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 239 }
 240
 241 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
 242 {
 243     add8x8_idct( &p_dst[0],               &dct[0] );
 244     add8x8_idct( &p_dst[8],               &dct[4] );
 245     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 246     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 247 }
 248
 249 /****************************************************************************
 250  * 8x8 transform:
 251  ****************************************************************************/
 252
 253 #define DCT8_1D {\
 254     int s07 = SRC(0) + SRC(7);\
 255     int s16 = SRC(1) + SRC(6);\
 256     int s25 = SRC(2) + SRC(5);\
 257     int s34 = SRC(3) + SRC(4);\
 258     int a0 = s07 + s34;\
 259     int a1 = s16 + s25;\
 260     int a2 = s07 - s34;\
 261     int a3 = s16 - s25;\
 262     int d07 = SRC(0) - SRC(7);\
 263     int d16 = SRC(1) - SRC(6);\
 264     int d25 = SRC(2) - SRC(5);\
 265     int d34 = SRC(3) - SRC(4);\
 266     int a4 = d16 + d25 + (d07 + (d07>>1));\
 267     int a5 = d07 - d34 - (d25 + (d25>>1));\
 268     int a6 = d07 + d34 - (d16 + (d16>>1));\
 269     int a7 = d16 - d25 + (d34 + (d34>>1));\
 270     DST(0) =  a0 + a1     ;\
 271     DST(1) =  a4 + (a7>>2);\
 272     DST(2) =  a2 + (a3>>1);\
 273     DST(3) =  a5 + (a6>>2);\
 274     DST(4) =  a0 - a1     ;\
 275     DST(5) =  a6 - (a5>>2);\
 276     DST(6) = (a2>>1) - a3 ;\
 277     DST(7) = (a4>>2) - a7 ;\
 278 }
 279
 280 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
 281 {
 282     dctcoef tmp[64];
 283
 284     pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 285
 286 #define SRC(x) tmp[x*8+i]
 287 #define DST(x) tmp[x*8+i]
 288     for( int i = 0; i < 8; i++ )
 289         DCT8_1D
 290 #undef SRC
 291 #undef DST
 292
 293 #define SRC(x) tmp[i*8+x]
 294 #define DST(x) dct[x*8+i]
 295     for( int i = 0; i < 8; i++ )
 296         DCT8_1D
 297 #undef SRC
 298 #undef DST
 299 }
 300
 301 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
 302 {
 303     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 304     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 305     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 306     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 307 }
 308
 309 #define IDCT8_1D {\
 310     int a0 =  SRC(0) + SRC(4);\
 311     int a2 =  SRC(0) - SRC(4);\
 312     int a4 = (SRC(2)>>1) - SRC(6);\
 313     int a6 = (SRC(6)>>1) + SRC(2);\
 314     int b0 = a0 + a6;\
 315     int b2 = a2 + a4;\
 316     int b4 = a2 - a4;\
 317     int b6 = a0 - a6;\
 318     int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 319     int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 320     int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 321     int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 322     int b1 = (a7>>2) + a1;\
 323     int b3 =  a3 + (a5>>2);\
 324     int b5 = (a3>>2) - a5;\
 325     int b7 =  a7 - (a1>>2);\
 326     DST(0, b0 + b7);\
 327     DST(1, b2 + b5);\
 328     DST(2, b4 + b3);\
 329     DST(3, b6 + b1);\
 330     DST(4, b6 - b1);\
 331     DST(5, b4 - b3);\
 332     DST(6, b2 - b5);\
 333     DST(7, b0 - b7);\
 334 }
 335
 336 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
 337 {
 338     dct[0] += 32; // rounding for the >>6 at the end
 339
 340 #define SRC(x)     dct[x*8+i]
 341 #define DST(x,rhs) dct[x*8+i] = (rhs)
 342     for( int i = 0; i < 8; i++ )
 343         IDCT8_1D
 344 #undef SRC
 345 #undef DST
 346
 347 #define SRC(x)     dct[i*8+x]
 348 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 349     for( int i = 0; i < 8; i++ )
 350         IDCT8_1D
 351 #undef SRC
 352 #undef DST
 353 }
 354
 355 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
 356 {
 357     add8x8_idct8( &dst[0],               dct[0] );
 358     add8x8_idct8( &dst[8],               dct[1] );
 359     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 360     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 361 }
 362
 363 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
 364 {
 365     dc = (dc + 32) >> 6;
 366     for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 367     {
 368         p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
 369         p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
 370         p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
 371         p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
 372     }
 373 }
 374
 375 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
 376 {
 377     add4x4_idct_dc( &p_dst[0],               dct[0] );
 378     add4x4_idct_dc( &p_dst[4],               dct[1] );
 379     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 380     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 381 }
 382
 383 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
 384 {
 385     for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
 386     {
 387         add4x4_idct_dc( &p_dst[ 0], dct[0] );
 388         add4x4_idct_dc( &p_dst[ 4], dct[1] );
 389         add4x4_idct_dc( &p_dst[ 8], dct[2] );
 390         add4x4_idct_dc( &p_dst[12], dct[3] );
 391     }
 392 }
 393
 394
 395 /****************************************************************************
 396  * x264_dct_init:
 397  ****************************************************************************/
 398 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 399 {
 400     dctf->sub4x4_dct    = sub4x4_dct;
 401     dctf->add4x4_idct   = add4x4_idct;
 402
 403     dctf->sub8x8_dct    = sub8x8_dct;
 404     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 405     dctf->add8x8_idct   = add8x8_idct;
 406     dctf->add8x8_idct_dc = add8x8_idct_dc;
 407
 408     dctf->sub16x16_dct  = sub16x16_dct;
 409     dctf->add16x16_idct = add16x16_idct;
 410     dctf->add16x16_idct_dc = add16x16_idct_dc;
 411
 412     dctf->sub8x8_dct8   = sub8x8_dct8;
 413     dctf->add8x8_idct8  = add8x8_idct8;
 414
 415     dctf->sub16x16_dct8  = sub16x16_dct8;
 416     dctf->add16x16_idct8 = add16x16_idct8;
 417
 418     dctf->dct4x4dc  = dct4x4dc;
 419     dctf->idct4x4dc = idct4x4dc;
 420
 421 #if !X264_HIGH_BIT_DEPTH
 422 #if HAVE_MMX
 423     if( cpu&X264_CPU_MMX )
 424     {
 425         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 426         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 427         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 428         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 429         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 430         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 431         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 432
 433 #if !ARCH_X86_64
 434         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 435         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 436         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 437         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 438
 439         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 440         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 441         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 442         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 443 #endif
 444     }
 445
 446     if( cpu&X264_CPU_SSE2 )
 447     {
 448         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 449         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 450         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 451         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 452         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 453
 454         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 455         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 456         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 457         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 458         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 459     }
 460
 461     if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
 462     {
 463         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 464         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 465         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 466         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 467         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 468         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 469         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 470     }
 471
 472     if( cpu&X264_CPU_SSE4 )
 473         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 474
 475 #endif //HAVE_MMX
 476
 477 #if HAVE_ALTIVEC
 478     if( cpu&X264_CPU_ALTIVEC )
 479     {
 480         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 481         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 482         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 483
 484         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 485         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 486         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 487
 488         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 489         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 490
 491         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 492         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 493     }
 494 #endif
 495
 496 #if HAVE_ARMV6
 497     if( cpu&X264_CPU_NEON )
 498     {
 499         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 500         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 501         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 502         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 503         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 504         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 505         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 506         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 507
 508         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 509         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 510         dctf->add16x16_idct = x264_add16x16_idct_neon;
 511
 512         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 513         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 514
 515         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 516         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 517     }
 518 #endif
 519 #endif // !X264_HIGH_BIT_DEPTH
 520 }
 521
 522 void x264_dct_init_weights( void )
 523 {
 524     for( int j = 0; j < 2; j++ )
 525     {
 526         for( int i = 0; i < 16; i++ )
 527             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 528         for( int i = 0; i < 64; i++ )
 529             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 530     }
 531 }
 532
 533
 534 #define ZIG(i,y,x) level[i] = dct[x*8+y];
 535 #define ZIGZAG8_FRAME\
 536     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 537     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 538     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 539     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 540     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 541     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 542     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 543     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 544     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 545     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 546     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 547     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 548     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 549     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 550     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 551     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 552
 553 #define ZIGZAG8_FIELD\
 554     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 555     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 556     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 557     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 558     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 559     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 560     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 561     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 562     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 563     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 564     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 565     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 566     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 567     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 568     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 569     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 570
 571 #define ZIGZAG4_FRAME\
 572     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 573     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 574     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 575     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 576
 577 #define ZIGZAG4_FIELD\
 578     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 579     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 580     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 581     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 582
 583 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
 584 {
 585     ZIGZAG8_FRAME
 586 }
 587
 588 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
 589 {
 590     ZIGZAG8_FIELD
 591 }
 592
 593 #undef ZIG
 594 #define ZIG(i,y,x) level[i] = dct[x*4+y];
 595 #define ZIGDC(i,y,x) ZIG(i,y,x)
 596
 597 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
 598 {
 599     ZIGZAG4_FRAME
 600 }
 601
 602 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
 603 {
 604     memcpy( level, dct, 2 * sizeof(dctcoef) );
 605     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 606     memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
 607 }
 608
 609 #undef ZIG
 610 #define ZIG(i,y,x) {\
 611     int oe = x+y*FENC_STRIDE;\
 612     int od = x+y*FDEC_STRIDE;\
 613     level[i] = p_src[oe] - p_dst[od];\
 614     nz |= level[i];\
 615 }
 616 #define COPY4x4\
 617     CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 618     CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 619     CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 620     CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 621 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
 622 #define COPY8x8\
 623     CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 624     CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 625     CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 626     CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
 627     CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
 628     CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
 629     CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
 630     CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 631
 632 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 633 {
 634     int nz = 0;
 635     ZIGZAG4_FRAME
 636     COPY4x4
 637     return !!nz;
 638 }
 639
 640 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
 641 {
 642     int nz = 0;
 643     ZIGZAG4_FIELD
 644     COPY4x4
 645     return !!nz;
 646 }
 647
 648 #undef ZIGDC
 649 #define ZIGDC(i,y,x) {\
 650     int oe = x+y*FENC_STRIDE;\
 651     int od = x+y*FDEC_STRIDE;\
 652     *dc = p_src[oe] - p_dst[od];\
 653     level[0] = 0;\
 654 }
 655
 656 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 657 {
 658     int nz = 0;
 659     ZIGZAG4_FRAME
 660     COPY4x4
 661     return !!nz;
 662 }
 663
 664 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
 665 {
 666     int nz = 0;
 667     ZIGZAG4_FIELD
 668     COPY4x4
 669     return !!nz;
 670 }
 671
 672 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 673 {
 674     int nz = 0;
 675     ZIGZAG8_FRAME
 676     COPY8x8
 677     return !!nz;
 678 }
 679 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
 680 {
 681     int nz = 0;
 682     ZIGZAG8_FIELD
 683     COPY8x8
 684     return !!nz;
 685 }
 686
 687 #undef ZIG
 688 #undef COPY4x4
 689
 690 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
 691 {
 692     for( int i = 0; i < 4; i++ )
 693     {
 694         int nz = 0;
 695         for( int j = 0; j < 16; j++ )
 696         {
 697             nz |= src[i+j*4];
 698             dst[i*16+j] = src[i+j*4];
 699         }
 700         nnz[(i&1) + (i>>1)*8] = !!nz;
 701     }
 702 }
 703
 704 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 705 {
 706     if( b_interlaced )
 707     {
 708         pf->scan_8x8   = zigzag_scan_8x8_field;
 709         pf->scan_4x4   = zigzag_scan_4x4_field;
 710         pf->sub_8x8    = zigzag_sub_8x8_field;
 711         pf->sub_4x4    = zigzag_sub_4x4_field;
 712         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 713 #if !X264_HIGH_BIT_DEPTH
 714 #if HAVE_MMX
 715         if( cpu&X264_CPU_MMXEXT )
 716         {
 717             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 718             pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
 719         }
 720         if( cpu&X264_CPU_SSSE3 )
 721         {
 722             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
 723             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
 724         }
 725 #endif
 726
 727 #if HAVE_ALTIVEC
 728         if( cpu&X264_CPU_ALTIVEC )
 729             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 730 #endif
 731 #endif // !X264_HIGH_BIT_DEPTH
 732     }
 733     else
 734     {
 735         pf->scan_8x8   = zigzag_scan_8x8_frame;
 736         pf->scan_4x4   = zigzag_scan_4x4_frame;
 737         pf->sub_8x8    = zigzag_sub_8x8_frame;
 738         pf->sub_4x4    = zigzag_sub_4x4_frame;
 739         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 740 #if !X264_HIGH_BIT_DEPTH
 741 #if HAVE_MMX
 742         if( cpu&X264_CPU_MMX )
 743             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 744         if( cpu&X264_CPU_MMXEXT )
 745             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 746         if( cpu&X264_CPU_SSE2_IS_FAST )
 747             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 748         if( cpu&X264_CPU_SSSE3 )
 749         {
 750             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 751             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 752             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 753             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 754                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 755         }
 756 #endif
 757
 758 #if HAVE_ALTIVEC
 759         if( cpu&X264_CPU_ALTIVEC )
 760             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 761 #endif
 762 #if HAVE_ARMV6
 763         if( cpu&X264_CPU_NEON )
 764             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 765 #endif
 766 #endif // !X264_HIGH_BIT_DEPTH
 767     }
 768
 769     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 770 #if !X264_HIGH_BIT_DEPTH
 771 #if HAVE_MMX
 772     if( cpu&X264_CPU_MMX )
 773         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 774     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 775         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 776 #endif
 777 #endif // !X264_HIGH_BIT_DEPTH
 778 }