git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31 #ifdef ARCH_ARM
  32 #   include "arm/dct.h"
  33 #endif
  34
  35 int x264_dct4_weight2_zigzag[2][16];
  36 int x264_dct8_weight2_zigzag[2][64];
  37
  38 static void dct4x4dc( int16_t d[16] )
  39 {
  40     int16_t tmp[16];
  41
  42     for( int i = 0; i < 4; i++ )
  43     {
  44         int s01 = d[i*4+0] + d[i*4+1];
  45         int d01 = d[i*4+0] - d[i*4+1];
  46         int s23 = d[i*4+2] + d[i*4+3];
  47         int d23 = d[i*4+2] - d[i*4+3];
  48
  49         tmp[0*4+i] = s01 + s23;
  50         tmp[1*4+i] = s01 - s23;
  51         tmp[2*4+i] = d01 - d23;
  52         tmp[3*4+i] = d01 + d23;
  53     }
  54
  55     for( int i = 0; i < 4; i++ )
  56     {
  57         int s01 = tmp[i*4+0] + tmp[i*4+1];
  58         int d01 = tmp[i*4+0] - tmp[i*4+1];
  59         int s23 = tmp[i*4+2] + tmp[i*4+3];
  60         int d23 = tmp[i*4+2] - tmp[i*4+3];
  61
  62         d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
  63         d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
  64         d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
  65         d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
  66     }
  67 }
  68
  69 static void idct4x4dc( int16_t d[16] )
  70 {
  71     int16_t tmp[16];
  72
  73     for( int i = 0; i < 4; i++ )
  74     {
  75         int s01 = d[i*4+0] + d[i*4+1];
  76         int d01 = d[i*4+0] - d[i*4+1];
  77         int s23 = d[i*4+2] + d[i*4+3];
  78         int d23 = d[i*4+2] - d[i*4+3];
  79
  80         tmp[0*4+i] = s01 + s23;
  81         tmp[1*4+i] = s01 - s23;
  82         tmp[2*4+i] = d01 - d23;
  83         tmp[3*4+i] = d01 + d23;
  84     }
  85
  86     for( int i = 0; i < 4; i++ )
  87     {
  88         int s01 = tmp[i*4+0] + tmp[i*4+1];
  89         int d01 = tmp[i*4+0] - tmp[i*4+1];
  90         int s23 = tmp[i*4+2] + tmp[i*4+3];
  91         int d23 = tmp[i*4+2] - tmp[i*4+3];
  92
  93         d[i*4+0] = s01 + s23;
  94         d[i*4+1] = s01 - s23;
  95         d[i*4+2] = d01 - d23;
  96         d[i*4+3] = d01 + d23;
  97     }
  98 }
  99
 100 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 101                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 102 {
 103     for( int y = 0; y < i_size; y++ )
 104     {
 105         for( int x = 0; x < i_size; x++ )
 106             diff[x + y*i_size] = pix1[x] - pix2[x];
 107         pix1 += i_pix1;
 108         pix2 += i_pix2;
 109     }
 110 }
 111
 112 static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
 113 {
 114     int16_t d[16];
 115     int16_t tmp[16];
 116
 117     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 118
 119     for( int i = 0; i < 4; i++ )
 120     {
 121         int s03 = d[i*4+0] + d[i*4+3];
 122         int s12 = d[i*4+1] + d[i*4+2];
 123         int d03 = d[i*4+0] - d[i*4+3];
 124         int d12 = d[i*4+1] - d[i*4+2];
 125
 126         tmp[0*4+i] =   s03 +   s12;
 127         tmp[1*4+i] = 2*d03 +   d12;
 128         tmp[2*4+i] =   s03 -   s12;
 129         tmp[3*4+i] =   d03 - 2*d12;
 130     }
 131
 132     for( int i = 0; i < 4; i++ )
 133     {
 134         int s03 = tmp[i*4+0] + tmp[i*4+3];
 135         int s12 = tmp[i*4+1] + tmp[i*4+2];
 136         int d03 = tmp[i*4+0] - tmp[i*4+3];
 137         int d12 = tmp[i*4+1] - tmp[i*4+2];
 138
 139         dct[i*4+0] =   s03 +   s12;
 140         dct[i*4+1] = 2*d03 +   d12;
 141         dct[i*4+2] =   s03 -   s12;
 142         dct[i*4+3] =   d03 - 2*d12;
 143     }
 144 }
 145
 146 static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
 147 {
 148     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 149     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 150     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 151     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 152 }
 153
 154 static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
 155 {
 156     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 157     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 158     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 159     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 160 }
 161
 162 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
 163 {
 164     int16_t d[16];
 165     int sum = 0;
 166
 167     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 168
 169     sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
 170     sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
 171
 172     return sum;
 173 }
 174
 175 static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
 176 {
 177     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 178     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 179     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 180     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 181
 182     /* 2x2 DC transform */
 183     int d0 = dct[0] + dct[1];
 184     int d1 = dct[2] + dct[3];
 185     int d2 = dct[0] - dct[1];
 186     int d3 = dct[2] - dct[3];
 187     dct[0] = d0 + d1;
 188     dct[2] = d2 + d3;
 189     dct[1] = d0 - d1;
 190     dct[3] = d2 - d3;
 191 }
 192
 193 static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
 194 {
 195     int16_t d[16];
 196     int16_t tmp[16];
 197
 198     for( int i = 0; i < 4; i++ )
 199     {
 200         int s02 =  dct[0*4+i]     +  dct[2*4+i];
 201         int d02 =  dct[0*4+i]     -  dct[2*4+i];
 202         int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
 203         int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
 204
 205         tmp[i*4+0] = s02 + s13;
 206         tmp[i*4+1] = d02 + d13;
 207         tmp[i*4+2] = d02 - d13;
 208         tmp[i*4+3] = s02 - s13;
 209     }
 210
 211     for( int i = 0; i < 4; i++ )
 212     {
 213         int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
 214         int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
 215         int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
 216         int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
 217
 218         d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
 219         d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
 220         d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
 221         d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
 222     }
 223
 224
 225     for( int y = 0; y < 4; y++ )
 226     {
 227         for( int x = 0; x < 4; x++ )
 228             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
 229         p_dst += FDEC_STRIDE;
 230     }
 231 }
 232
 233 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
 234 {
 235     add4x4_idct( &p_dst[0],               dct[0] );
 236     add4x4_idct( &p_dst[4],               dct[1] );
 237     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 238     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 239 }
 240
 241 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
 242 {
 243     add8x8_idct( &p_dst[0],               &dct[0] );
 244     add8x8_idct( &p_dst[8],               &dct[4] );
 245     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 246     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 247 }
 248
 249 /****************************************************************************
 250  * 8x8 transform:
 251  ****************************************************************************/
 252
 253 #define DCT8_1D {\
 254     int s07 = SRC(0) + SRC(7);\
 255     int s16 = SRC(1) + SRC(6);\
 256     int s25 = SRC(2) + SRC(5);\
 257     int s34 = SRC(3) + SRC(4);\
 258     int a0 = s07 + s34;\
 259     int a1 = s16 + s25;\
 260     int a2 = s07 - s34;\
 261     int a3 = s16 - s25;\
 262     int d07 = SRC(0) - SRC(7);\
 263     int d16 = SRC(1) - SRC(6);\
 264     int d25 = SRC(2) - SRC(5);\
 265     int d34 = SRC(3) - SRC(4);\
 266     int a4 = d16 + d25 + (d07 + (d07>>1));\
 267     int a5 = d07 - d34 - (d25 + (d25>>1));\
 268     int a6 = d07 + d34 - (d16 + (d16>>1));\
 269     int a7 = d16 - d25 + (d34 + (d34>>1));\
 270     DST(0) =  a0 + a1     ;\
 271     DST(1) =  a4 + (a7>>2);\
 272     DST(2) =  a2 + (a3>>1);\
 273     DST(3) =  a5 + (a6>>2);\
 274     DST(4) =  a0 - a1     ;\
 275     DST(5) =  a6 - (a5>>2);\
 276     DST(6) = (a2>>1) - a3 ;\
 277     DST(7) = (a4>>2) - a7 ;\
 278 }
 279
 280 static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
 281 {
 282     int16_t tmp[64];
 283
 284     pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 285
 286 #define SRC(x) tmp[x*8+i]
 287 #define DST(x) tmp[x*8+i]
 288     for( int i = 0; i < 8; i++ )
 289         DCT8_1D
 290 #undef SRC
 291 #undef DST
 292
 293 #define SRC(x) tmp[i*8+x]
 294 #define DST(x) dct[x*8+i]
 295     for( int i = 0; i < 8; i++ )
 296         DCT8_1D
 297 #undef SRC
 298 #undef DST
 299 }
 300
 301 static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
 302 {
 303     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 304     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 305     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 306     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 307 }
 308
 309 #define IDCT8_1D {\
 310     int a0 =  SRC(0) + SRC(4);\
 311     int a2 =  SRC(0) - SRC(4);\
 312     int a4 = (SRC(2)>>1) - SRC(6);\
 313     int a6 = (SRC(6)>>1) + SRC(2);\
 314     int b0 = a0 + a6;\
 315     int b2 = a2 + a4;\
 316     int b4 = a2 - a4;\
 317     int b6 = a0 - a6;\
 318     int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 319     int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 320     int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 321     int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 322     int b1 = (a7>>2) + a1;\
 323     int b3 =  a3 + (a5>>2);\
 324     int b5 = (a3>>2) - a5;\
 325     int b7 =  a7 - (a1>>2);\
 326     DST(0, b0 + b7);\
 327     DST(1, b2 + b5);\
 328     DST(2, b4 + b3);\
 329     DST(3, b6 + b1);\
 330     DST(4, b6 - b1);\
 331     DST(5, b4 - b3);\
 332     DST(6, b2 - b5);\
 333     DST(7, b0 - b7);\
 334 }
 335
 336 static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
 337 {
 338     dct[0] += 32; // rounding for the >>6 at the end
 339
 340 #define SRC(x)     dct[x*8+i]
 341 #define DST(x,rhs) dct[x*8+i] = (rhs)
 342     for( int i = 0; i < 8; i++ )
 343         IDCT8_1D
 344 #undef SRC
 345 #undef DST
 346
 347 #define SRC(x)     dct[i*8+x]
 348 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 349     for( int i = 0; i < 8; i++ )
 350         IDCT8_1D
 351 #undef SRC
 352 #undef DST
 353 }
 354
 355 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
 356 {
 357     add8x8_idct8( &dst[0],               dct[0] );
 358     add8x8_idct8( &dst[8],               dct[1] );
 359     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 360     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 361 }
 362
 363 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
 364 {
 365     dc = (dc + 32) >> 6;
 366     for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 367     {
 368         p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
 369         p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
 370         p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
 371         p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
 372     }
 373 }
 374
 375 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
 376 {
 377     add4x4_idct_dc( &p_dst[0],               dct[0] );
 378     add4x4_idct_dc( &p_dst[4],               dct[1] );
 379     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 380     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 381 }
 382
 383 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
 384 {
 385     for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
 386     {
 387         add4x4_idct_dc( &p_dst[ 0], dct[0] );
 388         add4x4_idct_dc( &p_dst[ 4], dct[1] );
 389         add4x4_idct_dc( &p_dst[ 8], dct[2] );
 390         add4x4_idct_dc( &p_dst[12], dct[3] );
 391     }
 392 }
 393
 394
 395 /****************************************************************************
 396  * x264_dct_init:
 397  ****************************************************************************/
 398 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 399 {
 400     dctf->sub4x4_dct    = sub4x4_dct;
 401     dctf->add4x4_idct   = add4x4_idct;
 402
 403     dctf->sub8x8_dct    = sub8x8_dct;
 404     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 405     dctf->add8x8_idct   = add8x8_idct;
 406     dctf->add8x8_idct_dc = add8x8_idct_dc;
 407
 408     dctf->sub16x16_dct  = sub16x16_dct;
 409     dctf->add16x16_idct = add16x16_idct;
 410     dctf->add16x16_idct_dc = add16x16_idct_dc;
 411
 412     dctf->sub8x8_dct8   = sub8x8_dct8;
 413     dctf->add8x8_idct8  = add8x8_idct8;
 414
 415     dctf->sub16x16_dct8  = sub16x16_dct8;
 416     dctf->add16x16_idct8 = add16x16_idct8;
 417
 418     dctf->dct4x4dc  = dct4x4dc;
 419     dctf->idct4x4dc = idct4x4dc;
 420
 421 #ifdef HAVE_MMX
 422     if( cpu&X264_CPU_MMX )
 423     {
 424         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 425         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 426         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 427         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 428         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 429         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 430         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 431
 432 #ifndef ARCH_X86_64
 433         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 434         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 435         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 436         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 437
 438         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 439         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 440         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 441         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 442 #endif
 443     }
 444
 445     if( cpu&X264_CPU_SSE2 )
 446     {
 447         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 448         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 449         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 450         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 451         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 452
 453         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 454         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 455         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 456         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 457         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 458     }
 459
 460     if( cpu&X264_CPU_SSSE3 )
 461     {
 462         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 463         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 464         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 465         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 466         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 467         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 468         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 469     }
 470
 471     if( cpu&X264_CPU_SSE4 )
 472         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 473
 474 #endif //HAVE_MMX
 475
 476 #ifdef HAVE_ALTIVEC
 477     if( cpu&X264_CPU_ALTIVEC )
 478     {
 479         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 480         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 481         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 482
 483         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 484         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 485         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 486
 487         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 488         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 489
 490         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 491         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 492     }
 493 #endif
 494
 495 #ifdef HAVE_ARMV6
 496     if( cpu&X264_CPU_NEON )
 497     {
 498         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 499         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 500         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 501         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 502         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 503         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 504         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 505         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 506
 507         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 508         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 509         dctf->add16x16_idct = x264_add16x16_idct_neon;
 510
 511         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 512         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 513
 514         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 515         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 516     }
 517 #endif
 518 }
 519
 520 void x264_dct_init_weights( void )
 521 {
 522     for( int j = 0; j < 2; j++ )
 523     {
 524         for( int i = 0; i < 16; i++ )
 525             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 526         for( int i = 0; i < 64; i++ )
 527             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 528     }
 529 }
 530
 531
 532 #define ZIG(i,y,x) level[i] = dct[x*8+y];
 533 #define ZIGZAG8_FRAME\
 534     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 535     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 536     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 537     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 538     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 539     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 540     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 541     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 542     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 543     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 544     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 545     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 546     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 547     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 548     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 549     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 550
 551 #define ZIGZAG8_FIELD\
 552     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 553     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 554     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 555     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 556     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 557     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 558     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 559     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 560     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 561     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 562     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 563     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 564     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 565     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 566     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 567     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 568
 569 #define ZIGZAG4_FRAME\
 570     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 571     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 572     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 573     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 574
 575 #define ZIGZAG4_FIELD\
 576     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 577     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 578     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 579     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 580
 581 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
 582 {
 583     ZIGZAG8_FRAME
 584 }
 585
 586 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
 587 {
 588     ZIGZAG8_FIELD
 589 }
 590
 591 #undef ZIG
 592 #define ZIG(i,y,x) level[i] = dct[x*4+y];
 593 #define ZIGDC(i,y,x) ZIG(i,y,x)
 594
 595 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
 596 {
 597     ZIGZAG4_FRAME
 598 }
 599
 600 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
 601 {
 602     CP32( level, dct );
 603     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 604     CP32( level+6, dct+6 );
 605     CP64( level+8, dct+8 );
 606     CP64( level+12, dct+12 );
 607 }
 608
 609 #undef ZIG
 610 #define ZIG(i,y,x) {\
 611     int oe = x+y*FENC_STRIDE;\
 612     int od = x+y*FDEC_STRIDE;\
 613     level[i] = p_src[oe] - p_dst[od];\
 614     nz |= level[i];\
 615 }
 616 #define COPY4x4\
 617     CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 618     CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 619     CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 620     CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 621 #define COPY8x8\
 622     CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 623     CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 624     CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 625     CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
 626     CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
 627     CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
 628     CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
 629     CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 630
 631 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 632 {
 633     int nz = 0;
 634     ZIGZAG4_FRAME
 635     COPY4x4
 636     return !!nz;
 637 }
 638
 639 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 640 {
 641     int nz = 0;
 642     ZIGZAG4_FIELD
 643     COPY4x4
 644     return !!nz;
 645 }
 646
 647 #undef ZIGDC
 648 #define ZIGDC(i,y,x) {\
 649     int oe = x+y*FENC_STRIDE;\
 650     int od = x+y*FDEC_STRIDE;\
 651     *dc = p_src[oe] - p_dst[od];\
 652     level[0] = 0;\
 653 }
 654
 655 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 656 {
 657     int nz = 0;
 658     ZIGZAG4_FRAME
 659     COPY4x4
 660     return !!nz;
 661 }
 662
 663 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 664 {
 665     int nz = 0;
 666     ZIGZAG4_FIELD
 667     COPY4x4
 668     return !!nz;
 669 }
 670
 671 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 672 {
 673     int nz = 0;
 674     ZIGZAG8_FRAME
 675     COPY8x8
 676     return !!nz;
 677 }
 678 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 679 {
 680     int nz = 0;
 681     ZIGZAG8_FIELD
 682     COPY8x8
 683     return !!nz;
 684 }
 685
 686 #undef ZIG
 687 #undef COPY4x4
 688
 689 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 690 {
 691     for( int i = 0; i < 4; i++ )
 692     {
 693         int nz = 0;
 694         for( int j = 0; j < 16; j++ )
 695         {
 696             nz |= src[i+j*4];
 697             dst[i*16+j] = src[i+j*4];
 698         }
 699         nnz[(i&1) + (i>>1)*8] = !!nz;
 700     }
 701 }
 702
 703 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 704 {
 705     if( b_interlaced )
 706     {
 707         pf->scan_8x8   = zigzag_scan_8x8_field;
 708         pf->scan_4x4   = zigzag_scan_4x4_field;
 709         pf->sub_8x8    = zigzag_sub_8x8_field;
 710         pf->sub_4x4    = zigzag_sub_4x4_field;
 711         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 712 #ifdef HAVE_MMX
 713         if( cpu&X264_CPU_MMXEXT )
 714         {
 715             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 716             pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
 717         }
 718         if( cpu&X264_CPU_SSSE3 )
 719         {
 720             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
 721             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
 722         }
 723 #endif
 724
 725 #ifdef HAVE_ALTIVEC
 726         if( cpu&X264_CPU_ALTIVEC )
 727             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 728 #endif
 729     }
 730     else
 731     {
 732         pf->scan_8x8   = zigzag_scan_8x8_frame;
 733         pf->scan_4x4   = zigzag_scan_4x4_frame;
 734         pf->sub_8x8    = zigzag_sub_8x8_frame;
 735         pf->sub_4x4    = zigzag_sub_4x4_frame;
 736         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 737 #ifdef HAVE_MMX
 738         if( cpu&X264_CPU_MMX )
 739             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 740         if( cpu&X264_CPU_MMXEXT )
 741             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 742         if( cpu&X264_CPU_SSE2_IS_FAST )
 743             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 744         if( cpu&X264_CPU_SSSE3 )
 745         {
 746             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 747             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 748             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 749             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 750                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 751         }
 752 #endif
 753
 754 #ifdef HAVE_ALTIVEC
 755         if( cpu&X264_CPU_ALTIVEC )
 756             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 757 #endif
 758 #ifdef HAVE_ARMV6
 759         if( cpu&X264_CPU_NEON )
 760             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 761 #endif
 762     }
 763
 764     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 765 #ifdef HAVE_MMX
 766     if( cpu&X264_CPU_MMX )
 767         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 768     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 769         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 770 #endif
 771 }