git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31 #ifdef ARCH_ARM
  32 #   include "arm/dct.h"
  33 #endif
  34
  35 int x264_dct4_weight2_zigzag[2][16];
  36 int x264_dct8_weight2_zigzag[2][64];
  37
  38 /*
  39  * XXX For all dct dc : input could be equal to output so ...
  40  */
  41
  42 static void dct4x4dc( int16_t d[4][4] )
  43 {
  44     int16_t tmp[4][4];
  45     int s01, s23;
  46     int d01, d23;
  47     int i;
  48
  49     for( i = 0; i < 4; i++ )
  50     {
  51         s01 = d[i][0] + d[i][1];
  52         d01 = d[i][0] - d[i][1];
  53         s23 = d[i][2] + d[i][3];
  54         d23 = d[i][2] - d[i][3];
  55
  56         tmp[0][i] = s01 + s23;
  57         tmp[1][i] = s01 - s23;
  58         tmp[2][i] = d01 - d23;
  59         tmp[3][i] = d01 + d23;
  60     }
  61
  62     for( i = 0; i < 4; i++ )
  63     {
  64         s01 = tmp[i][0] + tmp[i][1];
  65         d01 = tmp[i][0] - tmp[i][1];
  66         s23 = tmp[i][2] + tmp[i][3];
  67         d23 = tmp[i][2] - tmp[i][3];
  68
  69         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  70         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  71         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  72         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  73     }
  74 }
  75
  76 static void idct4x4dc( int16_t d[4][4] )
  77 {
  78     int16_t tmp[4][4];
  79     int s01, s23;
  80     int d01, d23;
  81     int i;
  82
  83     for( i = 0; i < 4; i++ )
  84     {
  85         s01 = d[i][0] + d[i][1];
  86         d01 = d[i][0] - d[i][1];
  87         s23 = d[i][2] + d[i][3];
  88         d23 = d[i][2] - d[i][3];
  89
  90         tmp[0][i] = s01 + s23;
  91         tmp[1][i] = s01 - s23;
  92         tmp[2][i] = d01 - d23;
  93         tmp[3][i] = d01 + d23;
  94     }
  95
  96     for( i = 0; i < 4; i++ )
  97     {
  98         s01 = tmp[i][0] + tmp[i][1];
  99         d01 = tmp[i][0] - tmp[i][1];
 100         s23 = tmp[i][2] + tmp[i][3];
 101         d23 = tmp[i][2] - tmp[i][3];
 102
 103         d[i][0] = s01 + s23;
 104         d[i][1] = s01 - s23;
 105         d[i][2] = d01 - d23;
 106         d[i][3] = d01 + d23;
 107     }
 108 }
 109
 110 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 111                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 112 {
 113     int y, x;
 114     for( y = 0; y < i_size; y++ )
 115     {
 116         for( x = 0; x < i_size; x++ )
 117         {
 118             diff[x + y*i_size] = pix1[x] - pix2[x];
 119         }
 120         pix1 += i_pix1;
 121         pix2 += i_pix2;
 122     }
 123 }
 124
 125 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 126 {
 127     int16_t d[4][4];
 128     int16_t tmp[4][4];
 129     int i;
 130
 131     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 132
 133     for( i = 0; i < 4; i++ )
 134     {
 135         const int s03 = d[i][0] + d[i][3];
 136         const int s12 = d[i][1] + d[i][2];
 137         const int d03 = d[i][0] - d[i][3];
 138         const int d12 = d[i][1] - d[i][2];
 139
 140         tmp[0][i] =   s03 +   s12;
 141         tmp[1][i] = 2*d03 +   d12;
 142         tmp[2][i] =   s03 -   s12;
 143         tmp[3][i] =   d03 - 2*d12;
 144     }
 145
 146     for( i = 0; i < 4; i++ )
 147     {
 148         const int s03 = tmp[i][0] + tmp[i][3];
 149         const int s12 = tmp[i][1] + tmp[i][2];
 150         const int d03 = tmp[i][0] - tmp[i][3];
 151         const int d12 = tmp[i][1] - tmp[i][2];
 152
 153         dct[i][0] =   s03 +   s12;
 154         dct[i][1] = 2*d03 +   d12;
 155         dct[i][2] =   s03 -   s12;
 156         dct[i][3] =   d03 - 2*d12;
 157     }
 158 }
 159
 160 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 161 {
 162     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 163     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 164     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 165     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 166 }
 167
 168 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 169 {
 170     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 171     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 172     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 173     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 174 }
 175
 176 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
 177 {
 178     int16_t d[4][4];
 179     int sum = 0;
 180
 181     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 182
 183     sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
 184     sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
 185     sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
 186     sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
 187
 188     return sum;
 189 }
 190
 191 static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
 192 {
 193     dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 194     dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 195     dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 196     dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 197 }
 198
 199 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 200 {
 201     int16_t d[4][4];
 202     int16_t tmp[4][4];
 203     int x, y;
 204     int i;
 205
 206     for( i = 0; i < 4; i++ )
 207     {
 208         const int s02 =  dct[0][i]     +  dct[2][i];
 209         const int d02 =  dct[0][i]     -  dct[2][i];
 210         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 211         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 212
 213         tmp[i][0] = s02 + s13;
 214         tmp[i][1] = d02 + d13;
 215         tmp[i][2] = d02 - d13;
 216         tmp[i][3] = s02 - s13;
 217     }
 218
 219     for( i = 0; i < 4; i++ )
 220     {
 221         const int s02 =  tmp[0][i]     +  tmp[2][i];
 222         const int d02 =  tmp[0][i]     -  tmp[2][i];
 223         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 224         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 225
 226         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 227         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 228         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 229         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 230     }
 231
 232
 233     for( y = 0; y < 4; y++ )
 234     {
 235         for( x = 0; x < 4; x++ )
 236         {
 237             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
 238         }
 239         p_dst += FDEC_STRIDE;
 240     }
 241 }
 242
 243 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 244 {
 245     add4x4_idct( &p_dst[0],               dct[0] );
 246     add4x4_idct( &p_dst[4],               dct[1] );
 247     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 248     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 249 }
 250
 251 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 252 {
 253     add8x8_idct( &p_dst[0],               &dct[0] );
 254     add8x8_idct( &p_dst[8],               &dct[4] );
 255     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 256     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 257 }
 258
 259 /****************************************************************************
 260  * 8x8 transform:
 261  ****************************************************************************/
 262
 263 #define DCT8_1D {\
 264     const int s07 = SRC(0) + SRC(7);\
 265     const int s16 = SRC(1) + SRC(6);\
 266     const int s25 = SRC(2) + SRC(5);\
 267     const int s34 = SRC(3) + SRC(4);\
 268     const int a0 = s07 + s34;\
 269     const int a1 = s16 + s25;\
 270     const int a2 = s07 - s34;\
 271     const int a3 = s16 - s25;\
 272     const int d07 = SRC(0) - SRC(7);\
 273     const int d16 = SRC(1) - SRC(6);\
 274     const int d25 = SRC(2) - SRC(5);\
 275     const int d34 = SRC(3) - SRC(4);\
 276     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 277     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 278     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 279     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 280     DST(0) =  a0 + a1     ;\
 281     DST(1) =  a4 + (a7>>2);\
 282     DST(2) =  a2 + (a3>>1);\
 283     DST(3) =  a5 + (a6>>2);\
 284     DST(4) =  a0 - a1     ;\
 285     DST(5) =  a6 - (a5>>2);\
 286     DST(6) = (a2>>1) - a3 ;\
 287     DST(7) = (a4>>2) - a7 ;\
 288 }
 289
 290 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 291 {
 292     int i;
 293     int16_t tmp[8][8];
 294
 295     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 296
 297 #define SRC(x) tmp[x][i]
 298 #define DST(x) tmp[x][i]
 299     for( i = 0; i < 8; i++ )
 300         DCT8_1D
 301 #undef SRC
 302 #undef DST
 303
 304 #define SRC(x) tmp[i][x]
 305 #define DST(x) dct[x][i]
 306     for( i = 0; i < 8; i++ )
 307         DCT8_1D
 308 #undef SRC
 309 #undef DST
 310 }
 311
 312 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 313 {
 314     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 315     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 316     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 317     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 318 }
 319
 320 #define IDCT8_1D {\
 321     const int a0 =  SRC(0) + SRC(4);\
 322     const int a2 =  SRC(0) - SRC(4);\
 323     const int a4 = (SRC(2)>>1) - SRC(6);\
 324     const int a6 = (SRC(6)>>1) + SRC(2);\
 325     const int b0 = a0 + a6;\
 326     const int b2 = a2 + a4;\
 327     const int b4 = a2 - a4;\
 328     const int b6 = a0 - a6;\
 329     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 330     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 331     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 332     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 333     const int b1 = (a7>>2) + a1;\
 334     const int b3 =  a3 + (a5>>2);\
 335     const int b5 = (a3>>2) - a5;\
 336     const int b7 =  a7 - (a1>>2);\
 337     DST(0, b0 + b7);\
 338     DST(1, b2 + b5);\
 339     DST(2, b4 + b3);\
 340     DST(3, b6 + b1);\
 341     DST(4, b6 - b1);\
 342     DST(5, b4 - b3);\
 343     DST(6, b2 - b5);\
 344     DST(7, b0 - b7);\
 345 }
 346
 347 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 348 {
 349     int i;
 350
 351     dct[0][0] += 32; // rounding for the >>6 at the end
 352
 353 #define SRC(x)     dct[x][i]
 354 #define DST(x,rhs) dct[x][i] = (rhs)
 355     for( i = 0; i < 8; i++ )
 356         IDCT8_1D
 357 #undef SRC
 358 #undef DST
 359
 360 #define SRC(x)     dct[i][x]
 361 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 362     for( i = 0; i < 8; i++ )
 363         IDCT8_1D
 364 #undef SRC
 365 #undef DST
 366 }
 367
 368 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 369 {
 370     add8x8_idct8( &dst[0],               dct[0] );
 371     add8x8_idct8( &dst[8],               dct[1] );
 372     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 373     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 374 }
 375
 376 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
 377 {
 378     int i;
 379     dc = (dc + 32) >> 6;
 380     for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 381     {
 382         p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
 383         p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
 384         p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
 385         p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
 386     }
 387 }
 388
 389 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
 390 {
 391     add4x4_idct_dc( &p_dst[0],               dct[0][0] );
 392     add4x4_idct_dc( &p_dst[4],               dct[0][1] );
 393     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
 394     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
 395 }
 396
 397 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
 398 {
 399     int i;
 400     for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
 401     {
 402         add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
 403         add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
 404         add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
 405         add4x4_idct_dc( &p_dst[12], dct[i][3] );
 406     }
 407 }
 408
 409
 410 /****************************************************************************
 411  * x264_dct_init:
 412  ****************************************************************************/
 413 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 414 {
 415     dctf->sub4x4_dct    = sub4x4_dct;
 416     dctf->add4x4_idct   = add4x4_idct;
 417
 418     dctf->sub8x8_dct    = sub8x8_dct;
 419     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 420     dctf->add8x8_idct   = add8x8_idct;
 421     dctf->add8x8_idct_dc = add8x8_idct_dc;
 422
 423     dctf->sub16x16_dct  = sub16x16_dct;
 424     dctf->add16x16_idct = add16x16_idct;
 425     dctf->add16x16_idct_dc = add16x16_idct_dc;
 426
 427     dctf->sub8x8_dct8   = sub8x8_dct8;
 428     dctf->add8x8_idct8  = add8x8_idct8;
 429
 430     dctf->sub16x16_dct8  = sub16x16_dct8;
 431     dctf->add16x16_idct8 = add16x16_idct8;
 432
 433     dctf->dct4x4dc  = dct4x4dc;
 434     dctf->idct4x4dc = idct4x4dc;
 435
 436 #ifdef HAVE_MMX
 437     if( cpu&X264_CPU_MMX )
 438     {
 439         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 440         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 441         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 442         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 443         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 444         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 445         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 446
 447 #ifndef ARCH_X86_64
 448         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 449         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 450         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 451         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 452
 453         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 454         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 455         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 456         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 457 #endif
 458     }
 459
 460     if( cpu&X264_CPU_SSE2 )
 461     {
 462         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 463         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 464         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 465         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 466         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 467
 468         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 469         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 470         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 471         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 472         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 473     }
 474
 475     if( cpu&X264_CPU_SSSE3 )
 476     {
 477         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 478         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 479         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 480         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 481         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 482         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 483         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 484     }
 485 #endif //HAVE_MMX
 486
 487 #ifdef ARCH_PPC
 488     if( cpu&X264_CPU_ALTIVEC )
 489     {
 490         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 491         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 492         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 493
 494         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 495         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 496         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 497
 498         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 499         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 500
 501         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 502         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 503     }
 504 #endif
 505
 506 #ifdef HAVE_ARMV6
 507     if( cpu&X264_CPU_NEON )
 508     {
 509         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 510         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 511         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 512         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 513         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 514         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 515         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 516         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 517
 518         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 519         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 520         dctf->add16x16_idct = x264_add16x16_idct_neon;
 521
 522         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 523         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 524
 525         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 526         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 527     }
 528 #endif
 529 }
 530
 531 void x264_dct_init_weights( void )
 532 {
 533     int i, j;
 534     for( j=0; j<2; j++ )
 535     {
 536         for( i=0; i<16; i++ )
 537             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 538         for( i=0; i<64; i++ )
 539             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 540     }
 541 }
 542
 543
 544 // gcc pessimizes multi-dimensional arrays here, even with constant indices
 545 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
 546 #define ZIGZAG8_FRAME\
 547     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 548     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 549     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 550     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 551     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 552     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 553     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 554     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 555     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 556     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 557     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 558     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 559     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 560     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 561     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 562     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 563
 564 #define ZIGZAG8_FIELD\
 565     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 566     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 567     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 568     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 569     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 570     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 571     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 572     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 573     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 574     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 575     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 576     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 577     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 578     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 579     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 580     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 581
 582 #define ZIGZAG4_FRAME\
 583     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 584     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 585     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 586     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 587
 588 #define ZIGZAG4_FIELD\
 589     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 590     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 591     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 592     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 593
 594 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
 595 {
 596     ZIGZAG8_FRAME
 597 }
 598
 599 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
 600 {
 601     ZIGZAG8_FIELD
 602 }
 603
 604 #undef ZIG
 605 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
 606 #define ZIGDC(i,y,x) ZIG(i,y,x)
 607
 608 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
 609 {
 610     ZIGZAG4_FRAME
 611 }
 612
 613 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 614 {
 615     *(uint32_t*)level = *(uint32_t*)dct;
 616     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 617     *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
 618     *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
 619     *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
 620 }
 621
 622 #undef ZIG
 623 #define ZIG(i,y,x) {\
 624     int oe = x+y*FENC_STRIDE;\
 625     int od = x+y*FDEC_STRIDE;\
 626     level[i] = p_src[oe] - p_dst[od];\
 627     nz |= level[i];\
 628 }
 629 #define COPY4x4\
 630     *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
 631     *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
 632     *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
 633     *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
 634 #define COPY8x8\
 635     *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
 636     *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
 637     *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
 638     *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
 639     *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
 640     *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
 641     *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
 642     *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
 643
 644 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 645 {
 646     int nz = 0;
 647     ZIGZAG4_FRAME
 648     COPY4x4
 649     return !!nz;
 650 }
 651
 652 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 653 {
 654     int nz = 0;
 655     ZIGZAG4_FIELD
 656     COPY4x4
 657     return !!nz;
 658 }
 659
 660 #undef ZIGDC
 661 #define ZIGDC(i,y,x) {\
 662     int oe = x+y*FENC_STRIDE;\
 663     int od = x+y*FDEC_STRIDE;\
 664     *dc = p_src[oe] - p_dst[od];\
 665     level[0] = 0;\
 666 }
 667
 668 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 669 {
 670     int nz = 0;
 671     ZIGZAG4_FRAME
 672     COPY4x4
 673     return !!nz;
 674 }
 675
 676 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 677 {
 678     int nz = 0;
 679     ZIGZAG4_FIELD
 680     COPY4x4
 681     return !!nz;
 682 }
 683
 684 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 685 {
 686     int nz = 0;
 687     ZIGZAG8_FRAME
 688     COPY8x8
 689     return !!nz;
 690 }
 691 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 692 {
 693     int nz = 0;
 694     ZIGZAG8_FIELD
 695     COPY8x8
 696     return !!nz;
 697 }
 698
 699 #undef ZIG
 700 #undef COPY4x4
 701
 702 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 703 {
 704     int i,j;
 705     for( i=0; i<4; i++ )
 706     {
 707         int nz = 0;
 708         for( j=0; j<16; j++ )
 709         {
 710             nz |= src[i+j*4];
 711             dst[i*16+j] = src[i+j*4];
 712         }
 713         nnz[(i&1) + (i>>1)*8] = !!nz;
 714     }
 715 }
 716
 717 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 718 {
 719     if( b_interlaced )
 720     {
 721         pf->scan_8x8   = zigzag_scan_8x8_field;
 722         pf->scan_4x4   = zigzag_scan_4x4_field;
 723         pf->sub_8x8    = zigzag_sub_8x8_field;
 724         pf->sub_4x4    = zigzag_sub_4x4_field;
 725         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 726 #ifdef HAVE_MMX
 727         if( cpu&X264_CPU_MMXEXT )
 728             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 729         if( cpu&X264_CPU_SSSE3 )
 730         {
 731             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
 732             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
 733         }
 734 #endif
 735
 736 #ifdef ARCH_PPC
 737         if( cpu&X264_CPU_ALTIVEC )
 738             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 739 #endif
 740     }
 741     else
 742     {
 743         pf->scan_8x8   = zigzag_scan_8x8_frame;
 744         pf->scan_4x4   = zigzag_scan_4x4_frame;
 745         pf->sub_8x8    = zigzag_sub_8x8_frame;
 746         pf->sub_4x4    = zigzag_sub_4x4_frame;
 747         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 748 #ifdef HAVE_MMX
 749         if( cpu&X264_CPU_MMX )
 750             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 751         if( cpu&X264_CPU_MMXEXT )
 752             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 753         if( cpu&X264_CPU_SSE2_IS_FAST )
 754             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 755         if( cpu&X264_CPU_SSSE3 )
 756         {
 757             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 758             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 759             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 760             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 761                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 762         }
 763 #endif
 764
 765 #ifdef ARCH_PPC
 766         if( cpu&X264_CPU_ALTIVEC )
 767             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 768 #endif
 769 #ifdef HAVE_ARMV6
 770         if( cpu&X264_CPU_NEON )
 771             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 772 #endif
 773     }
 774
 775     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 776 #ifdef HAVE_MMX
 777     if( cpu&X264_CPU_MMX )
 778         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 779     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 780         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 781 #endif
 782 }