git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31 #ifdef ARCH_ARM
  32 #   include "arm/dct.h"
  33 #endif
  34
  35 int x264_dct4_weight2_zigzag[2][16];
  36 int x264_dct8_weight2_zigzag[2][64];
  37
  38 /*
  39  * XXX For all dct dc : input could be equal to output so ...
  40  */
  41
  42 static void dct4x4dc( int16_t d[4][4] )
  43 {
  44     int16_t tmp[4][4];
  45     int s01, s23;
  46     int d01, d23;
  47     int i;
  48
  49     for( i = 0; i < 4; i++ )
  50     {
  51         s01 = d[i][0] + d[i][1];
  52         d01 = d[i][0] - d[i][1];
  53         s23 = d[i][2] + d[i][3];
  54         d23 = d[i][2] - d[i][3];
  55
  56         tmp[0][i] = s01 + s23;
  57         tmp[1][i] = s01 - s23;
  58         tmp[2][i] = d01 - d23;
  59         tmp[3][i] = d01 + d23;
  60     }
  61
  62     for( i = 0; i < 4; i++ )
  63     {
  64         s01 = tmp[i][0] + tmp[i][1];
  65         d01 = tmp[i][0] - tmp[i][1];
  66         s23 = tmp[i][2] + tmp[i][3];
  67         d23 = tmp[i][2] - tmp[i][3];
  68
  69         d[i][0] = ( s01 + s23 + 1 ) >> 1;
  70         d[i][1] = ( s01 - s23 + 1 ) >> 1;
  71         d[i][2] = ( d01 - d23 + 1 ) >> 1;
  72         d[i][3] = ( d01 + d23 + 1 ) >> 1;
  73     }
  74 }
  75
  76 static void idct4x4dc( int16_t d[4][4] )
  77 {
  78     int16_t tmp[4][4];
  79     int s01, s23;
  80     int d01, d23;
  81     int i;
  82
  83     for( i = 0; i < 4; i++ )
  84     {
  85         s01 = d[i][0] + d[i][1];
  86         d01 = d[i][0] - d[i][1];
  87         s23 = d[i][2] + d[i][3];
  88         d23 = d[i][2] - d[i][3];
  89
  90         tmp[0][i] = s01 + s23;
  91         tmp[1][i] = s01 - s23;
  92         tmp[2][i] = d01 - d23;
  93         tmp[3][i] = d01 + d23;
  94     }
  95
  96     for( i = 0; i < 4; i++ )
  97     {
  98         s01 = tmp[i][0] + tmp[i][1];
  99         d01 = tmp[i][0] - tmp[i][1];
 100         s23 = tmp[i][2] + tmp[i][3];
 101         d23 = tmp[i][2] - tmp[i][3];
 102
 103         d[i][0] = s01 + s23;
 104         d[i][1] = s01 - s23;
 105         d[i][2] = d01 - d23;
 106         d[i][3] = d01 + d23;
 107     }
 108 }
 109
 110 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 111                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 112 {
 113     int y, x;
 114     for( y = 0; y < i_size; y++ )
 115     {
 116         for( x = 0; x < i_size; x++ )
 117         {
 118             diff[x + y*i_size] = pix1[x] - pix2[x];
 119         }
 120         pix1 += i_pix1;
 121         pix2 += i_pix2;
 122     }
 123 }
 124
 125 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
 126 {
 127     int16_t d[4][4];
 128     int16_t tmp[4][4];
 129     int i;
 130
 131     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 132
 133     for( i = 0; i < 4; i++ )
 134     {
 135         const int s03 = d[i][0] + d[i][3];
 136         const int s12 = d[i][1] + d[i][2];
 137         const int d03 = d[i][0] - d[i][3];
 138         const int d12 = d[i][1] - d[i][2];
 139
 140         tmp[0][i] =   s03 +   s12;
 141         tmp[1][i] = 2*d03 +   d12;
 142         tmp[2][i] =   s03 -   s12;
 143         tmp[3][i] =   d03 - 2*d12;
 144     }
 145
 146     for( i = 0; i < 4; i++ )
 147     {
 148         const int s03 = tmp[i][0] + tmp[i][3];
 149         const int s12 = tmp[i][1] + tmp[i][2];
 150         const int d03 = tmp[i][0] - tmp[i][3];
 151         const int d12 = tmp[i][1] - tmp[i][2];
 152
 153         dct[i][0] =   s03 +   s12;
 154         dct[i][1] = 2*d03 +   d12;
 155         dct[i][2] =   s03 -   s12;
 156         dct[i][3] =   d03 - 2*d12;
 157     }
 158 }
 159
 160 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
 161 {
 162     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 163     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 164     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 165     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 166 }
 167
 168 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
 169 {
 170     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 171     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 172     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 173     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 174 }
 175
 176 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
 177 {
 178     int16_t d[4][4];
 179     int sum = 0;
 180
 181     pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 182
 183     sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
 184     sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
 185     sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
 186     sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
 187
 188     return sum;
 189 }
 190
 191 static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
 192 {
 193     dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 194     dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 195     dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 196     dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 197 }
 198
 199 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
 200 {
 201     int16_t d[4][4];
 202     int16_t tmp[4][4];
 203     int x, y;
 204     int i;
 205
 206     for( i = 0; i < 4; i++ )
 207     {
 208         const int s02 =  dct[0][i]     +  dct[2][i];
 209         const int d02 =  dct[0][i]     -  dct[2][i];
 210         const int s13 =  dct[1][i]     + (dct[3][i]>>1);
 211         const int d13 = (dct[1][i]>>1) -  dct[3][i];
 212
 213         tmp[i][0] = s02 + s13;
 214         tmp[i][1] = d02 + d13;
 215         tmp[i][2] = d02 - d13;
 216         tmp[i][3] = s02 - s13;
 217     }
 218
 219     for( i = 0; i < 4; i++ )
 220     {
 221         const int s02 =  tmp[0][i]     +  tmp[2][i];
 222         const int d02 =  tmp[0][i]     -  tmp[2][i];
 223         const int s13 =  tmp[1][i]     + (tmp[3][i]>>1);
 224         const int d13 = (tmp[1][i]>>1) -  tmp[3][i];
 225
 226         d[0][i] = ( s02 + s13 + 32 ) >> 6;
 227         d[1][i] = ( d02 + d13 + 32 ) >> 6;
 228         d[2][i] = ( d02 - d13 + 32 ) >> 6;
 229         d[3][i] = ( s02 - s13 + 32 ) >> 6;
 230     }
 231
 232
 233     for( y = 0; y < 4; y++ )
 234     {
 235         for( x = 0; x < 4; x++ )
 236         {
 237             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
 238         }
 239         p_dst += FDEC_STRIDE;
 240     }
 241 }
 242
 243 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
 244 {
 245     add4x4_idct( &p_dst[0],               dct[0] );
 246     add4x4_idct( &p_dst[4],               dct[1] );
 247     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 248     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 249 }
 250
 251 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
 252 {
 253     add8x8_idct( &p_dst[0],               &dct[0] );
 254     add8x8_idct( &p_dst[8],               &dct[4] );
 255     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 256     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 257 }
 258
 259 /****************************************************************************
 260  * 8x8 transform:
 261  ****************************************************************************/
 262
 263 #define DCT8_1D {\
 264     const int s07 = SRC(0) + SRC(7);\
 265     const int s16 = SRC(1) + SRC(6);\
 266     const int s25 = SRC(2) + SRC(5);\
 267     const int s34 = SRC(3) + SRC(4);\
 268     const int a0 = s07 + s34;\
 269     const int a1 = s16 + s25;\
 270     const int a2 = s07 - s34;\
 271     const int a3 = s16 - s25;\
 272     const int d07 = SRC(0) - SRC(7);\
 273     const int d16 = SRC(1) - SRC(6);\
 274     const int d25 = SRC(2) - SRC(5);\
 275     const int d34 = SRC(3) - SRC(4);\
 276     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 277     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 278     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 279     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 280     DST(0) =  a0 + a1     ;\
 281     DST(1) =  a4 + (a7>>2);\
 282     DST(2) =  a2 + (a3>>1);\
 283     DST(3) =  a5 + (a6>>2);\
 284     DST(4) =  a0 - a1     ;\
 285     DST(5) =  a6 - (a5>>2);\
 286     DST(6) = (a2>>1) - a3 ;\
 287     DST(7) = (a4>>2) - a7 ;\
 288 }
 289
 290 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
 291 {
 292     int i;
 293     int16_t tmp[8][8];
 294
 295     pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 296
 297 #define SRC(x) tmp[x][i]
 298 #define DST(x) tmp[x][i]
 299     for( i = 0; i < 8; i++ )
 300         DCT8_1D
 301 #undef SRC
 302 #undef DST
 303
 304 #define SRC(x) tmp[i][x]
 305 #define DST(x) dct[x][i]
 306     for( i = 0; i < 8; i++ )
 307         DCT8_1D
 308 #undef SRC
 309 #undef DST
 310 }
 311
 312 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
 313 {
 314     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 315     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 316     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 317     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 318 }
 319
 320 #define IDCT8_1D {\
 321     const int a0 =  SRC(0) + SRC(4);\
 322     const int a2 =  SRC(0) - SRC(4);\
 323     const int a4 = (SRC(2)>>1) - SRC(6);\
 324     const int a6 = (SRC(6)>>1) + SRC(2);\
 325     const int b0 = a0 + a6;\
 326     const int b2 = a2 + a4;\
 327     const int b4 = a2 - a4;\
 328     const int b6 = a0 - a6;\
 329     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 330     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 331     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 332     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 333     const int b1 = (a7>>2) + a1;\
 334     const int b3 =  a3 + (a5>>2);\
 335     const int b5 = (a3>>2) - a5;\
 336     const int b7 =  a7 - (a1>>2);\
 337     DST(0, b0 + b7);\
 338     DST(1, b2 + b5);\
 339     DST(2, b4 + b3);\
 340     DST(3, b6 + b1);\
 341     DST(4, b6 - b1);\
 342     DST(5, b4 - b3);\
 343     DST(6, b2 - b5);\
 344     DST(7, b0 - b7);\
 345 }
 346
 347 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
 348 {
 349     int i;
 350
 351     dct[0][0] += 32; // rounding for the >>6 at the end
 352
 353 #define SRC(x)     dct[x][i]
 354 #define DST(x,rhs) dct[x][i] = (rhs)
 355     for( i = 0; i < 8; i++ )
 356         IDCT8_1D
 357 #undef SRC
 358 #undef DST
 359
 360 #define SRC(x)     dct[i][x]
 361 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 362     for( i = 0; i < 8; i++ )
 363         IDCT8_1D
 364 #undef SRC
 365 #undef DST
 366 }
 367
 368 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
 369 {
 370     add8x8_idct8( &dst[0],               dct[0] );
 371     add8x8_idct8( &dst[8],               dct[1] );
 372     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 373     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 374 }
 375
 376 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
 377 {
 378     int i;
 379     dc = (dc + 32) >> 6;
 380     for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 381     {
 382         p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
 383         p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
 384         p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
 385         p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
 386     }
 387 }
 388
 389 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
 390 {
 391     add4x4_idct_dc( &p_dst[0],               dct[0][0] );
 392     add4x4_idct_dc( &p_dst[4],               dct[0][1] );
 393     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
 394     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
 395 }
 396
 397 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
 398 {
 399     int i;
 400     for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
 401     {
 402         add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
 403         add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
 404         add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
 405         add4x4_idct_dc( &p_dst[12], dct[i][3] );
 406     }
 407 }
 408
 409
 410 /****************************************************************************
 411  * x264_dct_init:
 412  ****************************************************************************/
 413 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 414 {
 415     dctf->sub4x4_dct    = sub4x4_dct;
 416     dctf->add4x4_idct   = add4x4_idct;
 417
 418     dctf->sub8x8_dct    = sub8x8_dct;
 419     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 420     dctf->add8x8_idct   = add8x8_idct;
 421     dctf->add8x8_idct_dc = add8x8_idct_dc;
 422
 423     dctf->sub16x16_dct  = sub16x16_dct;
 424     dctf->add16x16_idct = add16x16_idct;
 425     dctf->add16x16_idct_dc = add16x16_idct_dc;
 426
 427     dctf->sub8x8_dct8   = sub8x8_dct8;
 428     dctf->add8x8_idct8  = add8x8_idct8;
 429
 430     dctf->sub16x16_dct8  = sub16x16_dct8;
 431     dctf->add16x16_idct8 = add16x16_idct8;
 432
 433     dctf->dct4x4dc  = dct4x4dc;
 434     dctf->idct4x4dc = idct4x4dc;
 435
 436 #ifdef HAVE_MMX
 437     if( cpu&X264_CPU_MMX )
 438     {
 439         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 440         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 441         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 442         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 443         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 444         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 445         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 446
 447 #ifndef ARCH_X86_64
 448         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 449         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 450         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 451         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 452
 453         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 454         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 455         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 456         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 457 #endif
 458     }
 459
 460     if( cpu&X264_CPU_SSE2 )
 461     {
 462         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 463         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 464         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 465         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 466         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 467
 468         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 469         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 470         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 471         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 472         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 473     }
 474
 475     if( cpu&X264_CPU_SSSE3 )
 476     {
 477         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 478         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 479         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 480         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 481         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 482         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 483         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 484     }
 485
 486     if( cpu&X264_CPU_SSE4 )
 487         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 488
 489 #endif //HAVE_MMX
 490
 491 #ifdef ARCH_PPC
 492     if( cpu&X264_CPU_ALTIVEC )
 493     {
 494         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 495         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 496         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 497
 498         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 499         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 500         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 501
 502         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 503         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 504
 505         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 506         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 507     }
 508 #endif
 509
 510 #ifdef HAVE_ARMV6
 511     if( cpu&X264_CPU_NEON )
 512     {
 513         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 514         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 515         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 516         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 517         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 518         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 519         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 520         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 521
 522         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 523         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 524         dctf->add16x16_idct = x264_add16x16_idct_neon;
 525
 526         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 527         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 528
 529         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 530         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 531     }
 532 #endif
 533 }
 534
 535 void x264_dct_init_weights( void )
 536 {
 537     int i, j;
 538     for( j=0; j<2; j++ )
 539     {
 540         for( i=0; i<16; i++ )
 541             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 542         for( i=0; i<64; i++ )
 543             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 544     }
 545 }
 546
 547
 548 // gcc pessimizes multi-dimensional arrays here, even with constant indices
 549 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
 550 #define ZIGZAG8_FRAME\
 551     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 552     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 553     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 554     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 555     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 556     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 557     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 558     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 559     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 560     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 561     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 562     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 563     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 564     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 565     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 566     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 567
 568 #define ZIGZAG8_FIELD\
 569     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 570     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 571     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 572     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 573     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 574     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 575     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 576     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 577     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 578     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 579     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 580     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 581     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 582     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 583     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 584     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 585
 586 #define ZIGZAG4_FRAME\
 587     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 588     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 589     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 590     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 591
 592 #define ZIGZAG4_FIELD\
 593     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 594     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 595     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 596     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 597
 598 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
 599 {
 600     ZIGZAG8_FRAME
 601 }
 602
 603 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
 604 {
 605     ZIGZAG8_FIELD
 606 }
 607
 608 #undef ZIG
 609 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
 610 #define ZIGDC(i,y,x) ZIG(i,y,x)
 611
 612 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
 613 {
 614     ZIGZAG4_FRAME
 615 }
 616
 617 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
 618 {
 619     *(uint32_t*)level = *(uint32_t*)dct;
 620     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 621     *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
 622     *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
 623     *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
 624 }
 625
 626 #undef ZIG
 627 #define ZIG(i,y,x) {\
 628     int oe = x+y*FENC_STRIDE;\
 629     int od = x+y*FDEC_STRIDE;\
 630     level[i] = p_src[oe] - p_dst[od];\
 631     nz |= level[i];\
 632 }
 633 #define COPY4x4\
 634     *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
 635     *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
 636     *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
 637     *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
 638 #define COPY8x8\
 639     *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
 640     *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
 641     *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
 642     *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
 643     *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
 644     *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
 645     *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
 646     *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
 647
 648 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 649 {
 650     int nz = 0;
 651     ZIGZAG4_FRAME
 652     COPY4x4
 653     return !!nz;
 654 }
 655
 656 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 657 {
 658     int nz = 0;
 659     ZIGZAG4_FIELD
 660     COPY4x4
 661     return !!nz;
 662 }
 663
 664 #undef ZIGDC
 665 #define ZIGDC(i,y,x) {\
 666     int oe = x+y*FENC_STRIDE;\
 667     int od = x+y*FDEC_STRIDE;\
 668     *dc = p_src[oe] - p_dst[od];\
 669     level[0] = 0;\
 670 }
 671
 672 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 673 {
 674     int nz = 0;
 675     ZIGZAG4_FRAME
 676     COPY4x4
 677     return !!nz;
 678 }
 679
 680 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 681 {
 682     int nz = 0;
 683     ZIGZAG4_FIELD
 684     COPY4x4
 685     return !!nz;
 686 }
 687
 688 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 689 {
 690     int nz = 0;
 691     ZIGZAG8_FRAME
 692     COPY8x8
 693     return !!nz;
 694 }
 695 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 696 {
 697     int nz = 0;
 698     ZIGZAG8_FIELD
 699     COPY8x8
 700     return !!nz;
 701 }
 702
 703 #undef ZIG
 704 #undef COPY4x4
 705
 706 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 707 {
 708     int i,j;
 709     for( i=0; i<4; i++ )
 710     {
 711         int nz = 0;
 712         for( j=0; j<16; j++ )
 713         {
 714             nz |= src[i+j*4];
 715             dst[i*16+j] = src[i+j*4];
 716         }
 717         nnz[(i&1) + (i>>1)*8] = !!nz;
 718     }
 719 }
 720
 721 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 722 {
 723     if( b_interlaced )
 724     {
 725         pf->scan_8x8   = zigzag_scan_8x8_field;
 726         pf->scan_4x4   = zigzag_scan_4x4_field;
 727         pf->sub_8x8    = zigzag_sub_8x8_field;
 728         pf->sub_4x4    = zigzag_sub_4x4_field;
 729         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 730 #ifdef HAVE_MMX
 731         if( cpu&X264_CPU_MMXEXT )
 732             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 733         if( cpu&X264_CPU_SSSE3 )
 734         {
 735             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
 736             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
 737         }
 738 #endif
 739
 740 #ifdef ARCH_PPC
 741         if( cpu&X264_CPU_ALTIVEC )
 742             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 743 #endif
 744     }
 745     else
 746     {
 747         pf->scan_8x8   = zigzag_scan_8x8_frame;
 748         pf->scan_4x4   = zigzag_scan_4x4_frame;
 749         pf->sub_8x8    = zigzag_sub_8x8_frame;
 750         pf->sub_4x4    = zigzag_sub_4x4_frame;
 751         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 752 #ifdef HAVE_MMX
 753         if( cpu&X264_CPU_MMX )
 754             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 755         if( cpu&X264_CPU_MMXEXT )
 756             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 757         if( cpu&X264_CPU_SSE2_IS_FAST )
 758             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 759         if( cpu&X264_CPU_SSSE3 )
 760         {
 761             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 762             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 763             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 764             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 765                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 766         }
 767 #endif
 768
 769 #ifdef ARCH_PPC
 770         if( cpu&X264_CPU_ALTIVEC )
 771             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 772 #endif
 773 #ifdef HAVE_ARMV6
 774         if( cpu&X264_CPU_NEON )
 775             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 776 #endif
 777     }
 778
 779     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 780 #ifdef HAVE_MMX
 781     if( cpu&X264_CPU_MMX )
 782         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 783     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 784         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 785 #endif
 786 }