git.sesse.net Git - x264/blob - common/dct.c

   1 /*****************************************************************************
   2  * dct.c: h264 encoder library
   3  *****************************************************************************
   4  * Copyright (C) 2003-2008 x264 project
   5  *
   6  * Authors: Loren Merritt <lorenm@u.washington.edu>
   7  *          Laurent Aimar <fenrir@via.ecp.fr>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU General Public License
  20  * along with this program; if not, write to the Free Software
  21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
  22  *****************************************************************************/
  23
  24 #include "common.h"
  25 #ifdef HAVE_MMX
  26 #   include "x86/dct.h"
  27 #endif
  28 #ifdef ARCH_PPC
  29 #   include "ppc/dct.h"
  30 #endif
  31 #ifdef ARCH_ARM
  32 #   include "arm/dct.h"
  33 #endif
  34
  35 int x264_dct4_weight2_zigzag[2][16];
  36 int x264_dct8_weight2_zigzag[2][64];
  37
  38 static void dct4x4dc( int16_t d[16] )
  39 {
  40     int16_t tmp[16];
  41     int s01, s23;
  42     int d01, d23;
  43     int i;
  44
  45     for( i = 0; i < 4; i++ )
  46     {
  47         s01 = d[i*4+0] + d[i*4+1];
  48         d01 = d[i*4+0] - d[i*4+1];
  49         s23 = d[i*4+2] + d[i*4+3];
  50         d23 = d[i*4+2] - d[i*4+3];
  51
  52         tmp[0*4+i] = s01 + s23;
  53         tmp[1*4+i] = s01 - s23;
  54         tmp[2*4+i] = d01 - d23;
  55         tmp[3*4+i] = d01 + d23;
  56     }
  57
  58     for( i = 0; i < 4; i++ )
  59     {
  60         s01 = tmp[i*4+0] + tmp[i*4+1];
  61         d01 = tmp[i*4+0] - tmp[i*4+1];
  62         s23 = tmp[i*4+2] + tmp[i*4+3];
  63         d23 = tmp[i*4+2] - tmp[i*4+3];
  64
  65         d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
  66         d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
  67         d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
  68         d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
  69     }
  70 }
  71
  72 static void idct4x4dc( int16_t d[16] )
  73 {
  74     int16_t tmp[16];
  75     int s01, s23;
  76     int d01, d23;
  77     int i;
  78
  79     for( i = 0; i < 4; i++ )
  80     {
  81         s01 = d[i*4+0] + d[i*4+1];
  82         d01 = d[i*4+0] - d[i*4+1];
  83         s23 = d[i*4+2] + d[i*4+3];
  84         d23 = d[i*4+2] - d[i*4+3];
  85
  86         tmp[0*4+i] = s01 + s23;
  87         tmp[1*4+i] = s01 - s23;
  88         tmp[2*4+i] = d01 - d23;
  89         tmp[3*4+i] = d01 + d23;
  90     }
  91
  92     for( i = 0; i < 4; i++ )
  93     {
  94         s01 = tmp[i*4+0] + tmp[i*4+1];
  95         d01 = tmp[i*4+0] - tmp[i*4+1];
  96         s23 = tmp[i*4+2] + tmp[i*4+3];
  97         d23 = tmp[i*4+2] - tmp[i*4+3];
  98
  99         d[i*4+0] = s01 + s23;
 100         d[i*4+1] = s01 - s23;
 101         d[i*4+2] = d01 - d23;
 102         d[i*4+3] = d01 + d23;
 103     }
 104 }
 105
 106 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
 107                                   uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
 108 {
 109     int y, x;
 110     for( y = 0; y < i_size; y++ )
 111     {
 112         for( x = 0; x < i_size; x++ )
 113         {
 114             diff[x + y*i_size] = pix1[x] - pix2[x];
 115         }
 116         pix1 += i_pix1;
 117         pix2 += i_pix2;
 118     }
 119 }
 120
 121 static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
 122 {
 123     int16_t d[16];
 124     int16_t tmp[16];
 125     int i;
 126
 127     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 128
 129     for( i = 0; i < 4; i++ )
 130     {
 131         const int s03 = d[i*4+0] + d[i*4+3];
 132         const int s12 = d[i*4+1] + d[i*4+2];
 133         const int d03 = d[i*4+0] - d[i*4+3];
 134         const int d12 = d[i*4+1] - d[i*4+2];
 135
 136         tmp[0*4+i] =   s03 +   s12;
 137         tmp[1*4+i] = 2*d03 +   d12;
 138         tmp[2*4+i] =   s03 -   s12;
 139         tmp[3*4+i] =   d03 - 2*d12;
 140     }
 141
 142     for( i = 0; i < 4; i++ )
 143     {
 144         const int s03 = tmp[i*4+0] + tmp[i*4+3];
 145         const int s12 = tmp[i*4+1] + tmp[i*4+2];
 146         const int d03 = tmp[i*4+0] - tmp[i*4+3];
 147         const int d12 = tmp[i*4+1] - tmp[i*4+2];
 148
 149         dct[i*4+0] =   s03 +   s12;
 150         dct[i*4+1] = 2*d03 +   d12;
 151         dct[i*4+2] =   s03 -   s12;
 152         dct[i*4+3] =   d03 - 2*d12;
 153     }
 154 }
 155
 156 static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
 157 {
 158     sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
 159     sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
 160     sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 161     sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 162 }
 163
 164 static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
 165 {
 166     sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
 167     sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
 168     sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 169     sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 170 }
 171
 172 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
 173 {
 174     int16_t d[16];
 175     int sum = 0;
 176
 177     pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 178
 179     sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
 180     sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
 181
 182     return sum;
 183 }
 184
 185 static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
 186 {
 187     dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
 188     dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
 189     dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
 190     dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
 191 }
 192
 193 static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
 194 {
 195     int16_t d[16];
 196     int16_t tmp[16];
 197     int x, y;
 198     int i;
 199
 200     for( i = 0; i < 4; i++ )
 201     {
 202         const int s02 =  dct[0*4+i]     +  dct[2*4+i];
 203         const int d02 =  dct[0*4+i]     -  dct[2*4+i];
 204         const int s13 =  dct[1*4+i]     + (dct[3*4+i]>>1);
 205         const int d13 = (dct[1*4+i]>>1) -  dct[3*4+i];
 206
 207         tmp[i*4+0] = s02 + s13;
 208         tmp[i*4+1] = d02 + d13;
 209         tmp[i*4+2] = d02 - d13;
 210         tmp[i*4+3] = s02 - s13;
 211     }
 212
 213     for( i = 0; i < 4; i++ )
 214     {
 215         const int s02 =  tmp[0*4+i]     +  tmp[2*4+i];
 216         const int d02 =  tmp[0*4+i]     -  tmp[2*4+i];
 217         const int s13 =  tmp[1*4+i]     + (tmp[3*4+i]>>1);
 218         const int d13 = (tmp[1*4+i]>>1) -  tmp[3*4+i];
 219
 220         d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
 221         d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
 222         d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
 223         d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
 224     }
 225
 226
 227     for( y = 0; y < 4; y++ )
 228     {
 229         for( x = 0; x < 4; x++ )
 230             p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
 231         p_dst += FDEC_STRIDE;
 232     }
 233 }
 234
 235 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
 236 {
 237     add4x4_idct( &p_dst[0],               dct[0] );
 238     add4x4_idct( &p_dst[4],               dct[1] );
 239     add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 240     add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 241 }
 242
 243 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
 244 {
 245     add8x8_idct( &p_dst[0],               &dct[0] );
 246     add8x8_idct( &p_dst[8],               &dct[4] );
 247     add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
 248     add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
 249 }
 250
 251 /****************************************************************************
 252  * 8x8 transform:
 253  ****************************************************************************/
 254
 255 #define DCT8_1D {\
 256     const int s07 = SRC(0) + SRC(7);\
 257     const int s16 = SRC(1) + SRC(6);\
 258     const int s25 = SRC(2) + SRC(5);\
 259     const int s34 = SRC(3) + SRC(4);\
 260     const int a0 = s07 + s34;\
 261     const int a1 = s16 + s25;\
 262     const int a2 = s07 - s34;\
 263     const int a3 = s16 - s25;\
 264     const int d07 = SRC(0) - SRC(7);\
 265     const int d16 = SRC(1) - SRC(6);\
 266     const int d25 = SRC(2) - SRC(5);\
 267     const int d34 = SRC(3) - SRC(4);\
 268     const int a4 = d16 + d25 + (d07 + (d07>>1));\
 269     const int a5 = d07 - d34 - (d25 + (d25>>1));\
 270     const int a6 = d07 + d34 - (d16 + (d16>>1));\
 271     const int a7 = d16 - d25 + (d34 + (d34>>1));\
 272     DST(0) =  a0 + a1     ;\
 273     DST(1) =  a4 + (a7>>2);\
 274     DST(2) =  a2 + (a3>>1);\
 275     DST(3) =  a5 + (a6>>2);\
 276     DST(4) =  a0 - a1     ;\
 277     DST(5) =  a6 - (a5>>2);\
 278     DST(6) = (a2>>1) - a3 ;\
 279     DST(7) = (a4>>2) - a7 ;\
 280 }
 281
 282 static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
 283 {
 284     int i;
 285     int16_t tmp[64];
 286
 287     pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
 288
 289 #define SRC(x) tmp[x*8+i]
 290 #define DST(x) tmp[x*8+i]
 291     for( i = 0; i < 8; i++ )
 292         DCT8_1D
 293 #undef SRC
 294 #undef DST
 295
 296 #define SRC(x) tmp[i*8+x]
 297 #define DST(x) dct[x*8+i]
 298     for( i = 0; i < 8; i++ )
 299         DCT8_1D
 300 #undef SRC
 301 #undef DST
 302 }
 303
 304 static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
 305 {
 306     sub8x8_dct8( dct[0], &pix1[0],               &pix2[0] );
 307     sub8x8_dct8( dct[1], &pix1[8],               &pix2[8] );
 308     sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
 309     sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
 310 }
 311
 312 #define IDCT8_1D {\
 313     const int a0 =  SRC(0) + SRC(4);\
 314     const int a2 =  SRC(0) - SRC(4);\
 315     const int a4 = (SRC(2)>>1) - SRC(6);\
 316     const int a6 = (SRC(6)>>1) + SRC(2);\
 317     const int b0 = a0 + a6;\
 318     const int b2 = a2 + a4;\
 319     const int b4 = a2 - a4;\
 320     const int b6 = a0 - a6;\
 321     const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
 322     const int a3 =  SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
 323     const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
 324     const int a7 =  SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
 325     const int b1 = (a7>>2) + a1;\
 326     const int b3 =  a3 + (a5>>2);\
 327     const int b5 = (a3>>2) - a5;\
 328     const int b7 =  a7 - (a1>>2);\
 329     DST(0, b0 + b7);\
 330     DST(1, b2 + b5);\
 331     DST(2, b4 + b3);\
 332     DST(3, b6 + b1);\
 333     DST(4, b6 - b1);\
 334     DST(5, b4 - b3);\
 335     DST(6, b2 - b5);\
 336     DST(7, b0 - b7);\
 337 }
 338
 339 static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
 340 {
 341     int i;
 342
 343     dct[0] += 32; // rounding for the >>6 at the end
 344
 345 #define SRC(x)     dct[x*8+i]
 346 #define DST(x,rhs) dct[x*8+i] = (rhs)
 347     for( i = 0; i < 8; i++ )
 348         IDCT8_1D
 349 #undef SRC
 350 #undef DST
 351
 352 #define SRC(x)     dct[i*8+x]
 353 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
 354     for( i = 0; i < 8; i++ )
 355         IDCT8_1D
 356 #undef SRC
 357 #undef DST
 358 }
 359
 360 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
 361 {
 362     add8x8_idct8( &dst[0],               dct[0] );
 363     add8x8_idct8( &dst[8],               dct[1] );
 364     add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
 365     add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
 366 }
 367
 368 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
 369 {
 370     int i;
 371     dc = (dc + 32) >> 6;
 372     for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
 373     {
 374         p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
 375         p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
 376         p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
 377         p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
 378     }
 379 }
 380
 381 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
 382 {
 383     add4x4_idct_dc( &p_dst[0],               dct[0] );
 384     add4x4_idct_dc( &p_dst[4],               dct[1] );
 385     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
 386     add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
 387 }
 388
 389 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
 390 {
 391     int i;
 392     for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
 393     {
 394         add4x4_idct_dc( &p_dst[ 0], dct[0] );
 395         add4x4_idct_dc( &p_dst[ 4], dct[1] );
 396         add4x4_idct_dc( &p_dst[ 8], dct[2] );
 397         add4x4_idct_dc( &p_dst[12], dct[3] );
 398     }
 399 }
 400
 401
 402 /****************************************************************************
 403  * x264_dct_init:
 404  ****************************************************************************/
 405 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
 406 {
 407     dctf->sub4x4_dct    = sub4x4_dct;
 408     dctf->add4x4_idct   = add4x4_idct;
 409
 410     dctf->sub8x8_dct    = sub8x8_dct;
 411     dctf->sub8x8_dct_dc = sub8x8_dct_dc;
 412     dctf->add8x8_idct   = add8x8_idct;
 413     dctf->add8x8_idct_dc = add8x8_idct_dc;
 414
 415     dctf->sub16x16_dct  = sub16x16_dct;
 416     dctf->add16x16_idct = add16x16_idct;
 417     dctf->add16x16_idct_dc = add16x16_idct_dc;
 418
 419     dctf->sub8x8_dct8   = sub8x8_dct8;
 420     dctf->add8x8_idct8  = add8x8_idct8;
 421
 422     dctf->sub16x16_dct8  = sub16x16_dct8;
 423     dctf->add16x16_idct8 = add16x16_idct8;
 424
 425     dctf->dct4x4dc  = dct4x4dc;
 426     dctf->idct4x4dc = idct4x4dc;
 427
 428 #ifdef HAVE_MMX
 429     if( cpu&X264_CPU_MMX )
 430     {
 431         dctf->sub4x4_dct    = x264_sub4x4_dct_mmx;
 432         dctf->add4x4_idct   = x264_add4x4_idct_mmx;
 433         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
 434         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
 435         dctf->dct4x4dc      = x264_dct4x4dc_mmx;
 436         dctf->idct4x4dc     = x264_idct4x4dc_mmx;
 437         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
 438
 439 #ifndef ARCH_X86_64
 440         dctf->sub8x8_dct    = x264_sub8x8_dct_mmx;
 441         dctf->sub16x16_dct  = x264_sub16x16_dct_mmx;
 442         dctf->add8x8_idct   = x264_add8x8_idct_mmx;
 443         dctf->add16x16_idct = x264_add16x16_idct_mmx;
 444
 445         dctf->sub8x8_dct8   = x264_sub8x8_dct8_mmx;
 446         dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
 447         dctf->add8x8_idct8  = x264_add8x8_idct8_mmx;
 448         dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
 449 #endif
 450     }
 451
 452     if( cpu&X264_CPU_SSE2 )
 453     {
 454         dctf->sub8x8_dct8   = x264_sub8x8_dct8_sse2;
 455         dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
 456         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
 457         dctf->add8x8_idct8  = x264_add8x8_idct8_sse2;
 458         dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
 459
 460         dctf->sub8x8_dct    = x264_sub8x8_dct_sse2;
 461         dctf->sub16x16_dct  = x264_sub16x16_dct_sse2;
 462         dctf->add8x8_idct   = x264_add8x8_idct_sse2;
 463         dctf->add16x16_idct = x264_add16x16_idct_sse2;
 464         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
 465     }
 466
 467     if( cpu&X264_CPU_SSSE3 )
 468     {
 469         dctf->sub4x4_dct    = x264_sub4x4_dct_ssse3;
 470         dctf->sub8x8_dct    = x264_sub8x8_dct_ssse3;
 471         dctf->sub16x16_dct  = x264_sub16x16_dct_ssse3;
 472         dctf->sub8x8_dct8   = x264_sub8x8_dct8_ssse3;
 473         dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
 474         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
 475         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
 476     }
 477
 478     if( cpu&X264_CPU_SSE4 )
 479         dctf->add4x4_idct   = x264_add4x4_idct_sse4;
 480
 481 #endif //HAVE_MMX
 482
 483 #ifdef ARCH_PPC
 484     if( cpu&X264_CPU_ALTIVEC )
 485     {
 486         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
 487         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
 488         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
 489
 490         dctf->add4x4_idct   = x264_add4x4_idct_altivec;
 491         dctf->add8x8_idct   = x264_add8x8_idct_altivec;
 492         dctf->add16x16_idct = x264_add16x16_idct_altivec;
 493
 494         dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
 495         dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
 496
 497         dctf->add8x8_idct8  = x264_add8x8_idct8_altivec;
 498         dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
 499     }
 500 #endif
 501
 502 #ifdef HAVE_ARMV6
 503     if( cpu&X264_CPU_NEON )
 504     {
 505         dctf->sub4x4_dct    = x264_sub4x4_dct_neon;
 506         dctf->sub8x8_dct    = x264_sub8x8_dct_neon;
 507         dctf->sub16x16_dct  = x264_sub16x16_dct_neon;
 508         dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
 509         dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
 510         dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
 511         dctf->dct4x4dc      = x264_dct4x4dc_neon;
 512         dctf->idct4x4dc     = x264_idct4x4dc_neon;
 513
 514         dctf->add4x4_idct   = x264_add4x4_idct_neon;
 515         dctf->add8x8_idct   = x264_add8x8_idct_neon;
 516         dctf->add16x16_idct = x264_add16x16_idct_neon;
 517
 518         dctf->sub8x8_dct8   = x264_sub8x8_dct8_neon;
 519         dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
 520
 521         dctf->add8x8_idct8  = x264_add8x8_idct8_neon;
 522         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
 523     }
 524 #endif
 525 }
 526
 527 void x264_dct_init_weights( void )
 528 {
 529     int i, j;
 530     for( j=0; j<2; j++ )
 531     {
 532         for( i=0; i<16; i++ )
 533             x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
 534         for( i=0; i<64; i++ )
 535             x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
 536     }
 537 }
 538
 539
 540 #define ZIG(i,y,x) level[i] = dct[x*8+y];
 541 #define ZIGZAG8_FRAME\
 542     ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 543     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 544     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
 545     ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
 546     ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
 547     ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
 548     ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
 549     ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
 550     ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
 551     ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
 552     ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
 553     ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
 554     ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
 555     ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
 556     ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
 557     ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
 558
 559 #define ZIGZAG8_FIELD\
 560     ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
 561     ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
 562     ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
 563     ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
 564     ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
 565     ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
 566     ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
 567     ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
 568     ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
 569     ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
 570     ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
 571     ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
 572     ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
 573     ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
 574     ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
 575     ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
 576
 577 #define ZIGZAG4_FRAME\
 578     ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
 579     ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
 580     ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
 581     ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
 582
 583 #define ZIGZAG4_FIELD\
 584     ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
 585     ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
 586     ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
 587     ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
 588
 589 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
 590 {
 591     ZIGZAG8_FRAME
 592 }
 593
 594 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
 595 {
 596     ZIGZAG8_FIELD
 597 }
 598
 599 #undef ZIG
 600 #define ZIG(i,y,x) level[i] = dct[x*4+y];
 601 #define ZIGDC(i,y,x) ZIG(i,y,x)
 602
 603 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
 604 {
 605     ZIGZAG4_FRAME
 606 }
 607
 608 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
 609 {
 610     CP32( level, dct );
 611     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
 612     CP32( level+6, dct+6 );
 613     CP64( level+8, dct+8 );
 614     CP64( level+12, dct+12 );
 615 }
 616
 617 #undef ZIG
 618 #define ZIG(i,y,x) {\
 619     int oe = x+y*FENC_STRIDE;\
 620     int od = x+y*FDEC_STRIDE;\
 621     level[i] = p_src[oe] - p_dst[od];\
 622     nz |= level[i];\
 623 }
 624 #define COPY4x4\
 625     CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 626     CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 627     CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 628     CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
 629 #define COPY8x8\
 630     CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
 631     CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
 632     CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
 633     CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
 634     CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
 635     CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
 636     CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
 637     CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
 638
 639 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 640 {
 641     int nz = 0;
 642     ZIGZAG4_FRAME
 643     COPY4x4
 644     return !!nz;
 645 }
 646
 647 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
 648 {
 649     int nz = 0;
 650     ZIGZAG4_FIELD
 651     COPY4x4
 652     return !!nz;
 653 }
 654
 655 #undef ZIGDC
 656 #define ZIGDC(i,y,x) {\
 657     int oe = x+y*FENC_STRIDE;\
 658     int od = x+y*FDEC_STRIDE;\
 659     *dc = p_src[oe] - p_dst[od];\
 660     level[0] = 0;\
 661 }
 662
 663 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 664 {
 665     int nz = 0;
 666     ZIGZAG4_FRAME
 667     COPY4x4
 668     return !!nz;
 669 }
 670
 671 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
 672 {
 673     int nz = 0;
 674     ZIGZAG4_FIELD
 675     COPY4x4
 676     return !!nz;
 677 }
 678
 679 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 680 {
 681     int nz = 0;
 682     ZIGZAG8_FRAME
 683     COPY8x8
 684     return !!nz;
 685 }
 686 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
 687 {
 688     int nz = 0;
 689     ZIGZAG8_FIELD
 690     COPY8x8
 691     return !!nz;
 692 }
 693
 694 #undef ZIG
 695 #undef COPY4x4
 696
 697 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
 698 {
 699     int i,j;
 700     for( i=0; i<4; i++ )
 701     {
 702         int nz = 0;
 703         for( j=0; j<16; j++ )
 704         {
 705             nz |= src[i+j*4];
 706             dst[i*16+j] = src[i+j*4];
 707         }
 708         nnz[(i&1) + (i>>1)*8] = !!nz;
 709     }
 710 }
 711
 712 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
 713 {
 714     if( b_interlaced )
 715     {
 716         pf->scan_8x8   = zigzag_scan_8x8_field;
 717         pf->scan_4x4   = zigzag_scan_4x4_field;
 718         pf->sub_8x8    = zigzag_sub_8x8_field;
 719         pf->sub_4x4    = zigzag_sub_4x4_field;
 720         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
 721 #ifdef HAVE_MMX
 722         if( cpu&X264_CPU_MMXEXT )
 723             pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
 724         if( cpu&X264_CPU_SSSE3 )
 725         {
 726             pf->sub_4x4  = x264_zigzag_sub_4x4_field_ssse3;
 727             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
 728             pf->scan_8x8 = x264_zigzag_scan_8x8_field_ssse3;
 729         }
 730 #endif
 731
 732 #ifdef ARCH_PPC
 733         if( cpu&X264_CPU_ALTIVEC )
 734             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 735 #endif
 736     }
 737     else
 738     {
 739         pf->scan_8x8   = zigzag_scan_8x8_frame;
 740         pf->scan_4x4   = zigzag_scan_4x4_frame;
 741         pf->sub_8x8    = zigzag_sub_8x8_frame;
 742         pf->sub_4x4    = zigzag_sub_4x4_frame;
 743         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
 744 #ifdef HAVE_MMX
 745         if( cpu&X264_CPU_MMX )
 746             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
 747         if( cpu&X264_CPU_MMXEXT )
 748             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
 749         if( cpu&X264_CPU_SSE2_IS_FAST )
 750             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
 751         if( cpu&X264_CPU_SSSE3 )
 752         {
 753             pf->sub_4x4  = x264_zigzag_sub_4x4_frame_ssse3;
 754             pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
 755             pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
 756             if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 757                 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
 758         }
 759 #endif
 760
 761 #ifdef ARCH_PPC
 762         if( cpu&X264_CPU_ALTIVEC )
 763             pf->scan_4x4   = x264_zigzag_scan_4x4_frame_altivec;
 764 #endif
 765 #ifdef HAVE_ARMV6
 766         if( cpu&X264_CPU_NEON )
 767             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 768 #endif
 769     }
 770
 771     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
 772 #ifdef HAVE_MMX
 773     if( cpu&X264_CPU_MMX )
 774         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
 775     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
 776         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 777 #endif
 778 }