1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2012 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <hengar-6@student.ltu.se>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 /* the inverse of the scaling factors introduced by 8x8 fdct */
40 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
41 #define W(i) (i==0 ? FIX8(1.0000) :\
42 i==1 ? FIX8(0.8859) :\
43 i==2 ? FIX8(1.6000) :\
44 i==3 ? FIX8(0.9415) :\
45 i==4 ? FIX8(1.2651) :\
46 i==5 ? FIX8(1.1910) :0)
47 const uint32_t x264_dct8_weight_tab[64] = {
48 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
49 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
50 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
51 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
53 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
54 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
55 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
56 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
60 #define W(i) (i==0 ? FIX8(1.76777) :\
61 i==1 ? FIX8(1.11803) :\
62 i==2 ? FIX8(0.70711) :0)
63 const uint32_t x264_dct4_weight_tab[16] = {
64 W(0), W(1), W(0), W(1),
65 W(1), W(2), W(1), W(2),
66 W(0), W(1), W(0), W(1),
67 W(1), W(2), W(1), W(2)
72 #define W(i) (i==0 ? FIX8(3.125) :\
75 const uint32_t x264_dct4_weight2_tab[16] = {
76 W(0), W(1), W(0), W(1),
77 W(1), W(2), W(1), W(2),
78 W(0), W(1), W(0), W(1),
79 W(1), W(2), W(1), W(2)
83 #define W(i) (i==0 ? FIX8(1.00000) :\
84 i==1 ? FIX8(0.78487) :\
85 i==2 ? FIX8(2.56132) :\
86 i==3 ? FIX8(0.88637) :\
87 i==4 ? FIX8(1.60040) :\
88 i==5 ? FIX8(1.41850) :0)
89 const uint32_t x264_dct8_weight2_tab[64] = {
90 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
91 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
92 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
93 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
95 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
96 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
97 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
98 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
103 static void dct4x4dc( dctcoef d[16] )
107 for( int i = 0; i < 4; i++ )
109 int s01 = d[i*4+0] + d[i*4+1];
110 int d01 = d[i*4+0] - d[i*4+1];
111 int s23 = d[i*4+2] + d[i*4+3];
112 int d23 = d[i*4+2] - d[i*4+3];
114 tmp[0*4+i] = s01 + s23;
115 tmp[1*4+i] = s01 - s23;
116 tmp[2*4+i] = d01 - d23;
117 tmp[3*4+i] = d01 + d23;
120 for( int i = 0; i < 4; i++ )
122 int s01 = tmp[i*4+0] + tmp[i*4+1];
123 int d01 = tmp[i*4+0] - tmp[i*4+1];
124 int s23 = tmp[i*4+2] + tmp[i*4+3];
125 int d23 = tmp[i*4+2] - tmp[i*4+3];
127 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
128 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
129 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
130 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
134 static void idct4x4dc( dctcoef d[16] )
138 for( int i = 0; i < 4; i++ )
140 int s01 = d[i*4+0] + d[i*4+1];
141 int d01 = d[i*4+0] - d[i*4+1];
142 int s23 = d[i*4+2] + d[i*4+3];
143 int d23 = d[i*4+2] - d[i*4+3];
145 tmp[0*4+i] = s01 + s23;
146 tmp[1*4+i] = s01 - s23;
147 tmp[2*4+i] = d01 - d23;
148 tmp[3*4+i] = d01 + d23;
151 for( int i = 0; i < 4; i++ )
153 int s01 = tmp[i*4+0] + tmp[i*4+1];
154 int d01 = tmp[i*4+0] - tmp[i*4+1];
155 int s23 = tmp[i*4+2] + tmp[i*4+3];
156 int d23 = tmp[i*4+2] - tmp[i*4+3];
158 d[i*4+0] = s01 + s23;
159 d[i*4+1] = s01 - s23;
160 d[i*4+2] = d01 - d23;
161 d[i*4+3] = d01 + d23;
165 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
167 int a0 = dct4x4[0][0] + dct4x4[1][0];
168 int a1 = dct4x4[2][0] + dct4x4[3][0];
169 int a2 = dct4x4[4][0] + dct4x4[5][0];
170 int a3 = dct4x4[6][0] + dct4x4[7][0];
171 int a4 = dct4x4[0][0] - dct4x4[1][0];
172 int a5 = dct4x4[2][0] - dct4x4[3][0];
173 int a6 = dct4x4[4][0] - dct4x4[5][0];
174 int a7 = dct4x4[6][0] - dct4x4[7][0];
201 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
202 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
204 for( int y = 0; y < i_size; y++ )
206 for( int x = 0; x < i_size; x++ )
207 diff[x + y*i_size] = pix1[x] - pix2[x];
213 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
218 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
220 for( int i = 0; i < 4; i++ )
222 int s03 = d[i*4+0] + d[i*4+3];
223 int s12 = d[i*4+1] + d[i*4+2];
224 int d03 = d[i*4+0] - d[i*4+3];
225 int d12 = d[i*4+1] - d[i*4+2];
227 tmp[0*4+i] = s03 + s12;
228 tmp[1*4+i] = 2*d03 + d12;
229 tmp[2*4+i] = s03 - s12;
230 tmp[3*4+i] = d03 - 2*d12;
233 for( int i = 0; i < 4; i++ )
235 int s03 = tmp[i*4+0] + tmp[i*4+3];
236 int s12 = tmp[i*4+1] + tmp[i*4+2];
237 int d03 = tmp[i*4+0] - tmp[i*4+3];
238 int d12 = tmp[i*4+1] - tmp[i*4+2];
240 dct[i*4+0] = s03 + s12;
241 dct[i*4+1] = 2*d03 + d12;
242 dct[i*4+2] = s03 - s12;
243 dct[i*4+3] = d03 - 2*d12;
247 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
249 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
250 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
251 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
252 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
255 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
257 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
258 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
259 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
260 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
263 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
266 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
267 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
268 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
272 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
274 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
275 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
276 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
277 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
279 /* 2x2 DC transform */
280 int d0 = dct[0] + dct[1];
281 int d1 = dct[2] + dct[3];
282 int d2 = dct[0] - dct[1];
283 int d3 = dct[2] - dct[3];
290 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
292 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
293 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
294 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
295 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
296 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
297 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
298 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
299 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
301 /* 2x4 DC transform */
328 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
333 for( int i = 0; i < 4; i++ )
335 int s02 = dct[0*4+i] + dct[2*4+i];
336 int d02 = dct[0*4+i] - dct[2*4+i];
337 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
338 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
340 tmp[i*4+0] = s02 + s13;
341 tmp[i*4+1] = d02 + d13;
342 tmp[i*4+2] = d02 - d13;
343 tmp[i*4+3] = s02 - s13;
346 for( int i = 0; i < 4; i++ )
348 int s02 = tmp[0*4+i] + tmp[2*4+i];
349 int d02 = tmp[0*4+i] - tmp[2*4+i];
350 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
351 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
353 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
354 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
355 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
356 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
360 for( int y = 0; y < 4; y++ )
362 for( int x = 0; x < 4; x++ )
363 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
364 p_dst += FDEC_STRIDE;
368 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
370 add4x4_idct( &p_dst[0], dct[0] );
371 add4x4_idct( &p_dst[4], dct[1] );
372 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
373 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
376 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
378 add8x8_idct( &p_dst[0], &dct[0] );
379 add8x8_idct( &p_dst[8], &dct[4] );
380 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
381 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
384 /****************************************************************************
386 ****************************************************************************/
389 int s07 = SRC(0) + SRC(7);\
390 int s16 = SRC(1) + SRC(6);\
391 int s25 = SRC(2) + SRC(5);\
392 int s34 = SRC(3) + SRC(4);\
397 int d07 = SRC(0) - SRC(7);\
398 int d16 = SRC(1) - SRC(6);\
399 int d25 = SRC(2) - SRC(5);\
400 int d34 = SRC(3) - SRC(4);\
401 int a4 = d16 + d25 + (d07 + (d07>>1));\
402 int a5 = d07 - d34 - (d25 + (d25>>1));\
403 int a6 = d07 + d34 - (d16 + (d16>>1));\
404 int a7 = d16 - d25 + (d34 + (d34>>1));\
406 DST(1) = a4 + (a7>>2);\
407 DST(2) = a2 + (a3>>1);\
408 DST(3) = a5 + (a6>>2);\
410 DST(5) = a6 - (a5>>2);\
411 DST(6) = (a2>>1) - a3 ;\
412 DST(7) = (a4>>2) - a7 ;\
415 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
419 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
421 #define SRC(x) tmp[x*8+i]
422 #define DST(x) tmp[x*8+i]
423 for( int i = 0; i < 8; i++ )
428 #define SRC(x) tmp[i*8+x]
429 #define DST(x) dct[x*8+i]
430 for( int i = 0; i < 8; i++ )
436 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
438 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
439 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
440 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
441 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
445 int a0 = SRC(0) + SRC(4);\
446 int a2 = SRC(0) - SRC(4);\
447 int a4 = (SRC(2)>>1) - SRC(6);\
448 int a6 = (SRC(6)>>1) + SRC(2);\
453 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
454 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
455 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
456 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
457 int b1 = (a7>>2) + a1;\
458 int b3 = a3 + (a5>>2);\
459 int b5 = (a3>>2) - a5;\
460 int b7 = a7 - (a1>>2);\
471 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
473 dct[0] += 32; // rounding for the >>6 at the end
475 #define SRC(x) dct[x*8+i]
476 #define DST(x,rhs) dct[x*8+i] = (rhs)
477 for( int i = 0; i < 8; i++ )
482 #define SRC(x) dct[i*8+x]
483 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
484 for( int i = 0; i < 8; i++ )
490 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
492 add8x8_idct8( &dst[0], dct[0] );
493 add8x8_idct8( &dst[8], dct[1] );
494 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
495 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
498 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
501 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
503 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
504 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
505 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
506 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
510 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
512 add4x4_idct_dc( &p_dst[0], dct[0] );
513 add4x4_idct_dc( &p_dst[4], dct[1] );
514 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
515 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
518 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
520 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
522 add4x4_idct_dc( &p_dst[ 0], dct[0] );
523 add4x4_idct_dc( &p_dst[ 4], dct[1] );
524 add4x4_idct_dc( &p_dst[ 8], dct[2] );
525 add4x4_idct_dc( &p_dst[12], dct[3] );
530 /****************************************************************************
532 ****************************************************************************/
533 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
535 dctf->sub4x4_dct = sub4x4_dct;
536 dctf->add4x4_idct = add4x4_idct;
538 dctf->sub8x8_dct = sub8x8_dct;
539 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
540 dctf->add8x8_idct = add8x8_idct;
541 dctf->add8x8_idct_dc = add8x8_idct_dc;
543 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
545 dctf->sub16x16_dct = sub16x16_dct;
546 dctf->add16x16_idct = add16x16_idct;
547 dctf->add16x16_idct_dc = add16x16_idct_dc;
549 dctf->sub8x8_dct8 = sub8x8_dct8;
550 dctf->add8x8_idct8 = add8x8_idct8;
552 dctf->sub16x16_dct8 = sub16x16_dct8;
553 dctf->add16x16_idct8 = add16x16_idct8;
555 dctf->dct4x4dc = dct4x4dc;
556 dctf->idct4x4dc = idct4x4dc;
558 dctf->dct2x4dc = dct2x4dc;
562 if( cpu&X264_CPU_MMX )
564 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
565 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
566 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
568 if( cpu&X264_CPU_SSE2 )
570 dctf->add4x4_idct = x264_add4x4_idct_sse2;
571 dctf->dct4x4dc = x264_dct4x4dc_sse2;
572 dctf->idct4x4dc = x264_idct4x4dc_sse2;
573 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
574 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
575 dctf->add8x8_idct = x264_add8x8_idct_sse2;
576 dctf->add16x16_idct = x264_add16x16_idct_sse2;
577 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
578 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
579 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
580 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
581 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
582 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
584 if( cpu&X264_CPU_SSE4 )
586 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
587 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
589 if( cpu&X264_CPU_AVX )
591 dctf->add4x4_idct = x264_add4x4_idct_avx;
592 dctf->dct4x4dc = x264_dct4x4dc_avx;
593 dctf->idct4x4dc = x264_idct4x4dc_avx;
594 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
595 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
596 dctf->add8x8_idct = x264_add8x8_idct_avx;
597 dctf->add16x16_idct = x264_add16x16_idct_avx;
598 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
599 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
600 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
601 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
602 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
605 #else // !HIGH_BIT_DEPTH
607 if( cpu&X264_CPU_MMX )
609 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
610 dctf->add4x4_idct = x264_add4x4_idct_mmx;
611 dctf->dct4x4dc = x264_dct4x4dc_mmx;
612 dctf->idct4x4dc = x264_idct4x4dc_mmx;
613 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
616 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
617 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
618 dctf->add8x8_idct = x264_add8x8_idct_mmx;
619 dctf->add16x16_idct = x264_add16x16_idct_mmx;
621 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
622 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
623 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
624 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
628 if( cpu&X264_CPU_MMX2 )
630 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
631 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
634 if( cpu&X264_CPU_SSE2 )
636 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
637 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
638 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
639 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
640 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
641 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
643 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
644 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
645 dctf->add8x8_idct = x264_add8x8_idct_sse2;
646 dctf->add16x16_idct = x264_add16x16_idct_sse2;
647 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
650 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
652 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
653 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
654 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
655 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
656 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
657 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
658 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
659 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
662 if( cpu&X264_CPU_SSE4 )
663 dctf->add4x4_idct = x264_add4x4_idct_sse4;
665 if( cpu&X264_CPU_AVX )
667 dctf->add4x4_idct = x264_add4x4_idct_avx;
668 dctf->add8x8_idct = x264_add8x8_idct_avx;
669 dctf->add16x16_idct = x264_add16x16_idct_avx;
670 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
671 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
672 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
673 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
674 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
675 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
676 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
679 if( cpu&X264_CPU_XOP )
681 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
682 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
687 if( cpu&X264_CPU_ALTIVEC )
689 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
690 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
691 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
693 dctf->add4x4_idct = x264_add4x4_idct_altivec;
694 dctf->add8x8_idct = x264_add8x8_idct_altivec;
695 dctf->add16x16_idct = x264_add16x16_idct_altivec;
697 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
698 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
700 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
701 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
706 if( cpu&X264_CPU_NEON )
708 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
709 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
710 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
711 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
712 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
713 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
714 dctf->dct4x4dc = x264_dct4x4dc_neon;
715 dctf->idct4x4dc = x264_idct4x4dc_neon;
717 dctf->add4x4_idct = x264_add4x4_idct_neon;
718 dctf->add8x8_idct = x264_add8x8_idct_neon;
719 dctf->add16x16_idct = x264_add16x16_idct_neon;
721 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
722 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
724 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
725 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
728 #endif // HIGH_BIT_DEPTH
732 #define ZIG(i,y,x) level[i] = dct[x*8+y];
733 #define ZIGZAG8_FRAME\
734 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
735 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
736 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
737 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
738 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
739 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
740 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
741 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
742 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
743 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
744 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
745 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
746 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
747 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
748 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
749 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
751 #define ZIGZAG8_FIELD\
752 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
753 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
754 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
755 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
756 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
757 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
758 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
759 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
760 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
761 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
762 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
763 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
764 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
765 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
766 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
767 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
769 #define ZIGZAG4_FRAME\
770 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
771 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
772 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
773 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
775 #define ZIGZAG4_FIELD\
776 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
777 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
778 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
779 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
781 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
786 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
792 #define ZIG(i,y,x) level[i] = dct[x*4+y];
793 #define ZIGDC(i,y,x) ZIG(i,y,x)
795 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
800 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
802 memcpy( level, dct, 2 * sizeof(dctcoef) );
803 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
804 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
808 #define ZIG(i,y,x) {\
809 int oe = x+y*FENC_STRIDE;\
810 int od = x+y*FDEC_STRIDE;\
811 level[i] = p_src[oe] - p_dst[od];\
815 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
816 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
817 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
818 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
819 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
821 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
822 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
823 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
824 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
825 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
826 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
827 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
828 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
830 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
838 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
847 #define ZIGDC(i,y,x) {\
848 int oe = x+y*FENC_STRIDE;\
849 int od = x+y*FDEC_STRIDE;\
850 *dc = p_src[oe] - p_dst[od];\
854 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
862 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
870 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
877 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
888 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
890 for( int i = 0; i < 4; i++ )
893 for( int j = 0; j < 16; j++ )
896 dst[i*16+j] = src[i+j*4];
898 nnz[(i&1) + (i>>1)*8] = !!nz;
902 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
904 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
905 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
906 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
907 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
908 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
909 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
910 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
911 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
912 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
913 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
917 if( cpu&X264_CPU_SSE2 )
919 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
920 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
921 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
923 if( cpu&X264_CPU_SSE4 )
924 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
925 if( cpu&X264_CPU_AVX )
926 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
928 if( cpu&X264_CPU_AVX )
930 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
931 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
933 #endif // ARCH_X86_64
937 if( cpu&X264_CPU_MMX )
938 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
939 if( cpu&X264_CPU_MMX2 )
941 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
942 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
943 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
945 if( cpu&X264_CPU_SSE2_IS_FAST )
946 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
947 if( cpu&X264_CPU_SSSE3 )
949 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
950 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
951 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
952 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
953 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
954 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
955 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
957 if( cpu&X264_CPU_AVX )
959 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
960 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
962 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
963 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
965 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
966 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
968 if( cpu&X264_CPU_XOP )
970 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
971 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
972 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
976 if( cpu&X264_CPU_ALTIVEC )
978 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
979 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
983 if( cpu&X264_CPU_NEON )
984 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
986 #endif // HIGH_BIT_DEPTH
988 pf_interlaced->interleave_8x8_cavlc =
989 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
992 if( cpu&X264_CPU_SSE2 )
994 pf_interlaced->interleave_8x8_cavlc =
995 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
997 if( cpu&X264_CPU_AVX )
999 pf_interlaced->interleave_8x8_cavlc =
1000 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1003 if( cpu&X264_CPU_MMX )
1005 pf_interlaced->interleave_8x8_cavlc =
1006 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1008 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
1010 pf_interlaced->interleave_8x8_cavlc =
1011 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1014 if( cpu&X264_CPU_AVX )
1016 pf_interlaced->interleave_8x8_cavlc =
1017 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1019 #endif // HIGH_BIT_DEPTH