1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2013 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <hengar-6@student.ltu.se>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 /* the inverse of the scaling factors introduced by 8x8 fdct */
40 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
41 #define W(i) (i==0 ? FIX8(1.0000) :\
42 i==1 ? FIX8(0.8859) :\
43 i==2 ? FIX8(1.6000) :\
44 i==3 ? FIX8(0.9415) :\
45 i==4 ? FIX8(1.2651) :\
46 i==5 ? FIX8(1.1910) :0)
47 const uint32_t x264_dct8_weight_tab[64] = {
48 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
49 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
50 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
51 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
53 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
54 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
55 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
56 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
60 #define W(i) (i==0 ? FIX8(1.76777) :\
61 i==1 ? FIX8(1.11803) :\
62 i==2 ? FIX8(0.70711) :0)
63 const uint32_t x264_dct4_weight_tab[16] = {
64 W(0), W(1), W(0), W(1),
65 W(1), W(2), W(1), W(2),
66 W(0), W(1), W(0), W(1),
67 W(1), W(2), W(1), W(2)
72 #define W(i) (i==0 ? FIX8(3.125) :\
75 const uint32_t x264_dct4_weight2_tab[16] = {
76 W(0), W(1), W(0), W(1),
77 W(1), W(2), W(1), W(2),
78 W(0), W(1), W(0), W(1),
79 W(1), W(2), W(1), W(2)
83 #define W(i) (i==0 ? FIX8(1.00000) :\
84 i==1 ? FIX8(0.78487) :\
85 i==2 ? FIX8(2.56132) :\
86 i==3 ? FIX8(0.88637) :\
87 i==4 ? FIX8(1.60040) :\
88 i==5 ? FIX8(1.41850) :0)
89 const uint32_t x264_dct8_weight2_tab[64] = {
90 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
91 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
92 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
93 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
95 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
96 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
97 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
98 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
103 static void dct4x4dc( dctcoef d[16] )
107 for( int i = 0; i < 4; i++ )
109 int s01 = d[i*4+0] + d[i*4+1];
110 int d01 = d[i*4+0] - d[i*4+1];
111 int s23 = d[i*4+2] + d[i*4+3];
112 int d23 = d[i*4+2] - d[i*4+3];
114 tmp[0*4+i] = s01 + s23;
115 tmp[1*4+i] = s01 - s23;
116 tmp[2*4+i] = d01 - d23;
117 tmp[3*4+i] = d01 + d23;
120 for( int i = 0; i < 4; i++ )
122 int s01 = tmp[i*4+0] + tmp[i*4+1];
123 int d01 = tmp[i*4+0] - tmp[i*4+1];
124 int s23 = tmp[i*4+2] + tmp[i*4+3];
125 int d23 = tmp[i*4+2] - tmp[i*4+3];
127 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
128 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
129 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
130 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
134 static void idct4x4dc( dctcoef d[16] )
138 for( int i = 0; i < 4; i++ )
140 int s01 = d[i*4+0] + d[i*4+1];
141 int d01 = d[i*4+0] - d[i*4+1];
142 int s23 = d[i*4+2] + d[i*4+3];
143 int d23 = d[i*4+2] - d[i*4+3];
145 tmp[0*4+i] = s01 + s23;
146 tmp[1*4+i] = s01 - s23;
147 tmp[2*4+i] = d01 - d23;
148 tmp[3*4+i] = d01 + d23;
151 for( int i = 0; i < 4; i++ )
153 int s01 = tmp[i*4+0] + tmp[i*4+1];
154 int d01 = tmp[i*4+0] - tmp[i*4+1];
155 int s23 = tmp[i*4+2] + tmp[i*4+3];
156 int d23 = tmp[i*4+2] - tmp[i*4+3];
158 d[i*4+0] = s01 + s23;
159 d[i*4+1] = s01 - s23;
160 d[i*4+2] = d01 - d23;
161 d[i*4+3] = d01 + d23;
165 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
167 int a0 = dct4x4[0][0] + dct4x4[1][0];
168 int a1 = dct4x4[2][0] + dct4x4[3][0];
169 int a2 = dct4x4[4][0] + dct4x4[5][0];
170 int a3 = dct4x4[6][0] + dct4x4[7][0];
171 int a4 = dct4x4[0][0] - dct4x4[1][0];
172 int a5 = dct4x4[2][0] - dct4x4[3][0];
173 int a6 = dct4x4[4][0] - dct4x4[5][0];
174 int a7 = dct4x4[6][0] - dct4x4[7][0];
201 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
202 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
204 for( int y = 0; y < i_size; y++ )
206 for( int x = 0; x < i_size; x++ )
207 diff[x + y*i_size] = pix1[x] - pix2[x];
213 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
218 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
220 for( int i = 0; i < 4; i++ )
222 int s03 = d[i*4+0] + d[i*4+3];
223 int s12 = d[i*4+1] + d[i*4+2];
224 int d03 = d[i*4+0] - d[i*4+3];
225 int d12 = d[i*4+1] - d[i*4+2];
227 tmp[0*4+i] = s03 + s12;
228 tmp[1*4+i] = 2*d03 + d12;
229 tmp[2*4+i] = s03 - s12;
230 tmp[3*4+i] = d03 - 2*d12;
233 for( int i = 0; i < 4; i++ )
235 int s03 = tmp[i*4+0] + tmp[i*4+3];
236 int s12 = tmp[i*4+1] + tmp[i*4+2];
237 int d03 = tmp[i*4+0] - tmp[i*4+3];
238 int d12 = tmp[i*4+1] - tmp[i*4+2];
240 dct[i*4+0] = s03 + s12;
241 dct[i*4+1] = 2*d03 + d12;
242 dct[i*4+2] = s03 - s12;
243 dct[i*4+3] = d03 - 2*d12;
247 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
249 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
250 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
251 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
252 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
255 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
257 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
258 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
259 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
260 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
263 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
266 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
267 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
268 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
272 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
274 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
275 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
276 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
277 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
279 /* 2x2 DC transform */
280 int d0 = dct[0] + dct[1];
281 int d1 = dct[2] + dct[3];
282 int d2 = dct[0] - dct[1];
283 int d3 = dct[2] - dct[3];
290 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
292 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
293 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
294 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
295 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
296 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
297 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
298 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
299 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
301 /* 2x4 DC transform */
328 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
333 for( int i = 0; i < 4; i++ )
335 int s02 = dct[0*4+i] + dct[2*4+i];
336 int d02 = dct[0*4+i] - dct[2*4+i];
337 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
338 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
340 tmp[i*4+0] = s02 + s13;
341 tmp[i*4+1] = d02 + d13;
342 tmp[i*4+2] = d02 - d13;
343 tmp[i*4+3] = s02 - s13;
346 for( int i = 0; i < 4; i++ )
348 int s02 = tmp[0*4+i] + tmp[2*4+i];
349 int d02 = tmp[0*4+i] - tmp[2*4+i];
350 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
351 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
353 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
354 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
355 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
356 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
360 for( int y = 0; y < 4; y++ )
362 for( int x = 0; x < 4; x++ )
363 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
364 p_dst += FDEC_STRIDE;
368 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
370 add4x4_idct( &p_dst[0], dct[0] );
371 add4x4_idct( &p_dst[4], dct[1] );
372 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
373 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
376 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
378 add8x8_idct( &p_dst[0], &dct[0] );
379 add8x8_idct( &p_dst[8], &dct[4] );
380 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
381 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
384 /****************************************************************************
386 ****************************************************************************/
389 int s07 = SRC(0) + SRC(7);\
390 int s16 = SRC(1) + SRC(6);\
391 int s25 = SRC(2) + SRC(5);\
392 int s34 = SRC(3) + SRC(4);\
397 int d07 = SRC(0) - SRC(7);\
398 int d16 = SRC(1) - SRC(6);\
399 int d25 = SRC(2) - SRC(5);\
400 int d34 = SRC(3) - SRC(4);\
401 int a4 = d16 + d25 + (d07 + (d07>>1));\
402 int a5 = d07 - d34 - (d25 + (d25>>1));\
403 int a6 = d07 + d34 - (d16 + (d16>>1));\
404 int a7 = d16 - d25 + (d34 + (d34>>1));\
406 DST(1) = a4 + (a7>>2);\
407 DST(2) = a2 + (a3>>1);\
408 DST(3) = a5 + (a6>>2);\
410 DST(5) = a6 - (a5>>2);\
411 DST(6) = (a2>>1) - a3 ;\
412 DST(7) = (a4>>2) - a7 ;\
415 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
419 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
421 #define SRC(x) tmp[x*8+i]
422 #define DST(x) tmp[x*8+i]
423 for( int i = 0; i < 8; i++ )
428 #define SRC(x) tmp[i*8+x]
429 #define DST(x) dct[x*8+i]
430 for( int i = 0; i < 8; i++ )
436 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
438 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
439 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
440 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
441 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
445 int a0 = SRC(0) + SRC(4);\
446 int a2 = SRC(0) - SRC(4);\
447 int a4 = (SRC(2)>>1) - SRC(6);\
448 int a6 = (SRC(6)>>1) + SRC(2);\
453 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
454 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
455 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
456 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
457 int b1 = (a7>>2) + a1;\
458 int b3 = a3 + (a5>>2);\
459 int b5 = (a3>>2) - a5;\
460 int b7 = a7 - (a1>>2);\
471 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
473 dct[0] += 32; // rounding for the >>6 at the end
475 #define SRC(x) dct[x*8+i]
476 #define DST(x,rhs) dct[x*8+i] = (rhs)
477 for( int i = 0; i < 8; i++ )
482 #define SRC(x) dct[i*8+x]
483 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
484 for( int i = 0; i < 8; i++ )
490 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
492 add8x8_idct8( &dst[0], dct[0] );
493 add8x8_idct8( &dst[8], dct[1] );
494 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
495 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
498 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
501 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
503 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
504 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
505 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
506 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
510 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
512 add4x4_idct_dc( &p_dst[0], dct[0] );
513 add4x4_idct_dc( &p_dst[4], dct[1] );
514 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
515 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
518 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
520 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
522 add4x4_idct_dc( &p_dst[ 0], dct[0] );
523 add4x4_idct_dc( &p_dst[ 4], dct[1] );
524 add4x4_idct_dc( &p_dst[ 8], dct[2] );
525 add4x4_idct_dc( &p_dst[12], dct[3] );
530 /****************************************************************************
532 ****************************************************************************/
533 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
535 dctf->sub4x4_dct = sub4x4_dct;
536 dctf->add4x4_idct = add4x4_idct;
538 dctf->sub8x8_dct = sub8x8_dct;
539 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
540 dctf->add8x8_idct = add8x8_idct;
541 dctf->add8x8_idct_dc = add8x8_idct_dc;
543 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
545 dctf->sub16x16_dct = sub16x16_dct;
546 dctf->add16x16_idct = add16x16_idct;
547 dctf->add16x16_idct_dc = add16x16_idct_dc;
549 dctf->sub8x8_dct8 = sub8x8_dct8;
550 dctf->add8x8_idct8 = add8x8_idct8;
552 dctf->sub16x16_dct8 = sub16x16_dct8;
553 dctf->add16x16_idct8 = add16x16_idct8;
555 dctf->dct4x4dc = dct4x4dc;
556 dctf->idct4x4dc = idct4x4dc;
558 dctf->dct2x4dc = dct2x4dc;
562 if( cpu&X264_CPU_MMX )
564 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
565 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
566 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
568 if( cpu&X264_CPU_SSE2 )
570 dctf->add4x4_idct = x264_add4x4_idct_sse2;
571 dctf->dct4x4dc = x264_dct4x4dc_sse2;
572 dctf->idct4x4dc = x264_idct4x4dc_sse2;
573 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
574 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
575 dctf->add8x8_idct = x264_add8x8_idct_sse2;
576 dctf->add16x16_idct = x264_add16x16_idct_sse2;
577 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
578 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
579 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
580 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
581 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
582 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
584 if( cpu&X264_CPU_SSE4 )
586 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
587 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
589 if( cpu&X264_CPU_AVX )
591 dctf->add4x4_idct = x264_add4x4_idct_avx;
592 dctf->dct4x4dc = x264_dct4x4dc_avx;
593 dctf->idct4x4dc = x264_idct4x4dc_avx;
594 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
595 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
596 dctf->add8x8_idct = x264_add8x8_idct_avx;
597 dctf->add16x16_idct = x264_add16x16_idct_avx;
598 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
599 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
600 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
601 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
602 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
605 #else // !HIGH_BIT_DEPTH
607 if( cpu&X264_CPU_MMX )
609 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
610 dctf->add4x4_idct = x264_add4x4_idct_mmx;
611 dctf->dct4x4dc = x264_dct4x4dc_mmx;
612 dctf->idct4x4dc = x264_idct4x4dc_mmx;
613 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
616 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
617 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
618 dctf->add8x8_idct = x264_add8x8_idct_mmx;
619 dctf->add16x16_idct = x264_add16x16_idct_mmx;
621 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
622 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
623 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
624 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
628 if( cpu&X264_CPU_MMX2 )
630 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
631 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
634 if( cpu&X264_CPU_SSE2 )
636 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
637 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
638 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
639 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
640 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
641 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
643 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
645 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
646 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
647 dctf->add8x8_idct = x264_add8x8_idct_sse2;
648 dctf->add16x16_idct = x264_add16x16_idct_sse2;
649 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
653 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
655 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
656 if( !(cpu&X264_CPU_SLOW_ATOM) )
658 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
659 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
660 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
661 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
662 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
663 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
665 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
666 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
671 if( cpu&X264_CPU_SSE4 )
672 dctf->add4x4_idct = x264_add4x4_idct_sse4;
674 if( cpu&X264_CPU_AVX )
676 dctf->add4x4_idct = x264_add4x4_idct_avx;
677 dctf->add8x8_idct = x264_add8x8_idct_avx;
678 dctf->add16x16_idct = x264_add16x16_idct_avx;
679 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
680 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
681 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
682 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
683 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
684 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
685 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
688 if( cpu&X264_CPU_XOP )
690 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
691 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
694 if( cpu&X264_CPU_AVX2 )
696 dctf->add8x8_idct = x264_add8x8_idct_avx2;
697 dctf->add16x16_idct = x264_add16x16_idct_avx2;
698 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
699 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
701 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
707 if( cpu&X264_CPU_ALTIVEC )
709 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
710 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
711 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
713 dctf->add4x4_idct = x264_add4x4_idct_altivec;
714 dctf->add8x8_idct = x264_add8x8_idct_altivec;
715 dctf->add16x16_idct = x264_add16x16_idct_altivec;
717 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
718 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
720 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
721 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
726 if( cpu&X264_CPU_NEON )
728 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
729 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
730 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
731 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
732 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
733 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
734 dctf->dct4x4dc = x264_dct4x4dc_neon;
735 dctf->idct4x4dc = x264_idct4x4dc_neon;
737 dctf->add4x4_idct = x264_add4x4_idct_neon;
738 dctf->add8x8_idct = x264_add8x8_idct_neon;
739 dctf->add16x16_idct = x264_add16x16_idct_neon;
741 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
742 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
744 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
745 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
748 #endif // HIGH_BIT_DEPTH
752 #define ZIG(i,y,x) level[i] = dct[x*8+y];
753 #define ZIGZAG8_FRAME\
754 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
755 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
756 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
757 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
758 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
759 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
760 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
761 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
762 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
763 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
764 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
765 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
766 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
767 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
768 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
769 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
771 #define ZIGZAG8_FIELD\
772 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
773 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
774 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
775 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
776 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
777 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
778 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
779 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
780 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
781 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
782 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
783 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
784 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
785 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
786 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
787 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
789 #define ZIGZAG4_FRAME\
790 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
791 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
792 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
793 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
795 #define ZIGZAG4_FIELD\
796 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
797 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
798 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
799 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
801 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
806 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
812 #define ZIG(i,y,x) level[i] = dct[x*4+y];
813 #define ZIGDC(i,y,x) ZIG(i,y,x)
815 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
820 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
822 memcpy( level, dct, 2 * sizeof(dctcoef) );
823 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
824 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
828 #define ZIG(i,y,x) {\
829 int oe = x+y*FENC_STRIDE;\
830 int od = x+y*FDEC_STRIDE;\
831 level[i] = p_src[oe] - p_dst[od];\
835 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
836 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
837 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
838 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
839 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
841 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
842 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
843 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
844 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
845 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
846 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
847 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
848 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
850 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
858 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
867 #define ZIGDC(i,y,x) {\
868 int oe = x+y*FENC_STRIDE;\
869 int od = x+y*FDEC_STRIDE;\
870 *dc = p_src[oe] - p_dst[od];\
874 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
882 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
890 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
897 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
908 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
910 for( int i = 0; i < 4; i++ )
913 for( int j = 0; j < 16; j++ )
916 dst[i*16+j] = src[i+j*4];
918 nnz[(i&1) + (i>>1)*8] = !!nz;
922 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
924 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
925 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
926 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
927 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
928 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
929 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
930 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
931 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
932 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
933 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
937 if( cpu&X264_CPU_SSE2 )
939 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
940 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
941 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
943 if( cpu&X264_CPU_SSE4 )
944 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
945 if( cpu&X264_CPU_AVX )
946 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
948 if( cpu&X264_CPU_AVX )
950 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
951 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
953 #endif // ARCH_X86_64
957 if( cpu&X264_CPU_MMX )
958 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
959 if( cpu&X264_CPU_MMX2 )
961 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
962 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
963 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
965 if( cpu&X264_CPU_SSE2_IS_FAST )
966 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
967 if( cpu&X264_CPU_SSSE3 )
969 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
970 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
971 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
972 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
973 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
974 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
975 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
977 if( cpu&X264_CPU_AVX )
979 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
980 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
982 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
983 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
985 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
987 if( cpu&X264_CPU_XOP )
989 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
990 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
991 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
995 if( cpu&X264_CPU_ALTIVEC )
997 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
998 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1002 if( cpu&X264_CPU_NEON )
1003 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1005 #endif // HIGH_BIT_DEPTH
1007 pf_interlaced->interleave_8x8_cavlc =
1008 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1011 if( cpu&X264_CPU_SSE2 )
1013 pf_interlaced->interleave_8x8_cavlc =
1014 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1016 if( cpu&X264_CPU_AVX )
1018 pf_interlaced->interleave_8x8_cavlc =
1019 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1022 if( cpu&X264_CPU_MMX )
1024 pf_interlaced->interleave_8x8_cavlc =
1025 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1027 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1029 pf_interlaced->interleave_8x8_cavlc =
1030 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1033 if( cpu&X264_CPU_AVX )
1035 pf_interlaced->interleave_8x8_cavlc =
1036 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1039 if( cpu&X264_CPU_AVX2 )
1041 pf_interlaced->interleave_8x8_cavlc =
1042 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1044 #endif // HIGH_BIT_DEPTH