1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2013 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <hengar-6@student.ltu.se>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 /* the inverse of the scaling factors introduced by 8x8 fdct */
40 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
41 #define W(i) (i==0 ? FIX8(1.0000) :\
42 i==1 ? FIX8(0.8859) :\
43 i==2 ? FIX8(1.6000) :\
44 i==3 ? FIX8(0.9415) :\
45 i==4 ? FIX8(1.2651) :\
46 i==5 ? FIX8(1.1910) :0)
47 const uint32_t x264_dct8_weight_tab[64] = {
48 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
49 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
50 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
51 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
53 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
54 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
55 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
56 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
60 #define W(i) (i==0 ? FIX8(1.76777) :\
61 i==1 ? FIX8(1.11803) :\
62 i==2 ? FIX8(0.70711) :0)
63 const uint32_t x264_dct4_weight_tab[16] = {
64 W(0), W(1), W(0), W(1),
65 W(1), W(2), W(1), W(2),
66 W(0), W(1), W(0), W(1),
67 W(1), W(2), W(1), W(2)
72 #define W(i) (i==0 ? FIX8(3.125) :\
75 const uint32_t x264_dct4_weight2_tab[16] = {
76 W(0), W(1), W(0), W(1),
77 W(1), W(2), W(1), W(2),
78 W(0), W(1), W(0), W(1),
79 W(1), W(2), W(1), W(2)
83 #define W(i) (i==0 ? FIX8(1.00000) :\
84 i==1 ? FIX8(0.78487) :\
85 i==2 ? FIX8(2.56132) :\
86 i==3 ? FIX8(0.88637) :\
87 i==4 ? FIX8(1.60040) :\
88 i==5 ? FIX8(1.41850) :0)
89 const uint32_t x264_dct8_weight2_tab[64] = {
90 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
91 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
92 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
93 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
95 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
96 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
97 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
98 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
103 static void dct4x4dc( dctcoef d[16] )
107 for( int i = 0; i < 4; i++ )
109 int s01 = d[i*4+0] + d[i*4+1];
110 int d01 = d[i*4+0] - d[i*4+1];
111 int s23 = d[i*4+2] + d[i*4+3];
112 int d23 = d[i*4+2] - d[i*4+3];
114 tmp[0*4+i] = s01 + s23;
115 tmp[1*4+i] = s01 - s23;
116 tmp[2*4+i] = d01 - d23;
117 tmp[3*4+i] = d01 + d23;
120 for( int i = 0; i < 4; i++ )
122 int s01 = tmp[i*4+0] + tmp[i*4+1];
123 int d01 = tmp[i*4+0] - tmp[i*4+1];
124 int s23 = tmp[i*4+2] + tmp[i*4+3];
125 int d23 = tmp[i*4+2] - tmp[i*4+3];
127 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
128 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
129 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
130 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
134 static void idct4x4dc( dctcoef d[16] )
138 for( int i = 0; i < 4; i++ )
140 int s01 = d[i*4+0] + d[i*4+1];
141 int d01 = d[i*4+0] - d[i*4+1];
142 int s23 = d[i*4+2] + d[i*4+3];
143 int d23 = d[i*4+2] - d[i*4+3];
145 tmp[0*4+i] = s01 + s23;
146 tmp[1*4+i] = s01 - s23;
147 tmp[2*4+i] = d01 - d23;
148 tmp[3*4+i] = d01 + d23;
151 for( int i = 0; i < 4; i++ )
153 int s01 = tmp[i*4+0] + tmp[i*4+1];
154 int d01 = tmp[i*4+0] - tmp[i*4+1];
155 int s23 = tmp[i*4+2] + tmp[i*4+3];
156 int d23 = tmp[i*4+2] - tmp[i*4+3];
158 d[i*4+0] = s01 + s23;
159 d[i*4+1] = s01 - s23;
160 d[i*4+2] = d01 - d23;
161 d[i*4+3] = d01 + d23;
165 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
167 int a0 = dct4x4[0][0] + dct4x4[1][0];
168 int a1 = dct4x4[2][0] + dct4x4[3][0];
169 int a2 = dct4x4[4][0] + dct4x4[5][0];
170 int a3 = dct4x4[6][0] + dct4x4[7][0];
171 int a4 = dct4x4[0][0] - dct4x4[1][0];
172 int a5 = dct4x4[2][0] - dct4x4[3][0];
173 int a6 = dct4x4[4][0] - dct4x4[5][0];
174 int a7 = dct4x4[6][0] - dct4x4[7][0];
201 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
202 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
204 for( int y = 0; y < i_size; y++ )
206 for( int x = 0; x < i_size; x++ )
207 diff[x + y*i_size] = pix1[x] - pix2[x];
213 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
218 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
220 for( int i = 0; i < 4; i++ )
222 int s03 = d[i*4+0] + d[i*4+3];
223 int s12 = d[i*4+1] + d[i*4+2];
224 int d03 = d[i*4+0] - d[i*4+3];
225 int d12 = d[i*4+1] - d[i*4+2];
227 tmp[0*4+i] = s03 + s12;
228 tmp[1*4+i] = 2*d03 + d12;
229 tmp[2*4+i] = s03 - s12;
230 tmp[3*4+i] = d03 - 2*d12;
233 for( int i = 0; i < 4; i++ )
235 int s03 = tmp[i*4+0] + tmp[i*4+3];
236 int s12 = tmp[i*4+1] + tmp[i*4+2];
237 int d03 = tmp[i*4+0] - tmp[i*4+3];
238 int d12 = tmp[i*4+1] - tmp[i*4+2];
240 dct[i*4+0] = s03 + s12;
241 dct[i*4+1] = 2*d03 + d12;
242 dct[i*4+2] = s03 - s12;
243 dct[i*4+3] = d03 - 2*d12;
247 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
249 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
250 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
251 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
252 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
255 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
257 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
258 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
259 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
260 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
263 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
266 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
267 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
268 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
272 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
274 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
275 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
276 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
277 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
279 /* 2x2 DC transform */
280 int d0 = dct[0] + dct[1];
281 int d1 = dct[2] + dct[3];
282 int d2 = dct[0] - dct[1];
283 int d3 = dct[2] - dct[3];
290 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
292 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
293 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
294 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
295 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
296 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
297 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
298 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
299 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
301 /* 2x4 DC transform */
328 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
333 for( int i = 0; i < 4; i++ )
335 int s02 = dct[0*4+i] + dct[2*4+i];
336 int d02 = dct[0*4+i] - dct[2*4+i];
337 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
338 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
340 tmp[i*4+0] = s02 + s13;
341 tmp[i*4+1] = d02 + d13;
342 tmp[i*4+2] = d02 - d13;
343 tmp[i*4+3] = s02 - s13;
346 for( int i = 0; i < 4; i++ )
348 int s02 = tmp[0*4+i] + tmp[2*4+i];
349 int d02 = tmp[0*4+i] - tmp[2*4+i];
350 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
351 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
353 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
354 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
355 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
356 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
360 for( int y = 0; y < 4; y++ )
362 for( int x = 0; x < 4; x++ )
363 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
364 p_dst += FDEC_STRIDE;
368 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
370 add4x4_idct( &p_dst[0], dct[0] );
371 add4x4_idct( &p_dst[4], dct[1] );
372 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
373 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
376 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
378 add8x8_idct( &p_dst[0], &dct[0] );
379 add8x8_idct( &p_dst[8], &dct[4] );
380 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
381 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
384 /****************************************************************************
386 ****************************************************************************/
389 int s07 = SRC(0) + SRC(7);\
390 int s16 = SRC(1) + SRC(6);\
391 int s25 = SRC(2) + SRC(5);\
392 int s34 = SRC(3) + SRC(4);\
397 int d07 = SRC(0) - SRC(7);\
398 int d16 = SRC(1) - SRC(6);\
399 int d25 = SRC(2) - SRC(5);\
400 int d34 = SRC(3) - SRC(4);\
401 int a4 = d16 + d25 + (d07 + (d07>>1));\
402 int a5 = d07 - d34 - (d25 + (d25>>1));\
403 int a6 = d07 + d34 - (d16 + (d16>>1));\
404 int a7 = d16 - d25 + (d34 + (d34>>1));\
406 DST(1) = a4 + (a7>>2);\
407 DST(2) = a2 + (a3>>1);\
408 DST(3) = a5 + (a6>>2);\
410 DST(5) = a6 - (a5>>2);\
411 DST(6) = (a2>>1) - a3 ;\
412 DST(7) = (a4>>2) - a7 ;\
415 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
419 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
421 #define SRC(x) tmp[x*8+i]
422 #define DST(x) tmp[x*8+i]
423 for( int i = 0; i < 8; i++ )
428 #define SRC(x) tmp[i*8+x]
429 #define DST(x) dct[x*8+i]
430 for( int i = 0; i < 8; i++ )
436 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
438 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
439 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
440 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
441 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
445 int a0 = SRC(0) + SRC(4);\
446 int a2 = SRC(0) - SRC(4);\
447 int a4 = (SRC(2)>>1) - SRC(6);\
448 int a6 = (SRC(6)>>1) + SRC(2);\
453 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
454 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
455 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
456 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
457 int b1 = (a7>>2) + a1;\
458 int b3 = a3 + (a5>>2);\
459 int b5 = (a3>>2) - a5;\
460 int b7 = a7 - (a1>>2);\
471 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
473 dct[0] += 32; // rounding for the >>6 at the end
475 #define SRC(x) dct[x*8+i]
476 #define DST(x,rhs) dct[x*8+i] = (rhs)
477 for( int i = 0; i < 8; i++ )
482 #define SRC(x) dct[i*8+x]
483 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
484 for( int i = 0; i < 8; i++ )
490 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
492 add8x8_idct8( &dst[0], dct[0] );
493 add8x8_idct8( &dst[8], dct[1] );
494 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
495 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
498 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
501 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
503 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
504 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
505 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
506 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
510 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
512 add4x4_idct_dc( &p_dst[0], dct[0] );
513 add4x4_idct_dc( &p_dst[4], dct[1] );
514 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
515 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
518 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
520 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
522 add4x4_idct_dc( &p_dst[ 0], dct[0] );
523 add4x4_idct_dc( &p_dst[ 4], dct[1] );
524 add4x4_idct_dc( &p_dst[ 8], dct[2] );
525 add4x4_idct_dc( &p_dst[12], dct[3] );
530 /****************************************************************************
532 ****************************************************************************/
533 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
535 dctf->sub4x4_dct = sub4x4_dct;
536 dctf->add4x4_idct = add4x4_idct;
538 dctf->sub8x8_dct = sub8x8_dct;
539 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
540 dctf->add8x8_idct = add8x8_idct;
541 dctf->add8x8_idct_dc = add8x8_idct_dc;
543 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
545 dctf->sub16x16_dct = sub16x16_dct;
546 dctf->add16x16_idct = add16x16_idct;
547 dctf->add16x16_idct_dc = add16x16_idct_dc;
549 dctf->sub8x8_dct8 = sub8x8_dct8;
550 dctf->add8x8_idct8 = add8x8_idct8;
552 dctf->sub16x16_dct8 = sub16x16_dct8;
553 dctf->add16x16_idct8 = add16x16_idct8;
555 dctf->dct4x4dc = dct4x4dc;
556 dctf->idct4x4dc = idct4x4dc;
558 dctf->dct2x4dc = dct2x4dc;
562 if( cpu&X264_CPU_MMX )
564 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
565 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
566 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
568 if( cpu&X264_CPU_SSE2 )
570 dctf->add4x4_idct = x264_add4x4_idct_sse2;
571 dctf->dct4x4dc = x264_dct4x4dc_sse2;
572 dctf->idct4x4dc = x264_idct4x4dc_sse2;
573 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
574 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
575 dctf->add8x8_idct = x264_add8x8_idct_sse2;
576 dctf->add16x16_idct = x264_add16x16_idct_sse2;
577 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
578 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
579 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
580 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
581 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
582 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
584 if( cpu&X264_CPU_SSE4 )
586 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
587 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
589 if( cpu&X264_CPU_AVX )
591 dctf->add4x4_idct = x264_add4x4_idct_avx;
592 dctf->dct4x4dc = x264_dct4x4dc_avx;
593 dctf->idct4x4dc = x264_idct4x4dc_avx;
594 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
595 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
596 dctf->add8x8_idct = x264_add8x8_idct_avx;
597 dctf->add16x16_idct = x264_add16x16_idct_avx;
598 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
599 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
600 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
601 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
602 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
605 #else // !HIGH_BIT_DEPTH
607 if( cpu&X264_CPU_MMX )
609 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
610 dctf->add4x4_idct = x264_add4x4_idct_mmx;
611 dctf->dct4x4dc = x264_dct4x4dc_mmx;
612 dctf->idct4x4dc = x264_idct4x4dc_mmx;
613 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
616 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
617 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
618 dctf->add8x8_idct = x264_add8x8_idct_mmx;
619 dctf->add16x16_idct = x264_add16x16_idct_mmx;
621 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
622 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
623 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
624 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
628 if( cpu&X264_CPU_MMX2 )
630 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
631 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
634 if( cpu&X264_CPU_SSE2 )
636 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
637 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
638 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
639 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
640 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
641 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
643 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
645 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
646 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
647 dctf->add8x8_idct = x264_add8x8_idct_sse2;
648 dctf->add16x16_idct = x264_add16x16_idct_sse2;
649 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
653 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
655 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
656 if( !(cpu&X264_CPU_SLOW_ATOM) )
658 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
659 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
660 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
661 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
662 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
663 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
665 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
666 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
671 if( cpu&X264_CPU_SSE4 )
672 dctf->add4x4_idct = x264_add4x4_idct_sse4;
674 if( cpu&X264_CPU_AVX )
676 dctf->add4x4_idct = x264_add4x4_idct_avx;
677 dctf->add8x8_idct = x264_add8x8_idct_avx;
678 dctf->add16x16_idct = x264_add16x16_idct_avx;
679 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
680 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
681 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
682 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
683 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
684 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
685 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
688 if( cpu&X264_CPU_XOP )
690 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
691 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
694 if( cpu&X264_CPU_AVX2 )
696 dctf->add8x8_idct = x264_add8x8_idct_avx2;
697 dctf->add16x16_idct = x264_add16x16_idct_avx2;
698 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
699 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
700 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
702 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
708 if( cpu&X264_CPU_ALTIVEC )
710 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
711 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
712 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
714 dctf->add4x4_idct = x264_add4x4_idct_altivec;
715 dctf->add8x8_idct = x264_add8x8_idct_altivec;
716 dctf->add16x16_idct = x264_add16x16_idct_altivec;
718 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
719 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
721 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
722 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
727 if( cpu&X264_CPU_NEON )
729 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
730 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
731 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
732 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
733 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
734 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
735 dctf->dct4x4dc = x264_dct4x4dc_neon;
736 dctf->idct4x4dc = x264_idct4x4dc_neon;
738 dctf->add4x4_idct = x264_add4x4_idct_neon;
739 dctf->add8x8_idct = x264_add8x8_idct_neon;
740 dctf->add16x16_idct = x264_add16x16_idct_neon;
742 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
743 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
745 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
746 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
749 #endif // HIGH_BIT_DEPTH
753 #define ZIG(i,y,x) level[i] = dct[x*8+y];
754 #define ZIGZAG8_FRAME\
755 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
756 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
757 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
758 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
759 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
760 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
761 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
762 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
763 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
764 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
765 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
766 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
767 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
768 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
769 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
770 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
772 #define ZIGZAG8_FIELD\
773 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
774 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
775 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
776 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
777 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
778 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
779 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
780 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
781 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
782 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
783 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
784 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
785 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
786 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
787 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
788 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
790 #define ZIGZAG4_FRAME\
791 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
792 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
793 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
794 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
796 #define ZIGZAG4_FIELD\
797 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
798 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
799 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
800 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
802 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
807 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
813 #define ZIG(i,y,x) level[i] = dct[x*4+y];
814 #define ZIGDC(i,y,x) ZIG(i,y,x)
816 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
821 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
823 memcpy( level, dct, 2 * sizeof(dctcoef) );
824 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
825 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
829 #define ZIG(i,y,x) {\
830 int oe = x+y*FENC_STRIDE;\
831 int od = x+y*FDEC_STRIDE;\
832 level[i] = p_src[oe] - p_dst[od];\
836 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
837 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
838 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
839 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
840 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
842 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
843 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
844 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
845 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
846 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
847 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
848 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
849 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
851 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
859 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
868 #define ZIGDC(i,y,x) {\
869 int oe = x+y*FENC_STRIDE;\
870 int od = x+y*FDEC_STRIDE;\
871 *dc = p_src[oe] - p_dst[od];\
875 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
883 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
891 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
898 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
909 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
911 for( int i = 0; i < 4; i++ )
914 for( int j = 0; j < 16; j++ )
917 dst[i*16+j] = src[i+j*4];
919 nnz[(i&1) + (i>>1)*8] = !!nz;
923 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
925 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
926 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
927 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
928 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
929 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
930 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
931 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
932 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
933 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
934 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
938 if( cpu&X264_CPU_SSE2 )
940 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
941 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
942 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
944 if( cpu&X264_CPU_SSE4 )
945 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
946 if( cpu&X264_CPU_AVX )
947 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
949 if( cpu&X264_CPU_AVX )
951 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
952 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
954 #endif // ARCH_X86_64
958 if( cpu&X264_CPU_MMX )
959 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
960 if( cpu&X264_CPU_MMX2 )
962 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
963 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
964 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
966 if( cpu&X264_CPU_SSE2_IS_FAST )
967 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
968 if( cpu&X264_CPU_SSSE3 )
970 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
971 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
972 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
973 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
974 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
975 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
976 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
978 if( cpu&X264_CPU_AVX )
980 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
981 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
983 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
984 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
986 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
988 if( cpu&X264_CPU_XOP )
990 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
991 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
992 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
996 if( cpu&X264_CPU_ALTIVEC )
998 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
999 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1003 if( cpu&X264_CPU_NEON )
1004 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1006 #endif // HIGH_BIT_DEPTH
1008 pf_interlaced->interleave_8x8_cavlc =
1009 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1012 if( cpu&X264_CPU_SSE2 )
1014 pf_interlaced->interleave_8x8_cavlc =
1015 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1017 if( cpu&X264_CPU_AVX )
1019 pf_interlaced->interleave_8x8_cavlc =
1020 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1023 if( cpu&X264_CPU_MMX )
1025 pf_interlaced->interleave_8x8_cavlc =
1026 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1028 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1030 pf_interlaced->interleave_8x8_cavlc =
1031 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1034 if( cpu&X264_CPU_AVX )
1036 pf_interlaced->interleave_8x8_cavlc =
1037 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1040 if( cpu&X264_CPU_AVX2 )
1042 pf_interlaced->interleave_8x8_cavlc =
1043 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1045 #endif // HIGH_BIT_DEPTH