1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2012 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <hengar-6@student.ltu.se>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 /* the inverse of the scaling factors introduced by 8x8 fdct */
40 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
41 #define W(i) (i==0 ? FIX8(1.0000) :\
42 i==1 ? FIX8(0.8859) :\
43 i==2 ? FIX8(1.6000) :\
44 i==3 ? FIX8(0.9415) :\
45 i==4 ? FIX8(1.2651) :\
46 i==5 ? FIX8(1.1910) :0)
47 const uint32_t x264_dct8_weight_tab[64] = {
48 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
49 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
50 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
51 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
53 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
54 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
55 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
56 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
60 #define W(i) (i==0 ? FIX8(1.76777) :\
61 i==1 ? FIX8(1.11803) :\
62 i==2 ? FIX8(0.70711) :0)
63 const uint32_t x264_dct4_weight_tab[16] = {
64 W(0), W(1), W(0), W(1),
65 W(1), W(2), W(1), W(2),
66 W(0), W(1), W(0), W(1),
67 W(1), W(2), W(1), W(2)
72 #define W(i) (i==0 ? FIX8(3.125) :\
75 const uint32_t x264_dct4_weight2_tab[16] = {
76 W(0), W(1), W(0), W(1),
77 W(1), W(2), W(1), W(2),
78 W(0), W(1), W(0), W(1),
79 W(1), W(2), W(1), W(2)
83 #define W(i) (i==0 ? FIX8(1.00000) :\
84 i==1 ? FIX8(0.78487) :\
85 i==2 ? FIX8(2.56132) :\
86 i==3 ? FIX8(0.88637) :\
87 i==4 ? FIX8(1.60040) :\
88 i==5 ? FIX8(1.41850) :0)
89 const uint32_t x264_dct8_weight2_tab[64] = {
90 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
91 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
92 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
93 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
95 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
96 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
97 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
98 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
103 static void dct4x4dc( dctcoef d[16] )
107 for( int i = 0; i < 4; i++ )
109 int s01 = d[i*4+0] + d[i*4+1];
110 int d01 = d[i*4+0] - d[i*4+1];
111 int s23 = d[i*4+2] + d[i*4+3];
112 int d23 = d[i*4+2] - d[i*4+3];
114 tmp[0*4+i] = s01 + s23;
115 tmp[1*4+i] = s01 - s23;
116 tmp[2*4+i] = d01 - d23;
117 tmp[3*4+i] = d01 + d23;
120 for( int i = 0; i < 4; i++ )
122 int s01 = tmp[i*4+0] + tmp[i*4+1];
123 int d01 = tmp[i*4+0] - tmp[i*4+1];
124 int s23 = tmp[i*4+2] + tmp[i*4+3];
125 int d23 = tmp[i*4+2] - tmp[i*4+3];
127 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
128 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
129 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
130 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
134 static void idct4x4dc( dctcoef d[16] )
138 for( int i = 0; i < 4; i++ )
140 int s01 = d[i*4+0] + d[i*4+1];
141 int d01 = d[i*4+0] - d[i*4+1];
142 int s23 = d[i*4+2] + d[i*4+3];
143 int d23 = d[i*4+2] - d[i*4+3];
145 tmp[0*4+i] = s01 + s23;
146 tmp[1*4+i] = s01 - s23;
147 tmp[2*4+i] = d01 - d23;
148 tmp[3*4+i] = d01 + d23;
151 for( int i = 0; i < 4; i++ )
153 int s01 = tmp[i*4+0] + tmp[i*4+1];
154 int d01 = tmp[i*4+0] - tmp[i*4+1];
155 int s23 = tmp[i*4+2] + tmp[i*4+3];
156 int d23 = tmp[i*4+2] - tmp[i*4+3];
158 d[i*4+0] = s01 + s23;
159 d[i*4+1] = s01 - s23;
160 d[i*4+2] = d01 - d23;
161 d[i*4+3] = d01 + d23;
165 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
167 int a0 = dct4x4[0][0] + dct4x4[1][0];
168 int a1 = dct4x4[2][0] + dct4x4[3][0];
169 int a2 = dct4x4[4][0] + dct4x4[5][0];
170 int a3 = dct4x4[6][0] + dct4x4[7][0];
171 int a4 = dct4x4[0][0] - dct4x4[1][0];
172 int a5 = dct4x4[2][0] - dct4x4[3][0];
173 int a6 = dct4x4[4][0] - dct4x4[5][0];
174 int a7 = dct4x4[6][0] - dct4x4[7][0];
201 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
202 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
204 for( int y = 0; y < i_size; y++ )
206 for( int x = 0; x < i_size; x++ )
207 diff[x + y*i_size] = pix1[x] - pix2[x];
213 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
218 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
220 for( int i = 0; i < 4; i++ )
222 int s03 = d[i*4+0] + d[i*4+3];
223 int s12 = d[i*4+1] + d[i*4+2];
224 int d03 = d[i*4+0] - d[i*4+3];
225 int d12 = d[i*4+1] - d[i*4+2];
227 tmp[0*4+i] = s03 + s12;
228 tmp[1*4+i] = 2*d03 + d12;
229 tmp[2*4+i] = s03 - s12;
230 tmp[3*4+i] = d03 - 2*d12;
233 for( int i = 0; i < 4; i++ )
235 int s03 = tmp[i*4+0] + tmp[i*4+3];
236 int s12 = tmp[i*4+1] + tmp[i*4+2];
237 int d03 = tmp[i*4+0] - tmp[i*4+3];
238 int d12 = tmp[i*4+1] - tmp[i*4+2];
240 dct[i*4+0] = s03 + s12;
241 dct[i*4+1] = 2*d03 + d12;
242 dct[i*4+2] = s03 - s12;
243 dct[i*4+3] = d03 - 2*d12;
247 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
249 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
250 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
251 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
252 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
255 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
257 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
258 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
259 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
260 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
263 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
266 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
267 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
268 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
272 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
274 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
275 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
276 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
277 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
279 /* 2x2 DC transform */
280 int d0 = dct[0] + dct[1];
281 int d1 = dct[2] + dct[3];
282 int d2 = dct[0] - dct[1];
283 int d3 = dct[2] - dct[3];
290 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
292 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
293 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
294 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
295 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
296 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
297 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
298 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
299 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
301 /* 2x4 DC transform */
328 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
333 for( int i = 0; i < 4; i++ )
335 int s02 = dct[0*4+i] + dct[2*4+i];
336 int d02 = dct[0*4+i] - dct[2*4+i];
337 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
338 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
340 tmp[i*4+0] = s02 + s13;
341 tmp[i*4+1] = d02 + d13;
342 tmp[i*4+2] = d02 - d13;
343 tmp[i*4+3] = s02 - s13;
346 for( int i = 0; i < 4; i++ )
348 int s02 = tmp[0*4+i] + tmp[2*4+i];
349 int d02 = tmp[0*4+i] - tmp[2*4+i];
350 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
351 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
353 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
354 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
355 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
356 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
360 for( int y = 0; y < 4; y++ )
362 for( int x = 0; x < 4; x++ )
363 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
364 p_dst += FDEC_STRIDE;
368 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
370 add4x4_idct( &p_dst[0], dct[0] );
371 add4x4_idct( &p_dst[4], dct[1] );
372 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
373 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
376 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
378 add8x8_idct( &p_dst[0], &dct[0] );
379 add8x8_idct( &p_dst[8], &dct[4] );
380 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
381 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
384 /****************************************************************************
386 ****************************************************************************/
389 int s07 = SRC(0) + SRC(7);\
390 int s16 = SRC(1) + SRC(6);\
391 int s25 = SRC(2) + SRC(5);\
392 int s34 = SRC(3) + SRC(4);\
397 int d07 = SRC(0) - SRC(7);\
398 int d16 = SRC(1) - SRC(6);\
399 int d25 = SRC(2) - SRC(5);\
400 int d34 = SRC(3) - SRC(4);\
401 int a4 = d16 + d25 + (d07 + (d07>>1));\
402 int a5 = d07 - d34 - (d25 + (d25>>1));\
403 int a6 = d07 + d34 - (d16 + (d16>>1));\
404 int a7 = d16 - d25 + (d34 + (d34>>1));\
406 DST(1) = a4 + (a7>>2);\
407 DST(2) = a2 + (a3>>1);\
408 DST(3) = a5 + (a6>>2);\
410 DST(5) = a6 - (a5>>2);\
411 DST(6) = (a2>>1) - a3 ;\
412 DST(7) = (a4>>2) - a7 ;\
415 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
419 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
421 #define SRC(x) tmp[x*8+i]
422 #define DST(x) tmp[x*8+i]
423 for( int i = 0; i < 8; i++ )
428 #define SRC(x) tmp[i*8+x]
429 #define DST(x) dct[x*8+i]
430 for( int i = 0; i < 8; i++ )
436 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
438 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
439 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
440 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
441 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
445 int a0 = SRC(0) + SRC(4);\
446 int a2 = SRC(0) - SRC(4);\
447 int a4 = (SRC(2)>>1) - SRC(6);\
448 int a6 = (SRC(6)>>1) + SRC(2);\
453 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
454 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
455 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
456 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
457 int b1 = (a7>>2) + a1;\
458 int b3 = a3 + (a5>>2);\
459 int b5 = (a3>>2) - a5;\
460 int b7 = a7 - (a1>>2);\
471 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
473 dct[0] += 32; // rounding for the >>6 at the end
475 #define SRC(x) dct[x*8+i]
476 #define DST(x,rhs) dct[x*8+i] = (rhs)
477 for( int i = 0; i < 8; i++ )
482 #define SRC(x) dct[i*8+x]
483 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
484 for( int i = 0; i < 8; i++ )
490 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
492 add8x8_idct8( &dst[0], dct[0] );
493 add8x8_idct8( &dst[8], dct[1] );
494 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
495 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
498 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
501 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
503 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
504 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
505 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
506 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
510 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
512 add4x4_idct_dc( &p_dst[0], dct[0] );
513 add4x4_idct_dc( &p_dst[4], dct[1] );
514 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
515 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
518 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
520 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
522 add4x4_idct_dc( &p_dst[ 0], dct[0] );
523 add4x4_idct_dc( &p_dst[ 4], dct[1] );
524 add4x4_idct_dc( &p_dst[ 8], dct[2] );
525 add4x4_idct_dc( &p_dst[12], dct[3] );
530 /****************************************************************************
532 ****************************************************************************/
533 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
535 dctf->sub4x4_dct = sub4x4_dct;
536 dctf->add4x4_idct = add4x4_idct;
538 dctf->sub8x8_dct = sub8x8_dct;
539 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
540 dctf->add8x8_idct = add8x8_idct;
541 dctf->add8x8_idct_dc = add8x8_idct_dc;
543 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
545 dctf->sub16x16_dct = sub16x16_dct;
546 dctf->add16x16_idct = add16x16_idct;
547 dctf->add16x16_idct_dc = add16x16_idct_dc;
549 dctf->sub8x8_dct8 = sub8x8_dct8;
550 dctf->add8x8_idct8 = add8x8_idct8;
552 dctf->sub16x16_dct8 = sub16x16_dct8;
553 dctf->add16x16_idct8 = add16x16_idct8;
555 dctf->dct4x4dc = dct4x4dc;
556 dctf->idct4x4dc = idct4x4dc;
558 dctf->dct2x4dc = dct2x4dc;
562 if( cpu&X264_CPU_MMX )
564 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
565 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
566 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
568 if( cpu&X264_CPU_SSE2 )
570 dctf->add4x4_idct = x264_add4x4_idct_sse2;
571 dctf->dct4x4dc = x264_dct4x4dc_sse2;
572 dctf->idct4x4dc = x264_idct4x4dc_sse2;
573 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
574 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
575 dctf->add8x8_idct = x264_add8x8_idct_sse2;
576 dctf->add16x16_idct = x264_add16x16_idct_sse2;
577 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
578 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
579 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
580 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
581 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
582 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
584 if( cpu&X264_CPU_SSE4 )
586 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
587 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
589 if( cpu&X264_CPU_AVX )
591 dctf->add4x4_idct = x264_add4x4_idct_avx;
592 dctf->dct4x4dc = x264_dct4x4dc_avx;
593 dctf->idct4x4dc = x264_idct4x4dc_avx;
594 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
595 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
596 dctf->add8x8_idct = x264_add8x8_idct_avx;
597 dctf->add16x16_idct = x264_add16x16_idct_avx;
598 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
599 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
600 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
601 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
602 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
605 #else // !HIGH_BIT_DEPTH
607 if( cpu&X264_CPU_MMX )
609 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
610 dctf->add4x4_idct = x264_add4x4_idct_mmx;
611 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
612 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
613 dctf->dct4x4dc = x264_dct4x4dc_mmx;
614 dctf->idct4x4dc = x264_idct4x4dc_mmx;
615 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
618 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
619 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
620 dctf->add8x8_idct = x264_add8x8_idct_mmx;
621 dctf->add16x16_idct = x264_add16x16_idct_mmx;
623 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
624 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
625 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
626 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
630 if( cpu&X264_CPU_SSE2 )
632 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
633 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
634 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
635 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
636 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
637 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
639 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
640 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
641 dctf->add8x8_idct = x264_add8x8_idct_sse2;
642 dctf->add16x16_idct = x264_add16x16_idct_sse2;
643 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
646 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
648 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
649 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
650 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
651 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
652 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
653 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
654 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
655 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
658 if( cpu&X264_CPU_SSE4 )
659 dctf->add4x4_idct = x264_add4x4_idct_sse4;
661 if( cpu&X264_CPU_AVX )
663 dctf->add4x4_idct = x264_add4x4_idct_avx;
664 dctf->add8x8_idct = x264_add8x8_idct_avx;
665 dctf->add16x16_idct = x264_add16x16_idct_avx;
666 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
667 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
668 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
669 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
670 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
671 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
672 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
675 if( cpu&X264_CPU_XOP )
677 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
678 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
683 if( cpu&X264_CPU_ALTIVEC )
685 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
686 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
687 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
689 dctf->add4x4_idct = x264_add4x4_idct_altivec;
690 dctf->add8x8_idct = x264_add8x8_idct_altivec;
691 dctf->add16x16_idct = x264_add16x16_idct_altivec;
693 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
694 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
696 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
697 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
702 if( cpu&X264_CPU_NEON )
704 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
705 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
706 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
707 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
708 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
709 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
710 dctf->dct4x4dc = x264_dct4x4dc_neon;
711 dctf->idct4x4dc = x264_idct4x4dc_neon;
713 dctf->add4x4_idct = x264_add4x4_idct_neon;
714 dctf->add8x8_idct = x264_add8x8_idct_neon;
715 dctf->add16x16_idct = x264_add16x16_idct_neon;
717 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
718 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
720 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
721 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
724 #endif // HIGH_BIT_DEPTH
728 #define ZIG(i,y,x) level[i] = dct[x*8+y];
729 #define ZIGZAG8_FRAME\
730 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
731 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
732 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
733 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
734 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
735 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
736 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
737 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
738 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
739 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
740 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
741 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
742 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
743 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
744 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
745 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
747 #define ZIGZAG8_FIELD\
748 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
749 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
750 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
751 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
752 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
753 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
754 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
755 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
756 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
757 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
758 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
759 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
760 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
761 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
762 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
763 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
765 #define ZIGZAG4_FRAME\
766 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
767 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
768 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
769 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
771 #define ZIGZAG4_FIELD\
772 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
773 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
774 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
775 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
777 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
782 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
788 #define ZIG(i,y,x) level[i] = dct[x*4+y];
789 #define ZIGDC(i,y,x) ZIG(i,y,x)
791 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
796 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
798 memcpy( level, dct, 2 * sizeof(dctcoef) );
799 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
800 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
804 #define ZIG(i,y,x) {\
805 int oe = x+y*FENC_STRIDE;\
806 int od = x+y*FDEC_STRIDE;\
807 level[i] = p_src[oe] - p_dst[od];\
811 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
812 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
813 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
814 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
815 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
817 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
818 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
819 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
820 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
821 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
822 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
823 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
824 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
826 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
834 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
843 #define ZIGDC(i,y,x) {\
844 int oe = x+y*FENC_STRIDE;\
845 int od = x+y*FDEC_STRIDE;\
846 *dc = p_src[oe] - p_dst[od];\
850 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
858 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
866 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
873 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
884 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
886 for( int i = 0; i < 4; i++ )
889 for( int j = 0; j < 16; j++ )
892 dst[i*16+j] = src[i+j*4];
894 nnz[(i&1) + (i>>1)*8] = !!nz;
898 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
900 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
901 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
902 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
903 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
904 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
905 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
906 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
907 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
908 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
909 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
913 if( cpu&X264_CPU_SSE2 )
915 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
916 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
917 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
919 if( cpu&X264_CPU_SSE4 )
920 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
921 if( cpu&X264_CPU_AVX )
922 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
924 if( cpu&X264_CPU_AVX )
926 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
927 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
929 #endif // ARCH_X86_64
933 if( cpu&X264_CPU_MMX )
934 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
935 if( cpu&X264_CPU_MMX2 )
937 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
938 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
939 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
941 if( cpu&X264_CPU_SSE2_IS_FAST )
942 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
943 if( cpu&X264_CPU_SSSE3 )
945 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
946 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
947 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
948 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
949 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
950 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
951 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
953 if( cpu&X264_CPU_AVX )
955 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
956 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
958 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
959 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
961 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
962 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
964 if( cpu&X264_CPU_XOP )
966 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
967 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
968 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
972 if( cpu&X264_CPU_ALTIVEC )
974 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
975 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
979 if( cpu&X264_CPU_NEON )
980 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
982 #endif // HIGH_BIT_DEPTH
984 pf_interlaced->interleave_8x8_cavlc =
985 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
988 if( cpu&X264_CPU_SSE2 )
990 pf_interlaced->interleave_8x8_cavlc =
991 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
993 if( cpu&X264_CPU_AVX )
995 pf_interlaced->interleave_8x8_cavlc =
996 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
999 if( cpu&X264_CPU_MMX )
1001 pf_interlaced->interleave_8x8_cavlc =
1002 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1004 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
1006 pf_interlaced->interleave_8x8_cavlc =
1007 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1010 if( cpu&X264_CPU_AVX )
1012 pf_interlaced->interleave_8x8_cavlc =
1013 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1015 #endif // HIGH_BIT_DEPTH