1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2016 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <henrik@gramner.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 # include "aarch64/dct.h"
42 # include "mips/dct.h"
45 /* the inverse of the scaling factors introduced by 8x8 fdct */
46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47 #define W(i) (i==0 ? FIX8(1.0000) :\
48 i==1 ? FIX8(0.8859) :\
49 i==2 ? FIX8(1.6000) :\
50 i==3 ? FIX8(0.9415) :\
51 i==4 ? FIX8(1.2651) :\
52 i==5 ? FIX8(1.1910) :0)
53 const uint32_t x264_dct8_weight_tab[64] = {
54 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
59 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
66 #define W(i) (i==0 ? FIX8(1.76777) :\
67 i==1 ? FIX8(1.11803) :\
68 i==2 ? FIX8(0.70711) :0)
69 const uint32_t x264_dct4_weight_tab[16] = {
70 W(0), W(1), W(0), W(1),
71 W(1), W(2), W(1), W(2),
72 W(0), W(1), W(0), W(1),
73 W(1), W(2), W(1), W(2)
78 #define W(i) (i==0 ? FIX8(3.125) :\
81 const uint32_t x264_dct4_weight2_tab[16] = {
82 W(0), W(1), W(0), W(1),
83 W(1), W(2), W(1), W(2),
84 W(0), W(1), W(0), W(1),
85 W(1), W(2), W(1), W(2)
89 #define W(i) (i==0 ? FIX8(1.00000) :\
90 i==1 ? FIX8(0.78487) :\
91 i==2 ? FIX8(2.56132) :\
92 i==3 ? FIX8(0.88637) :\
93 i==4 ? FIX8(1.60040) :\
94 i==5 ? FIX8(1.41850) :0)
95 const uint32_t x264_dct8_weight2_tab[64] = {
96 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
101 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
109 static void dct4x4dc( dctcoef d[16] )
113 for( int i = 0; i < 4; i++ )
115 int s01 = d[i*4+0] + d[i*4+1];
116 int d01 = d[i*4+0] - d[i*4+1];
117 int s23 = d[i*4+2] + d[i*4+3];
118 int d23 = d[i*4+2] - d[i*4+3];
120 tmp[0*4+i] = s01 + s23;
121 tmp[1*4+i] = s01 - s23;
122 tmp[2*4+i] = d01 - d23;
123 tmp[3*4+i] = d01 + d23;
126 for( int i = 0; i < 4; i++ )
128 int s01 = tmp[i*4+0] + tmp[i*4+1];
129 int d01 = tmp[i*4+0] - tmp[i*4+1];
130 int s23 = tmp[i*4+2] + tmp[i*4+3];
131 int d23 = tmp[i*4+2] - tmp[i*4+3];
133 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
140 static void idct4x4dc( dctcoef d[16] )
144 for( int i = 0; i < 4; i++ )
146 int s01 = d[i*4+0] + d[i*4+1];
147 int d01 = d[i*4+0] - d[i*4+1];
148 int s23 = d[i*4+2] + d[i*4+3];
149 int d23 = d[i*4+2] - d[i*4+3];
151 tmp[0*4+i] = s01 + s23;
152 tmp[1*4+i] = s01 - s23;
153 tmp[2*4+i] = d01 - d23;
154 tmp[3*4+i] = d01 + d23;
157 for( int i = 0; i < 4; i++ )
159 int s01 = tmp[i*4+0] + tmp[i*4+1];
160 int d01 = tmp[i*4+0] - tmp[i*4+1];
161 int s23 = tmp[i*4+2] + tmp[i*4+3];
162 int d23 = tmp[i*4+2] - tmp[i*4+3];
164 d[i*4+0] = s01 + s23;
165 d[i*4+1] = s01 - s23;
166 d[i*4+2] = d01 - d23;
167 d[i*4+3] = d01 + d23;
171 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
173 int a0 = dct4x4[0][0] + dct4x4[1][0];
174 int a1 = dct4x4[2][0] + dct4x4[3][0];
175 int a2 = dct4x4[4][0] + dct4x4[5][0];
176 int a3 = dct4x4[6][0] + dct4x4[7][0];
177 int a4 = dct4x4[0][0] - dct4x4[1][0];
178 int a5 = dct4x4[2][0] - dct4x4[3][0];
179 int a6 = dct4x4[4][0] - dct4x4[5][0];
180 int a7 = dct4x4[6][0] - dct4x4[7][0];
207 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
210 for( int y = 0; y < i_size; y++ )
212 for( int x = 0; x < i_size; x++ )
213 diff[x + y*i_size] = pix1[x] - pix2[x];
219 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
224 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
226 for( int i = 0; i < 4; i++ )
228 int s03 = d[i*4+0] + d[i*4+3];
229 int s12 = d[i*4+1] + d[i*4+2];
230 int d03 = d[i*4+0] - d[i*4+3];
231 int d12 = d[i*4+1] - d[i*4+2];
233 tmp[0*4+i] = s03 + s12;
234 tmp[1*4+i] = 2*d03 + d12;
235 tmp[2*4+i] = s03 - s12;
236 tmp[3*4+i] = d03 - 2*d12;
239 for( int i = 0; i < 4; i++ )
241 int s03 = tmp[i*4+0] + tmp[i*4+3];
242 int s12 = tmp[i*4+1] + tmp[i*4+2];
243 int d03 = tmp[i*4+0] - tmp[i*4+3];
244 int d12 = tmp[i*4+1] - tmp[i*4+2];
246 dct[i*4+0] = s03 + s12;
247 dct[i*4+1] = 2*d03 + d12;
248 dct[i*4+2] = s03 - s12;
249 dct[i*4+3] = d03 - 2*d12;
253 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
255 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
261 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
263 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
269 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
272 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
278 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
280 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
285 /* 2x2 DC transform */
286 int d0 = dct[0] + dct[1];
287 int d1 = dct[2] + dct[3];
288 int d2 = dct[0] - dct[1];
289 int d3 = dct[2] - dct[3];
296 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
298 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
307 /* 2x4 DC transform */
334 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
339 for( int i = 0; i < 4; i++ )
341 int s02 = dct[0*4+i] + dct[2*4+i];
342 int d02 = dct[0*4+i] - dct[2*4+i];
343 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
344 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
346 tmp[i*4+0] = s02 + s13;
347 tmp[i*4+1] = d02 + d13;
348 tmp[i*4+2] = d02 - d13;
349 tmp[i*4+3] = s02 - s13;
352 for( int i = 0; i < 4; i++ )
354 int s02 = tmp[0*4+i] + tmp[2*4+i];
355 int d02 = tmp[0*4+i] - tmp[2*4+i];
356 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
357 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
359 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
366 for( int y = 0; y < 4; y++ )
368 for( int x = 0; x < 4; x++ )
369 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370 p_dst += FDEC_STRIDE;
374 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
376 add4x4_idct( &p_dst[0], dct[0] );
377 add4x4_idct( &p_dst[4], dct[1] );
378 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
382 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
384 add8x8_idct( &p_dst[0], &dct[0] );
385 add8x8_idct( &p_dst[8], &dct[4] );
386 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
390 /****************************************************************************
392 ****************************************************************************/
395 int s07 = SRC(0) + SRC(7);\
396 int s16 = SRC(1) + SRC(6);\
397 int s25 = SRC(2) + SRC(5);\
398 int s34 = SRC(3) + SRC(4);\
403 int d07 = SRC(0) - SRC(7);\
404 int d16 = SRC(1) - SRC(6);\
405 int d25 = SRC(2) - SRC(5);\
406 int d34 = SRC(3) - SRC(4);\
407 int a4 = d16 + d25 + (d07 + (d07>>1));\
408 int a5 = d07 - d34 - (d25 + (d25>>1));\
409 int a6 = d07 + d34 - (d16 + (d16>>1));\
410 int a7 = d16 - d25 + (d34 + (d34>>1));\
412 DST(1) = a4 + (a7>>2);\
413 DST(2) = a2 + (a3>>1);\
414 DST(3) = a5 + (a6>>2);\
416 DST(5) = a6 - (a5>>2);\
417 DST(6) = (a2>>1) - a3 ;\
418 DST(7) = (a4>>2) - a7 ;\
421 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
425 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
427 #define SRC(x) tmp[x*8+i]
428 #define DST(x) tmp[x*8+i]
429 for( int i = 0; i < 8; i++ )
434 #define SRC(x) tmp[i*8+x]
435 #define DST(x) dct[x*8+i]
436 for( int i = 0; i < 8; i++ )
442 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
444 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
445 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
446 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
451 int a0 = SRC(0) + SRC(4);\
452 int a2 = SRC(0) - SRC(4);\
453 int a4 = (SRC(2)>>1) - SRC(6);\
454 int a6 = (SRC(6)>>1) + SRC(2);\
459 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463 int b1 = (a7>>2) + a1;\
464 int b3 = a3 + (a5>>2);\
465 int b5 = (a3>>2) - a5;\
466 int b7 = a7 - (a1>>2);\
477 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
479 dct[0] += 32; // rounding for the >>6 at the end
481 #define SRC(x) dct[x*8+i]
482 #define DST(x,rhs) dct[x*8+i] = (rhs)
483 for( int i = 0; i < 8; i++ )
488 #define SRC(x) dct[i*8+x]
489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490 for( int i = 0; i < 8; i++ )
496 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
498 add8x8_idct8( &dst[0], dct[0] );
499 add8x8_idct8( &dst[8], dct[1] );
500 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
504 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
507 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
509 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
516 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
518 add4x4_idct_dc( &p_dst[0], dct[0] );
519 add4x4_idct_dc( &p_dst[4], dct[1] );
520 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
524 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
526 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
528 add4x4_idct_dc( &p_dst[ 0], dct[0] );
529 add4x4_idct_dc( &p_dst[ 4], dct[1] );
530 add4x4_idct_dc( &p_dst[ 8], dct[2] );
531 add4x4_idct_dc( &p_dst[12], dct[3] );
536 /****************************************************************************
538 ****************************************************************************/
539 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
541 dctf->sub4x4_dct = sub4x4_dct;
542 dctf->add4x4_idct = add4x4_idct;
544 dctf->sub8x8_dct = sub8x8_dct;
545 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546 dctf->add8x8_idct = add8x8_idct;
547 dctf->add8x8_idct_dc = add8x8_idct_dc;
549 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
551 dctf->sub16x16_dct = sub16x16_dct;
552 dctf->add16x16_idct = add16x16_idct;
553 dctf->add16x16_idct_dc = add16x16_idct_dc;
555 dctf->sub8x8_dct8 = sub8x8_dct8;
556 dctf->add8x8_idct8 = add8x8_idct8;
558 dctf->sub16x16_dct8 = sub16x16_dct8;
559 dctf->add16x16_idct8 = add16x16_idct8;
561 dctf->dct4x4dc = dct4x4dc;
562 dctf->idct4x4dc = idct4x4dc;
564 dctf->dct2x4dc = dct2x4dc;
568 if( cpu&X264_CPU_MMX )
570 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
571 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
572 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
574 if( cpu&X264_CPU_SSE2 )
576 dctf->add4x4_idct = x264_add4x4_idct_sse2;
577 dctf->dct4x4dc = x264_dct4x4dc_sse2;
578 dctf->idct4x4dc = x264_idct4x4dc_sse2;
579 dctf->dct2x4dc = x264_dct2x4dc_sse2;
580 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
581 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
582 dctf->add8x8_idct = x264_add8x8_idct_sse2;
583 dctf->add16x16_idct = x264_add16x16_idct_sse2;
584 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
585 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
586 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
587 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
588 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
589 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
591 if( cpu&X264_CPU_SSE4 )
593 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
594 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
596 if( cpu&X264_CPU_AVX )
598 dctf->add4x4_idct = x264_add4x4_idct_avx;
599 dctf->dct4x4dc = x264_dct4x4dc_avx;
600 dctf->idct4x4dc = x264_idct4x4dc_avx;
601 dctf->dct2x4dc = x264_dct2x4dc_avx;
602 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
603 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
604 dctf->add8x8_idct = x264_add8x8_idct_avx;
605 dctf->add16x16_idct = x264_add16x16_idct_avx;
606 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
607 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
608 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
609 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
610 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
613 #else // !HIGH_BIT_DEPTH
615 if( cpu&X264_CPU_MMX )
617 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
618 dctf->add4x4_idct = x264_add4x4_idct_mmx;
619 dctf->idct4x4dc = x264_idct4x4dc_mmx;
620 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
623 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
624 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
625 dctf->add8x8_idct = x264_add8x8_idct_mmx;
626 dctf->add16x16_idct = x264_add16x16_idct_mmx;
628 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
629 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
630 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
631 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
635 if( cpu&X264_CPU_MMX2 )
637 dctf->dct4x4dc = x264_dct4x4dc_mmx2;
638 dctf->dct2x4dc = x264_dct2x4dc_mmx2;
639 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
640 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
643 if( cpu&X264_CPU_SSE2 )
645 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
646 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
647 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
648 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
649 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
650 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
652 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
654 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
655 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
656 dctf->add8x8_idct = x264_add8x8_idct_sse2;
657 dctf->add16x16_idct = x264_add16x16_idct_sse2;
658 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
662 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
664 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
665 if( !(cpu&X264_CPU_SLOW_ATOM) )
667 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
668 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
669 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
670 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
671 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
672 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
674 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
675 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
680 if( cpu&X264_CPU_SSE4 )
681 dctf->add4x4_idct = x264_add4x4_idct_sse4;
683 if( cpu&X264_CPU_AVX )
685 dctf->add4x4_idct = x264_add4x4_idct_avx;
686 dctf->add8x8_idct = x264_add8x8_idct_avx;
687 dctf->add16x16_idct = x264_add16x16_idct_avx;
688 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
689 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
690 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
691 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
692 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
693 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
694 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
697 if( cpu&X264_CPU_XOP )
699 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
700 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
703 if( cpu&X264_CPU_AVX2 )
705 dctf->add8x8_idct = x264_add8x8_idct_avx2;
706 dctf->add16x16_idct = x264_add16x16_idct_avx2;
707 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
708 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
709 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
711 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
717 if( cpu&X264_CPU_ALTIVEC )
719 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
720 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
721 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
723 dctf->add4x4_idct = x264_add4x4_idct_altivec;
724 dctf->add8x8_idct = x264_add8x8_idct_altivec;
725 dctf->add16x16_idct = x264_add16x16_idct_altivec;
727 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
728 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
730 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
731 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
735 #if HAVE_ARMV6 || ARCH_AARCH64
736 if( cpu&X264_CPU_NEON )
738 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
739 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
740 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
741 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
742 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
743 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
744 dctf->dct4x4dc = x264_dct4x4dc_neon;
745 dctf->idct4x4dc = x264_idct4x4dc_neon;
747 dctf->add4x4_idct = x264_add4x4_idct_neon;
748 dctf->add8x8_idct = x264_add8x8_idct_neon;
749 dctf->add16x16_idct = x264_add16x16_idct_neon;
751 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
752 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
754 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
755 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
756 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
761 if( cpu&X264_CPU_MSA )
763 dctf->sub4x4_dct = x264_sub4x4_dct_msa;
764 dctf->sub8x8_dct = x264_sub8x8_dct_msa;
765 dctf->sub16x16_dct = x264_sub16x16_dct_msa;
766 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
767 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
768 dctf->dct4x4dc = x264_dct4x4dc_msa;
769 dctf->idct4x4dc = x264_idct4x4dc_msa;
770 dctf->add4x4_idct = x264_add4x4_idct_msa;
771 dctf->add8x8_idct = x264_add8x8_idct_msa;
772 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
773 dctf->add16x16_idct = x264_add16x16_idct_msa;
774 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
775 dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
776 dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
780 #endif // HIGH_BIT_DEPTH
784 #define ZIG(i,y,x) level[i] = dct[x*8+y];
785 #define ZIGZAG8_FRAME\
786 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
787 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
788 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
789 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
790 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
791 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
792 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
793 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
794 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
795 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
796 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
797 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
798 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
799 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
800 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
801 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
803 #define ZIGZAG8_FIELD\
804 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
805 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
806 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
807 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
808 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
809 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
810 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
811 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
812 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
813 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
814 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
815 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
816 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
817 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
818 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
819 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
821 #define ZIGZAG4_FRAME\
822 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
823 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
824 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
825 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
827 #define ZIGZAG4_FIELD\
828 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
829 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
830 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
831 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
833 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
838 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
844 #define ZIG(i,y,x) level[i] = dct[x*4+y];
845 #define ZIGDC(i,y,x) ZIG(i,y,x)
847 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
852 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
854 memcpy( level, dct, 2 * sizeof(dctcoef) );
855 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
856 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
860 #define ZIG(i,y,x) {\
861 int oe = x+y*FENC_STRIDE;\
862 int od = x+y*FDEC_STRIDE;\
863 level[i] = p_src[oe] - p_dst[od];\
867 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
868 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
869 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
870 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
871 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
873 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
874 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
875 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
876 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
877 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
878 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
879 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
880 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
882 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
890 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
899 #define ZIGDC(i,y,x) {\
900 int oe = x+y*FENC_STRIDE;\
901 int od = x+y*FDEC_STRIDE;\
902 *dc = p_src[oe] - p_dst[od];\
906 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
914 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
922 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
929 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
940 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
942 for( int i = 0; i < 4; i++ )
945 for( int j = 0; j < 16; j++ )
948 dst[i*16+j] = src[i+j*4];
950 nnz[(i&1) + (i>>1)*8] = !!nz;
954 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
956 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
957 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
958 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
959 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
960 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
961 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
962 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
963 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
964 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
965 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
969 if( cpu&X264_CPU_SSE2 )
971 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
972 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
973 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
975 if( cpu&X264_CPU_SSE4 )
976 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
977 if( cpu&X264_CPU_AVX )
978 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
980 if( cpu&X264_CPU_AVX )
982 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
983 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
985 #endif // ARCH_X86_64
989 if( cpu&X264_CPU_MMX )
990 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
991 if( cpu&X264_CPU_MMX2 )
993 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
994 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
995 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
997 if( cpu&X264_CPU_SSE2_IS_FAST )
998 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
999 if( cpu&X264_CPU_SSSE3 )
1001 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
1002 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
1003 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1004 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1005 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1006 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1007 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1009 if( cpu&X264_CPU_AVX )
1011 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
1012 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
1014 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1015 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1017 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1019 if( cpu&X264_CPU_XOP )
1021 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1022 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1023 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1027 if( cpu&X264_CPU_ALTIVEC )
1029 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
1030 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1033 #if HAVE_ARMV6 || ARCH_AARCH64
1034 if( cpu&X264_CPU_NEON )
1036 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1038 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
1039 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
1040 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
1041 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
1042 pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
1043 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
1044 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
1045 pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1046 pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
1047 #endif // ARCH_AARCH64
1049 #endif // HAVE_ARMV6 || ARCH_AARCH64
1050 #endif // HIGH_BIT_DEPTH
1052 pf_interlaced->interleave_8x8_cavlc =
1053 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1056 if( cpu&X264_CPU_SSE2 )
1058 pf_interlaced->interleave_8x8_cavlc =
1059 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1061 if( cpu&X264_CPU_AVX )
1063 pf_interlaced->interleave_8x8_cavlc =
1064 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1067 if( cpu&X264_CPU_MMX )
1069 pf_interlaced->interleave_8x8_cavlc =
1070 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1072 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1074 pf_interlaced->interleave_8x8_cavlc =
1075 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1078 if( cpu&X264_CPU_AVX )
1080 pf_interlaced->interleave_8x8_cavlc =
1081 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1084 if( cpu&X264_CPU_AVX2 )
1086 pf_interlaced->interleave_8x8_cavlc =
1087 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1089 #endif // HIGH_BIT_DEPTH
1093 if( cpu&X264_CPU_NEON )
1095 pf_interlaced->interleave_8x8_cavlc =
1096 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
1098 #endif // ARCH_AARCH64
1099 #endif // !HIGH_BIT_DEPTH
1102 if( cpu&X264_CPU_MSA )
1104 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;