1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2016 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <henrik@gramner.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 # include "aarch64/dct.h"
42 # include "mips/dct.h"
45 /* the inverse of the scaling factors introduced by 8x8 fdct */
46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47 #define W(i) (i==0 ? FIX8(1.0000) :\
48 i==1 ? FIX8(0.8859) :\
49 i==2 ? FIX8(1.6000) :\
50 i==3 ? FIX8(0.9415) :\
51 i==4 ? FIX8(1.2651) :\
52 i==5 ? FIX8(1.1910) :0)
53 const uint32_t x264_dct8_weight_tab[64] = {
54 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
59 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
66 #define W(i) (i==0 ? FIX8(1.76777) :\
67 i==1 ? FIX8(1.11803) :\
68 i==2 ? FIX8(0.70711) :0)
69 const uint32_t x264_dct4_weight_tab[16] = {
70 W(0), W(1), W(0), W(1),
71 W(1), W(2), W(1), W(2),
72 W(0), W(1), W(0), W(1),
73 W(1), W(2), W(1), W(2)
78 #define W(i) (i==0 ? FIX8(3.125) :\
81 const uint32_t x264_dct4_weight2_tab[16] = {
82 W(0), W(1), W(0), W(1),
83 W(1), W(2), W(1), W(2),
84 W(0), W(1), W(0), W(1),
85 W(1), W(2), W(1), W(2)
89 #define W(i) (i==0 ? FIX8(1.00000) :\
90 i==1 ? FIX8(0.78487) :\
91 i==2 ? FIX8(2.56132) :\
92 i==3 ? FIX8(0.88637) :\
93 i==4 ? FIX8(1.60040) :\
94 i==5 ? FIX8(1.41850) :0)
95 const uint32_t x264_dct8_weight2_tab[64] = {
96 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
101 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
109 static void dct4x4dc( dctcoef d[16] )
113 for( int i = 0; i < 4; i++ )
115 int s01 = d[i*4+0] + d[i*4+1];
116 int d01 = d[i*4+0] - d[i*4+1];
117 int s23 = d[i*4+2] + d[i*4+3];
118 int d23 = d[i*4+2] - d[i*4+3];
120 tmp[0*4+i] = s01 + s23;
121 tmp[1*4+i] = s01 - s23;
122 tmp[2*4+i] = d01 - d23;
123 tmp[3*4+i] = d01 + d23;
126 for( int i = 0; i < 4; i++ )
128 int s01 = tmp[i*4+0] + tmp[i*4+1];
129 int d01 = tmp[i*4+0] - tmp[i*4+1];
130 int s23 = tmp[i*4+2] + tmp[i*4+3];
131 int d23 = tmp[i*4+2] - tmp[i*4+3];
133 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
140 static void idct4x4dc( dctcoef d[16] )
144 for( int i = 0; i < 4; i++ )
146 int s01 = d[i*4+0] + d[i*4+1];
147 int d01 = d[i*4+0] - d[i*4+1];
148 int s23 = d[i*4+2] + d[i*4+3];
149 int d23 = d[i*4+2] - d[i*4+3];
151 tmp[0*4+i] = s01 + s23;
152 tmp[1*4+i] = s01 - s23;
153 tmp[2*4+i] = d01 - d23;
154 tmp[3*4+i] = d01 + d23;
157 for( int i = 0; i < 4; i++ )
159 int s01 = tmp[i*4+0] + tmp[i*4+1];
160 int d01 = tmp[i*4+0] - tmp[i*4+1];
161 int s23 = tmp[i*4+2] + tmp[i*4+3];
162 int d23 = tmp[i*4+2] - tmp[i*4+3];
164 d[i*4+0] = s01 + s23;
165 d[i*4+1] = s01 - s23;
166 d[i*4+2] = d01 - d23;
167 d[i*4+3] = d01 + d23;
171 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
173 int a0 = dct4x4[0][0] + dct4x4[1][0];
174 int a1 = dct4x4[2][0] + dct4x4[3][0];
175 int a2 = dct4x4[4][0] + dct4x4[5][0];
176 int a3 = dct4x4[6][0] + dct4x4[7][0];
177 int a4 = dct4x4[0][0] - dct4x4[1][0];
178 int a5 = dct4x4[2][0] - dct4x4[3][0];
179 int a6 = dct4x4[4][0] - dct4x4[5][0];
180 int a7 = dct4x4[6][0] - dct4x4[7][0];
207 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
210 for( int y = 0; y < i_size; y++ )
212 for( int x = 0; x < i_size; x++ )
213 diff[x + y*i_size] = pix1[x] - pix2[x];
219 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
224 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
226 for( int i = 0; i < 4; i++ )
228 int s03 = d[i*4+0] + d[i*4+3];
229 int s12 = d[i*4+1] + d[i*4+2];
230 int d03 = d[i*4+0] - d[i*4+3];
231 int d12 = d[i*4+1] - d[i*4+2];
233 tmp[0*4+i] = s03 + s12;
234 tmp[1*4+i] = 2*d03 + d12;
235 tmp[2*4+i] = s03 - s12;
236 tmp[3*4+i] = d03 - 2*d12;
239 for( int i = 0; i < 4; i++ )
241 int s03 = tmp[i*4+0] + tmp[i*4+3];
242 int s12 = tmp[i*4+1] + tmp[i*4+2];
243 int d03 = tmp[i*4+0] - tmp[i*4+3];
244 int d12 = tmp[i*4+1] - tmp[i*4+2];
246 dct[i*4+0] = s03 + s12;
247 dct[i*4+1] = 2*d03 + d12;
248 dct[i*4+2] = s03 - s12;
249 dct[i*4+3] = d03 - 2*d12;
253 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
255 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
261 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
263 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
269 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
272 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
278 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
280 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
285 /* 2x2 DC transform */
286 int d0 = dct[0] + dct[1];
287 int d1 = dct[2] + dct[3];
288 int d2 = dct[0] - dct[1];
289 int d3 = dct[2] - dct[3];
296 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
298 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
307 /* 2x4 DC transform */
334 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
339 for( int i = 0; i < 4; i++ )
341 int s02 = dct[0*4+i] + dct[2*4+i];
342 int d02 = dct[0*4+i] - dct[2*4+i];
343 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
344 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
346 tmp[i*4+0] = s02 + s13;
347 tmp[i*4+1] = d02 + d13;
348 tmp[i*4+2] = d02 - d13;
349 tmp[i*4+3] = s02 - s13;
352 for( int i = 0; i < 4; i++ )
354 int s02 = tmp[0*4+i] + tmp[2*4+i];
355 int d02 = tmp[0*4+i] - tmp[2*4+i];
356 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
357 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
359 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
366 for( int y = 0; y < 4; y++ )
368 for( int x = 0; x < 4; x++ )
369 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370 p_dst += FDEC_STRIDE;
374 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
376 add4x4_idct( &p_dst[0], dct[0] );
377 add4x4_idct( &p_dst[4], dct[1] );
378 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
382 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
384 add8x8_idct( &p_dst[0], &dct[0] );
385 add8x8_idct( &p_dst[8], &dct[4] );
386 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
390 /****************************************************************************
392 ****************************************************************************/
395 int s07 = SRC(0) + SRC(7);\
396 int s16 = SRC(1) + SRC(6);\
397 int s25 = SRC(2) + SRC(5);\
398 int s34 = SRC(3) + SRC(4);\
403 int d07 = SRC(0) - SRC(7);\
404 int d16 = SRC(1) - SRC(6);\
405 int d25 = SRC(2) - SRC(5);\
406 int d34 = SRC(3) - SRC(4);\
407 int a4 = d16 + d25 + (d07 + (d07>>1));\
408 int a5 = d07 - d34 - (d25 + (d25>>1));\
409 int a6 = d07 + d34 - (d16 + (d16>>1));\
410 int a7 = d16 - d25 + (d34 + (d34>>1));\
412 DST(1) = a4 + (a7>>2);\
413 DST(2) = a2 + (a3>>1);\
414 DST(3) = a5 + (a6>>2);\
416 DST(5) = a6 - (a5>>2);\
417 DST(6) = (a2>>1) - a3 ;\
418 DST(7) = (a4>>2) - a7 ;\
421 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
425 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
427 #define SRC(x) tmp[x*8+i]
428 #define DST(x) tmp[x*8+i]
429 for( int i = 0; i < 8; i++ )
434 #define SRC(x) tmp[i*8+x]
435 #define DST(x) dct[x*8+i]
436 for( int i = 0; i < 8; i++ )
442 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
444 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
445 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
446 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
451 int a0 = SRC(0) + SRC(4);\
452 int a2 = SRC(0) - SRC(4);\
453 int a4 = (SRC(2)>>1) - SRC(6);\
454 int a6 = (SRC(6)>>1) + SRC(2);\
459 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463 int b1 = (a7>>2) + a1;\
464 int b3 = a3 + (a5>>2);\
465 int b5 = (a3>>2) - a5;\
466 int b7 = a7 - (a1>>2);\
477 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
479 dct[0] += 32; // rounding for the >>6 at the end
481 #define SRC(x) dct[x*8+i]
482 #define DST(x,rhs) dct[x*8+i] = (rhs)
483 for( int i = 0; i < 8; i++ )
488 #define SRC(x) dct[i*8+x]
489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490 for( int i = 0; i < 8; i++ )
496 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
498 add8x8_idct8( &dst[0], dct[0] );
499 add8x8_idct8( &dst[8], dct[1] );
500 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
504 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
507 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
509 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
516 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
518 add4x4_idct_dc( &p_dst[0], dct[0] );
519 add4x4_idct_dc( &p_dst[4], dct[1] );
520 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
524 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
526 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
528 add4x4_idct_dc( &p_dst[ 0], dct[0] );
529 add4x4_idct_dc( &p_dst[ 4], dct[1] );
530 add4x4_idct_dc( &p_dst[ 8], dct[2] );
531 add4x4_idct_dc( &p_dst[12], dct[3] );
536 /****************************************************************************
538 ****************************************************************************/
539 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
541 dctf->sub4x4_dct = sub4x4_dct;
542 dctf->add4x4_idct = add4x4_idct;
544 dctf->sub8x8_dct = sub8x8_dct;
545 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546 dctf->add8x8_idct = add8x8_idct;
547 dctf->add8x8_idct_dc = add8x8_idct_dc;
549 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
551 dctf->sub16x16_dct = sub16x16_dct;
552 dctf->add16x16_idct = add16x16_idct;
553 dctf->add16x16_idct_dc = add16x16_idct_dc;
555 dctf->sub8x8_dct8 = sub8x8_dct8;
556 dctf->add8x8_idct8 = add8x8_idct8;
558 dctf->sub16x16_dct8 = sub16x16_dct8;
559 dctf->add16x16_idct8 = add16x16_idct8;
561 dctf->dct4x4dc = dct4x4dc;
562 dctf->idct4x4dc = idct4x4dc;
564 dctf->dct2x4dc = dct2x4dc;
568 if( cpu&X264_CPU_MMX )
570 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
571 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
572 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
574 if( cpu&X264_CPU_SSE2 )
576 dctf->add4x4_idct = x264_add4x4_idct_sse2;
577 dctf->dct4x4dc = x264_dct4x4dc_sse2;
578 dctf->idct4x4dc = x264_idct4x4dc_sse2;
579 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
580 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
581 dctf->add8x8_idct = x264_add8x8_idct_sse2;
582 dctf->add16x16_idct = x264_add16x16_idct_sse2;
583 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
584 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
585 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
586 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
587 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
588 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
590 if( cpu&X264_CPU_SSE4 )
592 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
593 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
595 if( cpu&X264_CPU_AVX )
597 dctf->add4x4_idct = x264_add4x4_idct_avx;
598 dctf->dct4x4dc = x264_dct4x4dc_avx;
599 dctf->idct4x4dc = x264_idct4x4dc_avx;
600 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
601 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
602 dctf->add8x8_idct = x264_add8x8_idct_avx;
603 dctf->add16x16_idct = x264_add16x16_idct_avx;
604 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
605 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
606 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
607 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
608 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
611 #else // !HIGH_BIT_DEPTH
613 if( cpu&X264_CPU_MMX )
615 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
616 dctf->add4x4_idct = x264_add4x4_idct_mmx;
617 dctf->idct4x4dc = x264_idct4x4dc_mmx;
618 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
621 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
622 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
623 dctf->add8x8_idct = x264_add8x8_idct_mmx;
624 dctf->add16x16_idct = x264_add16x16_idct_mmx;
626 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
627 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
628 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
629 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
633 if( cpu&X264_CPU_MMX2 )
635 dctf->dct4x4dc = x264_dct4x4dc_mmx2;
636 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
637 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
640 if( cpu&X264_CPU_SSE2 )
642 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
643 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
644 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
645 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
646 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
647 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
649 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
651 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
652 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
653 dctf->add8x8_idct = x264_add8x8_idct_sse2;
654 dctf->add16x16_idct = x264_add16x16_idct_sse2;
655 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
659 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
661 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
662 if( !(cpu&X264_CPU_SLOW_ATOM) )
664 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
665 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
666 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
667 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
668 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
669 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
671 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
672 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
677 if( cpu&X264_CPU_SSE4 )
678 dctf->add4x4_idct = x264_add4x4_idct_sse4;
680 if( cpu&X264_CPU_AVX )
682 dctf->add4x4_idct = x264_add4x4_idct_avx;
683 dctf->add8x8_idct = x264_add8x8_idct_avx;
684 dctf->add16x16_idct = x264_add16x16_idct_avx;
685 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
686 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
687 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
688 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
689 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
690 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
691 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
694 if( cpu&X264_CPU_XOP )
696 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
697 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
700 if( cpu&X264_CPU_AVX2 )
702 dctf->add8x8_idct = x264_add8x8_idct_avx2;
703 dctf->add16x16_idct = x264_add16x16_idct_avx2;
704 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
705 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
706 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
708 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
714 if( cpu&X264_CPU_ALTIVEC )
716 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
717 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
718 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
720 dctf->add4x4_idct = x264_add4x4_idct_altivec;
721 dctf->add8x8_idct = x264_add8x8_idct_altivec;
722 dctf->add16x16_idct = x264_add16x16_idct_altivec;
724 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
725 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
727 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
728 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
732 #if HAVE_ARMV6 || ARCH_AARCH64
733 if( cpu&X264_CPU_NEON )
735 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
736 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
737 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
738 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
739 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
740 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
741 dctf->dct4x4dc = x264_dct4x4dc_neon;
742 dctf->idct4x4dc = x264_idct4x4dc_neon;
744 dctf->add4x4_idct = x264_add4x4_idct_neon;
745 dctf->add8x8_idct = x264_add8x8_idct_neon;
746 dctf->add16x16_idct = x264_add16x16_idct_neon;
748 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
749 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
751 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
752 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
753 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
758 if( cpu&X264_CPU_MSA )
760 dctf->sub4x4_dct = x264_sub4x4_dct_msa;
761 dctf->sub8x8_dct = x264_sub8x8_dct_msa;
762 dctf->sub16x16_dct = x264_sub16x16_dct_msa;
763 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
764 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
765 dctf->dct4x4dc = x264_dct4x4dc_msa;
766 dctf->idct4x4dc = x264_idct4x4dc_msa;
767 dctf->add4x4_idct = x264_add4x4_idct_msa;
768 dctf->add8x8_idct = x264_add8x8_idct_msa;
769 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
770 dctf->add16x16_idct = x264_add16x16_idct_msa;
771 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
772 dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
773 dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
777 #endif // HIGH_BIT_DEPTH
781 #define ZIG(i,y,x) level[i] = dct[x*8+y];
782 #define ZIGZAG8_FRAME\
783 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
784 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
785 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
786 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
787 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
788 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
789 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
790 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
791 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
792 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
793 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
794 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
795 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
796 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
797 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
798 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
800 #define ZIGZAG8_FIELD\
801 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
802 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
803 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
804 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
805 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
806 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
807 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
808 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
809 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
810 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
811 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
812 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
813 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
814 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
815 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
816 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
818 #define ZIGZAG4_FRAME\
819 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
820 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
821 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
822 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
824 #define ZIGZAG4_FIELD\
825 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
826 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
827 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
828 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
830 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
835 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
841 #define ZIG(i,y,x) level[i] = dct[x*4+y];
842 #define ZIGDC(i,y,x) ZIG(i,y,x)
844 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
849 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
851 memcpy( level, dct, 2 * sizeof(dctcoef) );
852 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
853 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
857 #define ZIG(i,y,x) {\
858 int oe = x+y*FENC_STRIDE;\
859 int od = x+y*FDEC_STRIDE;\
860 level[i] = p_src[oe] - p_dst[od];\
864 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
865 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
866 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
867 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
868 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
870 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
871 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
872 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
873 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
874 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
875 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
876 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
877 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
879 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
887 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
896 #define ZIGDC(i,y,x) {\
897 int oe = x+y*FENC_STRIDE;\
898 int od = x+y*FDEC_STRIDE;\
899 *dc = p_src[oe] - p_dst[od];\
903 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
911 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
919 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
926 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
937 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
939 for( int i = 0; i < 4; i++ )
942 for( int j = 0; j < 16; j++ )
945 dst[i*16+j] = src[i+j*4];
947 nnz[(i&1) + (i>>1)*8] = !!nz;
951 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
953 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
954 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
955 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
956 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
957 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
958 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
959 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
960 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
961 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
962 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
966 if( cpu&X264_CPU_SSE2 )
968 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
969 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
970 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
972 if( cpu&X264_CPU_SSE4 )
973 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
974 if( cpu&X264_CPU_AVX )
975 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
977 if( cpu&X264_CPU_AVX )
979 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
980 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
982 #endif // ARCH_X86_64
986 if( cpu&X264_CPU_MMX )
987 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
988 if( cpu&X264_CPU_MMX2 )
990 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
991 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
992 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
994 if( cpu&X264_CPU_SSE2_IS_FAST )
995 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
996 if( cpu&X264_CPU_SSSE3 )
998 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
999 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
1000 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1001 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1002 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1003 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1004 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1006 if( cpu&X264_CPU_AVX )
1008 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
1009 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
1011 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1012 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1014 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1016 if( cpu&X264_CPU_XOP )
1018 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1019 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1020 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1024 if( cpu&X264_CPU_ALTIVEC )
1026 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
1027 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1030 #if HAVE_ARMV6 || ARCH_AARCH64
1031 if( cpu&X264_CPU_NEON )
1033 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1035 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
1036 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
1037 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
1038 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
1039 pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
1040 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
1041 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
1042 pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1043 pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
1044 #endif // ARCH_AARCH64
1046 #endif // HAVE_ARMV6 || ARCH_AARCH64
1047 #endif // HIGH_BIT_DEPTH
1049 pf_interlaced->interleave_8x8_cavlc =
1050 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1053 if( cpu&X264_CPU_SSE2 )
1055 pf_interlaced->interleave_8x8_cavlc =
1056 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1058 if( cpu&X264_CPU_AVX )
1060 pf_interlaced->interleave_8x8_cavlc =
1061 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1064 if( cpu&X264_CPU_MMX )
1066 pf_interlaced->interleave_8x8_cavlc =
1067 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1069 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1071 pf_interlaced->interleave_8x8_cavlc =
1072 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1075 if( cpu&X264_CPU_AVX )
1077 pf_interlaced->interleave_8x8_cavlc =
1078 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1081 if( cpu&X264_CPU_AVX2 )
1083 pf_interlaced->interleave_8x8_cavlc =
1084 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1086 #endif // HIGH_BIT_DEPTH
1090 if( cpu&X264_CPU_NEON )
1092 pf_interlaced->interleave_8x8_cavlc =
1093 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
1095 #endif // ARCH_AARCH64
1096 #endif // !HIGH_BIT_DEPTH
1099 if( cpu&X264_CPU_MSA )
1101 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;