1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2015 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
8 * Henrik Gramner <henrik@gramner.com>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
24 * This program is also available under a commercial proprietary license.
25 * For more information, contact us at licensing@x264.com.
26 *****************************************************************************/
39 # include "aarch64/dct.h"
42 # include "mips/dct.h"
45 /* the inverse of the scaling factors introduced by 8x8 fdct */
46 /* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */
47 #define W(i) (i==0 ? FIX8(1.0000) :\
48 i==1 ? FIX8(0.8859) :\
49 i==2 ? FIX8(1.6000) :\
50 i==3 ? FIX8(0.9415) :\
51 i==4 ? FIX8(1.2651) :\
52 i==5 ? FIX8(1.1910) :0)
53 const uint32_t x264_dct8_weight_tab[64] = {
54 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
55 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
56 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
57 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
59 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
60 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
61 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
62 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
66 #define W(i) (i==0 ? FIX8(1.76777) :\
67 i==1 ? FIX8(1.11803) :\
68 i==2 ? FIX8(0.70711) :0)
69 const uint32_t x264_dct4_weight_tab[16] = {
70 W(0), W(1), W(0), W(1),
71 W(1), W(2), W(1), W(2),
72 W(0), W(1), W(0), W(1),
73 W(1), W(2), W(1), W(2)
78 #define W(i) (i==0 ? FIX8(3.125) :\
81 const uint32_t x264_dct4_weight2_tab[16] = {
82 W(0), W(1), W(0), W(1),
83 W(1), W(2), W(1), W(2),
84 W(0), W(1), W(0), W(1),
85 W(1), W(2), W(1), W(2)
89 #define W(i) (i==0 ? FIX8(1.00000) :\
90 i==1 ? FIX8(0.78487) :\
91 i==2 ? FIX8(2.56132) :\
92 i==3 ? FIX8(0.88637) :\
93 i==4 ? FIX8(1.60040) :\
94 i==5 ? FIX8(1.41850) :0)
95 const uint32_t x264_dct8_weight2_tab[64] = {
96 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
97 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
98 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
99 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
101 W(0), W(3), W(4), W(3), W(0), W(3), W(4), W(3),
102 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1),
103 W(4), W(5), W(2), W(5), W(4), W(5), W(2), W(5),
104 W(3), W(1), W(5), W(1), W(3), W(1), W(5), W(1)
109 static void dct4x4dc( dctcoef d[16] )
113 for( int i = 0; i < 4; i++ )
115 int s01 = d[i*4+0] + d[i*4+1];
116 int d01 = d[i*4+0] - d[i*4+1];
117 int s23 = d[i*4+2] + d[i*4+3];
118 int d23 = d[i*4+2] - d[i*4+3];
120 tmp[0*4+i] = s01 + s23;
121 tmp[1*4+i] = s01 - s23;
122 tmp[2*4+i] = d01 - d23;
123 tmp[3*4+i] = d01 + d23;
126 for( int i = 0; i < 4; i++ )
128 int s01 = tmp[i*4+0] + tmp[i*4+1];
129 int d01 = tmp[i*4+0] - tmp[i*4+1];
130 int s23 = tmp[i*4+2] + tmp[i*4+3];
131 int d23 = tmp[i*4+2] - tmp[i*4+3];
133 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
134 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
135 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
136 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
140 static void idct4x4dc( dctcoef d[16] )
144 for( int i = 0; i < 4; i++ )
146 int s01 = d[i*4+0] + d[i*4+1];
147 int d01 = d[i*4+0] - d[i*4+1];
148 int s23 = d[i*4+2] + d[i*4+3];
149 int d23 = d[i*4+2] - d[i*4+3];
151 tmp[0*4+i] = s01 + s23;
152 tmp[1*4+i] = s01 - s23;
153 tmp[2*4+i] = d01 - d23;
154 tmp[3*4+i] = d01 + d23;
157 for( int i = 0; i < 4; i++ )
159 int s01 = tmp[i*4+0] + tmp[i*4+1];
160 int d01 = tmp[i*4+0] - tmp[i*4+1];
161 int s23 = tmp[i*4+2] + tmp[i*4+3];
162 int d23 = tmp[i*4+2] - tmp[i*4+3];
164 d[i*4+0] = s01 + s23;
165 d[i*4+1] = s01 - s23;
166 d[i*4+2] = d01 - d23;
167 d[i*4+3] = d01 + d23;
171 static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
173 int a0 = dct4x4[0][0] + dct4x4[1][0];
174 int a1 = dct4x4[2][0] + dct4x4[3][0];
175 int a2 = dct4x4[4][0] + dct4x4[5][0];
176 int a3 = dct4x4[6][0] + dct4x4[7][0];
177 int a4 = dct4x4[0][0] - dct4x4[1][0];
178 int a5 = dct4x4[2][0] - dct4x4[3][0];
179 int a6 = dct4x4[4][0] - dct4x4[5][0];
180 int a7 = dct4x4[6][0] - dct4x4[7][0];
207 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
208 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
210 for( int y = 0; y < i_size; y++ )
212 for( int x = 0; x < i_size; x++ )
213 diff[x + y*i_size] = pix1[x] - pix2[x];
219 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
224 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
226 for( int i = 0; i < 4; i++ )
228 int s03 = d[i*4+0] + d[i*4+3];
229 int s12 = d[i*4+1] + d[i*4+2];
230 int d03 = d[i*4+0] - d[i*4+3];
231 int d12 = d[i*4+1] - d[i*4+2];
233 tmp[0*4+i] = s03 + s12;
234 tmp[1*4+i] = 2*d03 + d12;
235 tmp[2*4+i] = s03 - s12;
236 tmp[3*4+i] = d03 - 2*d12;
239 for( int i = 0; i < 4; i++ )
241 int s03 = tmp[i*4+0] + tmp[i*4+3];
242 int s12 = tmp[i*4+1] + tmp[i*4+2];
243 int d03 = tmp[i*4+0] - tmp[i*4+3];
244 int d12 = tmp[i*4+1] - tmp[i*4+2];
246 dct[i*4+0] = s03 + s12;
247 dct[i*4+1] = 2*d03 + d12;
248 dct[i*4+2] = s03 - s12;
249 dct[i*4+3] = d03 - 2*d12;
253 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
255 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
256 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
257 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
258 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
261 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
263 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
264 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
265 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
266 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
269 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
272 for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
273 sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
274 - pix2[0] - pix2[1] - pix2[2] - pix2[3];
278 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
280 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
281 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
282 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
283 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
285 /* 2x2 DC transform */
286 int d0 = dct[0] + dct[1];
287 int d1 = dct[2] + dct[3];
288 int d2 = dct[0] - dct[1];
289 int d3 = dct[2] - dct[3];
296 static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
298 int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
299 int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
300 int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
301 int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
302 int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
303 int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
304 int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
305 int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
307 /* 2x4 DC transform */
334 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
339 for( int i = 0; i < 4; i++ )
341 int s02 = dct[0*4+i] + dct[2*4+i];
342 int d02 = dct[0*4+i] - dct[2*4+i];
343 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
344 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
346 tmp[i*4+0] = s02 + s13;
347 tmp[i*4+1] = d02 + d13;
348 tmp[i*4+2] = d02 - d13;
349 tmp[i*4+3] = s02 - s13;
352 for( int i = 0; i < 4; i++ )
354 int s02 = tmp[0*4+i] + tmp[2*4+i];
355 int d02 = tmp[0*4+i] - tmp[2*4+i];
356 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
357 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
359 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
360 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
361 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
362 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
366 for( int y = 0; y < 4; y++ )
368 for( int x = 0; x < 4; x++ )
369 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
370 p_dst += FDEC_STRIDE;
374 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
376 add4x4_idct( &p_dst[0], dct[0] );
377 add4x4_idct( &p_dst[4], dct[1] );
378 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
379 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
382 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
384 add8x8_idct( &p_dst[0], &dct[0] );
385 add8x8_idct( &p_dst[8], &dct[4] );
386 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
387 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
390 /****************************************************************************
392 ****************************************************************************/
395 int s07 = SRC(0) + SRC(7);\
396 int s16 = SRC(1) + SRC(6);\
397 int s25 = SRC(2) + SRC(5);\
398 int s34 = SRC(3) + SRC(4);\
403 int d07 = SRC(0) - SRC(7);\
404 int d16 = SRC(1) - SRC(6);\
405 int d25 = SRC(2) - SRC(5);\
406 int d34 = SRC(3) - SRC(4);\
407 int a4 = d16 + d25 + (d07 + (d07>>1));\
408 int a5 = d07 - d34 - (d25 + (d25>>1));\
409 int a6 = d07 + d34 - (d16 + (d16>>1));\
410 int a7 = d16 - d25 + (d34 + (d34>>1));\
412 DST(1) = a4 + (a7>>2);\
413 DST(2) = a2 + (a3>>1);\
414 DST(3) = a5 + (a6>>2);\
416 DST(5) = a6 - (a5>>2);\
417 DST(6) = (a2>>1) - a3 ;\
418 DST(7) = (a4>>2) - a7 ;\
421 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
425 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
427 #define SRC(x) tmp[x*8+i]
428 #define DST(x) tmp[x*8+i]
429 for( int i = 0; i < 8; i++ )
434 #define SRC(x) tmp[i*8+x]
435 #define DST(x) dct[x*8+i]
436 for( int i = 0; i < 8; i++ )
442 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
444 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
445 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
446 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
447 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
451 int a0 = SRC(0) + SRC(4);\
452 int a2 = SRC(0) - SRC(4);\
453 int a4 = (SRC(2)>>1) - SRC(6);\
454 int a6 = (SRC(6)>>1) + SRC(2);\
459 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
460 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
461 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
462 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
463 int b1 = (a7>>2) + a1;\
464 int b3 = a3 + (a5>>2);\
465 int b5 = (a3>>2) - a5;\
466 int b7 = a7 - (a1>>2);\
477 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
479 dct[0] += 32; // rounding for the >>6 at the end
481 #define SRC(x) dct[x*8+i]
482 #define DST(x,rhs) dct[x*8+i] = (rhs)
483 for( int i = 0; i < 8; i++ )
488 #define SRC(x) dct[i*8+x]
489 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
490 for( int i = 0; i < 8; i++ )
496 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
498 add8x8_idct8( &dst[0], dct[0] );
499 add8x8_idct8( &dst[8], dct[1] );
500 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
501 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
504 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
507 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
509 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
510 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
511 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
512 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
516 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
518 add4x4_idct_dc( &p_dst[0], dct[0] );
519 add4x4_idct_dc( &p_dst[4], dct[1] );
520 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
521 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
524 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
526 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
528 add4x4_idct_dc( &p_dst[ 0], dct[0] );
529 add4x4_idct_dc( &p_dst[ 4], dct[1] );
530 add4x4_idct_dc( &p_dst[ 8], dct[2] );
531 add4x4_idct_dc( &p_dst[12], dct[3] );
536 /****************************************************************************
538 ****************************************************************************/
539 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
541 dctf->sub4x4_dct = sub4x4_dct;
542 dctf->add4x4_idct = add4x4_idct;
544 dctf->sub8x8_dct = sub8x8_dct;
545 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
546 dctf->add8x8_idct = add8x8_idct;
547 dctf->add8x8_idct_dc = add8x8_idct_dc;
549 dctf->sub8x16_dct_dc = sub8x16_dct_dc;
551 dctf->sub16x16_dct = sub16x16_dct;
552 dctf->add16x16_idct = add16x16_idct;
553 dctf->add16x16_idct_dc = add16x16_idct_dc;
555 dctf->sub8x8_dct8 = sub8x8_dct8;
556 dctf->add8x8_idct8 = add8x8_idct8;
558 dctf->sub16x16_dct8 = sub16x16_dct8;
559 dctf->add16x16_idct8 = add16x16_idct8;
561 dctf->dct4x4dc = dct4x4dc;
562 dctf->idct4x4dc = idct4x4dc;
564 dctf->dct2x4dc = dct2x4dc;
568 if( cpu&X264_CPU_MMX )
570 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
571 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
572 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
574 if( cpu&X264_CPU_SSE2 )
576 dctf->add4x4_idct = x264_add4x4_idct_sse2;
577 dctf->dct4x4dc = x264_dct4x4dc_sse2;
578 dctf->idct4x4dc = x264_idct4x4dc_sse2;
579 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
580 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
581 dctf->add8x8_idct = x264_add8x8_idct_sse2;
582 dctf->add16x16_idct = x264_add16x16_idct_sse2;
583 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
584 dctf->add16x16_idct8 = x264_add16x16_idct8_sse2;
585 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
586 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
587 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_sse2;
588 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
590 if( cpu&X264_CPU_SSE4 )
592 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse4;
593 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse4;
595 if( cpu&X264_CPU_AVX )
597 dctf->add4x4_idct = x264_add4x4_idct_avx;
598 dctf->dct4x4dc = x264_dct4x4dc_avx;
599 dctf->idct4x4dc = x264_idct4x4dc_avx;
600 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
601 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
602 dctf->add8x8_idct = x264_add8x8_idct_avx;
603 dctf->add16x16_idct = x264_add16x16_idct_avx;
604 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
605 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
606 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
607 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_avx;
608 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
611 #else // !HIGH_BIT_DEPTH
613 if( cpu&X264_CPU_MMX )
615 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
616 dctf->add4x4_idct = x264_add4x4_idct_mmx;
617 dctf->idct4x4dc = x264_idct4x4dc_mmx;
618 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmx2;
621 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
622 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
623 dctf->add8x8_idct = x264_add8x8_idct_mmx;
624 dctf->add16x16_idct = x264_add16x16_idct_mmx;
626 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
627 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
628 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
629 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
633 if( cpu&X264_CPU_MMX2 )
635 dctf->dct4x4dc = x264_dct4x4dc_mmx2;
636 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx2;
637 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx2;
640 if( cpu&X264_CPU_SSE2 )
642 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
643 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
644 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
645 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_sse2;
646 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
647 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
649 if( !(cpu&X264_CPU_SSE2_IS_SLOW) )
651 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
652 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
653 dctf->add8x8_idct = x264_add8x8_idct_sse2;
654 dctf->add16x16_idct = x264_add16x16_idct_sse2;
655 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
659 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
661 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_ssse3;
662 if( !(cpu&X264_CPU_SLOW_ATOM) )
664 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
665 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
666 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
667 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
668 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
669 if( !(cpu&X264_CPU_SLOW_PSHUFB) )
671 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
672 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
677 if( cpu&X264_CPU_SSE4 )
678 dctf->add4x4_idct = x264_add4x4_idct_sse4;
680 if( cpu&X264_CPU_AVX )
682 dctf->add4x4_idct = x264_add4x4_idct_avx;
683 dctf->add8x8_idct = x264_add8x8_idct_avx;
684 dctf->add16x16_idct = x264_add16x16_idct_avx;
685 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
686 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
687 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
688 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
689 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
690 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
691 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
694 if( cpu&X264_CPU_XOP )
696 dctf->sub8x8_dct = x264_sub8x8_dct_xop;
697 dctf->sub16x16_dct = x264_sub16x16_dct_xop;
700 if( cpu&X264_CPU_AVX2 )
702 dctf->add8x8_idct = x264_add8x8_idct_avx2;
703 dctf->add16x16_idct = x264_add16x16_idct_avx2;
704 dctf->sub8x8_dct = x264_sub8x8_dct_avx2;
705 dctf->sub16x16_dct = x264_sub16x16_dct_avx2;
706 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx2;
708 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx2;
714 if( cpu&X264_CPU_ALTIVEC )
716 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
717 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
718 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
720 dctf->add4x4_idct = x264_add4x4_idct_altivec;
721 dctf->add8x8_idct = x264_add8x8_idct_altivec;
722 dctf->add16x16_idct = x264_add16x16_idct_altivec;
724 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
725 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
727 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
728 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
732 #if HAVE_ARMV6 || ARCH_AARCH64
733 if( cpu&X264_CPU_NEON )
735 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
736 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
737 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
738 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
739 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
740 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
741 dctf->dct4x4dc = x264_dct4x4dc_neon;
742 dctf->idct4x4dc = x264_idct4x4dc_neon;
744 dctf->add4x4_idct = x264_add4x4_idct_neon;
745 dctf->add8x8_idct = x264_add8x8_idct_neon;
746 dctf->add16x16_idct = x264_add16x16_idct_neon;
748 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
749 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
751 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
752 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
754 dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
760 if( cpu&X264_CPU_MSA )
762 dctf->sub4x4_dct = x264_sub4x4_dct_msa;
763 dctf->sub8x8_dct = x264_sub8x8_dct_msa;
764 dctf->sub16x16_dct = x264_sub16x16_dct_msa;
765 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
766 dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
767 dctf->dct4x4dc = x264_dct4x4dc_msa;
768 dctf->idct4x4dc = x264_idct4x4dc_msa;
769 dctf->add4x4_idct = x264_add4x4_idct_msa;
770 dctf->add8x8_idct = x264_add8x8_idct_msa;
771 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
772 dctf->add16x16_idct = x264_add16x16_idct_msa;
773 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
774 dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
775 dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
779 #endif // HIGH_BIT_DEPTH
783 #define ZIG(i,y,x) level[i] = dct[x*8+y];
784 #define ZIGZAG8_FRAME\
785 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
786 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
787 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
788 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
789 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
790 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
791 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
792 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
793 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
794 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
795 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
796 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
797 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
798 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
799 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
800 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
802 #define ZIGZAG8_FIELD\
803 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
804 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
805 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
806 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
807 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
808 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
809 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
810 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
811 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
812 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
813 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
814 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
815 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
816 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
817 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
818 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
820 #define ZIGZAG4_FRAME\
821 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
822 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
823 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
824 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
826 #define ZIGZAG4_FIELD\
827 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
828 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
829 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
830 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
832 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
837 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
843 #define ZIG(i,y,x) level[i] = dct[x*4+y];
844 #define ZIGDC(i,y,x) ZIG(i,y,x)
846 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
851 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
853 memcpy( level, dct, 2 * sizeof(dctcoef) );
854 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
855 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
859 #define ZIG(i,y,x) {\
860 int oe = x+y*FENC_STRIDE;\
861 int od = x+y*FDEC_STRIDE;\
862 level[i] = p_src[oe] - p_dst[od];\
866 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
867 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
868 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
869 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
870 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
872 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
873 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
874 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
875 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
876 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
877 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
878 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
879 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
881 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
889 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
898 #define ZIGDC(i,y,x) {\
899 int oe = x+y*FENC_STRIDE;\
900 int od = x+y*FDEC_STRIDE;\
901 *dc = p_src[oe] - p_dst[od];\
905 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
913 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
921 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
928 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
939 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
941 for( int i = 0; i < 4; i++ )
944 for( int j = 0; j < 16; j++ )
947 dst[i*16+j] = src[i+j*4];
949 nnz[(i&1) + (i>>1)*8] = !!nz;
953 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zigzag_function_t *pf_interlaced )
955 pf_interlaced->scan_8x8 = zigzag_scan_8x8_field;
956 pf_progressive->scan_8x8 = zigzag_scan_8x8_frame;
957 pf_interlaced->scan_4x4 = zigzag_scan_4x4_field;
958 pf_progressive->scan_4x4 = zigzag_scan_4x4_frame;
959 pf_interlaced->sub_8x8 = zigzag_sub_8x8_field;
960 pf_progressive->sub_8x8 = zigzag_sub_8x8_frame;
961 pf_interlaced->sub_4x4 = zigzag_sub_4x4_field;
962 pf_progressive->sub_4x4 = zigzag_sub_4x4_frame;
963 pf_interlaced->sub_4x4ac = zigzag_sub_4x4ac_field;
964 pf_progressive->sub_4x4ac = zigzag_sub_4x4ac_frame;
968 if( cpu&X264_CPU_SSE2 )
970 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
971 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
972 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
974 if( cpu&X264_CPU_SSE4 )
975 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
976 if( cpu&X264_CPU_AVX )
977 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
979 if( cpu&X264_CPU_AVX )
981 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
982 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
984 #endif // ARCH_X86_64
988 if( cpu&X264_CPU_MMX )
989 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
990 if( cpu&X264_CPU_MMX2 )
992 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_mmx2;
993 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_mmx2;
994 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_mmx2;
996 if( cpu&X264_CPU_SSE2_IS_FAST )
997 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
998 if( cpu&X264_CPU_SSSE3 )
1000 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
1001 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
1002 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_ssse3;
1003 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
1004 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
1005 if( !(cpu&X264_CPU_SLOW_SHUFFLE) )
1006 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
1008 if( cpu&X264_CPU_AVX )
1010 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
1011 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
1013 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_avx;
1014 pf_progressive->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
1016 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
1018 if( cpu&X264_CPU_XOP )
1020 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_xop;
1021 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_xop;
1022 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_xop;
1026 if( cpu&X264_CPU_ALTIVEC )
1028 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
1029 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
1032 #if HAVE_ARMV6 || ARCH_AARCH64
1033 if( cpu&X264_CPU_NEON )
1035 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
1037 pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
1038 pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
1039 pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
1040 pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
1041 pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
1042 pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
1043 pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
1044 pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
1045 pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
1046 #endif // ARCH_AARCH64
1048 #endif // HAVE_ARMV6 || ARCH_AARCH64
1049 #endif // HIGH_BIT_DEPTH
1051 pf_interlaced->interleave_8x8_cavlc =
1052 pf_progressive->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
1055 if( cpu&X264_CPU_SSE2 )
1057 pf_interlaced->interleave_8x8_cavlc =
1058 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1060 if( cpu&X264_CPU_AVX )
1062 pf_interlaced->interleave_8x8_cavlc =
1063 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1066 if( cpu&X264_CPU_MMX )
1068 pf_interlaced->interleave_8x8_cavlc =
1069 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
1071 if( (cpu&X264_CPU_SSE2) && !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SSE2_IS_SLOW)) )
1073 pf_interlaced->interleave_8x8_cavlc =
1074 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
1077 if( cpu&X264_CPU_AVX )
1079 pf_interlaced->interleave_8x8_cavlc =
1080 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
1083 if( cpu&X264_CPU_AVX2 )
1085 pf_interlaced->interleave_8x8_cavlc =
1086 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx2;
1088 #endif // HIGH_BIT_DEPTH
1092 if( cpu&X264_CPU_NEON )
1094 pf_interlaced->interleave_8x8_cavlc =
1095 pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
1097 #endif // ARCH_AARCH64
1098 #endif // !HIGH_BIT_DEPTH
1101 if( cpu&X264_CPU_MSA )
1103 pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;