1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2011 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
38 int x264_dct4_weight2_zigzag[2][16];
39 int x264_dct8_weight2_zigzag[2][64];
41 static void dct4x4dc( dctcoef d[16] )
45 for( int i = 0; i < 4; i++ )
47 int s01 = d[i*4+0] + d[i*4+1];
48 int d01 = d[i*4+0] - d[i*4+1];
49 int s23 = d[i*4+2] + d[i*4+3];
50 int d23 = d[i*4+2] - d[i*4+3];
52 tmp[0*4+i] = s01 + s23;
53 tmp[1*4+i] = s01 - s23;
54 tmp[2*4+i] = d01 - d23;
55 tmp[3*4+i] = d01 + d23;
58 for( int i = 0; i < 4; i++ )
60 int s01 = tmp[i*4+0] + tmp[i*4+1];
61 int d01 = tmp[i*4+0] - tmp[i*4+1];
62 int s23 = tmp[i*4+2] + tmp[i*4+3];
63 int d23 = tmp[i*4+2] - tmp[i*4+3];
65 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
66 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
67 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
68 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
72 static void idct4x4dc( dctcoef d[16] )
76 for( int i = 0; i < 4; i++ )
78 int s01 = d[i*4+0] + d[i*4+1];
79 int d01 = d[i*4+0] - d[i*4+1];
80 int s23 = d[i*4+2] + d[i*4+3];
81 int d23 = d[i*4+2] - d[i*4+3];
83 tmp[0*4+i] = s01 + s23;
84 tmp[1*4+i] = s01 - s23;
85 tmp[2*4+i] = d01 - d23;
86 tmp[3*4+i] = d01 + d23;
89 for( int i = 0; i < 4; i++ )
91 int s01 = tmp[i*4+0] + tmp[i*4+1];
92 int d01 = tmp[i*4+0] - tmp[i*4+1];
93 int s23 = tmp[i*4+2] + tmp[i*4+3];
94 int d23 = tmp[i*4+2] - tmp[i*4+3];
103 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
104 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
106 for( int y = 0; y < i_size; y++ )
108 for( int x = 0; x < i_size; x++ )
109 diff[x + y*i_size] = pix1[x] - pix2[x];
115 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
120 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
122 for( int i = 0; i < 4; i++ )
124 int s03 = d[i*4+0] + d[i*4+3];
125 int s12 = d[i*4+1] + d[i*4+2];
126 int d03 = d[i*4+0] - d[i*4+3];
127 int d12 = d[i*4+1] - d[i*4+2];
129 tmp[0*4+i] = s03 + s12;
130 tmp[1*4+i] = 2*d03 + d12;
131 tmp[2*4+i] = s03 - s12;
132 tmp[3*4+i] = d03 - 2*d12;
135 for( int i = 0; i < 4; i++ )
137 int s03 = tmp[i*4+0] + tmp[i*4+3];
138 int s12 = tmp[i*4+1] + tmp[i*4+2];
139 int d03 = tmp[i*4+0] - tmp[i*4+3];
140 int d12 = tmp[i*4+1] - tmp[i*4+2];
142 dct[i*4+0] = s03 + s12;
143 dct[i*4+1] = 2*d03 + d12;
144 dct[i*4+2] = s03 - s12;
145 dct[i*4+3] = d03 - 2*d12;
149 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
151 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
152 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
153 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
154 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
157 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
159 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
160 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
161 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
162 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
165 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
170 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
172 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
173 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
178 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
180 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
181 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
182 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
183 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
185 /* 2x2 DC transform */
186 int d0 = dct[0] + dct[1];
187 int d1 = dct[2] + dct[3];
188 int d2 = dct[0] - dct[1];
189 int d3 = dct[2] - dct[3];
196 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
201 for( int i = 0; i < 4; i++ )
203 int s02 = dct[0*4+i] + dct[2*4+i];
204 int d02 = dct[0*4+i] - dct[2*4+i];
205 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
206 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
208 tmp[i*4+0] = s02 + s13;
209 tmp[i*4+1] = d02 + d13;
210 tmp[i*4+2] = d02 - d13;
211 tmp[i*4+3] = s02 - s13;
214 for( int i = 0; i < 4; i++ )
216 int s02 = tmp[0*4+i] + tmp[2*4+i];
217 int d02 = tmp[0*4+i] - tmp[2*4+i];
218 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
219 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
221 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
222 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
223 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
224 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
228 for( int y = 0; y < 4; y++ )
230 for( int x = 0; x < 4; x++ )
231 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
232 p_dst += FDEC_STRIDE;
236 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
238 add4x4_idct( &p_dst[0], dct[0] );
239 add4x4_idct( &p_dst[4], dct[1] );
240 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
241 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
244 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
246 add8x8_idct( &p_dst[0], &dct[0] );
247 add8x8_idct( &p_dst[8], &dct[4] );
248 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
249 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
252 /****************************************************************************
254 ****************************************************************************/
257 int s07 = SRC(0) + SRC(7);\
258 int s16 = SRC(1) + SRC(6);\
259 int s25 = SRC(2) + SRC(5);\
260 int s34 = SRC(3) + SRC(4);\
265 int d07 = SRC(0) - SRC(7);\
266 int d16 = SRC(1) - SRC(6);\
267 int d25 = SRC(2) - SRC(5);\
268 int d34 = SRC(3) - SRC(4);\
269 int a4 = d16 + d25 + (d07 + (d07>>1));\
270 int a5 = d07 - d34 - (d25 + (d25>>1));\
271 int a6 = d07 + d34 - (d16 + (d16>>1));\
272 int a7 = d16 - d25 + (d34 + (d34>>1));\
274 DST(1) = a4 + (a7>>2);\
275 DST(2) = a2 + (a3>>1);\
276 DST(3) = a5 + (a6>>2);\
278 DST(5) = a6 - (a5>>2);\
279 DST(6) = (a2>>1) - a3 ;\
280 DST(7) = (a4>>2) - a7 ;\
283 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
287 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
289 #define SRC(x) tmp[x*8+i]
290 #define DST(x) tmp[x*8+i]
291 for( int i = 0; i < 8; i++ )
296 #define SRC(x) tmp[i*8+x]
297 #define DST(x) dct[x*8+i]
298 for( int i = 0; i < 8; i++ )
304 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
306 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
307 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
308 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
309 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
313 int a0 = SRC(0) + SRC(4);\
314 int a2 = SRC(0) - SRC(4);\
315 int a4 = (SRC(2)>>1) - SRC(6);\
316 int a6 = (SRC(6)>>1) + SRC(2);\
321 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
322 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
323 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
324 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
325 int b1 = (a7>>2) + a1;\
326 int b3 = a3 + (a5>>2);\
327 int b5 = (a3>>2) - a5;\
328 int b7 = a7 - (a1>>2);\
339 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
341 dct[0] += 32; // rounding for the >>6 at the end
343 #define SRC(x) dct[x*8+i]
344 #define DST(x,rhs) dct[x*8+i] = (rhs)
345 for( int i = 0; i < 8; i++ )
350 #define SRC(x) dct[i*8+x]
351 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
352 for( int i = 0; i < 8; i++ )
358 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
360 add8x8_idct8( &dst[0], dct[0] );
361 add8x8_idct8( &dst[8], dct[1] );
362 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
363 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
366 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
369 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
371 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
372 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
373 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
374 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
378 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
380 add4x4_idct_dc( &p_dst[0], dct[0] );
381 add4x4_idct_dc( &p_dst[4], dct[1] );
382 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
383 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
386 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
388 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
390 add4x4_idct_dc( &p_dst[ 0], dct[0] );
391 add4x4_idct_dc( &p_dst[ 4], dct[1] );
392 add4x4_idct_dc( &p_dst[ 8], dct[2] );
393 add4x4_idct_dc( &p_dst[12], dct[3] );
398 /****************************************************************************
400 ****************************************************************************/
401 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
403 dctf->sub4x4_dct = sub4x4_dct;
404 dctf->add4x4_idct = add4x4_idct;
406 dctf->sub8x8_dct = sub8x8_dct;
407 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
408 dctf->add8x8_idct = add8x8_idct;
409 dctf->add8x8_idct_dc = add8x8_idct_dc;
411 dctf->sub16x16_dct = sub16x16_dct;
412 dctf->add16x16_idct = add16x16_idct;
413 dctf->add16x16_idct_dc = add16x16_idct_dc;
415 dctf->sub8x8_dct8 = sub8x8_dct8;
416 dctf->add8x8_idct8 = add8x8_idct8;
418 dctf->sub16x16_dct8 = sub16x16_dct8;
419 dctf->add16x16_idct8 = add16x16_idct8;
421 dctf->dct4x4dc = dct4x4dc;
422 dctf->idct4x4dc = idct4x4dc;
426 if( cpu&X264_CPU_MMX )
428 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
429 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
430 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
432 if( cpu&X264_CPU_SSE2 )
434 dctf->add4x4_idct = x264_add4x4_idct_sse2;
435 dctf->dct4x4dc = x264_dct4x4dc_sse2;
436 dctf->idct4x4dc = x264_idct4x4dc_sse2;
437 dctf->add8x8_idct = x264_add8x8_idct_sse2;
438 dctf->add16x16_idct = x264_add16x16_idct_sse2;
439 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
440 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
442 if( cpu&X264_CPU_AVX )
444 dctf->add4x4_idct = x264_add4x4_idct_avx;
445 dctf->dct4x4dc = x264_dct4x4dc_avx;
446 dctf->idct4x4dc = x264_idct4x4dc_avx;
447 dctf->add8x8_idct = x264_add8x8_idct_avx;
448 dctf->add16x16_idct = x264_add16x16_idct_avx;
449 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_avx;
450 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_avx;
453 #else // !HIGH_BIT_DEPTH
455 if( cpu&X264_CPU_MMX )
457 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
458 dctf->add4x4_idct = x264_add4x4_idct_mmx;
459 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
460 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
461 dctf->dct4x4dc = x264_dct4x4dc_mmx;
462 dctf->idct4x4dc = x264_idct4x4dc_mmx;
463 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
466 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
467 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
468 dctf->add8x8_idct = x264_add8x8_idct_mmx;
469 dctf->add16x16_idct = x264_add16x16_idct_mmx;
471 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
472 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
473 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
474 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
478 if( cpu&X264_CPU_SSE2 )
480 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
481 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
482 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
483 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
484 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
486 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
487 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
488 dctf->add8x8_idct = x264_add8x8_idct_sse2;
489 dctf->add16x16_idct = x264_add16x16_idct_sse2;
490 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
493 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
495 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
496 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
497 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
498 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
499 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
500 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
501 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
504 if( cpu&X264_CPU_SSE4 )
505 dctf->add4x4_idct = x264_add4x4_idct_sse4;
507 if( cpu&X264_CPU_AVX )
509 dctf->add4x4_idct = x264_add4x4_idct_avx;
510 dctf->add8x8_idct = x264_add8x8_idct_avx;
511 dctf->add16x16_idct = x264_add16x16_idct_avx;
512 dctf->add8x8_idct8 = x264_add8x8_idct8_avx;
513 dctf->add16x16_idct8 = x264_add16x16_idct8_avx;
514 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_avx;
515 dctf->sub8x8_dct = x264_sub8x8_dct_avx;
516 dctf->sub16x16_dct = x264_sub16x16_dct_avx;
517 dctf->sub8x8_dct8 = x264_sub8x8_dct8_avx;
518 dctf->sub16x16_dct8 = x264_sub16x16_dct8_avx;
523 if( cpu&X264_CPU_ALTIVEC )
525 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
526 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
527 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
529 dctf->add4x4_idct = x264_add4x4_idct_altivec;
530 dctf->add8x8_idct = x264_add8x8_idct_altivec;
531 dctf->add16x16_idct = x264_add16x16_idct_altivec;
533 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
534 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
536 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
537 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
542 if( cpu&X264_CPU_NEON )
544 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
545 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
546 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
547 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
548 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
549 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
550 dctf->dct4x4dc = x264_dct4x4dc_neon;
551 dctf->idct4x4dc = x264_idct4x4dc_neon;
553 dctf->add4x4_idct = x264_add4x4_idct_neon;
554 dctf->add8x8_idct = x264_add8x8_idct_neon;
555 dctf->add16x16_idct = x264_add16x16_idct_neon;
557 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
558 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
560 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
561 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
564 #endif // HIGH_BIT_DEPTH
567 void x264_dct_init_weights( void )
569 for( int j = 0; j < 2; j++ )
571 for( int i = 0; i < 16; i++ )
572 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
573 for( int i = 0; i < 64; i++ )
574 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
579 #define ZIG(i,y,x) level[i] = dct[x*8+y];
580 #define ZIGZAG8_FRAME\
581 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
582 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
583 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
584 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
585 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
586 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
587 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
588 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
589 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
590 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
591 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
592 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
593 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
594 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
595 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
596 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
598 #define ZIGZAG8_FIELD\
599 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
600 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
601 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
602 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
603 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
604 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
605 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
606 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
607 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
608 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
609 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
610 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
611 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
612 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
613 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
614 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
616 #define ZIGZAG4_FRAME\
617 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
618 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
619 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
620 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
622 #define ZIGZAG4_FIELD\
623 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
624 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
625 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
626 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
628 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
633 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
639 #define ZIG(i,y,x) level[i] = dct[x*4+y];
640 #define ZIGDC(i,y,x) ZIG(i,y,x)
642 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
647 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
649 memcpy( level, dct, 2 * sizeof(dctcoef) );
650 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
651 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
655 #define ZIG(i,y,x) {\
656 int oe = x+y*FENC_STRIDE;\
657 int od = x+y*FDEC_STRIDE;\
658 level[i] = p_src[oe] - p_dst[od];\
662 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
663 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
664 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
665 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
666 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
668 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
669 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
670 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
671 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
672 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
673 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
674 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
675 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
677 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
685 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
694 #define ZIGDC(i,y,x) {\
695 int oe = x+y*FENC_STRIDE;\
696 int od = x+y*FDEC_STRIDE;\
697 *dc = p_src[oe] - p_dst[od];\
701 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
709 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
717 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
724 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
735 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
737 for( int i = 0; i < 4; i++ )
740 for( int j = 0; j < 16; j++ )
743 dst[i*16+j] = src[i+j*4];
745 nnz[(i&1) + (i>>1)*8] = !!nz;
749 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
753 pf->scan_8x8 = zigzag_scan_8x8_field;
754 pf->scan_4x4 = zigzag_scan_4x4_field;
755 pf->sub_8x8 = zigzag_sub_8x8_field;
756 pf->sub_4x4 = zigzag_sub_4x4_field;
757 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
760 if( cpu&X264_CPU_SSE2 )
761 pf->scan_4x4 = x264_zigzag_scan_4x4_field_sse2;
762 if( cpu&X264_CPU_SSE4 )
763 pf->scan_8x8 = x264_zigzag_scan_8x8_field_sse4;
764 if( cpu&X264_CPU_AVX )
765 pf->scan_8x8 = x264_zigzag_scan_8x8_field_avx;
769 if( cpu&X264_CPU_MMXEXT )
771 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
772 pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
774 if( cpu&X264_CPU_SSSE3 )
776 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
777 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
779 if( cpu&X264_CPU_AVX )
781 pf->sub_4x4 = x264_zigzag_sub_4x4_field_avx;
783 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_avx;
788 if( cpu&X264_CPU_ALTIVEC )
789 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
791 #endif // HIGH_BIT_DEPTH
795 pf->scan_8x8 = zigzag_scan_8x8_frame;
796 pf->scan_4x4 = zigzag_scan_4x4_frame;
797 pf->sub_8x8 = zigzag_sub_8x8_frame;
798 pf->sub_4x4 = zigzag_sub_4x4_frame;
799 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
802 if( cpu&X264_CPU_SSE2 )
804 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_sse2;
805 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
808 if( cpu&X264_CPU_AVX )
810 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
811 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_avx;
813 #endif // ARCH_X86_64
817 if( cpu&X264_CPU_MMX )
818 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
819 if( cpu&X264_CPU_MMXEXT )
820 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
821 if( cpu&X264_CPU_SSE2_IS_FAST )
822 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
823 if( cpu&X264_CPU_SSSE3 )
825 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
826 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
827 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
828 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
829 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
831 if( cpu&X264_CPU_AVX )
833 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_avx;
835 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_avx;
837 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
838 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_avx;
842 if( cpu&X264_CPU_ALTIVEC )
843 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
846 if( cpu&X264_CPU_NEON )
847 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
849 #endif // HIGH_BIT_DEPTH
852 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
855 if( cpu&X264_CPU_SSE2 )
856 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
857 if( cpu&X264_CPU_AVX )
858 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
860 if( cpu&X264_CPU_MMX )
861 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
862 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
863 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
864 if( cpu&X264_CPU_AVX )
865 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_avx;
866 #endif // HIGH_BIT_DEPTH