1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2010 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
38 int x264_dct4_weight2_zigzag[2][16];
39 int x264_dct8_weight2_zigzag[2][64];
41 static void dct4x4dc( dctcoef d[16] )
45 for( int i = 0; i < 4; i++ )
47 int s01 = d[i*4+0] + d[i*4+1];
48 int d01 = d[i*4+0] - d[i*4+1];
49 int s23 = d[i*4+2] + d[i*4+3];
50 int d23 = d[i*4+2] - d[i*4+3];
52 tmp[0*4+i] = s01 + s23;
53 tmp[1*4+i] = s01 - s23;
54 tmp[2*4+i] = d01 - d23;
55 tmp[3*4+i] = d01 + d23;
58 for( int i = 0; i < 4; i++ )
60 int s01 = tmp[i*4+0] + tmp[i*4+1];
61 int d01 = tmp[i*4+0] - tmp[i*4+1];
62 int s23 = tmp[i*4+2] + tmp[i*4+3];
63 int d23 = tmp[i*4+2] - tmp[i*4+3];
65 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
66 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
67 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
68 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
72 static void idct4x4dc( dctcoef d[16] )
76 for( int i = 0; i < 4; i++ )
78 int s01 = d[i*4+0] + d[i*4+1];
79 int d01 = d[i*4+0] - d[i*4+1];
80 int s23 = d[i*4+2] + d[i*4+3];
81 int d23 = d[i*4+2] - d[i*4+3];
83 tmp[0*4+i] = s01 + s23;
84 tmp[1*4+i] = s01 - s23;
85 tmp[2*4+i] = d01 - d23;
86 tmp[3*4+i] = d01 + d23;
89 for( int i = 0; i < 4; i++ )
91 int s01 = tmp[i*4+0] + tmp[i*4+1];
92 int d01 = tmp[i*4+0] - tmp[i*4+1];
93 int s23 = tmp[i*4+2] + tmp[i*4+3];
94 int d23 = tmp[i*4+2] - tmp[i*4+3];
103 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
104 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
106 for( int y = 0; y < i_size; y++ )
108 for( int x = 0; x < i_size; x++ )
109 diff[x + y*i_size] = pix1[x] - pix2[x];
115 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
120 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
122 for( int i = 0; i < 4; i++ )
124 int s03 = d[i*4+0] + d[i*4+3];
125 int s12 = d[i*4+1] + d[i*4+2];
126 int d03 = d[i*4+0] - d[i*4+3];
127 int d12 = d[i*4+1] - d[i*4+2];
129 tmp[0*4+i] = s03 + s12;
130 tmp[1*4+i] = 2*d03 + d12;
131 tmp[2*4+i] = s03 - s12;
132 tmp[3*4+i] = d03 - 2*d12;
135 for( int i = 0; i < 4; i++ )
137 int s03 = tmp[i*4+0] + tmp[i*4+3];
138 int s12 = tmp[i*4+1] + tmp[i*4+2];
139 int d03 = tmp[i*4+0] - tmp[i*4+3];
140 int d12 = tmp[i*4+1] - tmp[i*4+2];
142 dct[i*4+0] = s03 + s12;
143 dct[i*4+1] = 2*d03 + d12;
144 dct[i*4+2] = s03 - s12;
145 dct[i*4+3] = d03 - 2*d12;
149 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
151 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
152 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
153 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
154 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
157 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
159 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
160 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
161 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
162 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
165 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
170 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
172 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
173 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
178 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
180 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
181 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
182 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
183 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
185 /* 2x2 DC transform */
186 int d0 = dct[0] + dct[1];
187 int d1 = dct[2] + dct[3];
188 int d2 = dct[0] - dct[1];
189 int d3 = dct[2] - dct[3];
196 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
201 for( int i = 0; i < 4; i++ )
203 int s02 = dct[0*4+i] + dct[2*4+i];
204 int d02 = dct[0*4+i] - dct[2*4+i];
205 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
206 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
208 tmp[i*4+0] = s02 + s13;
209 tmp[i*4+1] = d02 + d13;
210 tmp[i*4+2] = d02 - d13;
211 tmp[i*4+3] = s02 - s13;
214 for( int i = 0; i < 4; i++ )
216 int s02 = tmp[0*4+i] + tmp[2*4+i];
217 int d02 = tmp[0*4+i] - tmp[2*4+i];
218 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
219 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
221 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
222 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
223 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
224 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
228 for( int y = 0; y < 4; y++ )
230 for( int x = 0; x < 4; x++ )
231 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
232 p_dst += FDEC_STRIDE;
236 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
238 add4x4_idct( &p_dst[0], dct[0] );
239 add4x4_idct( &p_dst[4], dct[1] );
240 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
241 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
244 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
246 add8x8_idct( &p_dst[0], &dct[0] );
247 add8x8_idct( &p_dst[8], &dct[4] );
248 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
249 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
252 /****************************************************************************
254 ****************************************************************************/
257 int s07 = SRC(0) + SRC(7);\
258 int s16 = SRC(1) + SRC(6);\
259 int s25 = SRC(2) + SRC(5);\
260 int s34 = SRC(3) + SRC(4);\
265 int d07 = SRC(0) - SRC(7);\
266 int d16 = SRC(1) - SRC(6);\
267 int d25 = SRC(2) - SRC(5);\
268 int d34 = SRC(3) - SRC(4);\
269 int a4 = d16 + d25 + (d07 + (d07>>1));\
270 int a5 = d07 - d34 - (d25 + (d25>>1));\
271 int a6 = d07 + d34 - (d16 + (d16>>1));\
272 int a7 = d16 - d25 + (d34 + (d34>>1));\
274 DST(1) = a4 + (a7>>2);\
275 DST(2) = a2 + (a3>>1);\
276 DST(3) = a5 + (a6>>2);\
278 DST(5) = a6 - (a5>>2);\
279 DST(6) = (a2>>1) - a3 ;\
280 DST(7) = (a4>>2) - a7 ;\
283 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
287 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
289 #define SRC(x) tmp[x*8+i]
290 #define DST(x) tmp[x*8+i]
291 for( int i = 0; i < 8; i++ )
296 #define SRC(x) tmp[i*8+x]
297 #define DST(x) dct[x*8+i]
298 for( int i = 0; i < 8; i++ )
304 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
306 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
307 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
308 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
309 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
313 int a0 = SRC(0) + SRC(4);\
314 int a2 = SRC(0) - SRC(4);\
315 int a4 = (SRC(2)>>1) - SRC(6);\
316 int a6 = (SRC(6)>>1) + SRC(2);\
321 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
322 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
323 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
324 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
325 int b1 = (a7>>2) + a1;\
326 int b3 = a3 + (a5>>2);\
327 int b5 = (a3>>2) - a5;\
328 int b7 = a7 - (a1>>2);\
339 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
341 dct[0] += 32; // rounding for the >>6 at the end
343 #define SRC(x) dct[x*8+i]
344 #define DST(x,rhs) dct[x*8+i] = (rhs)
345 for( int i = 0; i < 8; i++ )
350 #define SRC(x) dct[i*8+x]
351 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
352 for( int i = 0; i < 8; i++ )
358 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
360 add8x8_idct8( &dst[0], dct[0] );
361 add8x8_idct8( &dst[8], dct[1] );
362 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
363 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
366 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
369 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
371 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
372 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
373 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
374 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
378 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
380 add4x4_idct_dc( &p_dst[0], dct[0] );
381 add4x4_idct_dc( &p_dst[4], dct[1] );
382 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
383 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
386 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
388 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
390 add4x4_idct_dc( &p_dst[ 0], dct[0] );
391 add4x4_idct_dc( &p_dst[ 4], dct[1] );
392 add4x4_idct_dc( &p_dst[ 8], dct[2] );
393 add4x4_idct_dc( &p_dst[12], dct[3] );
398 /****************************************************************************
400 ****************************************************************************/
401 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
403 dctf->sub4x4_dct = sub4x4_dct;
404 dctf->add4x4_idct = add4x4_idct;
406 dctf->sub8x8_dct = sub8x8_dct;
407 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
408 dctf->add8x8_idct = add8x8_idct;
409 dctf->add8x8_idct_dc = add8x8_idct_dc;
411 dctf->sub16x16_dct = sub16x16_dct;
412 dctf->add16x16_idct = add16x16_idct;
413 dctf->add16x16_idct_dc = add16x16_idct_dc;
415 dctf->sub8x8_dct8 = sub8x8_dct8;
416 dctf->add8x8_idct8 = add8x8_idct8;
418 dctf->sub16x16_dct8 = sub16x16_dct8;
419 dctf->add16x16_idct8 = add16x16_idct8;
421 dctf->dct4x4dc = dct4x4dc;
422 dctf->idct4x4dc = idct4x4dc;
424 #if !X264_HIGH_BIT_DEPTH
426 if( cpu&X264_CPU_MMX )
428 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
429 dctf->add4x4_idct = x264_add4x4_idct_mmx;
430 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
431 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
432 dctf->dct4x4dc = x264_dct4x4dc_mmx;
433 dctf->idct4x4dc = x264_idct4x4dc_mmx;
434 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
437 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
438 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
439 dctf->add8x8_idct = x264_add8x8_idct_mmx;
440 dctf->add16x16_idct = x264_add16x16_idct_mmx;
442 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
443 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
444 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
445 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
449 if( cpu&X264_CPU_SSE2 )
451 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
452 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
453 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
454 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
455 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
457 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
458 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
459 dctf->add8x8_idct = x264_add8x8_idct_sse2;
460 dctf->add16x16_idct = x264_add16x16_idct_sse2;
461 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
464 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
466 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
467 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
468 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
469 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
470 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
471 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
472 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
475 if( cpu&X264_CPU_SSE4 )
476 dctf->add4x4_idct = x264_add4x4_idct_sse4;
481 if( cpu&X264_CPU_ALTIVEC )
483 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
484 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
485 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
487 dctf->add4x4_idct = x264_add4x4_idct_altivec;
488 dctf->add8x8_idct = x264_add8x8_idct_altivec;
489 dctf->add16x16_idct = x264_add16x16_idct_altivec;
491 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
492 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
494 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
495 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
500 if( cpu&X264_CPU_NEON )
502 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
503 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
504 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
505 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
506 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
507 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
508 dctf->dct4x4dc = x264_dct4x4dc_neon;
509 dctf->idct4x4dc = x264_idct4x4dc_neon;
511 dctf->add4x4_idct = x264_add4x4_idct_neon;
512 dctf->add8x8_idct = x264_add8x8_idct_neon;
513 dctf->add16x16_idct = x264_add16x16_idct_neon;
515 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
516 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
518 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
519 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
522 #endif // !X264_HIGH_BIT_DEPTH
525 void x264_dct_init_weights( void )
527 for( int j = 0; j < 2; j++ )
529 for( int i = 0; i < 16; i++ )
530 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
531 for( int i = 0; i < 64; i++ )
532 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
537 #define ZIG(i,y,x) level[i] = dct[x*8+y];
538 #define ZIGZAG8_FRAME\
539 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
540 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
541 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
542 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
543 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
544 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
545 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
546 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
547 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
548 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
549 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
550 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
551 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
552 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
553 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
554 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
556 #define ZIGZAG8_FIELD\
557 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
558 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
559 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
560 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
561 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
562 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
563 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
564 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
565 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
566 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
567 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
568 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
569 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
570 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
571 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
572 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
574 #define ZIGZAG4_FRAME\
575 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
576 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
577 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
578 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
580 #define ZIGZAG4_FIELD\
581 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
582 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
583 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
584 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
586 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
591 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
597 #define ZIG(i,y,x) level[i] = dct[x*4+y];
598 #define ZIGDC(i,y,x) ZIG(i,y,x)
600 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
605 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
607 memcpy( level, dct, 2 * sizeof(dctcoef) );
608 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
609 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
613 #define ZIG(i,y,x) {\
614 int oe = x+y*FENC_STRIDE;\
615 int od = x+y*FDEC_STRIDE;\
616 level[i] = p_src[oe] - p_dst[od];\
620 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
621 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
622 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
623 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
624 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
626 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
627 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
628 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
629 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
630 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
631 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
632 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
633 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
635 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
643 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
652 #define ZIGDC(i,y,x) {\
653 int oe = x+y*FENC_STRIDE;\
654 int od = x+y*FDEC_STRIDE;\
655 *dc = p_src[oe] - p_dst[od];\
659 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
667 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
675 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
682 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
693 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
695 for( int i = 0; i < 4; i++ )
698 for( int j = 0; j < 16; j++ )
701 dst[i*16+j] = src[i+j*4];
703 nnz[(i&1) + (i>>1)*8] = !!nz;
707 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
711 pf->scan_8x8 = zigzag_scan_8x8_field;
712 pf->scan_4x4 = zigzag_scan_4x4_field;
713 pf->sub_8x8 = zigzag_sub_8x8_field;
714 pf->sub_4x4 = zigzag_sub_4x4_field;
715 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
716 #if !X264_HIGH_BIT_DEPTH
718 if( cpu&X264_CPU_MMXEXT )
720 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
721 pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
723 if( cpu&X264_CPU_SSSE3 )
725 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
726 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
731 if( cpu&X264_CPU_ALTIVEC )
732 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
734 #endif // !X264_HIGH_BIT_DEPTH
738 pf->scan_8x8 = zigzag_scan_8x8_frame;
739 pf->scan_4x4 = zigzag_scan_4x4_frame;
740 pf->sub_8x8 = zigzag_sub_8x8_frame;
741 pf->sub_4x4 = zigzag_sub_4x4_frame;
742 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
743 #if !X264_HIGH_BIT_DEPTH
745 if( cpu&X264_CPU_MMX )
746 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
747 if( cpu&X264_CPU_MMXEXT )
748 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
749 if( cpu&X264_CPU_SSE2_IS_FAST )
750 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
751 if( cpu&X264_CPU_SSSE3 )
753 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
754 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
755 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
756 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
757 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
762 if( cpu&X264_CPU_ALTIVEC )
763 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
766 if( cpu&X264_CPU_NEON )
767 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
769 #endif // !X264_HIGH_BIT_DEPTH
772 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
773 #if !X264_HIGH_BIT_DEPTH
775 if( cpu&X264_CPU_MMX )
776 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
777 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
778 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
780 #endif // !X264_HIGH_BIT_DEPTH