1 /*****************************************************************************
2 * dct.c: transform and zigzag
3 *****************************************************************************
4 * Copyright (C) 2003-2010 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23 * This program is also available under a commercial proprietary license.
24 * For more information, contact us at licensing@x264.com.
25 *****************************************************************************/
38 int x264_dct4_weight2_zigzag[2][16];
39 int x264_dct8_weight2_zigzag[2][64];
41 static void dct4x4dc( dctcoef d[16] )
45 for( int i = 0; i < 4; i++ )
47 int s01 = d[i*4+0] + d[i*4+1];
48 int d01 = d[i*4+0] - d[i*4+1];
49 int s23 = d[i*4+2] + d[i*4+3];
50 int d23 = d[i*4+2] - d[i*4+3];
52 tmp[0*4+i] = s01 + s23;
53 tmp[1*4+i] = s01 - s23;
54 tmp[2*4+i] = d01 - d23;
55 tmp[3*4+i] = d01 + d23;
58 for( int i = 0; i < 4; i++ )
60 int s01 = tmp[i*4+0] + tmp[i*4+1];
61 int d01 = tmp[i*4+0] - tmp[i*4+1];
62 int s23 = tmp[i*4+2] + tmp[i*4+3];
63 int d23 = tmp[i*4+2] - tmp[i*4+3];
65 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
66 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
67 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
68 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
72 static void idct4x4dc( dctcoef d[16] )
76 for( int i = 0; i < 4; i++ )
78 int s01 = d[i*4+0] + d[i*4+1];
79 int d01 = d[i*4+0] - d[i*4+1];
80 int s23 = d[i*4+2] + d[i*4+3];
81 int d23 = d[i*4+2] - d[i*4+3];
83 tmp[0*4+i] = s01 + s23;
84 tmp[1*4+i] = s01 - s23;
85 tmp[2*4+i] = d01 - d23;
86 tmp[3*4+i] = d01 + d23;
89 for( int i = 0; i < 4; i++ )
91 int s01 = tmp[i*4+0] + tmp[i*4+1];
92 int d01 = tmp[i*4+0] - tmp[i*4+1];
93 int s23 = tmp[i*4+2] + tmp[i*4+3];
94 int d23 = tmp[i*4+2] - tmp[i*4+3];
103 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
104 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
106 for( int y = 0; y < i_size; y++ )
108 for( int x = 0; x < i_size; x++ )
109 diff[x + y*i_size] = pix1[x] - pix2[x];
115 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
120 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
122 for( int i = 0; i < 4; i++ )
124 int s03 = d[i*4+0] + d[i*4+3];
125 int s12 = d[i*4+1] + d[i*4+2];
126 int d03 = d[i*4+0] - d[i*4+3];
127 int d12 = d[i*4+1] - d[i*4+2];
129 tmp[0*4+i] = s03 + s12;
130 tmp[1*4+i] = 2*d03 + d12;
131 tmp[2*4+i] = s03 - s12;
132 tmp[3*4+i] = d03 - 2*d12;
135 for( int i = 0; i < 4; i++ )
137 int s03 = tmp[i*4+0] + tmp[i*4+3];
138 int s12 = tmp[i*4+1] + tmp[i*4+2];
139 int d03 = tmp[i*4+0] - tmp[i*4+3];
140 int d12 = tmp[i*4+1] - tmp[i*4+2];
142 dct[i*4+0] = s03 + s12;
143 dct[i*4+1] = 2*d03 + d12;
144 dct[i*4+2] = s03 - s12;
145 dct[i*4+3] = d03 - 2*d12;
149 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
151 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
152 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
153 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
154 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
157 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
159 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
160 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
161 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
162 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
165 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
170 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
172 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
173 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
178 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
180 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
181 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
182 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
183 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
185 /* 2x2 DC transform */
186 int d0 = dct[0] + dct[1];
187 int d1 = dct[2] + dct[3];
188 int d2 = dct[0] - dct[1];
189 int d3 = dct[2] - dct[3];
196 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
201 for( int i = 0; i < 4; i++ )
203 int s02 = dct[0*4+i] + dct[2*4+i];
204 int d02 = dct[0*4+i] - dct[2*4+i];
205 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
206 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
208 tmp[i*4+0] = s02 + s13;
209 tmp[i*4+1] = d02 + d13;
210 tmp[i*4+2] = d02 - d13;
211 tmp[i*4+3] = s02 - s13;
214 for( int i = 0; i < 4; i++ )
216 int s02 = tmp[0*4+i] + tmp[2*4+i];
217 int d02 = tmp[0*4+i] - tmp[2*4+i];
218 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
219 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
221 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
222 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
223 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
224 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
228 for( int y = 0; y < 4; y++ )
230 for( int x = 0; x < 4; x++ )
231 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
232 p_dst += FDEC_STRIDE;
236 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
238 add4x4_idct( &p_dst[0], dct[0] );
239 add4x4_idct( &p_dst[4], dct[1] );
240 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
241 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
244 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
246 add8x8_idct( &p_dst[0], &dct[0] );
247 add8x8_idct( &p_dst[8], &dct[4] );
248 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
249 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
252 /****************************************************************************
254 ****************************************************************************/
257 int s07 = SRC(0) + SRC(7);\
258 int s16 = SRC(1) + SRC(6);\
259 int s25 = SRC(2) + SRC(5);\
260 int s34 = SRC(3) + SRC(4);\
265 int d07 = SRC(0) - SRC(7);\
266 int d16 = SRC(1) - SRC(6);\
267 int d25 = SRC(2) - SRC(5);\
268 int d34 = SRC(3) - SRC(4);\
269 int a4 = d16 + d25 + (d07 + (d07>>1));\
270 int a5 = d07 - d34 - (d25 + (d25>>1));\
271 int a6 = d07 + d34 - (d16 + (d16>>1));\
272 int a7 = d16 - d25 + (d34 + (d34>>1));\
274 DST(1) = a4 + (a7>>2);\
275 DST(2) = a2 + (a3>>1);\
276 DST(3) = a5 + (a6>>2);\
278 DST(5) = a6 - (a5>>2);\
279 DST(6) = (a2>>1) - a3 ;\
280 DST(7) = (a4>>2) - a7 ;\
283 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
287 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
289 #define SRC(x) tmp[x*8+i]
290 #define DST(x) tmp[x*8+i]
291 for( int i = 0; i < 8; i++ )
296 #define SRC(x) tmp[i*8+x]
297 #define DST(x) dct[x*8+i]
298 for( int i = 0; i < 8; i++ )
304 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
306 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
307 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
308 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
309 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
313 int a0 = SRC(0) + SRC(4);\
314 int a2 = SRC(0) - SRC(4);\
315 int a4 = (SRC(2)>>1) - SRC(6);\
316 int a6 = (SRC(6)>>1) + SRC(2);\
321 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
322 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
323 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
324 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
325 int b1 = (a7>>2) + a1;\
326 int b3 = a3 + (a5>>2);\
327 int b5 = (a3>>2) - a5;\
328 int b7 = a7 - (a1>>2);\
339 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
341 dct[0] += 32; // rounding for the >>6 at the end
343 #define SRC(x) dct[x*8+i]
344 #define DST(x,rhs) dct[x*8+i] = (rhs)
345 for( int i = 0; i < 8; i++ )
350 #define SRC(x) dct[i*8+x]
351 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
352 for( int i = 0; i < 8; i++ )
358 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
360 add8x8_idct8( &dst[0], dct[0] );
361 add8x8_idct8( &dst[8], dct[1] );
362 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
363 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
366 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
369 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
371 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
372 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
373 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
374 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
378 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
380 add4x4_idct_dc( &p_dst[0], dct[0] );
381 add4x4_idct_dc( &p_dst[4], dct[1] );
382 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
383 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
386 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
388 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
390 add4x4_idct_dc( &p_dst[ 0], dct[0] );
391 add4x4_idct_dc( &p_dst[ 4], dct[1] );
392 add4x4_idct_dc( &p_dst[ 8], dct[2] );
393 add4x4_idct_dc( &p_dst[12], dct[3] );
398 /****************************************************************************
400 ****************************************************************************/
401 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
403 dctf->sub4x4_dct = sub4x4_dct;
404 dctf->add4x4_idct = add4x4_idct;
406 dctf->sub8x8_dct = sub8x8_dct;
407 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
408 dctf->add8x8_idct = add8x8_idct;
409 dctf->add8x8_idct_dc = add8x8_idct_dc;
411 dctf->sub16x16_dct = sub16x16_dct;
412 dctf->add16x16_idct = add16x16_idct;
413 dctf->add16x16_idct_dc = add16x16_idct_dc;
415 dctf->sub8x8_dct8 = sub8x8_dct8;
416 dctf->add8x8_idct8 = add8x8_idct8;
418 dctf->sub16x16_dct8 = sub16x16_dct8;
419 dctf->add16x16_idct8 = add16x16_idct8;
421 dctf->dct4x4dc = dct4x4dc;
422 dctf->idct4x4dc = idct4x4dc;
426 if( cpu&X264_CPU_MMX )
428 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
429 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
430 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
432 if( cpu&X264_CPU_SSE2 )
434 dctf->add4x4_idct = x264_add4x4_idct_sse2;
435 dctf->dct4x4dc = x264_dct4x4dc_sse2;
436 dctf->idct4x4dc = x264_idct4x4dc_sse2;
437 dctf->add8x8_idct = x264_add8x8_idct_sse2;
438 dctf->add16x16_idct = x264_add16x16_idct_sse2;
439 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_sse2;
440 dctf->add16x16_idct_dc= x264_add16x16_idct_dc_sse2;
443 #else // !HIGH_BIT_DEPTH
445 if( cpu&X264_CPU_MMX )
447 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
448 dctf->add4x4_idct = x264_add4x4_idct_mmx;
449 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
450 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
451 dctf->dct4x4dc = x264_dct4x4dc_mmx;
452 dctf->idct4x4dc = x264_idct4x4dc_mmx;
453 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
456 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
457 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
458 dctf->add8x8_idct = x264_add8x8_idct_mmx;
459 dctf->add16x16_idct = x264_add16x16_idct_mmx;
461 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
462 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
463 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
464 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
468 if( cpu&X264_CPU_SSE2 )
470 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
471 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
472 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
473 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
474 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
476 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
477 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
478 dctf->add8x8_idct = x264_add8x8_idct_sse2;
479 dctf->add16x16_idct = x264_add16x16_idct_sse2;
480 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
483 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
485 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
486 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
487 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
488 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
489 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
490 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
491 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
494 if( cpu&X264_CPU_SSE4 )
495 dctf->add4x4_idct = x264_add4x4_idct_sse4;
500 if( cpu&X264_CPU_ALTIVEC )
502 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
503 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
504 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
506 dctf->add4x4_idct = x264_add4x4_idct_altivec;
507 dctf->add8x8_idct = x264_add8x8_idct_altivec;
508 dctf->add16x16_idct = x264_add16x16_idct_altivec;
510 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
511 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
513 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
514 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
519 if( cpu&X264_CPU_NEON )
521 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
522 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
523 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
524 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
525 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
526 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
527 dctf->dct4x4dc = x264_dct4x4dc_neon;
528 dctf->idct4x4dc = x264_idct4x4dc_neon;
530 dctf->add4x4_idct = x264_add4x4_idct_neon;
531 dctf->add8x8_idct = x264_add8x8_idct_neon;
532 dctf->add16x16_idct = x264_add16x16_idct_neon;
534 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
535 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
537 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
538 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
541 #endif // HIGH_BIT_DEPTH
544 void x264_dct_init_weights( void )
546 for( int j = 0; j < 2; j++ )
548 for( int i = 0; i < 16; i++ )
549 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
550 for( int i = 0; i < 64; i++ )
551 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
556 #define ZIG(i,y,x) level[i] = dct[x*8+y];
557 #define ZIGZAG8_FRAME\
558 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
559 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
560 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
561 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
562 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
563 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
564 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
565 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
566 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
567 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
568 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
569 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
570 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
571 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
572 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
573 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
575 #define ZIGZAG8_FIELD\
576 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
577 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
578 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
579 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
580 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
581 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
582 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
583 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
584 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
585 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
586 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
587 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
588 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
589 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
590 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
591 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
593 #define ZIGZAG4_FRAME\
594 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
595 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
596 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
597 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
599 #define ZIGZAG4_FIELD\
600 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
601 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
602 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
603 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
605 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
610 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
616 #define ZIG(i,y,x) level[i] = dct[x*4+y];
617 #define ZIGDC(i,y,x) ZIG(i,y,x)
619 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
624 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
626 memcpy( level, dct, 2 * sizeof(dctcoef) );
627 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
628 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
632 #define ZIG(i,y,x) {\
633 int oe = x+y*FENC_STRIDE;\
634 int od = x+y*FDEC_STRIDE;\
635 level[i] = p_src[oe] - p_dst[od];\
639 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
640 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
641 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
642 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
643 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
645 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
646 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
647 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
648 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
649 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
650 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
651 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
652 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
654 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
662 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
671 #define ZIGDC(i,y,x) {\
672 int oe = x+y*FENC_STRIDE;\
673 int od = x+y*FDEC_STRIDE;\
674 *dc = p_src[oe] - p_dst[od];\
678 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
686 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
694 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
701 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
712 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
714 for( int i = 0; i < 4; i++ )
717 for( int j = 0; j < 16; j++ )
720 dst[i*16+j] = src[i+j*4];
722 nnz[(i&1) + (i>>1)*8] = !!nz;
726 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
730 pf->scan_8x8 = zigzag_scan_8x8_field;
731 pf->scan_4x4 = zigzag_scan_4x4_field;
732 pf->sub_8x8 = zigzag_sub_8x8_field;
733 pf->sub_4x4 = zigzag_sub_4x4_field;
734 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
737 if( cpu&X264_CPU_MMXEXT )
739 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
740 pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
742 if( cpu&X264_CPU_SSSE3 )
744 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
745 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
750 if( cpu&X264_CPU_ALTIVEC )
751 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
753 #endif // !HIGH_BIT_DEPTH
757 pf->scan_8x8 = zigzag_scan_8x8_frame;
758 pf->scan_4x4 = zigzag_scan_4x4_frame;
759 pf->sub_8x8 = zigzag_sub_8x8_frame;
760 pf->sub_4x4 = zigzag_sub_4x4_frame;
761 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
764 if( cpu&X264_CPU_MMX )
765 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
766 if( cpu&X264_CPU_MMXEXT )
767 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
768 if( cpu&X264_CPU_SSE2_IS_FAST )
769 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
770 if( cpu&X264_CPU_SSSE3 )
772 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
773 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
774 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
775 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
776 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
781 if( cpu&X264_CPU_ALTIVEC )
782 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
785 if( cpu&X264_CPU_NEON )
786 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
788 #endif // !HIGH_BIT_DEPTH
791 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
794 if( cpu&X264_CPU_MMX )
795 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
796 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
797 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
799 #endif // !HIGH_BIT_DEPTH