1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
35 int x264_dct4_weight2_zigzag[2][16];
36 int x264_dct8_weight2_zigzag[2][64];
38 static void dct4x4dc( dctcoef d[16] )
42 for( int i = 0; i < 4; i++ )
44 int s01 = d[i*4+0] + d[i*4+1];
45 int d01 = d[i*4+0] - d[i*4+1];
46 int s23 = d[i*4+2] + d[i*4+3];
47 int d23 = d[i*4+2] - d[i*4+3];
49 tmp[0*4+i] = s01 + s23;
50 tmp[1*4+i] = s01 - s23;
51 tmp[2*4+i] = d01 - d23;
52 tmp[3*4+i] = d01 + d23;
55 for( int i = 0; i < 4; i++ )
57 int s01 = tmp[i*4+0] + tmp[i*4+1];
58 int d01 = tmp[i*4+0] - tmp[i*4+1];
59 int s23 = tmp[i*4+2] + tmp[i*4+3];
60 int d23 = tmp[i*4+2] - tmp[i*4+3];
62 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
63 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
64 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
65 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
69 static void idct4x4dc( dctcoef d[16] )
73 for( int i = 0; i < 4; i++ )
75 int s01 = d[i*4+0] + d[i*4+1];
76 int d01 = d[i*4+0] - d[i*4+1];
77 int s23 = d[i*4+2] + d[i*4+3];
78 int d23 = d[i*4+2] - d[i*4+3];
80 tmp[0*4+i] = s01 + s23;
81 tmp[1*4+i] = s01 - s23;
82 tmp[2*4+i] = d01 - d23;
83 tmp[3*4+i] = d01 + d23;
86 for( int i = 0; i < 4; i++ )
88 int s01 = tmp[i*4+0] + tmp[i*4+1];
89 int d01 = tmp[i*4+0] - tmp[i*4+1];
90 int s23 = tmp[i*4+2] + tmp[i*4+3];
91 int d23 = tmp[i*4+2] - tmp[i*4+3];
100 static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
101 pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
103 for( int y = 0; y < i_size; y++ )
105 for( int x = 0; x < i_size; x++ )
106 diff[x + y*i_size] = pix1[x] - pix2[x];
112 static void sub4x4_dct( dctcoef dct[16], pixel *pix1, pixel *pix2 )
117 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
119 for( int i = 0; i < 4; i++ )
121 int s03 = d[i*4+0] + d[i*4+3];
122 int s12 = d[i*4+1] + d[i*4+2];
123 int d03 = d[i*4+0] - d[i*4+3];
124 int d12 = d[i*4+1] - d[i*4+2];
126 tmp[0*4+i] = s03 + s12;
127 tmp[1*4+i] = 2*d03 + d12;
128 tmp[2*4+i] = s03 - s12;
129 tmp[3*4+i] = d03 - 2*d12;
132 for( int i = 0; i < 4; i++ )
134 int s03 = tmp[i*4+0] + tmp[i*4+3];
135 int s12 = tmp[i*4+1] + tmp[i*4+2];
136 int d03 = tmp[i*4+0] - tmp[i*4+3];
137 int d12 = tmp[i*4+1] - tmp[i*4+2];
139 dct[i*4+0] = s03 + s12;
140 dct[i*4+1] = 2*d03 + d12;
141 dct[i*4+2] = s03 - s12;
142 dct[i*4+3] = d03 - 2*d12;
146 static void sub8x8_dct( dctcoef dct[4][16], pixel *pix1, pixel *pix2 )
148 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
149 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
150 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
151 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
154 static void sub16x16_dct( dctcoef dct[16][16], pixel *pix1, pixel *pix2 )
156 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
157 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
158 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
159 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
162 static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
167 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
169 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
170 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
175 static void sub8x8_dct_dc( dctcoef dct[4], pixel *pix1, pixel *pix2 )
177 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
178 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
179 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
180 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
182 /* 2x2 DC transform */
183 int d0 = dct[0] + dct[1];
184 int d1 = dct[2] + dct[3];
185 int d2 = dct[0] - dct[1];
186 int d3 = dct[2] - dct[3];
193 static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
198 for( int i = 0; i < 4; i++ )
200 int s02 = dct[0*4+i] + dct[2*4+i];
201 int d02 = dct[0*4+i] - dct[2*4+i];
202 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
203 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
205 tmp[i*4+0] = s02 + s13;
206 tmp[i*4+1] = d02 + d13;
207 tmp[i*4+2] = d02 - d13;
208 tmp[i*4+3] = s02 - s13;
211 for( int i = 0; i < 4; i++ )
213 int s02 = tmp[0*4+i] + tmp[2*4+i];
214 int d02 = tmp[0*4+i] - tmp[2*4+i];
215 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
216 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
218 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
219 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
220 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
221 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
225 for( int y = 0; y < 4; y++ )
227 for( int x = 0; x < 4; x++ )
228 p_dst[x] = x264_clip_pixel( p_dst[x] + d[y*4+x] );
229 p_dst += FDEC_STRIDE;
233 static void add8x8_idct( pixel *p_dst, dctcoef dct[4][16] )
235 add4x4_idct( &p_dst[0], dct[0] );
236 add4x4_idct( &p_dst[4], dct[1] );
237 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
238 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
241 static void add16x16_idct( pixel *p_dst, dctcoef dct[16][16] )
243 add8x8_idct( &p_dst[0], &dct[0] );
244 add8x8_idct( &p_dst[8], &dct[4] );
245 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
246 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
249 /****************************************************************************
251 ****************************************************************************/
254 int s07 = SRC(0) + SRC(7);\
255 int s16 = SRC(1) + SRC(6);\
256 int s25 = SRC(2) + SRC(5);\
257 int s34 = SRC(3) + SRC(4);\
262 int d07 = SRC(0) - SRC(7);\
263 int d16 = SRC(1) - SRC(6);\
264 int d25 = SRC(2) - SRC(5);\
265 int d34 = SRC(3) - SRC(4);\
266 int a4 = d16 + d25 + (d07 + (d07>>1));\
267 int a5 = d07 - d34 - (d25 + (d25>>1));\
268 int a6 = d07 + d34 - (d16 + (d16>>1));\
269 int a7 = d16 - d25 + (d34 + (d34>>1));\
271 DST(1) = a4 + (a7>>2);\
272 DST(2) = a2 + (a3>>1);\
273 DST(3) = a5 + (a6>>2);\
275 DST(5) = a6 - (a5>>2);\
276 DST(6) = (a2>>1) - a3 ;\
277 DST(7) = (a4>>2) - a7 ;\
280 static void sub8x8_dct8( dctcoef dct[64], pixel *pix1, pixel *pix2 )
284 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
286 #define SRC(x) tmp[x*8+i]
287 #define DST(x) tmp[x*8+i]
288 for( int i = 0; i < 8; i++ )
293 #define SRC(x) tmp[i*8+x]
294 #define DST(x) dct[x*8+i]
295 for( int i = 0; i < 8; i++ )
301 static void sub16x16_dct8( dctcoef dct[4][64], pixel *pix1, pixel *pix2 )
303 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
304 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
305 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
306 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
310 int a0 = SRC(0) + SRC(4);\
311 int a2 = SRC(0) - SRC(4);\
312 int a4 = (SRC(2)>>1) - SRC(6);\
313 int a6 = (SRC(6)>>1) + SRC(2);\
318 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
319 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
320 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
321 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
322 int b1 = (a7>>2) + a1;\
323 int b3 = a3 + (a5>>2);\
324 int b5 = (a3>>2) - a5;\
325 int b7 = a7 - (a1>>2);\
336 static void add8x8_idct8( pixel *dst, dctcoef dct[64] )
338 dct[0] += 32; // rounding for the >>6 at the end
340 #define SRC(x) dct[x*8+i]
341 #define DST(x,rhs) dct[x*8+i] = (rhs)
342 for( int i = 0; i < 8; i++ )
347 #define SRC(x) dct[i*8+x]
348 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_pixel( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
349 for( int i = 0; i < 8; i++ )
355 static void add16x16_idct8( pixel *dst, dctcoef dct[4][64] )
357 add8x8_idct8( &dst[0], dct[0] );
358 add8x8_idct8( &dst[8], dct[1] );
359 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
360 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
363 static void inline add4x4_idct_dc( pixel *p_dst, dctcoef dc )
366 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
368 p_dst[0] = x264_clip_pixel( p_dst[0] + dc );
369 p_dst[1] = x264_clip_pixel( p_dst[1] + dc );
370 p_dst[2] = x264_clip_pixel( p_dst[2] + dc );
371 p_dst[3] = x264_clip_pixel( p_dst[3] + dc );
375 static void add8x8_idct_dc( pixel *p_dst, dctcoef dct[4] )
377 add4x4_idct_dc( &p_dst[0], dct[0] );
378 add4x4_idct_dc( &p_dst[4], dct[1] );
379 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
380 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
383 static void add16x16_idct_dc( pixel *p_dst, dctcoef dct[16] )
385 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
387 add4x4_idct_dc( &p_dst[ 0], dct[0] );
388 add4x4_idct_dc( &p_dst[ 4], dct[1] );
389 add4x4_idct_dc( &p_dst[ 8], dct[2] );
390 add4x4_idct_dc( &p_dst[12], dct[3] );
395 /****************************************************************************
397 ****************************************************************************/
398 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
400 dctf->sub4x4_dct = sub4x4_dct;
401 dctf->add4x4_idct = add4x4_idct;
403 dctf->sub8x8_dct = sub8x8_dct;
404 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
405 dctf->add8x8_idct = add8x8_idct;
406 dctf->add8x8_idct_dc = add8x8_idct_dc;
408 dctf->sub16x16_dct = sub16x16_dct;
409 dctf->add16x16_idct = add16x16_idct;
410 dctf->add16x16_idct_dc = add16x16_idct_dc;
412 dctf->sub8x8_dct8 = sub8x8_dct8;
413 dctf->add8x8_idct8 = add8x8_idct8;
415 dctf->sub16x16_dct8 = sub16x16_dct8;
416 dctf->add16x16_idct8 = add16x16_idct8;
418 dctf->dct4x4dc = dct4x4dc;
419 dctf->idct4x4dc = idct4x4dc;
421 #if !X264_HIGH_BIT_DEPTH
423 if( cpu&X264_CPU_MMX )
425 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
426 dctf->add4x4_idct = x264_add4x4_idct_mmx;
427 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
428 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
429 dctf->dct4x4dc = x264_dct4x4dc_mmx;
430 dctf->idct4x4dc = x264_idct4x4dc_mmx;
431 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
434 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
435 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
436 dctf->add8x8_idct = x264_add8x8_idct_mmx;
437 dctf->add16x16_idct = x264_add16x16_idct_mmx;
439 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
440 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
441 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
442 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
446 if( cpu&X264_CPU_SSE2 )
448 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
449 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
450 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
451 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
452 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
454 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
455 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
456 dctf->add8x8_idct = x264_add8x8_idct_sse2;
457 dctf->add16x16_idct = x264_add16x16_idct_sse2;
458 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
461 if( (cpu&X264_CPU_SSSE3) && !(cpu&X264_CPU_SLOW_ATOM) )
463 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
464 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
465 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
466 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
467 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
468 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
469 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
472 if( cpu&X264_CPU_SSE4 )
473 dctf->add4x4_idct = x264_add4x4_idct_sse4;
478 if( cpu&X264_CPU_ALTIVEC )
480 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
481 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
482 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
484 dctf->add4x4_idct = x264_add4x4_idct_altivec;
485 dctf->add8x8_idct = x264_add8x8_idct_altivec;
486 dctf->add16x16_idct = x264_add16x16_idct_altivec;
488 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
489 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
491 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
492 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
497 if( cpu&X264_CPU_NEON )
499 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
500 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
501 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
502 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
503 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
504 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
505 dctf->dct4x4dc = x264_dct4x4dc_neon;
506 dctf->idct4x4dc = x264_idct4x4dc_neon;
508 dctf->add4x4_idct = x264_add4x4_idct_neon;
509 dctf->add8x8_idct = x264_add8x8_idct_neon;
510 dctf->add16x16_idct = x264_add16x16_idct_neon;
512 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
513 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
515 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
516 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
519 #endif // !X264_HIGH_BIT_DEPTH
522 void x264_dct_init_weights( void )
524 for( int j = 0; j < 2; j++ )
526 for( int i = 0; i < 16; i++ )
527 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
528 for( int i = 0; i < 64; i++ )
529 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
534 #define ZIG(i,y,x) level[i] = dct[x*8+y];
535 #define ZIGZAG8_FRAME\
536 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
537 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
538 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
539 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
540 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
541 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
542 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
543 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
544 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
545 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
546 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
547 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
548 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
549 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
550 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
551 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
553 #define ZIGZAG8_FIELD\
554 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
555 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
556 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
557 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
558 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
559 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
560 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
561 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
562 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
563 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
564 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
565 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
566 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
567 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
568 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
569 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
571 #define ZIGZAG4_FRAME\
572 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
573 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
574 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
575 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
577 #define ZIGZAG4_FIELD\
578 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
579 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
580 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
581 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
583 static void zigzag_scan_8x8_frame( dctcoef level[64], dctcoef dct[64] )
588 static void zigzag_scan_8x8_field( dctcoef level[64], dctcoef dct[64] )
594 #define ZIG(i,y,x) level[i] = dct[x*4+y];
595 #define ZIGDC(i,y,x) ZIG(i,y,x)
597 static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
602 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
604 memcpy( level, dct, 2 * sizeof(dctcoef) );
605 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
606 memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
610 #define ZIG(i,y,x) {\
611 int oe = x+y*FENC_STRIDE;\
612 int od = x+y*FDEC_STRIDE;\
613 level[i] = p_src[oe] - p_dst[od];\
617 CPPIXEL_X4( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
618 CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
619 CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
620 CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
621 #define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
623 CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
624 CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
625 CPPIXEL_X8( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
626 CPPIXEL_X8( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
627 CPPIXEL_X8( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
628 CPPIXEL_X8( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
629 CPPIXEL_X8( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
630 CPPIXEL_X8( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
632 static int zigzag_sub_4x4_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst )
640 static int zigzag_sub_4x4_field( dctcoef level[16], const pixel *p_src, pixel *p_dst )
649 #define ZIGDC(i,y,x) {\
650 int oe = x+y*FENC_STRIDE;\
651 int od = x+y*FDEC_STRIDE;\
652 *dc = p_src[oe] - p_dst[od];\
656 static int zigzag_sub_4x4ac_frame( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
664 static int zigzag_sub_4x4ac_field( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc )
672 static int zigzag_sub_8x8_frame( dctcoef level[64], const pixel *p_src, pixel *p_dst )
679 static int zigzag_sub_8x8_field( dctcoef level[64], const pixel *p_src, pixel *p_dst )
690 static void zigzag_interleave_8x8_cavlc( dctcoef *dst, dctcoef *src, uint8_t *nnz )
692 for( int i = 0; i < 4; i++ )
695 for( int j = 0; j < 16; j++ )
698 dst[i*16+j] = src[i+j*4];
700 nnz[(i&1) + (i>>1)*8] = !!nz;
704 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
708 pf->scan_8x8 = zigzag_scan_8x8_field;
709 pf->scan_4x4 = zigzag_scan_4x4_field;
710 pf->sub_8x8 = zigzag_sub_8x8_field;
711 pf->sub_4x4 = zigzag_sub_4x4_field;
712 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
713 #if !X264_HIGH_BIT_DEPTH
715 if( cpu&X264_CPU_MMXEXT )
717 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
718 pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
720 if( cpu&X264_CPU_SSSE3 )
722 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
723 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
728 if( cpu&X264_CPU_ALTIVEC )
729 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
731 #endif // !X264_HIGH_BIT_DEPTH
735 pf->scan_8x8 = zigzag_scan_8x8_frame;
736 pf->scan_4x4 = zigzag_scan_4x4_frame;
737 pf->sub_8x8 = zigzag_sub_8x8_frame;
738 pf->sub_4x4 = zigzag_sub_4x4_frame;
739 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
740 #if !X264_HIGH_BIT_DEPTH
742 if( cpu&X264_CPU_MMX )
743 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
744 if( cpu&X264_CPU_MMXEXT )
745 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
746 if( cpu&X264_CPU_SSE2_IS_FAST )
747 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
748 if( cpu&X264_CPU_SSSE3 )
750 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
751 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
752 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
753 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
754 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
759 if( cpu&X264_CPU_ALTIVEC )
760 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
763 if( cpu&X264_CPU_NEON )
764 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
766 #endif // !X264_HIGH_BIT_DEPTH
769 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
770 #if !X264_HIGH_BIT_DEPTH
772 if( cpu&X264_CPU_MMX )
773 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
774 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
775 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
777 #endif // !X264_HIGH_BIT_DEPTH