1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
35 int x264_dct4_weight2_zigzag[2][16];
36 int x264_dct8_weight2_zigzag[2][64];
38 static void dct4x4dc( int16_t d[16] )
42 for( int i = 0; i < 4; i++ )
44 int s01 = d[i*4+0] + d[i*4+1];
45 int d01 = d[i*4+0] - d[i*4+1];
46 int s23 = d[i*4+2] + d[i*4+3];
47 int d23 = d[i*4+2] - d[i*4+3];
49 tmp[0*4+i] = s01 + s23;
50 tmp[1*4+i] = s01 - s23;
51 tmp[2*4+i] = d01 - d23;
52 tmp[3*4+i] = d01 + d23;
55 for( int i = 0; i < 4; i++ )
57 int s01 = tmp[i*4+0] + tmp[i*4+1];
58 int d01 = tmp[i*4+0] - tmp[i*4+1];
59 int s23 = tmp[i*4+2] + tmp[i*4+3];
60 int d23 = tmp[i*4+2] - tmp[i*4+3];
62 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
63 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
64 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
65 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
69 static void idct4x4dc( int16_t d[16] )
73 for( int i = 0; i < 4; i++ )
75 int s01 = d[i*4+0] + d[i*4+1];
76 int d01 = d[i*4+0] - d[i*4+1];
77 int s23 = d[i*4+2] + d[i*4+3];
78 int d23 = d[i*4+2] - d[i*4+3];
80 tmp[0*4+i] = s01 + s23;
81 tmp[1*4+i] = s01 - s23;
82 tmp[2*4+i] = d01 - d23;
83 tmp[3*4+i] = d01 + d23;
86 for( int i = 0; i < 4; i++ )
88 int s01 = tmp[i*4+0] + tmp[i*4+1];
89 int d01 = tmp[i*4+0] - tmp[i*4+1];
90 int s23 = tmp[i*4+2] + tmp[i*4+3];
91 int d23 = tmp[i*4+2] - tmp[i*4+3];
100 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
101 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
103 for( int y = 0; y < i_size; y++ )
105 for( int x = 0; x < i_size; x++ )
106 diff[x + y*i_size] = pix1[x] - pix2[x];
112 static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
117 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
119 for( int i = 0; i < 4; i++ )
121 int s03 = d[i*4+0] + d[i*4+3];
122 int s12 = d[i*4+1] + d[i*4+2];
123 int d03 = d[i*4+0] - d[i*4+3];
124 int d12 = d[i*4+1] - d[i*4+2];
126 tmp[0*4+i] = s03 + s12;
127 tmp[1*4+i] = 2*d03 + d12;
128 tmp[2*4+i] = s03 - s12;
129 tmp[3*4+i] = d03 - 2*d12;
132 for( int i = 0; i < 4; i++ )
134 int s03 = tmp[i*4+0] + tmp[i*4+3];
135 int s12 = tmp[i*4+1] + tmp[i*4+2];
136 int d03 = tmp[i*4+0] - tmp[i*4+3];
137 int d12 = tmp[i*4+1] - tmp[i*4+2];
139 dct[i*4+0] = s03 + s12;
140 dct[i*4+1] = 2*d03 + d12;
141 dct[i*4+2] = s03 - s12;
142 dct[i*4+3] = d03 - 2*d12;
146 static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
148 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
149 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
150 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
151 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
154 static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
156 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
157 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
158 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
159 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
162 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
167 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
169 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
170 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
175 static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
177 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
178 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
179 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
180 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
182 /* 2x2 DC transform */
183 int d0 = dct[0] + dct[1];
184 int d1 = dct[2] + dct[3];
185 int d2 = dct[0] - dct[1];
186 int d3 = dct[2] - dct[3];
193 static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
198 for( int i = 0; i < 4; i++ )
200 int s02 = dct[0*4+i] + dct[2*4+i];
201 int d02 = dct[0*4+i] - dct[2*4+i];
202 int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
203 int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
205 tmp[i*4+0] = s02 + s13;
206 tmp[i*4+1] = d02 + d13;
207 tmp[i*4+2] = d02 - d13;
208 tmp[i*4+3] = s02 - s13;
211 for( int i = 0; i < 4; i++ )
213 int s02 = tmp[0*4+i] + tmp[2*4+i];
214 int d02 = tmp[0*4+i] - tmp[2*4+i];
215 int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
216 int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
218 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
219 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
220 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
221 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
225 for( int y = 0; y < 4; y++ )
227 for( int x = 0; x < 4; x++ )
228 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
229 p_dst += FDEC_STRIDE;
233 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
235 add4x4_idct( &p_dst[0], dct[0] );
236 add4x4_idct( &p_dst[4], dct[1] );
237 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
238 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
241 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
243 add8x8_idct( &p_dst[0], &dct[0] );
244 add8x8_idct( &p_dst[8], &dct[4] );
245 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
246 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
249 /****************************************************************************
251 ****************************************************************************/
254 int s07 = SRC(0) + SRC(7);\
255 int s16 = SRC(1) + SRC(6);\
256 int s25 = SRC(2) + SRC(5);\
257 int s34 = SRC(3) + SRC(4);\
262 int d07 = SRC(0) - SRC(7);\
263 int d16 = SRC(1) - SRC(6);\
264 int d25 = SRC(2) - SRC(5);\
265 int d34 = SRC(3) - SRC(4);\
266 int a4 = d16 + d25 + (d07 + (d07>>1));\
267 int a5 = d07 - d34 - (d25 + (d25>>1));\
268 int a6 = d07 + d34 - (d16 + (d16>>1));\
269 int a7 = d16 - d25 + (d34 + (d34>>1));\
271 DST(1) = a4 + (a7>>2);\
272 DST(2) = a2 + (a3>>1);\
273 DST(3) = a5 + (a6>>2);\
275 DST(5) = a6 - (a5>>2);\
276 DST(6) = (a2>>1) - a3 ;\
277 DST(7) = (a4>>2) - a7 ;\
280 static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
284 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
286 #define SRC(x) tmp[x*8+i]
287 #define DST(x) tmp[x*8+i]
288 for( int i = 0; i < 8; i++ )
293 #define SRC(x) tmp[i*8+x]
294 #define DST(x) dct[x*8+i]
295 for( int i = 0; i < 8; i++ )
301 static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
303 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
304 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
305 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
306 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
310 int a0 = SRC(0) + SRC(4);\
311 int a2 = SRC(0) - SRC(4);\
312 int a4 = (SRC(2)>>1) - SRC(6);\
313 int a6 = (SRC(6)>>1) + SRC(2);\
318 int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
319 int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
320 int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
321 int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
322 int b1 = (a7>>2) + a1;\
323 int b3 = a3 + (a5>>2);\
324 int b5 = (a3>>2) - a5;\
325 int b7 = a7 - (a1>>2);\
336 static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
338 dct[0] += 32; // rounding for the >>6 at the end
340 #define SRC(x) dct[x*8+i]
341 #define DST(x,rhs) dct[x*8+i] = (rhs)
342 for( int i = 0; i < 8; i++ )
347 #define SRC(x) dct[i*8+x]
348 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
349 for( int i = 0; i < 8; i++ )
355 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
357 add8x8_idct8( &dst[0], dct[0] );
358 add8x8_idct8( &dst[8], dct[1] );
359 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
360 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
363 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
366 for( int i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
368 p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
369 p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
370 p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
371 p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
375 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
377 add4x4_idct_dc( &p_dst[0], dct[0] );
378 add4x4_idct_dc( &p_dst[4], dct[1] );
379 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
380 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
383 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
385 for( int i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
387 add4x4_idct_dc( &p_dst[ 0], dct[0] );
388 add4x4_idct_dc( &p_dst[ 4], dct[1] );
389 add4x4_idct_dc( &p_dst[ 8], dct[2] );
390 add4x4_idct_dc( &p_dst[12], dct[3] );
395 /****************************************************************************
397 ****************************************************************************/
398 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
400 dctf->sub4x4_dct = sub4x4_dct;
401 dctf->add4x4_idct = add4x4_idct;
403 dctf->sub8x8_dct = sub8x8_dct;
404 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
405 dctf->add8x8_idct = add8x8_idct;
406 dctf->add8x8_idct_dc = add8x8_idct_dc;
408 dctf->sub16x16_dct = sub16x16_dct;
409 dctf->add16x16_idct = add16x16_idct;
410 dctf->add16x16_idct_dc = add16x16_idct_dc;
412 dctf->sub8x8_dct8 = sub8x8_dct8;
413 dctf->add8x8_idct8 = add8x8_idct8;
415 dctf->sub16x16_dct8 = sub16x16_dct8;
416 dctf->add16x16_idct8 = add16x16_idct8;
418 dctf->dct4x4dc = dct4x4dc;
419 dctf->idct4x4dc = idct4x4dc;
422 if( cpu&X264_CPU_MMX )
424 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
425 dctf->add4x4_idct = x264_add4x4_idct_mmx;
426 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
427 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
428 dctf->dct4x4dc = x264_dct4x4dc_mmx;
429 dctf->idct4x4dc = x264_idct4x4dc_mmx;
430 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
433 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
434 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
435 dctf->add8x8_idct = x264_add8x8_idct_mmx;
436 dctf->add16x16_idct = x264_add16x16_idct_mmx;
438 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
439 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
440 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
441 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
445 if( cpu&X264_CPU_SSE2 )
447 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
448 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
449 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
450 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
451 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
453 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
454 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
455 dctf->add8x8_idct = x264_add8x8_idct_sse2;
456 dctf->add16x16_idct = x264_add16x16_idct_sse2;
457 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
460 if( cpu&X264_CPU_SSSE3 )
462 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
463 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
464 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
465 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
466 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
467 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
468 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
471 if( cpu&X264_CPU_SSE4 )
472 dctf->add4x4_idct = x264_add4x4_idct_sse4;
477 if( cpu&X264_CPU_ALTIVEC )
479 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
480 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
481 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
483 dctf->add4x4_idct = x264_add4x4_idct_altivec;
484 dctf->add8x8_idct = x264_add8x8_idct_altivec;
485 dctf->add16x16_idct = x264_add16x16_idct_altivec;
487 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
488 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
490 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
491 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
496 if( cpu&X264_CPU_NEON )
498 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
499 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
500 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
501 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
502 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
503 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
504 dctf->dct4x4dc = x264_dct4x4dc_neon;
505 dctf->idct4x4dc = x264_idct4x4dc_neon;
507 dctf->add4x4_idct = x264_add4x4_idct_neon;
508 dctf->add8x8_idct = x264_add8x8_idct_neon;
509 dctf->add16x16_idct = x264_add16x16_idct_neon;
511 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
512 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
514 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
515 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
520 void x264_dct_init_weights( void )
522 for( int j = 0; j < 2; j++ )
524 for( int i = 0; i < 16; i++ )
525 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
526 for( int i = 0; i < 64; i++ )
527 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
532 #define ZIG(i,y,x) level[i] = dct[x*8+y];
533 #define ZIGZAG8_FRAME\
534 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
535 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
536 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
537 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
538 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
539 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
540 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
541 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
542 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
543 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
544 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
545 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
546 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
547 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
548 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
549 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
551 #define ZIGZAG8_FIELD\
552 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
553 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
554 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
555 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
556 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
557 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
558 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
559 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
560 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
561 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
562 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
563 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
564 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
565 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
566 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
567 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
569 #define ZIGZAG4_FRAME\
570 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
571 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
572 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
573 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
575 #define ZIGZAG4_FIELD\
576 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
577 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
578 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
579 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
581 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
586 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
592 #define ZIG(i,y,x) level[i] = dct[x*4+y];
593 #define ZIGDC(i,y,x) ZIG(i,y,x)
595 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
600 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
603 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
604 CP32( level+6, dct+6 );
605 CP64( level+8, dct+8 );
606 CP64( level+12, dct+12 );
610 #define ZIG(i,y,x) {\
611 int oe = x+y*FENC_STRIDE;\
612 int od = x+y*FDEC_STRIDE;\
613 level[i] = p_src[oe] - p_dst[od];\
617 CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
618 CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
619 CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
620 CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
622 CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
623 CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
624 CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
625 CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
626 CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
627 CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
628 CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
629 CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
631 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
639 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
648 #define ZIGDC(i,y,x) {\
649 int oe = x+y*FENC_STRIDE;\
650 int od = x+y*FDEC_STRIDE;\
651 *dc = p_src[oe] - p_dst[od];\
655 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
663 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
671 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
678 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
689 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
691 for( int i = 0; i < 4; i++ )
694 for( int j = 0; j < 16; j++ )
697 dst[i*16+j] = src[i+j*4];
699 nnz[(i&1) + (i>>1)*8] = !!nz;
703 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
707 pf->scan_8x8 = zigzag_scan_8x8_field;
708 pf->scan_4x4 = zigzag_scan_4x4_field;
709 pf->sub_8x8 = zigzag_sub_8x8_field;
710 pf->sub_4x4 = zigzag_sub_4x4_field;
711 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
713 if( cpu&X264_CPU_MMXEXT )
715 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
716 pf->scan_8x8 = x264_zigzag_scan_8x8_field_mmxext;
718 if( cpu&X264_CPU_SSSE3 )
720 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
721 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
726 if( cpu&X264_CPU_ALTIVEC )
727 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
732 pf->scan_8x8 = zigzag_scan_8x8_frame;
733 pf->scan_4x4 = zigzag_scan_4x4_frame;
734 pf->sub_8x8 = zigzag_sub_8x8_frame;
735 pf->sub_4x4 = zigzag_sub_4x4_frame;
736 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
738 if( cpu&X264_CPU_MMX )
739 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
740 if( cpu&X264_CPU_MMXEXT )
741 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
742 if( cpu&X264_CPU_SSE2_IS_FAST )
743 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
744 if( cpu&X264_CPU_SSSE3 )
746 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
747 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
748 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
749 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
750 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
755 if( cpu&X264_CPU_ALTIVEC )
756 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
759 if( cpu&X264_CPU_NEON )
760 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
764 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
766 if( cpu&X264_CPU_MMX )
767 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
768 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
769 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;