1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
35 int x264_dct4_weight2_zigzag[2][16];
36 int x264_dct8_weight2_zigzag[2][64];
38 static void dct4x4dc( int16_t d[16] )
45 for( i = 0; i < 4; i++ )
47 s01 = d[i*4+0] + d[i*4+1];
48 d01 = d[i*4+0] - d[i*4+1];
49 s23 = d[i*4+2] + d[i*4+3];
50 d23 = d[i*4+2] - d[i*4+3];
52 tmp[0*4+i] = s01 + s23;
53 tmp[1*4+i] = s01 - s23;
54 tmp[2*4+i] = d01 - d23;
55 tmp[3*4+i] = d01 + d23;
58 for( i = 0; i < 4; i++ )
60 s01 = tmp[i*4+0] + tmp[i*4+1];
61 d01 = tmp[i*4+0] - tmp[i*4+1];
62 s23 = tmp[i*4+2] + tmp[i*4+3];
63 d23 = tmp[i*4+2] - tmp[i*4+3];
65 d[i*4+0] = ( s01 + s23 + 1 ) >> 1;
66 d[i*4+1] = ( s01 - s23 + 1 ) >> 1;
67 d[i*4+2] = ( d01 - d23 + 1 ) >> 1;
68 d[i*4+3] = ( d01 + d23 + 1 ) >> 1;
72 static void idct4x4dc( int16_t d[16] )
79 for( i = 0; i < 4; i++ )
81 s01 = d[i*4+0] + d[i*4+1];
82 d01 = d[i*4+0] - d[i*4+1];
83 s23 = d[i*4+2] + d[i*4+3];
84 d23 = d[i*4+2] - d[i*4+3];
86 tmp[0*4+i] = s01 + s23;
87 tmp[1*4+i] = s01 - s23;
88 tmp[2*4+i] = d01 - d23;
89 tmp[3*4+i] = d01 + d23;
92 for( i = 0; i < 4; i++ )
94 s01 = tmp[i*4+0] + tmp[i*4+1];
95 d01 = tmp[i*4+0] - tmp[i*4+1];
96 s23 = tmp[i*4+2] + tmp[i*4+3];
97 d23 = tmp[i*4+2] - tmp[i*4+3];
100 d[i*4+1] = s01 - s23;
101 d[i*4+2] = d01 - d23;
102 d[i*4+3] = d01 + d23;
106 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
107 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
110 for( y = 0; y < i_size; y++ )
112 for( x = 0; x < i_size; x++ )
114 diff[x + y*i_size] = pix1[x] - pix2[x];
121 static void sub4x4_dct( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 )
127 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
129 for( i = 0; i < 4; i++ )
131 const int s03 = d[i*4+0] + d[i*4+3];
132 const int s12 = d[i*4+1] + d[i*4+2];
133 const int d03 = d[i*4+0] - d[i*4+3];
134 const int d12 = d[i*4+1] - d[i*4+2];
136 tmp[0*4+i] = s03 + s12;
137 tmp[1*4+i] = 2*d03 + d12;
138 tmp[2*4+i] = s03 - s12;
139 tmp[3*4+i] = d03 - 2*d12;
142 for( i = 0; i < 4; i++ )
144 const int s03 = tmp[i*4+0] + tmp[i*4+3];
145 const int s12 = tmp[i*4+1] + tmp[i*4+2];
146 const int d03 = tmp[i*4+0] - tmp[i*4+3];
147 const int d12 = tmp[i*4+1] - tmp[i*4+2];
149 dct[i*4+0] = s03 + s12;
150 dct[i*4+1] = 2*d03 + d12;
151 dct[i*4+2] = s03 - s12;
152 dct[i*4+3] = d03 - 2*d12;
156 static void sub8x8_dct( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 )
158 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
159 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
160 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
161 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
164 static void sub16x16_dct( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 )
166 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
167 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
168 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
169 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
172 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
177 pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
179 sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
180 sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
185 static void sub8x8_dct_dc( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 )
187 dct[0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
188 dct[1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
189 dct[2] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
190 dct[3] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
193 static void add4x4_idct( uint8_t *p_dst, int16_t dct[16] )
200 for( i = 0; i < 4; i++ )
202 const int s02 = dct[0*4+i] + dct[2*4+i];
203 const int d02 = dct[0*4+i] - dct[2*4+i];
204 const int s13 = dct[1*4+i] + (dct[3*4+i]>>1);
205 const int d13 = (dct[1*4+i]>>1) - dct[3*4+i];
207 tmp[i*4+0] = s02 + s13;
208 tmp[i*4+1] = d02 + d13;
209 tmp[i*4+2] = d02 - d13;
210 tmp[i*4+3] = s02 - s13;
213 for( i = 0; i < 4; i++ )
215 const int s02 = tmp[0*4+i] + tmp[2*4+i];
216 const int d02 = tmp[0*4+i] - tmp[2*4+i];
217 const int s13 = tmp[1*4+i] + (tmp[3*4+i]>>1);
218 const int d13 = (tmp[1*4+i]>>1) - tmp[3*4+i];
220 d[0*4+i] = ( s02 + s13 + 32 ) >> 6;
221 d[1*4+i] = ( d02 + d13 + 32 ) >> 6;
222 d[2*4+i] = ( d02 - d13 + 32 ) >> 6;
223 d[3*4+i] = ( s02 - s13 + 32 ) >> 6;
227 for( y = 0; y < 4; y++ )
229 for( x = 0; x < 4; x++ )
230 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y*4+x] );
231 p_dst += FDEC_STRIDE;
235 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][16] )
237 add4x4_idct( &p_dst[0], dct[0] );
238 add4x4_idct( &p_dst[4], dct[1] );
239 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
240 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
243 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][16] )
245 add8x8_idct( &p_dst[0], &dct[0] );
246 add8x8_idct( &p_dst[8], &dct[4] );
247 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
248 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
251 /****************************************************************************
253 ****************************************************************************/
256 const int s07 = SRC(0) + SRC(7);\
257 const int s16 = SRC(1) + SRC(6);\
258 const int s25 = SRC(2) + SRC(5);\
259 const int s34 = SRC(3) + SRC(4);\
260 const int a0 = s07 + s34;\
261 const int a1 = s16 + s25;\
262 const int a2 = s07 - s34;\
263 const int a3 = s16 - s25;\
264 const int d07 = SRC(0) - SRC(7);\
265 const int d16 = SRC(1) - SRC(6);\
266 const int d25 = SRC(2) - SRC(5);\
267 const int d34 = SRC(3) - SRC(4);\
268 const int a4 = d16 + d25 + (d07 + (d07>>1));\
269 const int a5 = d07 - d34 - (d25 + (d25>>1));\
270 const int a6 = d07 + d34 - (d16 + (d16>>1));\
271 const int a7 = d16 - d25 + (d34 + (d34>>1));\
273 DST(1) = a4 + (a7>>2);\
274 DST(2) = a2 + (a3>>1);\
275 DST(3) = a5 + (a6>>2);\
277 DST(5) = a6 - (a5>>2);\
278 DST(6) = (a2>>1) - a3 ;\
279 DST(7) = (a4>>2) - a7 ;\
282 static void sub8x8_dct8( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 )
287 pixel_sub_wxh( tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
289 #define SRC(x) tmp[x*8+i]
290 #define DST(x) tmp[x*8+i]
291 for( i = 0; i < 8; i++ )
296 #define SRC(x) tmp[i*8+x]
297 #define DST(x) dct[x*8+i]
298 for( i = 0; i < 8; i++ )
304 static void sub16x16_dct8( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
306 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
307 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
308 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
309 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
313 const int a0 = SRC(0) + SRC(4);\
314 const int a2 = SRC(0) - SRC(4);\
315 const int a4 = (SRC(2)>>1) - SRC(6);\
316 const int a6 = (SRC(6)>>1) + SRC(2);\
317 const int b0 = a0 + a6;\
318 const int b2 = a2 + a4;\
319 const int b4 = a2 - a4;\
320 const int b6 = a0 - a6;\
321 const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
322 const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
323 const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
324 const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
325 const int b1 = (a7>>2) + a1;\
326 const int b3 = a3 + (a5>>2);\
327 const int b5 = (a3>>2) - a5;\
328 const int b7 = a7 - (a1>>2);\
339 static void add8x8_idct8( uint8_t *dst, int16_t dct[64] )
343 dct[0] += 32; // rounding for the >>6 at the end
345 #define SRC(x) dct[x*8+i]
346 #define DST(x,rhs) dct[x*8+i] = (rhs)
347 for( i = 0; i < 8; i++ )
352 #define SRC(x) dct[i*8+x]
353 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
354 for( i = 0; i < 8; i++ )
360 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][64] )
362 add8x8_idct8( &dst[0], dct[0] );
363 add8x8_idct8( &dst[8], dct[1] );
364 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
365 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
368 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
372 for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
374 p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
375 p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
376 p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
377 p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
381 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[4] )
383 add4x4_idct_dc( &p_dst[0], dct[0] );
384 add4x4_idct_dc( &p_dst[4], dct[1] );
385 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[2] );
386 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[3] );
389 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[16] )
392 for( i = 0; i < 4; i++, dct += 4, p_dst += 4*FDEC_STRIDE )
394 add4x4_idct_dc( &p_dst[ 0], dct[0] );
395 add4x4_idct_dc( &p_dst[ 4], dct[1] );
396 add4x4_idct_dc( &p_dst[ 8], dct[2] );
397 add4x4_idct_dc( &p_dst[12], dct[3] );
402 /****************************************************************************
404 ****************************************************************************/
405 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
407 dctf->sub4x4_dct = sub4x4_dct;
408 dctf->add4x4_idct = add4x4_idct;
410 dctf->sub8x8_dct = sub8x8_dct;
411 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
412 dctf->add8x8_idct = add8x8_idct;
413 dctf->add8x8_idct_dc = add8x8_idct_dc;
415 dctf->sub16x16_dct = sub16x16_dct;
416 dctf->add16x16_idct = add16x16_idct;
417 dctf->add16x16_idct_dc = add16x16_idct_dc;
419 dctf->sub8x8_dct8 = sub8x8_dct8;
420 dctf->add8x8_idct8 = add8x8_idct8;
422 dctf->sub16x16_dct8 = sub16x16_dct8;
423 dctf->add16x16_idct8 = add16x16_idct8;
425 dctf->dct4x4dc = dct4x4dc;
426 dctf->idct4x4dc = idct4x4dc;
429 if( cpu&X264_CPU_MMX )
431 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
432 dctf->add4x4_idct = x264_add4x4_idct_mmx;
433 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
434 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
435 dctf->dct4x4dc = x264_dct4x4dc_mmx;
436 dctf->idct4x4dc = x264_idct4x4dc_mmx;
437 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
440 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
441 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
442 dctf->add8x8_idct = x264_add8x8_idct_mmx;
443 dctf->add16x16_idct = x264_add16x16_idct_mmx;
445 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
446 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
447 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
448 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
452 if( cpu&X264_CPU_SSE2 )
454 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
455 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
456 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
457 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
458 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
460 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
461 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
462 dctf->add8x8_idct = x264_add8x8_idct_sse2;
463 dctf->add16x16_idct = x264_add16x16_idct_sse2;
464 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
467 if( cpu&X264_CPU_SSSE3 )
469 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
470 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
471 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
472 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
473 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
474 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
475 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
478 if( cpu&X264_CPU_SSE4 )
479 dctf->add4x4_idct = x264_add4x4_idct_sse4;
484 if( cpu&X264_CPU_ALTIVEC )
486 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
487 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
488 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
490 dctf->add4x4_idct = x264_add4x4_idct_altivec;
491 dctf->add8x8_idct = x264_add8x8_idct_altivec;
492 dctf->add16x16_idct = x264_add16x16_idct_altivec;
494 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
495 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
497 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
498 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
503 if( cpu&X264_CPU_NEON )
505 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
506 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
507 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
508 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
509 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
510 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
511 dctf->dct4x4dc = x264_dct4x4dc_neon;
512 dctf->idct4x4dc = x264_idct4x4dc_neon;
514 dctf->add4x4_idct = x264_add4x4_idct_neon;
515 dctf->add8x8_idct = x264_add8x8_idct_neon;
516 dctf->add16x16_idct = x264_add16x16_idct_neon;
518 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
519 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
521 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
522 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
527 void x264_dct_init_weights( void )
532 for( i=0; i<16; i++ )
533 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
534 for( i=0; i<64; i++ )
535 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
540 #define ZIG(i,y,x) level[i] = dct[x*8+y];
541 #define ZIGZAG8_FRAME\
542 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
543 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
544 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
545 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
546 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
547 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
548 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
549 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
550 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
551 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
552 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
553 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
554 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
555 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
556 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
557 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
559 #define ZIGZAG8_FIELD\
560 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
561 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
562 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
563 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
564 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
565 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
566 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
567 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
568 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
569 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
570 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
571 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
572 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
573 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
574 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
575 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
577 #define ZIGZAG4_FRAME\
578 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
579 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
580 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
581 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
583 #define ZIGZAG4_FIELD\
584 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
585 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
586 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
587 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
589 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[64] )
594 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[64] )
600 #define ZIG(i,y,x) level[i] = dct[x*4+y];
601 #define ZIGDC(i,y,x) ZIG(i,y,x)
603 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[16] )
608 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[16] )
611 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
612 CP32( level+6, dct+6 );
613 CP64( level+8, dct+8 );
614 CP64( level+12, dct+12 );
618 #define ZIG(i,y,x) {\
619 int oe = x+y*FENC_STRIDE;\
620 int od = x+y*FDEC_STRIDE;\
621 level[i] = p_src[oe] - p_dst[od];\
625 CP32( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
626 CP32( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
627 CP32( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
628 CP32( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
630 CP64( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
631 CP64( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
632 CP64( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
633 CP64( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );\
634 CP64( p_dst+4*FDEC_STRIDE, p_src+4*FENC_STRIDE );\
635 CP64( p_dst+5*FDEC_STRIDE, p_src+5*FENC_STRIDE );\
636 CP64( p_dst+6*FDEC_STRIDE, p_src+6*FENC_STRIDE );\
637 CP64( p_dst+7*FDEC_STRIDE, p_src+7*FENC_STRIDE );
639 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
647 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
656 #define ZIGDC(i,y,x) {\
657 int oe = x+y*FENC_STRIDE;\
658 int od = x+y*FDEC_STRIDE;\
659 *dc = p_src[oe] - p_dst[od];\
663 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
671 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
679 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
686 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
697 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
703 for( j=0; j<16; j++ )
706 dst[i*16+j] = src[i+j*4];
708 nnz[(i&1) + (i>>1)*8] = !!nz;
712 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
716 pf->scan_8x8 = zigzag_scan_8x8_field;
717 pf->scan_4x4 = zigzag_scan_4x4_field;
718 pf->sub_8x8 = zigzag_sub_8x8_field;
719 pf->sub_4x4 = zigzag_sub_4x4_field;
720 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
722 if( cpu&X264_CPU_MMXEXT )
723 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
724 if( cpu&X264_CPU_SSSE3 )
726 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
727 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
728 pf->scan_8x8 = x264_zigzag_scan_8x8_field_ssse3;
733 if( cpu&X264_CPU_ALTIVEC )
734 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
739 pf->scan_8x8 = zigzag_scan_8x8_frame;
740 pf->scan_4x4 = zigzag_scan_4x4_frame;
741 pf->sub_8x8 = zigzag_sub_8x8_frame;
742 pf->sub_4x4 = zigzag_sub_4x4_frame;
743 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
745 if( cpu&X264_CPU_MMX )
746 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
747 if( cpu&X264_CPU_MMXEXT )
748 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
749 if( cpu&X264_CPU_SSE2_IS_FAST )
750 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
751 if( cpu&X264_CPU_SSSE3 )
753 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
754 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
755 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
756 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
757 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
762 if( cpu&X264_CPU_ALTIVEC )
763 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
766 if( cpu&X264_CPU_NEON )
767 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
771 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
773 if( cpu&X264_CPU_MMX )
774 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
775 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
776 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;