1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
35 int x264_dct4_weight2_zigzag[2][16];
36 int x264_dct8_weight2_zigzag[2][64];
39 * XXX For all dct dc : input could be equal to output so ...
42 static void dct4x4dc( int16_t d[4][4] )
49 for( i = 0; i < 4; i++ )
51 s01 = d[i][0] + d[i][1];
52 d01 = d[i][0] - d[i][1];
53 s23 = d[i][2] + d[i][3];
54 d23 = d[i][2] - d[i][3];
56 tmp[0][i] = s01 + s23;
57 tmp[1][i] = s01 - s23;
58 tmp[2][i] = d01 - d23;
59 tmp[3][i] = d01 + d23;
62 for( i = 0; i < 4; i++ )
64 s01 = tmp[i][0] + tmp[i][1];
65 d01 = tmp[i][0] - tmp[i][1];
66 s23 = tmp[i][2] + tmp[i][3];
67 d23 = tmp[i][2] - tmp[i][3];
69 d[i][0] = ( s01 + s23 + 1 ) >> 1;
70 d[i][1] = ( s01 - s23 + 1 ) >> 1;
71 d[i][2] = ( d01 - d23 + 1 ) >> 1;
72 d[i][3] = ( d01 + d23 + 1 ) >> 1;
76 static void idct4x4dc( int16_t d[4][4] )
83 for( i = 0; i < 4; i++ )
85 s01 = d[i][0] + d[i][1];
86 d01 = d[i][0] - d[i][1];
87 s23 = d[i][2] + d[i][3];
88 d23 = d[i][2] - d[i][3];
90 tmp[0][i] = s01 + s23;
91 tmp[1][i] = s01 - s23;
92 tmp[2][i] = d01 - d23;
93 tmp[3][i] = d01 + d23;
96 for( i = 0; i < 4; i++ )
98 s01 = tmp[i][0] + tmp[i][1];
99 d01 = tmp[i][0] - tmp[i][1];
100 s23 = tmp[i][2] + tmp[i][3];
101 d23 = tmp[i][2] - tmp[i][3];
110 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
111 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
114 for( y = 0; y < i_size; y++ )
116 for( x = 0; x < i_size; x++ )
118 diff[x + y*i_size] = pix1[x] - pix2[x];
125 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
131 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
133 for( i = 0; i < 4; i++ )
135 const int s03 = d[i][0] + d[i][3];
136 const int s12 = d[i][1] + d[i][2];
137 const int d03 = d[i][0] - d[i][3];
138 const int d12 = d[i][1] - d[i][2];
140 tmp[0][i] = s03 + s12;
141 tmp[1][i] = 2*d03 + d12;
142 tmp[2][i] = s03 - s12;
143 tmp[3][i] = d03 - 2*d12;
146 for( i = 0; i < 4; i++ )
148 const int s03 = tmp[i][0] + tmp[i][3];
149 const int s12 = tmp[i][1] + tmp[i][2];
150 const int d03 = tmp[i][0] - tmp[i][3];
151 const int d12 = tmp[i][1] - tmp[i][2];
153 dct[i][0] = s03 + s12;
154 dct[i][1] = 2*d03 + d12;
155 dct[i][2] = s03 - s12;
156 dct[i][3] = d03 - 2*d12;
160 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
162 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
163 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
164 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
165 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
168 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
170 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
171 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
172 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
173 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
176 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
181 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
183 sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
184 sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
185 sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
186 sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
191 static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
193 dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
194 dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
195 dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
196 dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
199 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
206 for( i = 0; i < 4; i++ )
208 const int s02 = dct[0][i] + dct[2][i];
209 const int d02 = dct[0][i] - dct[2][i];
210 const int s13 = dct[1][i] + (dct[3][i]>>1);
211 const int d13 = (dct[1][i]>>1) - dct[3][i];
213 tmp[i][0] = s02 + s13;
214 tmp[i][1] = d02 + d13;
215 tmp[i][2] = d02 - d13;
216 tmp[i][3] = s02 - s13;
219 for( i = 0; i < 4; i++ )
221 const int s02 = tmp[0][i] + tmp[2][i];
222 const int d02 = tmp[0][i] - tmp[2][i];
223 const int s13 = tmp[1][i] + (tmp[3][i]>>1);
224 const int d13 = (tmp[1][i]>>1) - tmp[3][i];
226 d[0][i] = ( s02 + s13 + 32 ) >> 6;
227 d[1][i] = ( d02 + d13 + 32 ) >> 6;
228 d[2][i] = ( d02 - d13 + 32 ) >> 6;
229 d[3][i] = ( s02 - s13 + 32 ) >> 6;
233 for( y = 0; y < 4; y++ )
235 for( x = 0; x < 4; x++ )
237 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
239 p_dst += FDEC_STRIDE;
243 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
245 add4x4_idct( &p_dst[0], dct[0] );
246 add4x4_idct( &p_dst[4], dct[1] );
247 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
248 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
251 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
253 add8x8_idct( &p_dst[0], &dct[0] );
254 add8x8_idct( &p_dst[8], &dct[4] );
255 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
256 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
259 /****************************************************************************
261 ****************************************************************************/
264 const int s07 = SRC(0) + SRC(7);\
265 const int s16 = SRC(1) + SRC(6);\
266 const int s25 = SRC(2) + SRC(5);\
267 const int s34 = SRC(3) + SRC(4);\
268 const int a0 = s07 + s34;\
269 const int a1 = s16 + s25;\
270 const int a2 = s07 - s34;\
271 const int a3 = s16 - s25;\
272 const int d07 = SRC(0) - SRC(7);\
273 const int d16 = SRC(1) - SRC(6);\
274 const int d25 = SRC(2) - SRC(5);\
275 const int d34 = SRC(3) - SRC(4);\
276 const int a4 = d16 + d25 + (d07 + (d07>>1));\
277 const int a5 = d07 - d34 - (d25 + (d25>>1));\
278 const int a6 = d07 + d34 - (d16 + (d16>>1));\
279 const int a7 = d16 - d25 + (d34 + (d34>>1));\
281 DST(1) = a4 + (a7>>2);\
282 DST(2) = a2 + (a3>>1);\
283 DST(3) = a5 + (a6>>2);\
285 DST(5) = a6 - (a5>>2);\
286 DST(6) = (a2>>1) - a3 ;\
287 DST(7) = (a4>>2) - a7 ;\
290 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
295 pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
297 #define SRC(x) tmp[x][i]
298 #define DST(x) tmp[x][i]
299 for( i = 0; i < 8; i++ )
304 #define SRC(x) tmp[i][x]
305 #define DST(x) dct[x][i]
306 for( i = 0; i < 8; i++ )
312 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
314 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
315 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
316 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
317 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
321 const int a0 = SRC(0) + SRC(4);\
322 const int a2 = SRC(0) - SRC(4);\
323 const int a4 = (SRC(2)>>1) - SRC(6);\
324 const int a6 = (SRC(6)>>1) + SRC(2);\
325 const int b0 = a0 + a6;\
326 const int b2 = a2 + a4;\
327 const int b4 = a2 - a4;\
328 const int b6 = a0 - a6;\
329 const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
330 const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
331 const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
332 const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
333 const int b1 = (a7>>2) + a1;\
334 const int b3 = a3 + (a5>>2);\
335 const int b5 = (a3>>2) - a5;\
336 const int b7 = a7 - (a1>>2);\
347 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
351 dct[0][0] += 32; // rounding for the >>6 at the end
353 #define SRC(x) dct[x][i]
354 #define DST(x,rhs) dct[x][i] = (rhs)
355 for( i = 0; i < 8; i++ )
360 #define SRC(x) dct[i][x]
361 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
362 for( i = 0; i < 8; i++ )
368 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
370 add8x8_idct8( &dst[0], dct[0] );
371 add8x8_idct8( &dst[8], dct[1] );
372 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
373 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
376 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
380 for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
382 p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
383 p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
384 p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
385 p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
389 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
391 add4x4_idct_dc( &p_dst[0], dct[0][0] );
392 add4x4_idct_dc( &p_dst[4], dct[0][1] );
393 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
394 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
397 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
400 for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
402 add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
403 add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
404 add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
405 add4x4_idct_dc( &p_dst[12], dct[i][3] );
410 /****************************************************************************
412 ****************************************************************************/
413 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
415 dctf->sub4x4_dct = sub4x4_dct;
416 dctf->add4x4_idct = add4x4_idct;
418 dctf->sub8x8_dct = sub8x8_dct;
419 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
420 dctf->add8x8_idct = add8x8_idct;
421 dctf->add8x8_idct_dc = add8x8_idct_dc;
423 dctf->sub16x16_dct = sub16x16_dct;
424 dctf->add16x16_idct = add16x16_idct;
425 dctf->add16x16_idct_dc = add16x16_idct_dc;
427 dctf->sub8x8_dct8 = sub8x8_dct8;
428 dctf->add8x8_idct8 = add8x8_idct8;
430 dctf->sub16x16_dct8 = sub16x16_dct8;
431 dctf->add16x16_idct8 = add16x16_idct8;
433 dctf->dct4x4dc = dct4x4dc;
434 dctf->idct4x4dc = idct4x4dc;
437 if( cpu&X264_CPU_MMX )
439 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
440 dctf->add4x4_idct = x264_add4x4_idct_mmx;
441 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
442 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
443 dctf->dct4x4dc = x264_dct4x4dc_mmx;
444 dctf->idct4x4dc = x264_idct4x4dc_mmx;
445 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
448 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
449 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
450 dctf->add8x8_idct = x264_add8x8_idct_mmx;
451 dctf->add16x16_idct = x264_add16x16_idct_mmx;
453 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
454 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
455 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
456 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
460 if( cpu&X264_CPU_SSE2 )
462 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
463 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
464 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
465 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
466 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
468 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
469 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
470 dctf->add8x8_idct = x264_add8x8_idct_sse2;
471 dctf->add16x16_idct = x264_add16x16_idct_sse2;
472 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
475 if( cpu&X264_CPU_SSSE3 )
477 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
478 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
479 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
480 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
481 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
482 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
483 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
486 if( cpu&X264_CPU_SSE4 )
487 dctf->add4x4_idct = x264_add4x4_idct_sse4;
492 if( cpu&X264_CPU_ALTIVEC )
494 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
495 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
496 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
498 dctf->add4x4_idct = x264_add4x4_idct_altivec;
499 dctf->add8x8_idct = x264_add8x8_idct_altivec;
500 dctf->add16x16_idct = x264_add16x16_idct_altivec;
502 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
503 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
505 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
506 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
511 if( cpu&X264_CPU_NEON )
513 dctf->sub4x4_dct = x264_sub4x4_dct_neon;
514 dctf->sub8x8_dct = x264_sub8x8_dct_neon;
515 dctf->sub16x16_dct = x264_sub16x16_dct_neon;
516 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_neon;
517 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_neon;
518 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_neon;
519 dctf->dct4x4dc = x264_dct4x4dc_neon;
520 dctf->idct4x4dc = x264_idct4x4dc_neon;
522 dctf->add4x4_idct = x264_add4x4_idct_neon;
523 dctf->add8x8_idct = x264_add8x8_idct_neon;
524 dctf->add16x16_idct = x264_add16x16_idct_neon;
526 dctf->sub8x8_dct8 = x264_sub8x8_dct8_neon;
527 dctf->sub16x16_dct8 = x264_sub16x16_dct8_neon;
529 dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
530 dctf->add16x16_idct8= x264_add16x16_idct8_neon;
535 void x264_dct_init_weights( void )
540 for( i=0; i<16; i++ )
541 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
542 for( i=0; i<64; i++ )
543 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
548 // gcc pessimizes multi-dimensional arrays here, even with constant indices
549 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
550 #define ZIGZAG8_FRAME\
551 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
552 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
553 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
554 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
555 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
556 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
557 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
558 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
559 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
560 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
561 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
562 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
563 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
564 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
565 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
566 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
568 #define ZIGZAG8_FIELD\
569 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
570 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
571 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
572 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
573 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
574 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
575 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
576 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
577 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
578 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
579 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
580 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
581 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
582 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
583 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
584 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
586 #define ZIGZAG4_FRAME\
587 ZIGDC( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
588 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
589 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
590 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
592 #define ZIGZAG4_FIELD\
593 ZIGDC( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
594 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
595 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
596 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
598 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
603 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
609 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
610 #define ZIGDC(i,y,x) ZIG(i,y,x)
612 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
617 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
619 *(uint32_t*)level = *(uint32_t*)dct;
620 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
621 *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
622 *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
623 *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
627 #define ZIG(i,y,x) {\
628 int oe = x+y*FENC_STRIDE;\
629 int od = x+y*FDEC_STRIDE;\
630 level[i] = p_src[oe] - p_dst[od];\
634 *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
635 *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
636 *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
637 *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
639 *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
640 *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
641 *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
642 *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
643 *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
644 *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
645 *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
646 *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
648 static int zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
656 static int zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
665 #define ZIGDC(i,y,x) {\
666 int oe = x+y*FENC_STRIDE;\
667 int od = x+y*FDEC_STRIDE;\
668 *dc = p_src[oe] - p_dst[od];\
672 static int zigzag_sub_4x4ac_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
680 static int zigzag_sub_4x4ac_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst, int16_t *dc )
688 static int zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
695 static int zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
706 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
712 for( j=0; j<16; j++ )
715 dst[i*16+j] = src[i+j*4];
717 nnz[(i&1) + (i>>1)*8] = !!nz;
721 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
725 pf->scan_8x8 = zigzag_scan_8x8_field;
726 pf->scan_4x4 = zigzag_scan_4x4_field;
727 pf->sub_8x8 = zigzag_sub_8x8_field;
728 pf->sub_4x4 = zigzag_sub_4x4_field;
729 pf->sub_4x4ac = zigzag_sub_4x4ac_field;
731 if( cpu&X264_CPU_MMXEXT )
732 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
733 if( cpu&X264_CPU_SSSE3 )
735 pf->sub_4x4 = x264_zigzag_sub_4x4_field_ssse3;
736 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_field_ssse3;
741 if( cpu&X264_CPU_ALTIVEC )
742 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
747 pf->scan_8x8 = zigzag_scan_8x8_frame;
748 pf->scan_4x4 = zigzag_scan_4x4_frame;
749 pf->sub_8x8 = zigzag_sub_8x8_frame;
750 pf->sub_4x4 = zigzag_sub_4x4_frame;
751 pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
753 if( cpu&X264_CPU_MMX )
754 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
755 if( cpu&X264_CPU_MMXEXT )
756 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
757 if( cpu&X264_CPU_SSE2_IS_FAST )
758 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
759 if( cpu&X264_CPU_SSSE3 )
761 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
762 pf->sub_4x4ac= x264_zigzag_sub_4x4ac_frame_ssse3;
763 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
764 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
765 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
770 if( cpu&X264_CPU_ALTIVEC )
771 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
774 if( cpu&X264_CPU_NEON )
775 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
779 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
781 if( cpu&X264_CPU_MMX )
782 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
783 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
784 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;