1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
32 int x264_dct4_weight2_zigzag[2][16];
33 int x264_dct8_weight2_zigzag[2][64];
36 * XXX For all dct dc : input could be equal to output so ...
39 static void dct4x4dc( int16_t d[4][4] )
46 for( i = 0; i < 4; i++ )
48 s01 = d[i][0] + d[i][1];
49 d01 = d[i][0] - d[i][1];
50 s23 = d[i][2] + d[i][3];
51 d23 = d[i][2] - d[i][3];
53 tmp[0][i] = s01 + s23;
54 tmp[1][i] = s01 - s23;
55 tmp[2][i] = d01 - d23;
56 tmp[3][i] = d01 + d23;
59 for( i = 0; i < 4; i++ )
61 s01 = tmp[i][0] + tmp[i][1];
62 d01 = tmp[i][0] - tmp[i][1];
63 s23 = tmp[i][2] + tmp[i][3];
64 d23 = tmp[i][2] - tmp[i][3];
66 d[i][0] = ( s01 + s23 + 1 ) >> 1;
67 d[i][1] = ( s01 - s23 + 1 ) >> 1;
68 d[i][2] = ( d01 - d23 + 1 ) >> 1;
69 d[i][3] = ( d01 + d23 + 1 ) >> 1;
73 static void idct4x4dc( int16_t d[4][4] )
80 for( i = 0; i < 4; i++ )
82 s01 = d[i][0] + d[i][1];
83 d01 = d[i][0] - d[i][1];
84 s23 = d[i][2] + d[i][3];
85 d23 = d[i][2] - d[i][3];
87 tmp[0][i] = s01 + s23;
88 tmp[1][i] = s01 - s23;
89 tmp[2][i] = d01 - d23;
90 tmp[3][i] = d01 + d23;
93 for( i = 0; i < 4; i++ )
95 s01 = tmp[i][0] + tmp[i][1];
96 d01 = tmp[i][0] - tmp[i][1];
97 s23 = tmp[i][2] + tmp[i][3];
98 d23 = tmp[i][2] - tmp[i][3];
107 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
108 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
111 for( y = 0; y < i_size; y++ )
113 for( x = 0; x < i_size; x++ )
115 diff[x + y*i_size] = pix1[x] - pix2[x];
122 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
128 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
130 for( i = 0; i < 4; i++ )
132 const int s03 = d[i][0] + d[i][3];
133 const int s12 = d[i][1] + d[i][2];
134 const int d03 = d[i][0] - d[i][3];
135 const int d12 = d[i][1] - d[i][2];
137 tmp[0][i] = s03 + s12;
138 tmp[1][i] = 2*d03 + d12;
139 tmp[2][i] = s03 - s12;
140 tmp[3][i] = d03 - 2*d12;
143 for( i = 0; i < 4; i++ )
145 const int s03 = tmp[i][0] + tmp[i][3];
146 const int s12 = tmp[i][1] + tmp[i][2];
147 const int d03 = tmp[i][0] - tmp[i][3];
148 const int d12 = tmp[i][1] - tmp[i][2];
150 dct[i][0] = s03 + s12;
151 dct[i][1] = 2*d03 + d12;
152 dct[i][2] = s03 - s12;
153 dct[i][3] = d03 - 2*d12;
157 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
159 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
160 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
161 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
162 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
165 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
167 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
168 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
169 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
170 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
173 static int sub4x4_dct_dc( uint8_t *pix1, uint8_t *pix2 )
178 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
180 sum += d[0][0] + d[0][1] + d[0][2] + d[0][3];
181 sum += d[1][0] + d[1][1] + d[1][2] + d[1][3];
182 sum += d[2][0] + d[2][1] + d[2][2] + d[2][3];
183 sum += d[3][0] + d[3][1] + d[3][2] + d[3][3];
188 static void sub8x8_dct_dc( int16_t dct[2][2], uint8_t *pix1, uint8_t *pix2 )
190 dct[0][0] = sub4x4_dct_dc( &pix1[0], &pix2[0] );
191 dct[0][1] = sub4x4_dct_dc( &pix1[4], &pix2[4] );
192 dct[1][0] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
193 dct[1][1] = sub4x4_dct_dc( &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
196 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
203 for( i = 0; i < 4; i++ )
205 const int s02 = dct[0][i] + dct[2][i];
206 const int d02 = dct[0][i] - dct[2][i];
207 const int s13 = dct[1][i] + (dct[3][i]>>1);
208 const int d13 = (dct[1][i]>>1) - dct[3][i];
210 tmp[i][0] = s02 + s13;
211 tmp[i][1] = d02 + d13;
212 tmp[i][2] = d02 - d13;
213 tmp[i][3] = s02 - s13;
216 for( i = 0; i < 4; i++ )
218 const int s02 = tmp[0][i] + tmp[2][i];
219 const int d02 = tmp[0][i] - tmp[2][i];
220 const int s13 = tmp[1][i] + (tmp[3][i]>>1);
221 const int d13 = (tmp[1][i]>>1) - tmp[3][i];
223 d[0][i] = ( s02 + s13 + 32 ) >> 6;
224 d[1][i] = ( d02 + d13 + 32 ) >> 6;
225 d[2][i] = ( d02 - d13 + 32 ) >> 6;
226 d[3][i] = ( s02 - s13 + 32 ) >> 6;
230 for( y = 0; y < 4; y++ )
232 for( x = 0; x < 4; x++ )
234 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
236 p_dst += FDEC_STRIDE;
240 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
242 add4x4_idct( &p_dst[0], dct[0] );
243 add4x4_idct( &p_dst[4], dct[1] );
244 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
245 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
248 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
250 add8x8_idct( &p_dst[0], &dct[0] );
251 add8x8_idct( &p_dst[8], &dct[4] );
252 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
253 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
256 /****************************************************************************
258 ****************************************************************************/
261 const int s07 = SRC(0) + SRC(7);\
262 const int s16 = SRC(1) + SRC(6);\
263 const int s25 = SRC(2) + SRC(5);\
264 const int s34 = SRC(3) + SRC(4);\
265 const int a0 = s07 + s34;\
266 const int a1 = s16 + s25;\
267 const int a2 = s07 - s34;\
268 const int a3 = s16 - s25;\
269 const int d07 = SRC(0) - SRC(7);\
270 const int d16 = SRC(1) - SRC(6);\
271 const int d25 = SRC(2) - SRC(5);\
272 const int d34 = SRC(3) - SRC(4);\
273 const int a4 = d16 + d25 + (d07 + (d07>>1));\
274 const int a5 = d07 - d34 - (d25 + (d25>>1));\
275 const int a6 = d07 + d34 - (d16 + (d16>>1));\
276 const int a7 = d16 - d25 + (d34 + (d34>>1));\
278 DST(1) = a4 + (a7>>2);\
279 DST(2) = a2 + (a3>>1);\
280 DST(3) = a5 + (a6>>2);\
282 DST(5) = a6 - (a5>>2);\
283 DST(6) = (a2>>1) - a3 ;\
284 DST(7) = (a4>>2) - a7 ;\
287 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
292 pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
294 #define SRC(x) tmp[x][i]
295 #define DST(x) tmp[x][i]
296 for( i = 0; i < 8; i++ )
301 #define SRC(x) tmp[i][x]
302 #define DST(x) dct[x][i]
303 for( i = 0; i < 8; i++ )
309 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
311 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
312 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
313 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
314 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
318 const int a0 = SRC(0) + SRC(4);\
319 const int a2 = SRC(0) - SRC(4);\
320 const int a4 = (SRC(2)>>1) - SRC(6);\
321 const int a6 = (SRC(6)>>1) + SRC(2);\
322 const int b0 = a0 + a6;\
323 const int b2 = a2 + a4;\
324 const int b4 = a2 - a4;\
325 const int b6 = a0 - a6;\
326 const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
327 const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
328 const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
329 const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
330 const int b1 = (a7>>2) + a1;\
331 const int b3 = a3 + (a5>>2);\
332 const int b5 = (a3>>2) - a5;\
333 const int b7 = a7 - (a1>>2);\
344 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
348 dct[0][0] += 32; // rounding for the >>6 at the end
350 #define SRC(x) dct[x][i]
351 #define DST(x,rhs) dct[x][i] = (rhs)
352 for( i = 0; i < 8; i++ )
357 #define SRC(x) dct[i][x]
358 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
359 for( i = 0; i < 8; i++ )
365 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
367 add8x8_idct8( &dst[0], dct[0] );
368 add8x8_idct8( &dst[8], dct[1] );
369 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
370 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
373 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
377 for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
379 p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
380 p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
381 p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
382 p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
386 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
388 add4x4_idct_dc( &p_dst[0], dct[0][0] );
389 add4x4_idct_dc( &p_dst[4], dct[0][1] );
390 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
391 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
394 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
397 for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
399 add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
400 add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
401 add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
402 add4x4_idct_dc( &p_dst[12], dct[i][3] );
407 /****************************************************************************
409 ****************************************************************************/
410 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
412 dctf->sub4x4_dct = sub4x4_dct;
413 dctf->add4x4_idct = add4x4_idct;
415 dctf->sub8x8_dct = sub8x8_dct;
416 dctf->sub8x8_dct_dc = sub8x8_dct_dc;
417 dctf->add8x8_idct = add8x8_idct;
418 dctf->add8x8_idct_dc = add8x8_idct_dc;
420 dctf->sub16x16_dct = sub16x16_dct;
421 dctf->add16x16_idct = add16x16_idct;
422 dctf->add16x16_idct_dc = add16x16_idct_dc;
424 dctf->sub8x8_dct8 = sub8x8_dct8;
425 dctf->add8x8_idct8 = add8x8_idct8;
427 dctf->sub16x16_dct8 = sub16x16_dct8;
428 dctf->add16x16_idct8 = add16x16_idct8;
430 dctf->dct4x4dc = dct4x4dc;
431 dctf->idct4x4dc = idct4x4dc;
434 if( cpu&X264_CPU_MMX )
436 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
437 dctf->add4x4_idct = x264_add4x4_idct_mmx;
438 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
439 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
440 dctf->dct4x4dc = x264_dct4x4dc_mmx;
441 dctf->idct4x4dc = x264_idct4x4dc_mmx;
442 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_mmxext;
445 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
446 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
447 dctf->add8x8_idct = x264_add8x8_idct_mmx;
448 dctf->add16x16_idct = x264_add16x16_idct_mmx;
450 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
451 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
452 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
453 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
457 if( cpu&X264_CPU_SSE2 )
459 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
460 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
461 dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_sse2;
462 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
463 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
465 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
466 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
467 dctf->add8x8_idct = x264_add8x8_idct_sse2;
468 dctf->add16x16_idct = x264_add16x16_idct_sse2;
469 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
472 if( cpu&X264_CPU_SSSE3 )
474 dctf->sub4x4_dct = x264_sub4x4_dct_ssse3;
475 dctf->sub8x8_dct = x264_sub8x8_dct_ssse3;
476 dctf->sub16x16_dct = x264_sub16x16_dct_ssse3;
477 dctf->sub8x8_dct8 = x264_sub8x8_dct8_ssse3;
478 dctf->sub16x16_dct8 = x264_sub16x16_dct8_ssse3;
479 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
480 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
485 if( cpu&X264_CPU_ALTIVEC )
487 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
488 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
489 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
491 dctf->add4x4_idct = x264_add4x4_idct_altivec;
492 dctf->add8x8_idct = x264_add8x8_idct_altivec;
493 dctf->add16x16_idct = x264_add16x16_idct_altivec;
495 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
496 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
498 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
499 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
504 void x264_dct_init_weights( void )
509 for( i=0; i<16; i++ )
510 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
511 for( i=0; i<64; i++ )
512 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
517 // gcc pessimizes multi-dimensional arrays here, even with constant indices
518 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
519 #define ZIGZAG8_FRAME\
520 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
521 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
522 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
523 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
524 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
525 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
526 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
527 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
528 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
529 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
530 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
531 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
532 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
533 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
534 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
535 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
537 #define ZIGZAG8_FIELD\
538 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
539 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
540 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
541 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
542 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
543 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
544 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
545 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
546 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
547 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
548 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
549 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
550 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
551 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
552 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
553 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
555 #define ZIGZAG4_FRAME\
556 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
557 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
558 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
559 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
561 #define ZIGZAG4_FIELD\
562 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
563 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
564 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
565 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
567 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
572 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
578 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
580 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
585 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
587 *(uint32_t*)level = *(uint32_t*)dct;
588 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
589 *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
590 *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
591 *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
595 #define ZIG(i,y,x) {\
596 int oe = x+y*FENC_STRIDE;\
597 int od = x+y*FDEC_STRIDE;\
598 level[i] = p_src[oe] - p_dst[od];\
601 *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
602 *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
603 *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
604 *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
606 *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
607 *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
608 *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
609 *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
610 *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
611 *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
612 *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
613 *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
615 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
621 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
627 static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
632 static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
641 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
647 for( j=0; j<16; j++ )
650 dst[i*16+j] = src[i+j*4];
652 nnz[(i&1) + (i>>1)*8] = !!nz;
656 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
660 pf->scan_8x8 = zigzag_scan_8x8_field;
661 pf->scan_4x4 = zigzag_scan_4x4_field;
662 pf->sub_8x8 = zigzag_sub_8x8_field;
663 pf->sub_4x4 = zigzag_sub_4x4_field;
665 if( cpu&X264_CPU_MMXEXT )
666 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
670 if( cpu&X264_CPU_ALTIVEC )
671 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
676 pf->scan_8x8 = zigzag_scan_8x8_frame;
677 pf->scan_4x4 = zigzag_scan_4x4_frame;
678 pf->sub_8x8 = zigzag_sub_8x8_frame;
679 pf->sub_4x4 = zigzag_sub_4x4_frame;
681 if( cpu&X264_CPU_MMX )
682 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
683 if( cpu&X264_CPU_MMXEXT )
684 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
685 if( cpu&X264_CPU_SSE2_IS_FAST )
686 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
687 if( cpu&X264_CPU_SSSE3 )
689 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
690 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
691 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
692 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
697 if( cpu&X264_CPU_ALTIVEC )
698 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
702 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
704 if( cpu&X264_CPU_MMX )
705 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
706 if( cpu&X264_CPU_SHUFFLE_IS_FAST )
707 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;