1 /*****************************************************************************
2 * dct.c: h264 encoder library
3 *****************************************************************************
4 * Copyright (C) 2003-2008 x264 project
6 * Authors: Loren Merritt <lorenm@u.washington.edu>
7 * Laurent Aimar <fenrir@via.ecp.fr>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 *****************************************************************************/
32 int x264_dct4_weight2_zigzag[2][16];
33 int x264_dct8_weight2_zigzag[2][64];
36 * XXX For all dct dc : input could be equal to output so ...
39 static void dct4x4dc( int16_t d[4][4] )
46 for( i = 0; i < 4; i++ )
48 s01 = d[i][0] + d[i][1];
49 d01 = d[i][0] - d[i][1];
50 s23 = d[i][2] + d[i][3];
51 d23 = d[i][2] - d[i][3];
53 tmp[0][i] = s01 + s23;
54 tmp[1][i] = s01 - s23;
55 tmp[2][i] = d01 - d23;
56 tmp[3][i] = d01 + d23;
59 for( i = 0; i < 4; i++ )
61 s01 = tmp[i][0] + tmp[i][1];
62 d01 = tmp[i][0] - tmp[i][1];
63 s23 = tmp[i][2] + tmp[i][3];
64 d23 = tmp[i][2] - tmp[i][3];
66 d[i][0] = ( s01 + s23 + 1 ) >> 1;
67 d[i][1] = ( s01 - s23 + 1 ) >> 1;
68 d[i][2] = ( d01 - d23 + 1 ) >> 1;
69 d[i][3] = ( d01 + d23 + 1 ) >> 1;
73 static void idct4x4dc( int16_t d[4][4] )
80 for( i = 0; i < 4; i++ )
82 s01 = d[i][0] + d[i][1];
83 d01 = d[i][0] - d[i][1];
84 s23 = d[i][2] + d[i][3];
85 d23 = d[i][2] - d[i][3];
87 tmp[0][i] = s01 + s23;
88 tmp[1][i] = s01 - s23;
89 tmp[2][i] = d01 - d23;
90 tmp[3][i] = d01 + d23;
93 for( i = 0; i < 4; i++ )
95 s01 = tmp[i][0] + tmp[i][1];
96 d01 = tmp[i][0] - tmp[i][1];
97 s23 = tmp[i][2] + tmp[i][3];
98 d23 = tmp[i][2] - tmp[i][3];
107 static inline void pixel_sub_wxh( int16_t *diff, int i_size,
108 uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2 )
111 for( y = 0; y < i_size; y++ )
113 for( x = 0; x < i_size; x++ )
115 diff[x + y*i_size] = pix1[x] - pix2[x];
122 static void sub4x4_dct( int16_t dct[4][4], uint8_t *pix1, uint8_t *pix2 )
128 pixel_sub_wxh( (int16_t*)d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
130 for( i = 0; i < 4; i++ )
132 const int s03 = d[i][0] + d[i][3];
133 const int s12 = d[i][1] + d[i][2];
134 const int d03 = d[i][0] - d[i][3];
135 const int d12 = d[i][1] - d[i][2];
137 tmp[0][i] = s03 + s12;
138 tmp[1][i] = 2*d03 + d12;
139 tmp[2][i] = s03 - s12;
140 tmp[3][i] = d03 - 2*d12;
143 for( i = 0; i < 4; i++ )
145 const int s03 = tmp[i][0] + tmp[i][3];
146 const int s12 = tmp[i][1] + tmp[i][2];
147 const int d03 = tmp[i][0] - tmp[i][3];
148 const int d12 = tmp[i][1] - tmp[i][2];
150 dct[i][0] = s03 + s12;
151 dct[i][1] = 2*d03 + d12;
152 dct[i][2] = s03 - s12;
153 dct[i][3] = d03 - 2*d12;
157 static void sub8x8_dct( int16_t dct[4][4][4], uint8_t *pix1, uint8_t *pix2 )
159 sub4x4_dct( dct[0], &pix1[0], &pix2[0] );
160 sub4x4_dct( dct[1], &pix1[4], &pix2[4] );
161 sub4x4_dct( dct[2], &pix1[4*FENC_STRIDE+0], &pix2[4*FDEC_STRIDE+0] );
162 sub4x4_dct( dct[3], &pix1[4*FENC_STRIDE+4], &pix2[4*FDEC_STRIDE+4] );
165 static void sub16x16_dct( int16_t dct[16][4][4], uint8_t *pix1, uint8_t *pix2 )
167 sub8x8_dct( &dct[ 0], &pix1[0], &pix2[0] );
168 sub8x8_dct( &dct[ 4], &pix1[8], &pix2[8] );
169 sub8x8_dct( &dct[ 8], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
170 sub8x8_dct( &dct[12], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
174 static void add4x4_idct( uint8_t *p_dst, int16_t dct[4][4] )
181 for( i = 0; i < 4; i++ )
183 const int s02 = dct[0][i] + dct[2][i];
184 const int d02 = dct[0][i] - dct[2][i];
185 const int s13 = dct[1][i] + (dct[3][i]>>1);
186 const int d13 = (dct[1][i]>>1) - dct[3][i];
188 tmp[i][0] = s02 + s13;
189 tmp[i][1] = d02 + d13;
190 tmp[i][2] = d02 - d13;
191 tmp[i][3] = s02 - s13;
194 for( i = 0; i < 4; i++ )
196 const int s02 = tmp[0][i] + tmp[2][i];
197 const int d02 = tmp[0][i] - tmp[2][i];
198 const int s13 = tmp[1][i] + (tmp[3][i]>>1);
199 const int d13 = (tmp[1][i]>>1) - tmp[3][i];
201 d[0][i] = ( s02 + s13 + 32 ) >> 6;
202 d[1][i] = ( d02 + d13 + 32 ) >> 6;
203 d[2][i] = ( d02 - d13 + 32 ) >> 6;
204 d[3][i] = ( s02 - s13 + 32 ) >> 6;
208 for( y = 0; y < 4; y++ )
210 for( x = 0; x < 4; x++ )
212 p_dst[x] = x264_clip_uint8( p_dst[x] + d[y][x] );
214 p_dst += FDEC_STRIDE;
218 static void add8x8_idct( uint8_t *p_dst, int16_t dct[4][4][4] )
220 add4x4_idct( &p_dst[0], dct[0] );
221 add4x4_idct( &p_dst[4], dct[1] );
222 add4x4_idct( &p_dst[4*FDEC_STRIDE+0], dct[2] );
223 add4x4_idct( &p_dst[4*FDEC_STRIDE+4], dct[3] );
226 static void add16x16_idct( uint8_t *p_dst, int16_t dct[16][4][4] )
228 add8x8_idct( &p_dst[0], &dct[0] );
229 add8x8_idct( &p_dst[8], &dct[4] );
230 add8x8_idct( &p_dst[8*FDEC_STRIDE+0], &dct[8] );
231 add8x8_idct( &p_dst[8*FDEC_STRIDE+8], &dct[12] );
234 /****************************************************************************
236 ****************************************************************************/
239 const int s07 = SRC(0) + SRC(7);\
240 const int s16 = SRC(1) + SRC(6);\
241 const int s25 = SRC(2) + SRC(5);\
242 const int s34 = SRC(3) + SRC(4);\
243 const int a0 = s07 + s34;\
244 const int a1 = s16 + s25;\
245 const int a2 = s07 - s34;\
246 const int a3 = s16 - s25;\
247 const int d07 = SRC(0) - SRC(7);\
248 const int d16 = SRC(1) - SRC(6);\
249 const int d25 = SRC(2) - SRC(5);\
250 const int d34 = SRC(3) - SRC(4);\
251 const int a4 = d16 + d25 + (d07 + (d07>>1));\
252 const int a5 = d07 - d34 - (d25 + (d25>>1));\
253 const int a6 = d07 + d34 - (d16 + (d16>>1));\
254 const int a7 = d16 - d25 + (d34 + (d34>>1));\
256 DST(1) = a4 + (a7>>2);\
257 DST(2) = a2 + (a3>>1);\
258 DST(3) = a5 + (a6>>2);\
260 DST(5) = a6 - (a5>>2);\
261 DST(6) = (a2>>1) - a3 ;\
262 DST(7) = (a4>>2) - a7 ;\
265 static void sub8x8_dct8( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
270 pixel_sub_wxh( (int16_t*)tmp, 8, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
272 #define SRC(x) tmp[x][i]
273 #define DST(x) tmp[x][i]
274 for( i = 0; i < 8; i++ )
279 #define SRC(x) tmp[i][x]
280 #define DST(x) dct[x][i]
281 for( i = 0; i < 8; i++ )
287 static void sub16x16_dct8( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
289 sub8x8_dct8( dct[0], &pix1[0], &pix2[0] );
290 sub8x8_dct8( dct[1], &pix1[8], &pix2[8] );
291 sub8x8_dct8( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
292 sub8x8_dct8( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
296 const int a0 = SRC(0) + SRC(4);\
297 const int a2 = SRC(0) - SRC(4);\
298 const int a4 = (SRC(2)>>1) - SRC(6);\
299 const int a6 = (SRC(6)>>1) + SRC(2);\
300 const int b0 = a0 + a6;\
301 const int b2 = a2 + a4;\
302 const int b4 = a2 - a4;\
303 const int b6 = a0 - a6;\
304 const int a1 = -SRC(3) + SRC(5) - SRC(7) - (SRC(7)>>1);\
305 const int a3 = SRC(1) + SRC(7) - SRC(3) - (SRC(3)>>1);\
306 const int a5 = -SRC(1) + SRC(7) + SRC(5) + (SRC(5)>>1);\
307 const int a7 = SRC(3) + SRC(5) + SRC(1) + (SRC(1)>>1);\
308 const int b1 = (a7>>2) + a1;\
309 const int b3 = a3 + (a5>>2);\
310 const int b5 = (a3>>2) - a5;\
311 const int b7 = a7 - (a1>>2);\
322 static void add8x8_idct8( uint8_t *dst, int16_t dct[8][8] )
326 dct[0][0] += 32; // rounding for the >>6 at the end
328 #define SRC(x) dct[x][i]
329 #define DST(x,rhs) dct[x][i] = (rhs)
330 for( i = 0; i < 8; i++ )
335 #define SRC(x) dct[i][x]
336 #define DST(x,rhs) dst[i + x*FDEC_STRIDE] = x264_clip_uint8( dst[i + x*FDEC_STRIDE] + ((rhs) >> 6) );
337 for( i = 0; i < 8; i++ )
343 static void add16x16_idct8( uint8_t *dst, int16_t dct[4][8][8] )
345 add8x8_idct8( &dst[0], dct[0] );
346 add8x8_idct8( &dst[8], dct[1] );
347 add8x8_idct8( &dst[8*FDEC_STRIDE+0], dct[2] );
348 add8x8_idct8( &dst[8*FDEC_STRIDE+8], dct[3] );
351 static void inline add4x4_idct_dc( uint8_t *p_dst, int16_t dc )
355 for( i = 0; i < 4; i++, p_dst += FDEC_STRIDE )
357 p_dst[0] = x264_clip_uint8( p_dst[0] + dc );
358 p_dst[1] = x264_clip_uint8( p_dst[1] + dc );
359 p_dst[2] = x264_clip_uint8( p_dst[2] + dc );
360 p_dst[3] = x264_clip_uint8( p_dst[3] + dc );
364 static void add8x8_idct_dc( uint8_t *p_dst, int16_t dct[2][2] )
366 add4x4_idct_dc( &p_dst[0], dct[0][0] );
367 add4x4_idct_dc( &p_dst[4], dct[0][1] );
368 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+0], dct[1][0] );
369 add4x4_idct_dc( &p_dst[4*FDEC_STRIDE+4], dct[1][1] );
372 static void add16x16_idct_dc( uint8_t *p_dst, int16_t dct[4][4] )
375 for( i = 0; i < 4; i++, p_dst += 4*FDEC_STRIDE )
377 add4x4_idct_dc( &p_dst[ 0], dct[i][0] );
378 add4x4_idct_dc( &p_dst[ 4], dct[i][1] );
379 add4x4_idct_dc( &p_dst[ 8], dct[i][2] );
380 add4x4_idct_dc( &p_dst[12], dct[i][3] );
385 /****************************************************************************
387 ****************************************************************************/
388 void x264_dct_init( int cpu, x264_dct_function_t *dctf )
390 dctf->sub4x4_dct = sub4x4_dct;
391 dctf->add4x4_idct = add4x4_idct;
393 dctf->sub8x8_dct = sub8x8_dct;
394 dctf->add8x8_idct = add8x8_idct;
395 dctf->add8x8_idct_dc = add8x8_idct_dc;
397 dctf->sub16x16_dct = sub16x16_dct;
398 dctf->add16x16_idct = add16x16_idct;
399 dctf->add16x16_idct_dc = add16x16_idct_dc;
401 dctf->sub8x8_dct8 = sub8x8_dct8;
402 dctf->add8x8_idct8 = add8x8_idct8;
404 dctf->sub16x16_dct8 = sub16x16_dct8;
405 dctf->add16x16_idct8 = add16x16_idct8;
407 dctf->dct4x4dc = dct4x4dc;
408 dctf->idct4x4dc = idct4x4dc;
411 if( cpu&X264_CPU_MMX )
413 dctf->sub4x4_dct = x264_sub4x4_dct_mmx;
414 dctf->add4x4_idct = x264_add4x4_idct_mmx;
415 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_mmx;
416 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_mmx;
417 dctf->dct4x4dc = x264_dct4x4dc_mmx;
418 dctf->idct4x4dc = x264_idct4x4dc_mmx;
421 dctf->sub8x8_dct = x264_sub8x8_dct_mmx;
422 dctf->sub16x16_dct = x264_sub16x16_dct_mmx;
423 dctf->add8x8_idct = x264_add8x8_idct_mmx;
424 dctf->add16x16_idct = x264_add16x16_idct_mmx;
426 dctf->sub8x8_dct8 = x264_sub8x8_dct8_mmx;
427 dctf->sub16x16_dct8 = x264_sub16x16_dct8_mmx;
428 dctf->add8x8_idct8 = x264_add8x8_idct8_mmx;
429 dctf->add16x16_idct8= x264_add16x16_idct8_mmx;
433 if( cpu&X264_CPU_SSE2 )
435 dctf->sub8x8_dct8 = x264_sub8x8_dct8_sse2;
436 dctf->sub16x16_dct8 = x264_sub16x16_dct8_sse2;
437 dctf->add8x8_idct8 = x264_add8x8_idct8_sse2;
438 dctf->add16x16_idct8= x264_add16x16_idct8_sse2;
440 dctf->sub8x8_dct = x264_sub8x8_dct_sse2;
441 dctf->sub16x16_dct = x264_sub16x16_dct_sse2;
442 dctf->add8x8_idct = x264_add8x8_idct_sse2;
443 dctf->add16x16_idct = x264_add16x16_idct_sse2;
444 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_sse2;
447 if( cpu&X264_CPU_SSSE3 )
449 dctf->add8x8_idct_dc = x264_add8x8_idct_dc_ssse3;
450 dctf->add16x16_idct_dc = x264_add16x16_idct_dc_ssse3;
455 if( cpu&X264_CPU_ALTIVEC )
457 dctf->sub4x4_dct = x264_sub4x4_dct_altivec;
458 dctf->sub8x8_dct = x264_sub8x8_dct_altivec;
459 dctf->sub16x16_dct = x264_sub16x16_dct_altivec;
461 dctf->add4x4_idct = x264_add4x4_idct_altivec;
462 dctf->add8x8_idct = x264_add8x8_idct_altivec;
463 dctf->add16x16_idct = x264_add16x16_idct_altivec;
465 dctf->sub8x8_dct8 = x264_sub8x8_dct8_altivec;
466 dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
468 dctf->add8x8_idct8 = x264_add8x8_idct8_altivec;
469 dctf->add16x16_idct8= x264_add16x16_idct8_altivec;
474 void x264_dct_init_weights( void )
479 for( i=0; i<16; i++ )
480 x264_dct4_weight2_zigzag[j][i] = x264_dct4_weight2_tab[ x264_zigzag_scan4[j][i] ];
481 for( i=0; i<64; i++ )
482 x264_dct8_weight2_zigzag[j][i] = x264_dct8_weight2_tab[ x264_zigzag_scan8[j][i] ];
487 // gcc pessimizes multi-dimensional arrays here, even with constant indices
488 #define ZIG(i,y,x) level[i] = dct[0][x*8+y];
489 #define ZIGZAG8_FRAME\
490 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
491 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
492 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,4,0) ZIG(11,3,1)\
493 ZIG(12,2,2) ZIG(13,1,3) ZIG(14,0,4) ZIG(15,0,5)\
494 ZIG(16,1,4) ZIG(17,2,3) ZIG(18,3,2) ZIG(19,4,1)\
495 ZIG(20,5,0) ZIG(21,6,0) ZIG(22,5,1) ZIG(23,4,2)\
496 ZIG(24,3,3) ZIG(25,2,4) ZIG(26,1,5) ZIG(27,0,6)\
497 ZIG(28,0,7) ZIG(29,1,6) ZIG(30,2,5) ZIG(31,3,4)\
498 ZIG(32,4,3) ZIG(33,5,2) ZIG(34,6,1) ZIG(35,7,0)\
499 ZIG(36,7,1) ZIG(37,6,2) ZIG(38,5,3) ZIG(39,4,4)\
500 ZIG(40,3,5) ZIG(41,2,6) ZIG(42,1,7) ZIG(43,2,7)\
501 ZIG(44,3,6) ZIG(45,4,5) ZIG(46,5,4) ZIG(47,6,3)\
502 ZIG(48,7,2) ZIG(49,7,3) ZIG(50,6,4) ZIG(51,5,5)\
503 ZIG(52,4,6) ZIG(53,3,7) ZIG(54,4,7) ZIG(55,5,6)\
504 ZIG(56,6,5) ZIG(57,7,4) ZIG(58,7,5) ZIG(59,6,6)\
505 ZIG(60,5,7) ZIG(61,6,7) ZIG(62,7,6) ZIG(63,7,7)\
507 #define ZIGZAG8_FIELD\
508 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,2,0) ZIG( 3,0,1)\
509 ZIG( 4,1,1) ZIG( 5,3,0) ZIG( 6,4,0) ZIG( 7,2,1)\
510 ZIG( 8,0,2) ZIG( 9,3,1) ZIG(10,5,0) ZIG(11,6,0)\
511 ZIG(12,7,0) ZIG(13,4,1) ZIG(14,1,2) ZIG(15,0,3)\
512 ZIG(16,2,2) ZIG(17,5,1) ZIG(18,6,1) ZIG(19,7,1)\
513 ZIG(20,3,2) ZIG(21,1,3) ZIG(22,0,4) ZIG(23,2,3)\
514 ZIG(24,4,2) ZIG(25,5,2) ZIG(26,6,2) ZIG(27,7,2)\
515 ZIG(28,3,3) ZIG(29,1,4) ZIG(30,0,5) ZIG(31,2,4)\
516 ZIG(32,4,3) ZIG(33,5,3) ZIG(34,6,3) ZIG(35,7,3)\
517 ZIG(36,3,4) ZIG(37,1,5) ZIG(38,0,6) ZIG(39,2,5)\
518 ZIG(40,4,4) ZIG(41,5,4) ZIG(42,6,4) ZIG(43,7,4)\
519 ZIG(44,3,5) ZIG(45,1,6) ZIG(46,2,6) ZIG(47,4,5)\
520 ZIG(48,5,5) ZIG(49,6,5) ZIG(50,7,5) ZIG(51,3,6)\
521 ZIG(52,0,7) ZIG(53,1,7) ZIG(54,4,6) ZIG(55,5,6)\
522 ZIG(56,6,6) ZIG(57,7,6) ZIG(58,2,7) ZIG(59,3,7)\
523 ZIG(60,4,7) ZIG(61,5,7) ZIG(62,6,7) ZIG(63,7,7)
525 #define ZIGZAG4_FRAME\
526 ZIG( 0,0,0) ZIG( 1,0,1) ZIG( 2,1,0) ZIG( 3,2,0)\
527 ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)\
528 ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)\
529 ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
531 #define ZIGZAG4_FIELD\
532 ZIG( 0,0,0) ZIG( 1,1,0) ZIG( 2,0,1) ZIG( 3,2,0)\
533 ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)\
534 ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)\
535 ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
537 static void zigzag_scan_8x8_frame( int16_t level[64], int16_t dct[8][8] )
542 static void zigzag_scan_8x8_field( int16_t level[64], int16_t dct[8][8] )
548 #define ZIG(i,y,x) level[i] = dct[0][x*4+y];
550 static void zigzag_scan_4x4_frame( int16_t level[16], int16_t dct[4][4] )
555 static void zigzag_scan_4x4_field( int16_t level[16], int16_t dct[4][4] )
557 *(uint32_t*)level = *(uint32_t*)dct;
558 ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
559 *(uint32_t*)(level+6) = *(uint32_t*)(*dct+6);
560 *(uint64_t*)(level+8) = *(uint64_t*)(*dct+8);
561 *(uint64_t*)(level+12) = *(uint64_t*)(*dct+12);
565 #define ZIG(i,y,x) {\
566 int oe = x+y*FENC_STRIDE;\
567 int od = x+y*FDEC_STRIDE;\
568 level[i] = p_src[oe] - p_dst[od];\
571 *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
572 *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
573 *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
574 *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);
576 *(uint64_t*)(p_dst+0*FDEC_STRIDE) = *(uint64_t*)(p_src+0*FENC_STRIDE);\
577 *(uint64_t*)(p_dst+1*FDEC_STRIDE) = *(uint64_t*)(p_src+1*FENC_STRIDE);\
578 *(uint64_t*)(p_dst+2*FDEC_STRIDE) = *(uint64_t*)(p_src+2*FENC_STRIDE);\
579 *(uint64_t*)(p_dst+3*FDEC_STRIDE) = *(uint64_t*)(p_src+3*FENC_STRIDE);\
580 *(uint64_t*)(p_dst+4*FDEC_STRIDE) = *(uint64_t*)(p_src+4*FENC_STRIDE);\
581 *(uint64_t*)(p_dst+5*FDEC_STRIDE) = *(uint64_t*)(p_src+5*FENC_STRIDE);\
582 *(uint64_t*)(p_dst+6*FDEC_STRIDE) = *(uint64_t*)(p_src+6*FENC_STRIDE);\
583 *(uint64_t*)(p_dst+7*FDEC_STRIDE) = *(uint64_t*)(p_src+7*FENC_STRIDE);
585 static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
591 static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
597 static void zigzag_sub_8x8_frame( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
602 static void zigzag_sub_8x8_field( int16_t level[64], const uint8_t *p_src, uint8_t *p_dst )
611 static void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
617 for( j=0; j<16; j++ )
620 dst[i*16+j] = src[i+j*4];
622 nnz[(i&1) + (i>>1)*8] = !!nz;
626 void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
630 pf->scan_8x8 = zigzag_scan_8x8_field;
631 pf->scan_4x4 = zigzag_scan_4x4_field;
632 pf->sub_8x8 = zigzag_sub_8x8_field;
633 pf->sub_4x4 = zigzag_sub_4x4_field;
635 if( cpu&X264_CPU_MMXEXT )
636 pf->scan_4x4 = x264_zigzag_scan_4x4_field_mmxext;
640 if( cpu&X264_CPU_ALTIVEC )
641 pf->scan_4x4 = x264_zigzag_scan_4x4_field_altivec;
646 pf->scan_8x8 = zigzag_scan_8x8_frame;
647 pf->scan_4x4 = zigzag_scan_4x4_frame;
648 pf->sub_8x8 = zigzag_sub_8x8_frame;
649 pf->sub_4x4 = zigzag_sub_4x4_frame;
651 if( cpu&X264_CPU_MMX )
652 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
653 if( cpu&X264_CPU_MMXEXT )
654 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_mmxext;
655 if( cpu&X264_CPU_SSE2_IS_FAST )
656 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_sse2;
657 if( cpu&X264_CPU_SSSE3 )
659 pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
660 pf->scan_8x8 = x264_zigzag_scan_8x8_frame_ssse3;
662 if( cpu&X264_CPU_PHADD_IS_FAST )
663 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_ssse3;
667 if( cpu&X264_CPU_ALTIVEC )
668 pf->scan_4x4 = x264_zigzag_scan_4x4_frame_altivec;
672 pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
674 if( cpu&X264_CPU_MMX )
675 pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;