4 #include "common/common.h"
5 #include "common/cpu.h"
7 /* buf1, buf2: initialised to random data and shouldn't write into them */
8 uint8_t * buf1, * buf2;
9 /* buf3, buf4: used to store output */
10 uint8_t * buf3, * buf4;
12 #define report( name ) { \
14 fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
18 static int check_pixel( int cpu_ref, int cpu_new )
20 x264_pixel_function_t pixel_c;
21 x264_pixel_function_t pixel_ref;
22 x264_pixel_function_t pixel_asm;
23 x264_predict_t predict_16x16[4+3];
24 x264_predict_t predict_8x8c[4+3];
25 x264_predict_t predict_4x4[9+3];
26 x264_predict8x8_t predict_8x8[9+3];
27 DECLARE_ALIGNED( uint8_t, edge[33], 16 );
29 int ret = 0, ok, used_asm;
32 x264_pixel_init( 0, &pixel_c );
33 x264_pixel_init( cpu_ref, &pixel_ref );
34 x264_pixel_init( cpu_new, &pixel_asm );
35 x264_predict_16x16_init( 0, predict_16x16 );
36 x264_predict_8x8c_init( 0, predict_8x8c );
37 x264_predict_8x8_init( 0, predict_8x8 );
38 x264_predict_4x4_init( 0, predict_4x4 );
39 x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
41 #define TEST_PIXEL( name ) \
42 for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
45 if( pixel_asm.name[i] != pixel_ref.name[i] ) \
47 for( j=0; j<64; j++ ) \
50 res_c = pixel_c.name[i]( buf1, 32, buf2+j, 16 ); \
51 res_asm = pixel_asm.name[i]( buf1, 32, buf2+j, 16 ); \
52 if( res_c != res_asm ) \
55 fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
61 report( "pixel " #name " :" );
68 #define TEST_PIXEL_X( N ) \
69 for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
71 int res_c[4]={0}, res_asm[4]={0}; \
72 if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
74 for( j=0; j<64; j++) \
76 uint8_t *pix2 = buf2+j; \
78 res_c[0] = pixel_c.sad[i]( buf1, 16, pix2, 32 ); \
79 res_c[1] = pixel_c.sad[i]( buf1, 16, pix2+30, 32 ); \
80 res_c[2] = pixel_c.sad[i]( buf1, 16, pix2+1, 32 ); \
83 res_c[3] = pixel_c.sad[i]( buf1, 16, pix2+99, 32 ); \
84 pixel_asm.sad_x4[i]( buf1, pix2, pix2+30, pix2+1, pix2+99, 32, res_asm ); \
87 pixel_asm.sad_x3[i]( buf1, pix2, pix2+30, pix2+1, 32, res_asm ); \
88 if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
91 fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
92 i, res_c[0], res_c[1], res_c[2], res_c[3], \
93 res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
98 report( "pixel sad_x"#N" :" );
103 #define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
104 if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
106 int res_c[3], res_asm[3]; \
108 memcpy( buf3, buf2, 1024 ); \
109 for( i=0; i<3; i++ ) \
111 pred[i]( buf3+40, ##__VA_ARGS__ ); \
112 res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
114 pixel_asm.name( buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
115 if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
118 fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
119 res_c[0], res_c[1], res_c[2], \
120 res_asm[0], res_asm[1], res_asm[2] ); \
124 ok = 1; used_asm = 0;
125 TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
126 TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
127 TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
128 TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
129 report( "intra satd_x3 :" );
131 if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
132 pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
136 x264_cpu_restore( cpu_new );
137 res_c = x264_pixel_ssim_wxh( &pixel_c, buf1+2, 32, buf2+2, 32, 32, 28 );
138 res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
139 if( fabs(res_c - res_a) > 1e-7 )
142 fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
147 ok = 1; used_asm = 0;
148 for( i=0; i<32; i++ )
150 for( i=0; i<100; i++ )
151 if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
153 DECLARE_ALIGNED( uint16_t, sums[72], 16 );
154 DECLARE_ALIGNED( int, dc[4], 16 );
155 int16_t mvs_a[32], mvs_c[32];
157 int thresh = rand() & 0x3fff;
158 for( j=0; j<72; j++ )
159 sums[j] = rand() & 0x3fff;
161 dc[j] = rand() & 0x3fff;
163 mvn_c = pixel_c.ads[i&3]( dc, sums, 32, cost_mv, mvs_c, 28, thresh );
164 mvn_a = pixel_asm.ads[i&3]( dc, sums, 32, cost_mv, mvs_a, 28, thresh );
165 if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
168 printf("c%d: ", i&3);
169 for(j=0; j<mvn_c; j++)
170 printf("%d ", mvs_c[j]);
171 printf("\na%d: ", i&3);
172 for(j=0; j<mvn_a; j++)
173 printf("%d ", mvs_a[j]);
177 report( "esa ads:" );
182 static int check_dct( int cpu_ref, int cpu_new )
184 x264_dct_function_t dct_c;
185 x264_dct_function_t dct_ref;
186 x264_dct_function_t dct_asm;
187 x264_quant_function_t qf;
188 int ret = 0, ok, used_asm, i;
189 int16_t dct1[16][4][4] __attribute__((aligned(16)));
190 int16_t dct2[16][4][4] __attribute__((aligned(16)));
191 int16_t dct4[16][4][4] __attribute__((aligned(16)));
192 int16_t dct8[4][8][8] __attribute__((aligned(16)));
196 x264_dct_init( 0, &dct_c );
197 x264_dct_init( cpu_ref, &dct_ref);
198 x264_dct_init( cpu_new, &dct_asm );
200 memset( h, 0, sizeof(*h) );
201 h->pps = h->pps_array;
202 x264_param_default( &h->param );
203 h->param.analyse.i_luma_deadzone[0] = 0;
204 h->param.analyse.i_luma_deadzone[1] = 0;
205 h->param.analyse.b_transform_8x8 = 1;
207 h->pps->scaling_list[i] = x264_cqm_flat16;
209 x264_quant_init( h, 0, &qf );
211 #define TEST_DCT( name, t1, t2, size ) \
212 if( dct_asm.name != dct_ref.name ) \
215 dct_c.name( t1, buf1, buf2 ); \
216 dct_asm.name( t2, buf1, buf2 ); \
217 if( memcmp( t1, t2, size ) ) \
220 fprintf( stderr, #name " [FAILED]\n" ); \
223 ok = 1; used_asm = 0;
224 TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
225 TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
226 TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
227 report( "sub_dct4 :" );
229 ok = 1; used_asm = 0;
230 TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 );
231 TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 );
232 report( "sub_dct8 :" );
235 // fdct and idct are denormalized by different factors, so quant/dequant
236 // is needed to force the coefs into the right range.
237 dct_c.sub16x16_dct( dct4, buf1, buf2 );
238 dct_c.sub16x16_dct8( dct8, buf1, buf2 );
239 for( i=0; i<16; i++ )
241 qf.quant_4x4( dct4[i], h->quant4_mf[CQM_4IY][20], h->quant4_bias[CQM_4IY][20] );
242 qf.dequant_4x4( dct4[i], h->dequant4_mf[CQM_4IY], 20 );
246 qf.quant_8x8( dct8[i], h->quant8_mf[CQM_8IY][20], h->quant8_bias[CQM_8IY][20] );
247 qf.dequant_8x8( dct8[i], h->dequant8_mf[CQM_8IY], 20 );
250 #define TEST_IDCT( name, src ) \
251 if( dct_asm.name != dct_ref.name ) \
254 memcpy( buf3, buf1, 32*32 ); \
255 memcpy( buf4, buf1, 32*32 ); \
256 memcpy( dct1, src, 512 ); \
257 memcpy( dct2, src, 512 ); \
258 dct_c.name( buf3, (void*)dct1 ); \
259 dct_asm.name( buf4, (void*)dct2 ); \
260 if( memcmp( buf3, buf4, 32*32 ) ) \
263 fprintf( stderr, #name " [FAILED]\n" ); \
266 ok = 1; used_asm = 0;
267 TEST_IDCT( add4x4_idct, dct4 );
268 TEST_IDCT( add8x8_idct, dct4 );
269 TEST_IDCT( add16x16_idct, dct4 );
270 report( "add_idct4 :" );
272 ok = 1; used_asm = 0;
273 TEST_IDCT( add8x8_idct8, dct8 );
274 TEST_IDCT( add16x16_idct8, dct8 );
275 report( "add_idct8 :" );
278 ok = 1; used_asm = 0;
279 if( dct_asm.dct4x4dc != dct_ref.dct4x4dc )
281 int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
282 int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
284 dct_c.dct4x4dc( dct1 );
285 dct_asm.dct4x4dc( dct2 );
286 if( memcmp( dct1, dct2, 32 ) )
289 fprintf( stderr, " - dct4x4dc : [FAILED]\n" );
292 if( dct_asm.idct4x4dc != dct_ref.idct4x4dc )
294 int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
295 int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
297 dct_c.idct4x4dc( dct1 );
298 dct_asm.idct4x4dc( dct2 );
299 if( memcmp( dct1, dct2, 32 ) )
302 fprintf( stderr, " - idct4x4dc : [FAILED]\n" );
305 report( "(i)dct4x4dc :" );
307 ok = 1; used_asm = 0;
308 if( dct_asm.dct2x2dc != dct_ref.dct2x2dc )
310 int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
311 int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
313 dct_c.dct2x2dc( dct1 );
314 dct_asm.dct2x2dc( dct2 );
315 if( memcmp( dct1, dct2, 4*2 ) )
318 fprintf( stderr, " - dct2x2dc : [FAILED]\n" );
321 if( dct_asm.idct2x2dc != dct_ref.idct2x2dc )
323 int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
324 int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
326 dct_c.idct2x2dc( dct1 );
327 dct_asm.idct2x2dc( dct2 );
328 if( memcmp( dct1, dct2, 4*2 ) )
331 fprintf( stderr, " - idct2x2dc : [FAILED]\n" );
334 report( "(i)dct2x2dc :" );
336 x264_zigzag_function_t zigzag_c;
337 x264_zigzag_function_t zigzag_ref;
338 x264_zigzag_function_t zigzag_asm;
340 int32_t level1[64] __attribute__((aligned(16)));
341 int32_t level2[64] __attribute__((aligned(16)));
343 #define TEST_ZIGZAG_SCAN( name, t1, t2, dct, size ) \
344 if( zigzag_asm.name != zigzag_ref.name ) \
347 zigzag_c.name( t1, dct ); \
348 zigzag_asm.name( t2, dct ); \
349 if( memcmp( t1, t2, size ) ) \
352 fprintf( stderr, #name " [FAILED]\n" ); \
356 #define TEST_ZIGZAG_SUB( name, t1, t2, size ) \
357 if( zigzag_asm.name != zigzag_ref.name ) \
360 memcpy( buf3, buf1, 16*FDEC_STRIDE ); \
361 memcpy( buf4, buf1, 16*FDEC_STRIDE ); \
362 zigzag_c.name( t1, buf2, buf3 ); \
363 zigzag_asm.name( t2, buf2, buf4 ); \
364 if( memcmp( t1, t2, size )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) ) \
367 fprintf( stderr, #name " [FAILED]\n" ); \
371 x264_zigzag_init( 0, &zigzag_c, 0 );
372 x264_zigzag_init( cpu_ref, &zigzag_ref, 0 );
373 x264_zigzag_init( cpu_new, &zigzag_asm, 0 );
375 ok = 1; used_asm = 0;
376 TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
377 TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
378 TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
379 TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
380 TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
381 report( "zigzag_frame :" );
383 x264_zigzag_init( 0, &zigzag_c, 1 );
384 x264_zigzag_init( cpu_ref, &zigzag_ref, 1 );
385 x264_zigzag_init( cpu_new, &zigzag_asm, 1 );
387 ok = 1; used_asm = 0;
388 TEST_ZIGZAG_SCAN( scan_8x8, level1, level2, (void*)dct1, 64*4 );
389 TEST_ZIGZAG_SCAN( scan_4x4, level1, level2, dct1[0], 16*4 );
390 TEST_ZIGZAG_SCAN( scan_4x4ac, level1, level2, dct1[0], 15*4 );
391 TEST_ZIGZAG_SUB( sub_4x4, level1, level2, 16*4 );
392 TEST_ZIGZAG_SUB( sub_4x4ac, level1, level2, 15*4 );
393 report( "zigzag_field :" );
394 #undef TEST_ZIGZAG_SCAN
395 #undef TEST_ZIGZAG_SUB
400 static int check_mc( int cpu_ref, int cpu_new )
402 x264_mc_functions_t mc_c;
403 x264_mc_functions_t mc_ref;
404 x264_mc_functions_t mc_a;
405 x264_pixel_function_t pixel;
407 uint8_t *src = &buf1[2*32+2];
408 uint8_t *src2[4] = { &buf1[2*32+2], &buf1[6*32+2],
409 &buf1[10*32+2], &buf1[14*32+2] };
410 uint8_t *dst1 = &buf3[2*32];
411 uint8_t *dst2 = &buf4[2*32];
413 int dx, dy, i, j, k, w;
414 int ret = 0, ok, used_asm;
416 x264_mc_init( 0, &mc_c );
417 x264_mc_init( cpu_ref, &mc_ref );
418 x264_mc_init( cpu_new, &mc_a );
419 x264_pixel_init( 0, &pixel );
421 #define MC_TEST_LUMA( w, h ) \
422 if( mc_a.mc_luma != mc_ref.mc_luma && !(w&(w-1)) && h<=16 ) \
425 memset(buf3, 0xCD, 1024); \
426 memset(buf4, 0xCD, 1024); \
427 mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \
428 mc_a.mc_luma( dst2, 32, src2, 16, dx, dy, w, h ); \
429 if( memcmp( buf3, buf4, 1024 ) ) \
431 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
435 if( mc_a.get_ref != mc_ref.get_ref ) \
437 uint8_t *ref = dst2; \
438 int ref_stride = 32; \
440 memset(buf3, 0xCD, 1024); \
441 memset(buf4, 0xCD, 1024); \
442 mc_c.mc_luma( dst1, 32, src2, 16, dx, dy, w, h ); \
443 ref = mc_a.get_ref( ref, &ref_stride, src2, 16, dx, dy, w, h ); \
444 for( i=0; i<h; i++ ) \
445 if( memcmp( dst1+i*32, ref+i*ref_stride, w ) ) \
447 fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
453 #define MC_TEST_CHROMA( w, h ) \
454 if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
457 memset(buf3, 0xCD, 1024); \
458 memset(buf4, 0xCD, 1024); \
459 mc_c.mc_chroma( dst1, 16, src, 32, dx, dy, w, h ); \
460 mc_a.mc_chroma( dst2, 16, src, 32, dx, dy, w, h ); \
461 /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
462 for( j=0; j<h; j++ ) \
463 for( i=w; i<4; i++ ) \
464 dst2[i+j*16] = dst1[i+j*16]; \
465 if( memcmp( buf3, buf4, 1024 ) ) \
467 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
471 ok = 1; used_asm = 0;
472 for( dy = -8; dy < 8; dy++ )
473 for( dx = -8; dx < 8; dx++ )
475 MC_TEST_LUMA( 20, 18 );
476 MC_TEST_LUMA( 16, 16 );
477 MC_TEST_LUMA( 16, 8 );
478 MC_TEST_LUMA( 12, 10 );
479 MC_TEST_LUMA( 8, 16 );
480 MC_TEST_LUMA( 8, 8 );
481 MC_TEST_LUMA( 8, 4 );
482 MC_TEST_LUMA( 4, 8 );
483 MC_TEST_LUMA( 4, 4 );
485 report( "mc luma :" );
487 ok = 1; used_asm = 0;
488 for( dy = -1; dy < 9; dy++ )
489 for( dx = -1; dx < 9; dx++ )
491 MC_TEST_CHROMA( 8, 8 );
492 MC_TEST_CHROMA( 8, 4 );
493 MC_TEST_CHROMA( 4, 8 );
494 MC_TEST_CHROMA( 4, 4 );
495 MC_TEST_CHROMA( 4, 2 );
496 MC_TEST_CHROMA( 2, 4 );
497 MC_TEST_CHROMA( 2, 2 );
499 report( "mc chroma :" );
501 #undef MC_TEST_CHROMA
503 #define MC_TEST_AVG( name, ... ) \
504 for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
506 memcpy( buf3, buf1, 1024 ); \
507 memcpy( buf4, buf1, 1024 ); \
508 if( mc_a.name[i] != mc_ref.name[i] ) \
511 mc_c.name[i]( buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
512 mc_a.name[i]( buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
513 if( memcmp( buf3, buf4, 1024 ) ) \
516 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
521 report( "mc avg :" );
522 ok = 1; used_asm = 0;
523 for( w = -64; w <= 128 && ok; w++ )
524 MC_TEST_AVG( avg_weight, w );
525 report( "mc wpredb :" );
527 if( mc_a.hpel_filter != mc_ref.hpel_filter )
529 uint8_t *src = buf1+16+2*64;
530 uint8_t *dstc[3] = { buf3+16, buf3+16+16*64, buf3+16+32*64 };
531 uint8_t *dsta[3] = { buf4+16, buf4+16+16*64, buf4+16+32*64 };
532 ok = 1; used_asm = 1;
533 memset( buf3, 0, 4096 );
534 memset( buf4, 0, 4096 );
535 mc_c.hpel_filter( dstc[0], dstc[1], dstc[2], src, 64, 48, 10 );
536 mc_a.hpel_filter( dsta[0], dsta[1], dsta[2], src, 64, 48, 10 );
538 for( j=0; j<10; j++ )
539 //FIXME ideally the first pixels would match too, but they aren't actually used
540 if( memcmp( dstc[i]+j*64+2, dsta[i]+j*64+2, 46 ) )
543 fprintf( stderr, "hpel filter differs at plane %c line %d\n", "hvc"[i], j );
544 for( k=0; k<48; k++ )
545 printf("%02x%s", dstc[i][j*64+k], (k+1)&3 ? "" : " ");
547 for( k=0; k<48; k++ )
548 printf("%02x%s", dsta[i][j*64+k], (k+1)&3 ? "" : " ");
552 report( "hpel filter :" );
558 static int check_deblock( int cpu_ref, int cpu_new )
560 x264_deblock_function_t db_c;
561 x264_deblock_function_t db_ref;
562 x264_deblock_function_t db_a;
563 int ret = 0, ok = 1, used_asm = 0;
564 int alphas[36], betas[36];
568 x264_deblock_init( 0, &db_c );
569 x264_deblock_init( cpu_ref, &db_ref );
570 x264_deblock_init( cpu_new, &db_a );
572 /* not exactly the real values of a,b,tc but close enough */
574 for( i = 35; i >= 0; i-- )
578 tcs[i][0] = tcs[i][2] = (c+6)/10;
579 tcs[i][1] = tcs[i][3] = (c+9)/20;
584 #define TEST_DEBLOCK( name, ... ) \
585 for( i = 0; i < 36; i++ ) \
587 for( j = 0; j < 1024; j++ ) \
588 /* two distributions of random to excersize different failure modes */\
589 buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
590 memcpy( buf4, buf3, 1024 ); \
591 if( db_a.name != db_ref.name ) \
594 db_c.name( &buf3[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
595 db_a.name( &buf4[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
596 if( memcmp( buf3, buf4, 1024 ) ) \
599 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
605 TEST_DEBLOCK( deblock_h_luma, tcs[i] );
606 TEST_DEBLOCK( deblock_v_luma, tcs[i] );
607 TEST_DEBLOCK( deblock_h_chroma, tcs[i] );
608 TEST_DEBLOCK( deblock_v_chroma, tcs[i] );
609 TEST_DEBLOCK( deblock_h_luma_intra );
610 TEST_DEBLOCK( deblock_v_luma_intra );
611 TEST_DEBLOCK( deblock_h_chroma_intra );
612 TEST_DEBLOCK( deblock_v_chroma_intra );
614 report( "deblock :" );
619 static int check_quant( int cpu_ref, int cpu_new )
621 x264_quant_function_t qf_c;
622 x264_quant_function_t qf_ref;
623 x264_quant_function_t qf_a;
624 int16_t dct1[64] __attribute__((__aligned__(16)));
625 int16_t dct2[64] __attribute__((__aligned__(16)));
626 uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
627 int ret = 0, ok, used_asm;
628 int oks[2] = {1,1}, used_asms[2] = {0,0};
632 memset( h, 0, sizeof(*h) );
633 h->pps = h->pps_array;
634 x264_param_default( &h->param );
635 h->param.rc.i_qp_min = 26;
636 h->param.analyse.b_transform_8x8 = 1;
638 for( i_cqm = 0; i_cqm < 4; i_cqm++ )
641 for( i = 0; i < 6; i++ )
642 h->pps->scaling_list[i] = x264_cqm_flat16;
643 else if( i_cqm == 1 )
644 for( i = 0; i < 6; i++ )
645 h->pps->scaling_list[i] = x264_cqm_jvt[i];
649 for( i = 0; i < 64; i++ )
650 cqm_buf[i] = 10 + rand() % 246;
652 for( i = 0; i < 64; i++ )
654 for( i = 0; i < 6; i++ )
655 h->pps->scaling_list[i] = cqm_buf;
659 x264_quant_init( h, 0, &qf_c );
660 x264_quant_init( h, cpu_ref, &qf_ref );
661 x264_quant_init( h, cpu_new, &qf_a );
663 #define INIT_QUANT8() \
665 static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
667 for( y = 0; y < 8; y++ ) \
668 for( x = 0; x < 8; x++ ) \
670 unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
671 dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
675 #define INIT_QUANT4() \
677 static const int scale1d[4] = {4,6,4,6}; \
679 for( y = 0; y < 4; y++ ) \
680 for( x = 0; x < 4; x++ ) \
682 unsigned int scale = 255*scale1d[y]*scale1d[x]; \
683 dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
687 #define TEST_QUANT_DC( name, cqm ) \
688 if( qf_a.name != qf_ref.name ) \
691 for( qp = 51; qp > 0; qp-- ) \
693 for( i = 0; i < 16; i++ ) \
694 dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
695 qf_c.name( (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
696 qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
697 if( memcmp( dct1, dct2, 16*2 ) ) \
700 fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
706 #define TEST_QUANT( qname, block, w ) \
707 if( qf_a.qname != qf_ref.qname ) \
710 for( qp = 51; qp > 0; qp-- ) \
713 qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
714 qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
715 if( memcmp( dct1, dct2, w*w*2 ) ) \
718 fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
724 TEST_QUANT( quant_8x8, CQM_8IY, 8 );
725 TEST_QUANT( quant_8x8, CQM_8PY, 8 );
726 TEST_QUANT( quant_4x4, CQM_4IY, 4 );
727 TEST_QUANT( quant_4x4, CQM_4PY, 4 );
728 TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
729 TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
731 #define TEST_DEQUANT( qname, dqname, block, w ) \
732 if( qf_a.dqname != qf_ref.dqname ) \
735 for( qp = 51; qp > 0; qp-- ) \
738 qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
739 memcpy( dct2, dct1, w*w*2 ); \
740 qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \
741 qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \
742 if( memcmp( dct1, dct2, w*w*2 ) ) \
745 fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
751 TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
752 TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
753 TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
754 TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
756 x264_cqm_delete( h );
759 ok = oks[0]; used_asm = used_asms[0];
762 ok = oks[1]; used_asm = used_asms[1];
763 report( "dequant :" );
768 static int check_intra( int cpu_ref, int cpu_new )
770 int ret = 0, ok = 1, used_asm = 0;
772 DECLARE_ALIGNED( uint8_t, edge[33], 16 );
775 x264_predict_t predict_16x16[4+3];
776 x264_predict_t predict_8x8c[4+3];
777 x264_predict8x8_t predict_8x8[9+3];
778 x264_predict_t predict_4x4[9+3];
779 } ip_c, ip_ref, ip_a;
781 x264_predict_16x16_init( 0, ip_c.predict_16x16 );
782 x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
783 x264_predict_8x8_init( 0, ip_c.predict_8x8 );
784 x264_predict_4x4_init( 0, ip_c.predict_4x4 );
786 x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
787 x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
788 x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8 );
789 x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
791 x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
792 x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
793 x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 );
794 x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
796 x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
798 #define INTRA_TEST( name, dir, w, ... ) \
799 if( ip_a.name[dir] != ip_ref.name[dir] )\
802 memcpy( buf3, buf1, 32*20 );\
803 memcpy( buf4, buf1, 32*20 );\
804 ip_c.name[dir]( buf3+48, ##__VA_ARGS__ );\
805 ip_a.name[dir]( buf4+48, ##__VA_ARGS__ );\
806 if( memcmp( buf3, buf4, 32*20 ) )\
808 fprintf( stderr, #name "[%d] : [FAILED]\n", dir );\
811 for(k=-1; k<16; k++)\
812 printf("%2x ", edge[16+k]);\
815 printf("%2x ", edge[14-j]);\
817 printf("%2x ", buf4[48+k+j*32]);\
824 printf("%2x ", buf3[48+k+j*32]);\
830 for( i = 0; i < 12; i++ )
831 INTRA_TEST( predict_4x4, i, 4 );
832 for( i = 0; i < 7; i++ )
833 INTRA_TEST( predict_8x8c, i, 8 );
834 for( i = 0; i < 7; i++ )
835 INTRA_TEST( predict_16x16, i, 16 );
836 for( i = 0; i < 12; i++ )
837 INTRA_TEST( predict_8x8, i, 8, edge );
839 report( "intra pred :" );
843 int check_all( int cpu_ref, int cpu_new )
845 return check_pixel( cpu_ref, cpu_new )
846 + check_dct( cpu_ref, cpu_new )
847 + check_mc( cpu_ref, cpu_new )
848 + check_intra( cpu_ref, cpu_new )
849 + check_deblock( cpu_ref, cpu_new )
850 + check_quant( cpu_ref, cpu_new );
853 int add_flags( int *cpu_ref, int *cpu_new, int flags, const char *name )
857 fprintf( stderr, "x264: %s\n", name );
858 return check_all( *cpu_ref, *cpu_new );
861 int main(int argc, char *argv[])
864 int cpu0 = 0, cpu1 = 0;
867 buf1 = x264_malloc( 1024 ); /* 32 x 32 */
868 buf2 = x264_malloc( 1024 );
869 buf3 = x264_malloc( 4096 );
870 buf4 = x264_malloc( 4096 );
872 i = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
873 fprintf( stderr, "x264: using random seed %u\n", i );
876 for( i = 0; i < 1024; i++ )
878 buf1[i] = rand() & 0xFF;
879 buf2[i] = rand() & 0xFF;
880 buf3[i] = buf4[i] = 0;
884 if( x264_cpu_detect() & X264_CPU_MMXEXT )
886 ret |= add_flags( &cpu0, &cpu1, X264_CPU_MMX | X264_CPU_MMXEXT, "MMXEXT" );
887 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "MMXEXT Cache64" );
888 cpu1 &= ~X264_CPU_CACHELINE_64;
889 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32, "MMXEXT Cache32" );
891 if( x264_cpu_detect() & X264_CPU_SSE2 )
893 cpu1 &= ~(X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_32);
894 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE | X264_CPU_SSE2, "SSE2" );
895 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSE2 Cache64" );
897 if( x264_cpu_detect() & X264_CPU_SSE3 )
898 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3, "SSE3" );
899 if( x264_cpu_detect() & X264_CPU_SSSE3 )
901 cpu1 &= ~X264_CPU_CACHELINE_SPLIT;
902 ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSSE3, "SSSE3" );
903 ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_SPLIT|X264_CPU_CACHELINE_64, "SSSE3 Cache64" );
906 if( x264_cpu_detect() & X264_CPU_ALTIVEC )
908 fprintf( stderr, "x264: ALTIVEC against C\n" );
909 ret = check_all( 0, X264_CPU_ALTIVEC );
915 fprintf( stderr, "x264: All tests passed Yeah :)\n" );
918 fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );