git.sesse.net Git - x264/blob - tools/checkasm.c

   1 #include <stdlib.h>
   2 #include <math.h>
   3
   4 #include "common/common.h"
   5 #include "common/cpu.h"
   6 #ifdef HAVE_MMX
   7 #include "common/i386/pixel.h"
   8 #include "common/i386/dct.h"
   9 #include "common/i386/mc.h"
  10 #endif
  11 #ifdef ARCH_PPC
  12 #include "common/ppc/pixel.h"
  13 #include "common/ppc/mc.h"
  14 #endif
  15
  16 /* buf1, buf2: initialised to random data and shouldn't write into them */
  17 uint8_t * buf1, * buf2;
  18 /* buf3, buf4: used to store output */
  19 uint8_t * buf3, * buf4;
  20 /* buf5: temp */
  21 uint8_t * buf5;
  22
  23 #define report( name ) { \
  24     if( used_asm ) \
  25         fprintf( stderr, " - %-21s [%s]\n", name, ok ? "OK" : "FAILED" ); \
  26     if( !ok ) ret = -1; \
  27 }
  28
  29 static int check_pixel( int cpu_ref, int cpu_new )
  30 {
  31     x264_pixel_function_t pixel_c;
  32     x264_pixel_function_t pixel_ref;
  33     x264_pixel_function_t pixel_asm;
  34     x264_predict_t predict_16x16[4+3];
  35     x264_predict_t predict_8x8c[4+3];
  36     x264_predict_t predict_4x4[9+3];
  37     x264_predict8x8_t predict_8x8[9+3];
  38     DECLARE_ALIGNED( uint8_t, edge[33], 8 );
  39     int ret = 0, ok, used_asm;
  40     int i, j;
  41
  42     x264_pixel_init( 0, &pixel_c );
  43     x264_pixel_init( cpu_ref, &pixel_ref );
  44     x264_pixel_init( cpu_new, &pixel_asm );
  45     x264_predict_16x16_init( 0, predict_16x16 );
  46     x264_predict_8x8c_init( 0, predict_8x8c );
  47     x264_predict_8x8_init( 0, predict_8x8 );
  48     x264_predict_4x4_init( 0, predict_4x4 );
  49     x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
  50
  51 #define TEST_PIXEL( name ) \
  52     for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
  53     { \
  54         int res_c, res_asm; \
  55         if( pixel_asm.name[i] != pixel_ref.name[i] ) \
  56         { \
  57             used_asm = 1; \
  58             res_c   = pixel_c.name[i]( buf1, 32, buf2, 16 ); \
  59             res_asm = pixel_asm.name[i]( buf1, 32, buf2, 16 ); \
  60             if( res_c != res_asm ) \
  61             { \
  62                 ok = 0; \
  63                 fprintf( stderr, #name "[%d]: %d != %d [FAILED]\n", i, res_c, res_asm ); \
  64             } \
  65         } \
  66     } \
  67     report( "pixel " #name " :" );
  68
  69     TEST_PIXEL( sad );
  70     TEST_PIXEL( ssd );
  71     TEST_PIXEL( satd );
  72     TEST_PIXEL( sa8d );
  73
  74 #define TEST_PIXEL_X( N ) \
  75     for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
  76     { \
  77         int res_c[4]={0}, res_asm[4]={0}; \
  78         if( pixel_asm.sad_x##N[i] && pixel_asm.sad_x##N[i] != pixel_ref.sad_x##N[i] ) \
  79         { \
  80             used_asm = 1; \
  81             res_c[0] = pixel_c.sad[i]( buf1, 16, buf2, 32 ); \
  82             res_c[1] = pixel_c.sad[i]( buf1, 16, buf2+30, 32 ); \
  83             res_c[2] = pixel_c.sad[i]( buf1, 16, buf2+1, 32 ); \
  84             if(N==4) \
  85             { \
  86                 res_c[3] = pixel_c.sad[i]( buf1, 16, buf2+99, 32 ); \
  87                 pixel_asm.sad_x4[i]( buf1, buf2, buf2+30, buf2+1, buf2+99, 32, res_asm ); \
  88             } \
  89             else \
  90                 pixel_asm.sad_x3[i]( buf1, buf2, buf2+30, buf2+1, 32, res_asm ); \
  91             if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
  92             { \
  93                 ok = 0; \
  94                 fprintf( stderr, "sad_x"#N"[%d]: %d,%d,%d,%d != %d,%d,%d,%d [FAILED]\n", \
  95                          i, res_c[0], res_c[1], res_c[2], res_c[3], \
  96                          res_asm[0], res_asm[1], res_asm[2], res_asm[3] ); \
  97             } \
  98         } \
  99     } \
 100     report( "pixel sad_x"#N" :" );
 101
 102     TEST_PIXEL_X(3);
 103     TEST_PIXEL_X(4);
 104
 105 #define TEST_INTRA_SATD( name, pred, satd, i8x8, ... ) \
 106     if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
 107     { \
 108         int res_c[3], res_asm[3]; \
 109         used_asm = 1; \
 110         memcpy( buf3, buf2, 1024 ); \
 111         for( i=0; i<3; i++ ) \
 112         { \
 113             pred[i]( buf3+40, ##__VA_ARGS__ ); \
 114             res_c[i] = pixel_c.satd( buf1+40, 16, buf3+40, 32 ); \
 115         } \
 116         pixel_asm.name( buf1+40, i8x8 ? edge : buf3+40, res_asm ); \
 117         if( memcmp(res_c, res_asm, sizeof(res_c)) ) \
 118         { \
 119             ok = 0; \
 120             fprintf( stderr, #name": %d,%d,%d != %d,%d,%d [FAILED]\n", \
 121                      res_c[0], res_c[1], res_c[2], \
 122                      res_asm[0], res_asm[1], res_asm[2] ); \
 123         } \
 124     }
 125
 126     ok = 1; used_asm = 0;
 127     TEST_INTRA_SATD( intra_satd_x3_16x16, predict_16x16, satd[PIXEL_16x16], 0 );
 128     TEST_INTRA_SATD( intra_satd_x3_8x8c, predict_8x8c, satd[PIXEL_8x8], 0 );
 129     TEST_INTRA_SATD( intra_satd_x3_4x4, predict_4x4, satd[PIXEL_4x4], 0 );
 130     TEST_INTRA_SATD( intra_sa8d_x3_8x8, predict_8x8, sa8d[PIXEL_8x8], 1, edge );
 131     report( "intra satd_x3 :" );
 132
 133     if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
 134         pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
 135     {
 136         float res_c, res_a;
 137         ok = 1;
 138         x264_cpu_restore( cpu_new );
 139         res_c = x264_pixel_ssim_wxh( &pixel_c,   buf1+2, 32, buf2+2, 32, 32, 28 );
 140         res_a = x264_pixel_ssim_wxh( &pixel_asm, buf1+2, 32, buf2+2, 32, 32, 28 );
 141         if( fabs(res_c - res_a) > 1e-8 )
 142         {
 143             ok = 0;
 144             fprintf( stderr, "ssim: %.7f != %.7f [FAILED]\n", res_c, res_a );
 145         }
 146         report( "ssim :" );
 147     }
 148
 149     ok = 1; used_asm = 0;
 150     for( i=0; i<4; i++ )
 151         if( pixel_asm.ads[i] != pixel_ref.ads[i] )
 152         {
 153             uint16_t res_a[32], res_c[32];
 154             uint16_t sums[72];
 155             int dc[4];
 156             for( j=0; j<72; j++ )
 157                 sums[j] = rand() & 0x3fff;
 158             for( j=0; j<4; j++ )
 159                 dc[j] = rand() & 0x3fff;
 160             used_asm = 1;
 161             pixel_c.ads[i]( dc, sums, 32, res_c, 32 );
 162             pixel_asm.ads[i]( dc, sums, 32, res_a, 32 );
 163             if( memcmp(res_a, res_c, sizeof(res_c)) )
 164                 ok = 0;
 165         }
 166     report( "esa ads:" );
 167
 168     return ret;
 169 }
 170
 171 static int check_dct( int cpu_ref, int cpu_new )
 172 {
 173     x264_dct_function_t dct_c;
 174     x264_dct_function_t dct_ref;
 175     x264_dct_function_t dct_asm;
 176     int ret = 0, ok, used_asm;
 177     int16_t dct1[16][4][4] __attribute__((aligned(16)));
 178     int16_t dct2[16][4][4] __attribute__((aligned(16)));
 179
 180     x264_dct_init( 0, &dct_c );
 181     x264_dct_init( cpu_ref, &dct_ref);
 182     x264_dct_init( cpu_new, &dct_asm );
 183 #define TEST_DCT( name, t1, t2, size ) \
 184     if( dct_asm.name != dct_ref.name ) \
 185     { \
 186         used_asm = 1; \
 187         dct_c.name( t1, buf1, buf2 ); \
 188         dct_asm.name( t2, buf1, buf2 ); \
 189         if( memcmp( t1, t2, size ) ) \
 190         { \
 191             ok = 0; \
 192             fprintf( stderr, #name " [FAILED]\n" ); \
 193         } \
 194     }
 195     ok = 1; used_asm = 0;
 196     TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
 197     TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
 198     TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
 199     report( "sub_dct4 :" );
 200
 201     ok = 1; used_asm = 0;
 202     TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 );
 203     TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 );
 204     report( "sub_dct8 :" );
 205 #undef TEST_DCT
 206
 207     /* copy coefs because idct8 modifies them in place */
 208     memcpy( buf5, dct1, 512 );
 209
 210 #define TEST_IDCT( name ) \
 211     if( dct_asm.name != dct_ref.name ) \
 212     { \
 213         used_asm = 1; \
 214         memcpy( buf3, buf1, 32*32 ); \
 215         memcpy( buf4, buf1, 32*32 ); \
 216         memcpy( dct1, buf5, 512 ); \
 217         memcpy( dct2, buf5, 512 ); \
 218         dct_c.name( buf3, (void*)dct1 ); \
 219         dct_asm.name( buf4, (void*)dct2 ); \
 220         if( memcmp( buf3, buf4, 32*32 ) ) \
 221         { \
 222             ok = 0; \
 223             fprintf( stderr, #name " [FAILED]\n" ); \
 224         } \
 225     }
 226     ok = 1; used_asm = 0;
 227     TEST_IDCT( add4x4_idct );
 228     TEST_IDCT( add8x8_idct );
 229     TEST_IDCT( add16x16_idct );
 230     report( "add_idct4 :" );
 231
 232     ok = 1; used_asm = 0;
 233     TEST_IDCT( add8x8_idct8 );
 234     TEST_IDCT( add16x16_idct8 );
 235     report( "add_idct8 :" );
 236 #undef TEST_IDCT
 237
 238     ok = 1; used_asm = 0;
 239     if( dct_asm.dct4x4dc != dct_ref.dct4x4dc )
 240     {
 241         int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
 242         int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
 243         used_asm = 1;
 244         dct_c.dct4x4dc( dct1 );
 245         dct_asm.dct4x4dc( dct2 );
 246         if( memcmp( dct1, dct2, 32 ) )
 247         {
 248             ok = 0;
 249             fprintf( stderr, " - dct4x4dc :        [FAILED]\n" );
 250         }
 251     }
 252     if( dct_asm.dct4x4dc != dct_ref.dct4x4dc )
 253     {
 254         int16_t dct1[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
 255         int16_t dct2[4][4] __attribute((aligned(16))) = { {-12, 42, 23, 67},{2, 90, 89,56}, {67,43,-76,91},{56,-78,-54,1}};
 256         used_asm = 1;
 257         dct_c.idct4x4dc( dct1 );
 258         dct_asm.idct4x4dc( dct2 );
 259         if( memcmp( dct1, dct2, 32 ) )
 260         {
 261             ok = 0;
 262             fprintf( stderr, " - idct4x4dc :        [FAILED]\n" );
 263         }
 264     }
 265     report( "(i)dct4x4dc :" );
 266
 267     ok = 1; used_asm = 0;
 268     if( dct_asm.dct2x2dc != dct_ref.dct2x2dc )
 269     {
 270         int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
 271         int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
 272         used_asm = 1;
 273         dct_c.dct2x2dc( dct1 );
 274         dct_asm.dct2x2dc( dct2 );
 275         if( memcmp( dct1, dct2, 4*2 ) )
 276         {
 277             ok = 0;
 278             fprintf( stderr, " - dct2x2dc :        [FAILED]\n" );
 279         }
 280     }
 281     if( dct_asm.idct2x2dc != dct_ref.idct2x2dc )
 282     {
 283         int16_t dct1[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
 284         int16_t dct2[2][2] __attribute((aligned(16))) = { {-12, 42},{2, 90}};
 285         used_asm = 1;
 286         dct_c.idct2x2dc( dct1 );
 287         dct_asm.idct2x2dc( dct2 );
 288         if( memcmp( dct1, dct2, 4*2 ) )
 289         {
 290             ok = 0;
 291             fprintf( stderr, " - idct2x2dc :       [FAILED]\n" );
 292         }
 293     }
 294     report( "(i)dct2x2dc :" );
 295
 296     return ret;
 297 }
 298
 299 static int check_mc( int cpu_ref, int cpu_new )
 300 {
 301     x264_mc_functions_t mc_c;
 302     x264_mc_functions_t mc_ref;
 303     x264_mc_functions_t mc_a;
 304     x264_pixel_function_t pixel;
 305
 306     uint8_t *src     = &buf1[2*32+2];
 307     uint8_t *src2[4] = { &buf1[2*32+2],  &buf1[7*32+2],
 308                          &buf1[12*32+2], &buf1[17*32+2] };
 309     uint8_t *dst1    = &buf3[2*32+2];
 310     uint8_t *dst2    = &buf4[2*32+2];
 311
 312     int dx, dy, i, j, w;
 313     int ret = 0, ok, used_asm;
 314
 315     x264_mc_init( 0, &mc_c );
 316     x264_mc_init( cpu_ref, &mc_ref );
 317     x264_mc_init( cpu_new, &mc_a );
 318     x264_pixel_init( 0, &pixel );
 319
 320 #define MC_TEST_LUMA( w, h ) \
 321         if( mc_a.mc_luma != mc_ref.mc_luma ) \
 322         { \
 323             used_asm = 1; \
 324             memset(buf3, 0xCD, 1024); \
 325             memset(buf4, 0xCD, 1024); \
 326             mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
 327             mc_a.mc_luma( dst2, 16, src2, 32, dx, dy, w, h ); \
 328             if( memcmp( buf3, buf4, 1024 ) ) \
 329             { \
 330                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
 331                 ok = 0; \
 332             } \
 333         } \
 334         if( mc_a.get_ref != mc_ref.get_ref ) \
 335         { \
 336             uint8_t *ref = dst2; \
 337             int ref_stride = 16; \
 338             used_asm = 1; \
 339             memset(buf3, 0xCD, 1024); \
 340             memset(buf4, 0xCD, 1024); \
 341             mc_c.mc_luma( dst1, 16, src2, 32, dx, dy, w, h ); \
 342             ref = mc_a.get_ref( ref, &ref_stride, src2, 32, dx, dy, w, h ); \
 343             if( pixel.sad[PIXEL_##w##x##h]( dst1, 16, ref, ref_stride ) ) \
 344             { \
 345                 fprintf( stderr, "get_ref[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
 346                 ok = 0; \
 347             } \
 348         }
 349
 350 #define MC_TEST_CHROMA( w, h ) \
 351         if( mc_a.mc_chroma != mc_ref.mc_chroma ) \
 352         { \
 353             used_asm = 1; \
 354             memset(buf3, 0xCD, 1024); \
 355             memset(buf4, 0xCD, 1024); \
 356             mc_c.mc_chroma( dst1, 16, src, 32, dx, dy, w, h ); \
 357             mc_a.mc_chroma( dst2, 16, src, 32, dx, dy, w, h ); \
 358             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
 359             for( j=0; j<h; j++ ) \
 360                 for( i=w; i<4; i++ ) \
 361                     dst2[i+j*16] = dst1[i+j*16]; \
 362             if( memcmp( buf3, buf4, 1024 ) ) \
 363             { \
 364                 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
 365                 ok = 0; \
 366             } \
 367         }
 368     ok = 1; used_asm = 0;
 369     for( dy = -8; dy < 8; dy++ )
 370         for( dx = -8; dx < 8; dx++ )
 371         {
 372             MC_TEST_LUMA( 16, 16 );
 373             MC_TEST_LUMA( 16, 8 );
 374             MC_TEST_LUMA( 8, 16 );
 375             MC_TEST_LUMA( 8, 8 );
 376             MC_TEST_LUMA( 8, 4 );
 377             MC_TEST_LUMA( 4, 8 );
 378             MC_TEST_LUMA( 4, 4 );
 379         }
 380     report( "mc luma :" );
 381
 382     ok = 1; used_asm = 0;
 383     for( dy = -1; dy < 9; dy++ )
 384         for( dx = -1; dx < 9; dx++ )
 385         {
 386             MC_TEST_CHROMA( 8, 8 );
 387             MC_TEST_CHROMA( 8, 4 );
 388             MC_TEST_CHROMA( 4, 8 );
 389             MC_TEST_CHROMA( 4, 4 );
 390             MC_TEST_CHROMA( 4, 2 );
 391             MC_TEST_CHROMA( 2, 4 );
 392             MC_TEST_CHROMA( 2, 2 );
 393         }
 394     report( "mc chroma :" );
 395 #undef MC_TEST_LUMA
 396 #undef MC_TEST_CHROMA
 397
 398 #define MC_TEST_AVG( name, ... ) \
 399     for( i = 0, ok = 1, used_asm = 0; i < 10; i++ ) \
 400     { \
 401         memcpy( buf3, buf1, 1024 ); \
 402         memcpy( buf4, buf1, 1024 ); \
 403         if( mc_a.name[i] != mc_ref.name[i] ) \
 404         { \
 405             used_asm = 1; \
 406             mc_c.name[i]( buf3, 32, buf2, 16, ##__VA_ARGS__ ); \
 407             mc_a.name[i]( buf4, 32, buf2, 16, ##__VA_ARGS__ ); \
 408             if( memcmp( buf3, buf4, 1024 ) )               \
 409             { \
 410                 ok = 0; \
 411                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
 412             } \
 413         } \
 414     }
 415     MC_TEST_AVG( avg );
 416     report( "mc avg :" );
 417     ok = 1; used_asm = 0;
 418     for( w = -64; w <= 128 && ok; w++ )
 419         MC_TEST_AVG( avg_weight, w );
 420     report( "mc wpredb :" );
 421
 422     return ret;
 423 }
 424
 425 static int check_deblock( int cpu_ref, int cpu_new )
 426 {
 427     x264_deblock_function_t db_c;
 428     x264_deblock_function_t db_ref;
 429     x264_deblock_function_t db_a;
 430     int ret = 0, ok = 1, used_asm = 0;
 431     int alphas[36], betas[36];
 432     int8_t tcs[36][4];
 433     int a, c, i, j;
 434
 435     x264_deblock_init( 0, &db_c );
 436     x264_deblock_init( cpu_ref, &db_ref );
 437     x264_deblock_init( cpu_new, &db_a );
 438
 439     /* not exactly the real values of a,b,tc but close enough */
 440     a = 255; c = 250;
 441     for( i = 35; i >= 0; i-- )
 442     {
 443         alphas[i] = a;
 444         betas[i] = (i+1)/2;
 445         tcs[i][0] = tcs[i][2] = (c+6)/10;
 446         tcs[i][1] = tcs[i][3] = (c+9)/20;
 447         a = a*9/10;
 448         c = c*9/10;
 449     }
 450
 451 #define TEST_DEBLOCK( name, ... ) \
 452     for( i = 0; i < 36; i++ ) \
 453     { \
 454         for( j = 0; j < 1024; j++ ) \
 455             /* two distributions of random to excersize different failure modes */\
 456             buf1[j] = rand() & (i&1 ? 0xf : 0xff ); \
 457         memcpy( buf3, buf1, 1024 ); \
 458         memcpy( buf4, buf1, 1024 ); \
 459         if( db_a.name != db_ref.name ) \
 460         { \
 461             used_asm = 1; \
 462             db_c.name( &buf3[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
 463             db_a.name( &buf4[8*32], 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
 464             if( memcmp( buf3, buf4, 1024 ) )               \
 465             { \
 466                 ok = 0; \
 467                 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
 468                 break; \
 469             } \
 470         } \
 471     }
 472
 473     TEST_DEBLOCK( deblock_h_luma, tcs[i] );
 474     TEST_DEBLOCK( deblock_v_luma, tcs[i] );
 475     TEST_DEBLOCK( deblock_h_chroma, tcs[i] );
 476     TEST_DEBLOCK( deblock_v_chroma, tcs[i] );
 477     TEST_DEBLOCK( deblock_h_luma_intra );
 478     TEST_DEBLOCK( deblock_v_luma_intra );
 479     TEST_DEBLOCK( deblock_h_chroma_intra );
 480     TEST_DEBLOCK( deblock_v_chroma_intra );
 481
 482     report( "deblock :" );
 483
 484     return ret;
 485 }
 486
 487 static int check_quant( int cpu_ref, int cpu_new )
 488 {
 489     x264_quant_function_t qf_c;
 490     x264_quant_function_t qf_ref;
 491     x264_quant_function_t qf_a;
 492     int16_t dct1[64]    __attribute__((__aligned__(16)));
 493     int16_t dct2[64]    __attribute__((__aligned__(16)));
 494     uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
 495     int ret = 0, ok, used_asm;
 496     int oks[2] = {1,1}, used_asms[2] = {0,0};
 497     int i, i_cqm, qp;
 498     x264_t h_buf;
 499     x264_t *h = &h_buf;
 500     h->pps = h->pps_array;
 501     x264_param_default( &h->param );
 502     h->param.rc.i_qp_min = 26;
 503
 504     for( i_cqm = 0; i_cqm < 4; i_cqm++ )
 505     {
 506         if( i_cqm == 0 )
 507             for( i = 0; i < 6; i++ )
 508                 h->pps->scaling_list[i] = x264_cqm_flat16;
 509         else if( i_cqm == 1 )
 510             for( i = 0; i < 6; i++ )
 511                 h->pps->scaling_list[i] = x264_cqm_jvt[i];
 512         else
 513         {
 514             if( i_cqm == 2 )
 515                 for( i = 0; i < 64; i++ )
 516                     cqm_buf[i] = 10 + rand() % 246;
 517             else
 518                 for( i = 0; i < 64; i++ )
 519                     cqm_buf[i] = 1;
 520             for( i = 0; i < 6; i++ )
 521                 h->pps->scaling_list[i] = cqm_buf;
 522         }
 523
 524         x264_cqm_init( h );
 525         x264_quant_init( h, 0, &qf_c );
 526         x264_quant_init( h, cpu_ref, &qf_ref );
 527         x264_quant_init( h, cpu_new, &qf_a );
 528
 529 #define INIT_QUANT8() \
 530         { \
 531             static const int scale1d[8] = {32,31,24,31,32,31,24,31}; \
 532             int x, y; \
 533             for( y = 0; y < 8; y++ ) \
 534                 for( x = 0; x < 8; x++ ) \
 535                 { \
 536                     unsigned int scale = (255*scale1d[y]*scale1d[x])/16; \
 537                     dct1[y*8+x] = dct2[y*8+x] = (rand()%(2*scale+1))-scale; \
 538                 } \
 539         }
 540
 541 #define INIT_QUANT4() \
 542         { \
 543             static const int scale1d[4] = {4,6,4,6}; \
 544             int x, y; \
 545             for( y = 0; y < 4; y++ ) \
 546                 for( x = 0; x < 4; x++ ) \
 547                 { \
 548                     unsigned int scale = 255*scale1d[y]*scale1d[x]; \
 549                     dct1[y*4+x] = dct2[y*4+x] = (rand()%(2*scale+1))-scale; \
 550                 } \
 551         }
 552
 553 #define TEST_QUANT_DC( name, cqm ) \
 554         if( qf_a.name != qf_ref.name ) \
 555         { \
 556             used_asms[0] = 1; \
 557             for( qp = 51; qp > 0; qp-- ) \
 558             { \
 559                 for( i = 0; i < 16; i++ ) \
 560                     dct1[i] = dct2[i] = (rand() & 0x1fff) - 0xfff; \
 561                 qf_c.name( (void*)dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
 562                 qf_a.name( (void*)dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
 563                 if( memcmp( dct1, dct2, 16*2 ) )       \
 564                 { \
 565                     oks[0] = 0; \
 566                     fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
 567                     break; \
 568                 } \
 569             } \
 570         }
 571
 572 #define TEST_QUANT( qname, block, w ) \
 573         if( qf_a.qname != qf_ref.qname ) \
 574         { \
 575             used_asms[0] = 1; \
 576             for( qp = 51; qp > 0; qp-- ) \
 577             { \
 578                 INIT_QUANT##w() \
 579                 qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
 580                 qf_a.qname( (void*)dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
 581                 if( memcmp( dct1, dct2, w*w*2 ) ) \
 582                 { \
 583                     oks[0] = 0; \
 584                     fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
 585                     break; \
 586                 } \
 587             } \
 588         }
 589
 590         TEST_QUANT( quant_8x8, CQM_8IY, 8 );
 591         TEST_QUANT( quant_8x8, CQM_8PY, 8 );
 592         TEST_QUANT( quant_4x4, CQM_4IY, 4 );
 593         TEST_QUANT( quant_4x4, CQM_4PY, 4 );
 594         TEST_QUANT_DC( quant_4x4_dc, **h->quant4_mf[CQM_4IY] );
 595         TEST_QUANT_DC( quant_2x2_dc, **h->quant4_mf[CQM_4IC] );
 596
 597 #define TEST_DEQUANT( qname, dqname, block, w ) \
 598         if( qf_a.dqname != qf_ref.dqname ) \
 599         { \
 600             used_asms[1] = 1; \
 601             for( qp = 51; qp > 0; qp-- ) \
 602             { \
 603                 INIT_QUANT##w() \
 604                 qf_c.qname( (void*)dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
 605                 memcpy( dct2, dct1, w*w*2 ); \
 606                 qf_c.dqname( (void*)dct1, h->dequant##w##_mf[block], qp ); \
 607                 qf_a.dqname( (void*)dct2, h->dequant##w##_mf[block], qp ); \
 608                 if( memcmp( dct1, dct2, w*w*2 ) ) \
 609                 { \
 610                     oks[1] = 0; \
 611                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
 612                     break; \
 613                 } \
 614             } \
 615         }
 616
 617         TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8IY, 8 );
 618         TEST_DEQUANT( quant_8x8, dequant_8x8, CQM_8PY, 8 );
 619         TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4IY, 4 );
 620         TEST_DEQUANT( quant_4x4, dequant_4x4, CQM_4PY, 4 );
 621     }
 622
 623     ok = oks[0]; used_asm = used_asms[0];
 624     report( "quant :" );
 625
 626     ok = oks[1]; used_asm = used_asms[1];
 627     report( "dequant :" );
 628
 629     return ret;
 630 }
 631
 632 static int check_intra( int cpu_ref, int cpu_new )
 633 {
 634     int ret = 0, ok = 1, used_asm = 0;
 635     int i;
 636     DECLARE_ALIGNED( uint8_t, edge[33], 8 );
 637     struct
 638     {
 639         x264_predict_t      predict_16x16[4+3];
 640         x264_predict_t      predict_8x8c[4+3];
 641         x264_predict8x8_t   predict_8x8[9+3];
 642         x264_predict_t      predict_4x4[9+3];
 643     } ip_c, ip_ref, ip_a;
 644
 645     x264_predict_16x16_init( 0, ip_c.predict_16x16 );
 646     x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
 647     x264_predict_8x8_init( 0, ip_c.predict_8x8 );
 648     x264_predict_4x4_init( 0, ip_c.predict_4x4 );
 649
 650     x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
 651     x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
 652     x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8 );
 653     x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
 654
 655     x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
 656     x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
 657     x264_predict_8x8_init( cpu_new, ip_a.predict_8x8 );
 658     x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
 659
 660     x264_predict_8x8_filter( buf1+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
 661
 662 #define INTRA_TEST( name, dir, ... ) \
 663     if( ip_a.name[dir] != ip_ref.name[dir] )\
 664     { \
 665         used_asm = 1; \
 666         memcpy( buf3, buf1, 32*20 );\
 667         memcpy( buf4, buf1, 32*20 );\
 668         ip_c.name[dir]( buf3+48, ##__VA_ARGS__ );\
 669         ip_a.name[dir]( buf4+48, ##__VA_ARGS__ );\
 670         if( memcmp( buf3, buf4, 32*20 ) )\
 671         {\
 672             fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
 673             ok = 0;\
 674             int j,k;\
 675             for(k=-1; k<16; k++)\
 676                 printf("%2x ", edge[16+k]);\
 677             printf("\n");\
 678             for(j=0; j<8; j++){\
 679                 printf("%2x ", edge[j]);\
 680                 for(k=0; k<8; k++)\
 681                     printf("%2x ", buf4[48+k+j*32]);\
 682                 printf("\n");\
 683             }\
 684             printf("\n");\
 685             for(j=0; j<8; j++){\
 686                 printf("   ");\
 687                 for(k=0; k<8; k++)\
 688                     printf("%2x ", buf3[48+k+j*32]);\
 689                 printf("\n");\
 690             }\
 691         }\
 692     }
 693
 694     for( i = 0; i < 12; i++ )
 695         INTRA_TEST( predict_4x4, i );
 696     for( i = 0; i < 7; i++ )
 697         INTRA_TEST( predict_8x8c, i );
 698     for( i = 0; i < 7; i++ )
 699         INTRA_TEST( predict_16x16, i );
 700     for( i = 0; i < 12; i++ )
 701         INTRA_TEST( predict_8x8, i, edge );
 702
 703     report( "intra pred :" );
 704     return ret;
 705 }
 706
 707 int check_all( int cpu_ref, int cpu_new )
 708 {
 709     return check_pixel( cpu_ref, cpu_new )
 710          + check_dct( cpu_ref, cpu_new )
 711          + check_mc( cpu_ref, cpu_new )
 712          + check_intra( cpu_ref, cpu_new )
 713          + check_deblock( cpu_ref, cpu_new )
 714          + check_quant( cpu_ref, cpu_new );
 715 }
 716
 717 int main(int argc, char *argv[])
 718 {
 719     int ret = 0;
 720     int cpu0 = 0, cpu1 = 0;
 721     int i;
 722
 723     buf1 = x264_malloc( 1024 ); /* 32 x 32 */
 724     buf2 = x264_malloc( 1024 );
 725     buf3 = x264_malloc( 1024 );
 726     buf4 = x264_malloc( 1024 );
 727     buf5 = x264_malloc( 1024 );
 728
 729     i = ( argc > 1 ) ? atoi(argv[1]) : x264_mdate();
 730     fprintf( stderr, "x264: using random seed %u\n", i );
 731     srand( i );
 732
 733     for( i = 0; i < 1024; i++ )
 734     {
 735         buf1[i] = rand() & 0xFF;
 736         buf2[i] = rand() & 0xFF;
 737         buf3[i] = buf4[i] = 0;
 738     }
 739
 740 #ifdef HAVE_MMX
 741     fprintf( stderr, "x264: MMXEXT against C\n" );
 742     cpu1 = X264_CPU_MMX | X264_CPU_MMXEXT;
 743     ret = check_all( 0, cpu1 );
 744
 745     if( x264_cpu_detect() & X264_CPU_SSE2 )
 746     {
 747         fprintf( stderr, "\nx264: SSE2 against C\n" );
 748         cpu0 = cpu1;
 749         cpu1 |= X264_CPU_SSE | X264_CPU_SSE2;
 750         ret |= check_all( cpu0, cpu1 );
 751
 752         if( x264_cpu_detect() & X264_CPU_SSSE3 )
 753         {
 754             fprintf( stderr, "\nx264: SSSE3 against C\n" );
 755             cpu0 = cpu1;
 756             cpu1 |= X264_CPU_SSE3 | X264_CPU_SSSE3;
 757             ret |= check_all( cpu0, cpu1 );
 758         }
 759     }
 760 #elif ARCH_PPC
 761     if( x264_cpu_detect() & X264_CPU_ALTIVEC )
 762     {
 763         fprintf( stderr, "x264: ALTIVEC against C\n" );
 764         ret = check_all( 0, X264_CPU_ALTIVEC );
 765     }
 766 #endif
 767
 768     if( ret == 0 )
 769     {
 770         fprintf( stderr, "x264: All tests passed Yeah :)\n" );
 771         return 0;
 772     }
 773     fprintf( stderr, "x264: at least one test has failed. Go and fix that Right Now!\n" );
 774     return -1;
 775 }
 776