git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "simple_idct.h"
  43 #include "aandcttab.h"
  44 #include "faandct.h"
  45 #include "faanidct.h"
  46 #include "x86/idct_xvid.h"
  47 #include "dctref.h"
  48
  49 #undef printf
  50
  51 void ff_mmx_idct(DCTELEM *data);
  52 void ff_mmxext_idct(DCTELEM *data);
  53
  54 // BFIN
  55 void ff_bfin_idct(DCTELEM *block);
  56 void ff_bfin_fdct(DCTELEM *block);
  57
  58 // ALTIVEC
  59 void ff_fdct_altivec(DCTELEM *block);
  60
  61 // ARM
  62 void ff_j_rev_dct_arm(DCTELEM *data);
  63 void ff_simple_idct_arm(DCTELEM *data);
  64 void ff_simple_idct_armv5te(DCTELEM *data);
  65 void ff_simple_idct_armv6(DCTELEM *data);
  66 void ff_simple_idct_neon(DCTELEM *data);
  67
  68 void ff_simple_idct_axp(DCTELEM *data);
  69
  70 struct algo {
  71     const char *name;
  72     void (*func)(DCTELEM *block);
  73     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  74                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  75     int mm_support;
  76     int nonspec;
  77 };
  78
  79 static int cpu_flags;
  80
  81 static const struct algo fdct_tab[] = {
  82     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  83     { "FAAN",           ff_faandct,            NO_PERM    },
  84     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  85     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  86
  87 #if HAVE_MMX_INLINE
  88     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  89     { "MMXEXT",         ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMXEXT  },
  90     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  91 #endif
  92
  93 #if HAVE_ALTIVEC
  94     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  95 #endif
  96
  97 #if ARCH_BFIN
  98     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
  99 #endif
 100
 101     { 0 }
 102 };
 103
 104 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
 105 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 106                                 DCTELEM *block, int16_t *qmat);
 107
 108 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
 109     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 110     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 111     int i;
 112
 113     for(i=0; i<64; i++){
 114         qmat[i]=4;
 115         tmp[i]= dst[i];
 116     }
 117     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 118 }
 119 #endif
 120
 121 static const struct algo idct_tab[] = {
 122     { "FAANI",          ff_faanidct,           NO_PERM  },
 123     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 124     { "INT",            ff_j_rev_dct,          MMX_PERM },
 125     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 126
 127 #if HAVE_MMX_INLINE
 128 #if CONFIG_GPL
 129     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 130     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 131 #endif
 132     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 133     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 134     { "XVID-MMXEXT",    ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
 135     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 136 #if ARCH_X86_64 && HAVE_YASM
 137     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 138 #endif
 139 #endif
 140
 141 #if ARCH_BFIN
 142     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 143 #endif
 144
 145 #if ARCH_ARM
 146     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 147     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 148 #endif
 149 #if HAVE_ARMV5TE
 150     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 151 #endif
 152 #if HAVE_ARMV6
 153     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 154 #endif
 155 #if HAVE_NEON
 156     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 157 #endif
 158
 159 #if ARCH_ALPHA
 160     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 161 #endif
 162
 163     { 0 }
 164 };
 165
 166 #define AANSCALE_BITS 12
 167
 168 #define NB_ITS 20000
 169 #define NB_ITS_SPEED 50000
 170
 171 static short idct_mmx_perm[64];
 172
 173 static short idct_simple_mmx_perm[64] = {
 174     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 175     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 176     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 177     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 178     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 179     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 180     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 181     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 182 };
 183
 184 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 185
 186 static void idct_mmx_init(void)
 187 {
 188     int i;
 189
 190     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 191     for (i = 0; i < 64; i++) {
 192         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 193     }
 194 }
 195
 196 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 197 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 198
 199 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 200 {
 201     int i, j;
 202
 203     memset(block, 0, 64 * sizeof(*block));
 204
 205     switch (test) {
 206     case 0:
 207         for (i = 0; i < 64; i++)
 208             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 209         if (is_idct) {
 210             ff_ref_fdct(block);
 211             for (i = 0; i < 64; i++)
 212                 block[i] >>= 3;
 213         }
 214         break;
 215     case 1:
 216         j = av_lfg_get(prng) % 10 + 1;
 217         for (i = 0; i < j; i++) {
 218             int idx = av_lfg_get(prng) % 64;
 219             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
 220         }
 221         break;
 222     case 2:
 223         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 224         block[63] = (block[0] & 1) ^ 1;
 225         break;
 226     }
 227 }
 228
 229 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 230 {
 231     int i;
 232
 233     if (perm == MMX_PERM) {
 234         for (i = 0; i < 64; i++)
 235             dst[idct_mmx_perm[i]] = src[i];
 236     } else if (perm == MMX_SIMPLE_PERM) {
 237         for (i = 0; i < 64; i++)
 238             dst[idct_simple_mmx_perm[i]] = src[i];
 239     } else if (perm == SSE2_PERM) {
 240         for (i = 0; i < 64; i++)
 241             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 242     } else if (perm == PARTTRANS_PERM) {
 243         for (i = 0; i < 64; i++)
 244             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 245     } else if (perm == TRANSPOSE_PERM) {
 246         for (i = 0; i < 64; i++)
 247             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 248     } else {
 249         for (i = 0; i < 64; i++)
 250             dst[i] = src[i];
 251     }
 252 }
 253
 254 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 255 {
 256     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 257     int it, i, scale;
 258     int err_inf, v;
 259     int64_t err2, ti, ti1, it1, err_sum = 0;
 260     int64_t sysErr[64], sysErrMax = 0;
 261     int maxout = 0;
 262     int blockSumErrMax = 0, blockSumErr;
 263     AVLFG prng;
 264     const int vals=1<<bits;
 265     double omse, ome;
 266     int spec_err;
 267
 268     av_lfg_init(&prng, 1);
 269
 270     err_inf = 0;
 271     err2 = 0;
 272     for (i = 0; i < 64; i++)
 273         sysErr[i] = 0;
 274     for (it = 0; it < NB_ITS; it++) {
 275         init_block(block1, test, is_idct, &prng, vals);
 276         permute(block, block1, dct->format);
 277
 278         dct->func(block);
 279         emms_c();
 280
 281         if (dct->format == SCALE_PERM) {
 282             for (i = 0; i < 64; i++) {
 283                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 284                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 285             }
 286         }
 287
 288         ref(block1);
 289
 290         blockSumErr = 0;
 291         for (i = 0; i < 64; i++) {
 292             int err = block[i] - block1[i];
 293             err_sum += err;
 294             v = abs(err);
 295             if (v > err_inf)
 296                 err_inf = v;
 297             err2 += v * v;
 298             sysErr[i] += block[i] - block1[i];
 299             blockSumErr += v;
 300             if (abs(block[i]) > maxout)
 301                 maxout = abs(block[i]);
 302         }
 303         if (blockSumErrMax < blockSumErr)
 304             blockSumErrMax = blockSumErr;
 305     }
 306     for (i = 0; i < 64; i++)
 307         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 308
 309     for (i = 0; i < 64; i++) {
 310         if (i % 8 == 0)
 311             printf("\n");
 312         printf("%7d ", (int) sysErr[i]);
 313     }
 314     printf("\n");
 315
 316     omse = (double) err2 / NB_ITS / 64;
 317     ome  = (double) err_sum / NB_ITS / 64;
 318
 319     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 320
 321     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 322            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 323            omse, ome, (double) sysErrMax / NB_ITS,
 324            maxout, blockSumErrMax);
 325
 326     if (spec_err && !dct->nonspec)
 327         return 1;
 328
 329     if (!speed)
 330         return 0;
 331
 332     /* speed test */
 333
 334     init_block(block, test, is_idct, &prng, vals);
 335     permute(block1, block, dct->format);
 336
 337     ti = av_gettime();
 338     it1 = 0;
 339     do {
 340         for (it = 0; it < NB_ITS_SPEED; it++) {
 341             memcpy(block, block1, sizeof(block));
 342             dct->func(block);
 343         }
 344         emms_c();
 345         it1 += NB_ITS_SPEED;
 346         ti1 = av_gettime() - ti;
 347     } while (ti1 < 1000000);
 348
 349     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 350            (double) it1 * 1000.0 / (double) ti1);
 351
 352     return 0;
 353 }
 354
 355 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 356 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 357
 358 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 359 {
 360     static int init;
 361     static double c8[8][8];
 362     static double c4[4][4];
 363     double block1[64], block2[64], block3[64];
 364     double s, sum, v;
 365     int i, j, k;
 366
 367     if (!init) {
 368         init = 1;
 369
 370         for (i = 0; i < 8; i++) {
 371             sum = 0;
 372             for (j = 0; j < 8; j++) {
 373                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 374                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 375                 sum += c8[i][j] * c8[i][j];
 376             }
 377         }
 378
 379         for (i = 0; i < 4; i++) {
 380             sum = 0;
 381             for (j = 0; j < 4; j++) {
 382                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 383                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 384                 sum += c4[i][j] * c4[i][j];
 385             }
 386         }
 387     }
 388
 389     /* butterfly */
 390     s = 0.5 * sqrt(2.0);
 391     for (i = 0; i < 4; i++) {
 392         for (j = 0; j < 8; j++) {
 393             block1[8 * (2 * i) + j] =
 394                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 395             block1[8 * (2 * i + 1) + j] =
 396                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 397         }
 398     }
 399
 400     /* idct8 on lines */
 401     for (i = 0; i < 8; i++) {
 402         for (j = 0; j < 8; j++) {
 403             sum = 0;
 404             for (k = 0; k < 8; k++)
 405                 sum += c8[k][j] * block1[8 * i + k];
 406             block2[8 * i + j] = sum;
 407         }
 408     }
 409
 410     /* idct4 */
 411     for (i = 0; i < 8; i++) {
 412         for (j = 0; j < 4; j++) {
 413             /* top */
 414             sum = 0;
 415             for (k = 0; k < 4; k++)
 416                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 417             block3[8 * (2 * j) + i] = sum;
 418
 419             /* bottom */
 420             sum = 0;
 421             for (k = 0; k < 4; k++)
 422                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 423             block3[8 * (2 * j + 1) + i] = sum;
 424         }
 425     }
 426
 427     /* clamp and store the result */
 428     for (i = 0; i < 8; i++) {
 429         for (j = 0; j < 8; j++) {
 430             v = block3[8 * i + j];
 431             if      (v < 0)   v = 0;
 432             else if (v > 255) v = 255;
 433             dest[i * linesize + j] = (int) rint(v);
 434         }
 435     }
 436 }
 437
 438 static void idct248_error(const char *name,
 439                           void (*idct248_put)(uint8_t *dest, int line_size,
 440                                               int16_t *block),
 441                           int speed)
 442 {
 443     int it, i, it1, ti, ti1, err_max, v;
 444     AVLFG prng;
 445
 446     av_lfg_init(&prng, 1);
 447
 448     /* just one test to see if code is correct (precision is less
 449        important here) */
 450     err_max = 0;
 451     for (it = 0; it < NB_ITS; it++) {
 452         /* XXX: use forward transform to generate values */
 453         for (i = 0; i < 64; i++)
 454             block1[i] = av_lfg_get(&prng) % 256 - 128;
 455         block1[0] += 1024;
 456
 457         for (i = 0; i < 64; i++)
 458             block[i] = block1[i];
 459         idct248_ref(img_dest1, 8, block);
 460
 461         for (i = 0; i < 64; i++)
 462             block[i] = block1[i];
 463         idct248_put(img_dest, 8, block);
 464
 465         for (i = 0; i < 64; i++) {
 466             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 467             if (v == 255)
 468                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 469             if (v > err_max)
 470                 err_max = v;
 471         }
 472 #if 0
 473         printf("ref=\n");
 474         for(i=0;i<8;i++) {
 475             int j;
 476             for(j=0;j<8;j++) {
 477                 printf(" %3d", img_dest1[i*8+j]);
 478             }
 479             printf("\n");
 480         }
 481
 482         printf("out=\n");
 483         for(i=0;i<8;i++) {
 484             int j;
 485             for(j=0;j<8;j++) {
 486                 printf(" %3d", img_dest[i*8+j]);
 487             }
 488             printf("\n");
 489         }
 490 #endif
 491     }
 492     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 493
 494     if (!speed)
 495         return;
 496
 497     ti = av_gettime();
 498     it1 = 0;
 499     do {
 500         for (it = 0; it < NB_ITS_SPEED; it++) {
 501             for (i = 0; i < 64; i++)
 502                 block[i] = block1[i];
 503             idct248_put(img_dest, 8, block);
 504         }
 505         emms_c();
 506         it1 += NB_ITS_SPEED;
 507         ti1 = av_gettime() - ti;
 508     } while (ti1 < 1000000);
 509
 510     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 511            (double) it1 * 1000.0 / (double) ti1);
 512 }
 513
 514 static void help(void)
 515 {
 516     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 517            "test-number 0 -> test with random matrixes\n"
 518            "            1 -> test with random sparse matrixes\n"
 519            "            2 -> do 3. test from mpeg4 std\n"
 520            "bits        Number of time domain bits to use, 8 is default\n"
 521            "-i          test IDCT implementations\n"
 522            "-4          test IDCT248 implementations\n"
 523            "-t          speed test\n");
 524 }
 525
 526 #if !HAVE_GETOPT
 527 #include "compat/getopt.c"
 528 #endif
 529
 530 int main(int argc, char **argv)
 531 {
 532     int test_idct = 0, test_248_dct = 0;
 533     int c, i;
 534     int test = 1;
 535     int speed = 0;
 536     int err = 0;
 537     int bits=8;
 538
 539     cpu_flags = av_get_cpu_flags();
 540
 541     ff_ref_dct_init();
 542     idct_mmx_init();
 543
 544     for (;;) {
 545         c = getopt(argc, argv, "ih4t");
 546         if (c == -1)
 547             break;
 548         switch (c) {
 549         case 'i':
 550             test_idct = 1;
 551             break;
 552         case '4':
 553             test_248_dct = 1;
 554             break;
 555         case 't':
 556             speed = 1;
 557             break;
 558         default:
 559         case 'h':
 560             help();
 561             return 0;
 562         }
 563     }
 564
 565     if (optind < argc)
 566         test = atoi(argv[optind]);
 567     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 568
 569     printf("ffmpeg DCT/IDCT test\n");
 570
 571     if (test_248_dct) {
 572         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 573     } else {
 574         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 575         for (i = 0; algos[i].name; i++)
 576             if (!(~cpu_flags & algos[i].mm_support)) {
 577                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 578             }
 579     }
 580
 581     return err;
 582 }