git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33 #include <math.h>
  34
  35 #include "libavutil/cpu.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/lfg.h"
  38
  39 #include "simple_idct.h"
  40 #include "aandcttab.h"
  41 #include "faandct.h"
  42 #include "faanidct.h"
  43 #include "x86/idct_xvid.h"
  44 #include "dctref.h"
  45
  46 #undef printf
  47
  48 void ff_mmx_idct(DCTELEM *data);
  49 void ff_mmxext_idct(DCTELEM *data);
  50
  51 void odivx_idct_c(short *block);
  52
  53 // BFIN
  54 void ff_bfin_idct(DCTELEM *block);
  55 void ff_bfin_fdct(DCTELEM *block);
  56
  57 // ALTIVEC
  58 void fdct_altivec(DCTELEM *block);
  59 //void idct_altivec(DCTELEM *block);?? no routine
  60
  61 // ARM
  62 void ff_j_rev_dct_arm(DCTELEM *data);
  63 void ff_simple_idct_arm(DCTELEM *data);
  64 void ff_simple_idct_armv5te(DCTELEM *data);
  65 void ff_simple_idct_armv6(DCTELEM *data);
  66 void ff_simple_idct_neon(DCTELEM *data);
  67
  68 void ff_simple_idct_axp(DCTELEM *data);
  69
  70 struct algo {
  71     const char *name;
  72     void (*func)(DCTELEM *block);
  73     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  74                      SSE2_PERM, PARTTRANS_PERM } format;
  75     int mm_support;
  76     int nonspec;
  77 };
  78
  79 #ifndef FAAN_POSTSCALE
  80 #define FAAN_SCALE SCALE_PERM
  81 #else
  82 #define FAAN_SCALE NO_PERM
  83 #endif
  84
  85 static int cpu_flags;
  86
  87 static const struct algo fdct_tab[] = {
  88     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  89     { "FAAN",           ff_faandct,            FAAN_SCALE },
  90     { "IJG-AAN-INT",    fdct_ifast,            SCALE_PERM },
  91     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  92
  93 #if HAVE_MMX
  94     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  95     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  96     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  97 #endif
  98
  99 #if HAVE_ALTIVEC
 100     { "altivecfdct",    fdct_altivec,          NO_PERM,   AV_CPU_FLAG_ALTIVEC },
 101 #endif
 102
 103 #if ARCH_BFIN
 104     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 105 #endif
 106
 107     { 0 }
 108 };
 109
 110 static const struct algo idct_tab[] = {
 111     { "FAANI",          ff_faanidct,           NO_PERM  },
 112     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 113     { "INT",            j_rev_dct,             MMX_PERM },
 114     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 115
 116 #if HAVE_MMX
 117 #if CONFIG_GPL
 118     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 119     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 120 #endif
 121     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 122     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 123     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 124     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 125 #endif
 126
 127 #if ARCH_BFIN
 128     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 129 #endif
 130
 131 #if ARCH_ARM
 132     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 133     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 134 #endif
 135 #if HAVE_ARMV5TE
 136     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 137 #endif
 138 #if HAVE_ARMV6
 139     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 140 #endif
 141 #if HAVE_NEON
 142     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 143 #endif
 144
 145 #if ARCH_ALPHA
 146     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 147 #endif
 148
 149     { 0 }
 150 };
 151
 152 #define AANSCALE_BITS 12
 153
 154 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
 155
 156 static int64_t gettime(void)
 157 {
 158     struct timeval tv;
 159     gettimeofday(&tv, NULL);
 160     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 161 }
 162
 163 #define NB_ITS 20000
 164 #define NB_ITS_SPEED 50000
 165
 166 static short idct_mmx_perm[64];
 167
 168 static short idct_simple_mmx_perm[64] = {
 169     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 170     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 171     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 172     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 173     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 174     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 175     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 176     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 177 };
 178
 179 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 180
 181 static void idct_mmx_init(void)
 182 {
 183     int i;
 184
 185     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 186     for (i = 0; i < 64; i++) {
 187         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 188     }
 189 }
 190
 191 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 192 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 193
 194 static inline void mmx_emms(void)
 195 {
 196 #if HAVE_MMX
 197     if (cpu_flags & AV_CPU_FLAG_MMX)
 198         __asm__ volatile ("emms\n\t");
 199 #endif
 200 }
 201
 202 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
 203 {
 204     int i, j;
 205
 206     memset(block, 0, 64 * sizeof(*block));
 207
 208     switch (test) {
 209     case 0:
 210         for (i = 0; i < 64; i++)
 211             block[i] = (av_lfg_get(prng) % 512) - 256;
 212         if (is_idct) {
 213             ff_ref_fdct(block);
 214             for (i = 0; i < 64; i++)
 215                 block[i] >>= 3;
 216         }
 217         break;
 218     case 1:
 219         j = av_lfg_get(prng) % 10 + 1;
 220         for (i = 0; i < j; i++)
 221             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
 222         break;
 223     case 2:
 224         block[ 0] = av_lfg_get(prng) % 4096 - 2048;
 225         block[63] = (block[0] & 1) ^ 1;
 226         break;
 227     }
 228 }
 229
 230 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 231 {
 232     int i;
 233
 234     if (perm == MMX_PERM) {
 235         for (i = 0; i < 64; i++)
 236             dst[idct_mmx_perm[i]] = src[i];
 237     } else if (perm == MMX_SIMPLE_PERM) {
 238         for (i = 0; i < 64; i++)
 239             dst[idct_simple_mmx_perm[i]] = src[i];
 240     } else if (perm == SSE2_PERM) {
 241         for (i = 0; i < 64; i++)
 242             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 243     } else if (perm == PARTTRANS_PERM) {
 244         for (i = 0; i < 64; i++)
 245             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 246     } else {
 247         for (i = 0; i < 64; i++)
 248             dst[i] = src[i];
 249     }
 250 }
 251
 252 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
 253 {
 254     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 255     int it, i, scale;
 256     int err_inf, v;
 257     int64_t err2, ti, ti1, it1, err_sum = 0;
 258     int64_t sysErr[64], sysErrMax = 0;
 259     int maxout = 0;
 260     int blockSumErrMax = 0, blockSumErr;
 261     AVLFG prng;
 262     double omse, ome;
 263     int spec_err;
 264
 265     av_lfg_init(&prng, 1);
 266
 267     err_inf = 0;
 268     err2 = 0;
 269     for (i = 0; i < 64; i++)
 270         sysErr[i] = 0;
 271     for (it = 0; it < NB_ITS; it++) {
 272         init_block(block1, test, is_idct, &prng);
 273         permute(block, block1, dct->format);
 274
 275         dct->func(block);
 276         mmx_emms();
 277
 278         if (dct->format == SCALE_PERM) {
 279             for (i = 0; i < 64; i++) {
 280                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 281                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 282             }
 283         }
 284
 285         ref(block1);
 286
 287         blockSumErr = 0;
 288         for (i = 0; i < 64; i++) {
 289             int err = block[i] - block1[i];
 290             err_sum += err;
 291             v = abs(err);
 292             if (v > err_inf)
 293                 err_inf = v;
 294             err2 += v * v;
 295             sysErr[i] += block[i] - block1[i];
 296             blockSumErr += v;
 297             if (abs(block[i]) > maxout)
 298                 maxout = abs(block[i]);
 299         }
 300         if (blockSumErrMax < blockSumErr)
 301             blockSumErrMax = blockSumErr;
 302     }
 303     for (i = 0; i < 64; i++)
 304         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 305
 306     for (i = 0; i < 64; i++) {
 307         if (i % 8 == 0)
 308             printf("\n");
 309         printf("%7d ", (int) sysErr[i]);
 310     }
 311     printf("\n");
 312
 313     omse = (double) err2 / NB_ITS / 64;
 314     ome  = (double) err_sum / NB_ITS / 64;
 315
 316     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 317
 318     printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 319            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 320            omse, ome, (double) sysErrMax / NB_ITS,
 321            maxout, blockSumErrMax);
 322
 323     if (spec_err && !dct->nonspec)
 324         return 1;
 325
 326     if (!speed)
 327         return 0;
 328
 329     /* speed test */
 330     init_block(block, test, is_idct, &prng);
 331     permute(block1, block, dct->format);
 332
 333     ti = gettime();
 334     it1 = 0;
 335     do {
 336         for (it = 0; it < NB_ITS_SPEED; it++) {
 337             memcpy(block, block1, sizeof(block));
 338             dct->func(block);
 339         }
 340         it1 += NB_ITS_SPEED;
 341         ti1 = gettime() - ti;
 342     } while (ti1 < 1000000);
 343     mmx_emms();
 344
 345     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 346            (double) it1 * 1000.0 / (double) ti1);
 347
 348     return 0;
 349 }
 350
 351 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 352 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 353
 354 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 355 {
 356     static int init;
 357     static double c8[8][8];
 358     static double c4[4][4];
 359     double block1[64], block2[64], block3[64];
 360     double s, sum, v;
 361     int i, j, k;
 362
 363     if (!init) {
 364         init = 1;
 365
 366         for (i = 0; i < 8; i++) {
 367             sum = 0;
 368             for (j = 0; j < 8; j++) {
 369                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 370                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 371                 sum += c8[i][j] * c8[i][j];
 372             }
 373         }
 374
 375         for (i = 0; i < 4; i++) {
 376             sum = 0;
 377             for (j = 0; j < 4; j++) {
 378                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 379                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 380                 sum += c4[i][j] * c4[i][j];
 381             }
 382         }
 383     }
 384
 385     /* butterfly */
 386     s = 0.5 * sqrt(2.0);
 387     for (i = 0; i < 4; i++) {
 388         for (j = 0; j < 8; j++) {
 389             block1[8 * (2 * i) + j] =
 390                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 391             block1[8 * (2 * i + 1) + j] =
 392                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 393         }
 394     }
 395
 396     /* idct8 on lines */
 397     for (i = 0; i < 8; i++) {
 398         for (j = 0; j < 8; j++) {
 399             sum = 0;
 400             for (k = 0; k < 8; k++)
 401                 sum += c8[k][j] * block1[8 * i + k];
 402             block2[8 * i + j] = sum;
 403         }
 404     }
 405
 406     /* idct4 */
 407     for (i = 0; i < 8; i++) {
 408         for (j = 0; j < 4; j++) {
 409             /* top */
 410             sum = 0;
 411             for (k = 0; k < 4; k++)
 412                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 413             block3[8 * (2 * j) + i] = sum;
 414
 415             /* bottom */
 416             sum = 0;
 417             for (k = 0; k < 4; k++)
 418                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 419             block3[8 * (2 * j + 1) + i] = sum;
 420         }
 421     }
 422
 423     /* clamp and store the result */
 424     for (i = 0; i < 8; i++) {
 425         for (j = 0; j < 8; j++) {
 426             v = block3[8 * i + j];
 427             if      (v < 0)   v = 0;
 428             else if (v > 255) v = 255;
 429             dest[i * linesize + j] = (int) rint(v);
 430         }
 431     }
 432 }
 433
 434 static void idct248_error(const char *name,
 435                           void (*idct248_put)(uint8_t *dest, int line_size,
 436                                               int16_t *block),
 437                           int speed)
 438 {
 439     int it, i, it1, ti, ti1, err_max, v;
 440     AVLFG prng;
 441
 442     av_lfg_init(&prng, 1);
 443
 444     /* just one test to see if code is correct (precision is less
 445        important here) */
 446     err_max = 0;
 447     for (it = 0; it < NB_ITS; it++) {
 448         /* XXX: use forward transform to generate values */
 449         for (i = 0; i < 64; i++)
 450             block1[i] = av_lfg_get(&prng) % 256 - 128;
 451         block1[0] += 1024;
 452
 453         for (i = 0; i < 64; i++)
 454             block[i] = block1[i];
 455         idct248_ref(img_dest1, 8, block);
 456
 457         for (i = 0; i < 64; i++)
 458             block[i] = block1[i];
 459         idct248_put(img_dest, 8, block);
 460
 461         for (i = 0; i < 64; i++) {
 462             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 463             if (v == 255)
 464                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 465             if (v > err_max)
 466                 err_max = v;
 467         }
 468     }
 469     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 470
 471     if (!speed)
 472         return;
 473
 474     ti = gettime();
 475     it1 = 0;
 476     do {
 477         for (it = 0; it < NB_ITS_SPEED; it++) {
 478             for (i = 0; i < 64; i++)
 479                 block[i] = block1[i];
 480             idct248_put(img_dest, 8, block);
 481         }
 482         it1 += NB_ITS_SPEED;
 483         ti1 = gettime() - ti;
 484     } while (ti1 < 1000000);
 485     mmx_emms();
 486
 487     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 488            (double) it1 * 1000.0 / (double) ti1);
 489 }
 490
 491 static void help(void)
 492 {
 493     printf("dct-test [-i] [<test-number>]\n"
 494            "test-number 0 -> test with random matrixes\n"
 495            "            1 -> test with random sparse matrixes\n"
 496            "            2 -> do 3. test from mpeg4 std\n"
 497            "-i          test IDCT implementations\n"
 498            "-4          test IDCT248 implementations\n"
 499            "-t          speed test\n");
 500 }
 501
 502 int main(int argc, char **argv)
 503 {
 504     int test_idct = 0, test_248_dct = 0;
 505     int c, i;
 506     int test = 1;
 507     int speed = 0;
 508     int err = 0;
 509
 510     cpu_flags = av_get_cpu_flags();
 511
 512     ff_ref_dct_init();
 513     idct_mmx_init();
 514
 515     for (i = 0; i < 256; i++)
 516         cropTbl[i + MAX_NEG_CROP] = i;
 517     for (i = 0; i < MAX_NEG_CROP; i++) {
 518         cropTbl[i] = 0;
 519         cropTbl[i + MAX_NEG_CROP + 256] = 255;
 520     }
 521
 522     for (;;) {
 523         c = getopt(argc, argv, "ih4t");
 524         if (c == -1)
 525             break;
 526         switch (c) {
 527         case 'i':
 528             test_idct = 1;
 529             break;
 530         case '4':
 531             test_248_dct = 1;
 532             break;
 533         case 't':
 534             speed = 1;
 535             break;
 536         default:
 537         case 'h':
 538             help();
 539             return 0;
 540         }
 541     }
 542
 543     if (optind < argc)
 544         test = atoi(argv[optind]);
 545
 546     printf("Libav DCT/IDCT test\n");
 547
 548     if (test_248_dct) {
 549         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 550     } else {
 551         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 552         for (i = 0; algos[i].name; i++)
 553             if (!(~cpu_flags & algos[i].mm_support)) {
 554                 err |= dct_error(&algos[i], test, test_idct, speed);
 555             }
 556     }
 557
 558     return err;
 559 }