git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33 #include <math.h>
  34
  35 #include "libavutil/cpu.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/lfg.h"
  38
  39 #include "simple_idct.h"
  40 #include "aandcttab.h"
  41 #include "faandct.h"
  42 #include "faanidct.h"
  43 #include "x86/idct_xvid.h"
  44 #include "dctref.h"
  45
  46 #undef printf
  47
  48 void ff_mmx_idct(DCTELEM *data);
  49 void ff_mmxext_idct(DCTELEM *data);
  50
  51 void odivx_idct_c(short *block);
  52
  53 // BFIN
  54 void ff_bfin_idct(DCTELEM *block);
  55 void ff_bfin_fdct(DCTELEM *block);
  56
  57 // ALTIVEC
  58 void fdct_altivec(DCTELEM *block);
  59 //void idct_altivec(DCTELEM *block);?? no routine
  60
  61 // ARM
  62 void ff_j_rev_dct_arm(DCTELEM *data);
  63 void ff_simple_idct_arm(DCTELEM *data);
  64 void ff_simple_idct_armv5te(DCTELEM *data);
  65 void ff_simple_idct_armv6(DCTELEM *data);
  66 void ff_simple_idct_neon(DCTELEM *data);
  67
  68 void ff_simple_idct_axp(DCTELEM *data);
  69
  70 struct algo {
  71     const char *name;
  72     void (*func)(DCTELEM *block);
  73     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  74                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  75     int mm_support;
  76     int nonspec;
  77 };
  78
  79 #ifndef FAAN_POSTSCALE
  80 #define FAAN_SCALE SCALE_PERM
  81 #else
  82 #define FAAN_SCALE NO_PERM
  83 #endif
  84
  85 static int cpu_flags;
  86
  87 static const struct algo fdct_tab[] = {
  88     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  89     { "FAAN",           ff_faandct,            FAAN_SCALE },
  90     { "IJG-AAN-INT",    fdct_ifast,            SCALE_PERM },
  91     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  92
  93 #if HAVE_MMX
  94     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  95     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  96     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  97 #endif
  98
  99 #if HAVE_ALTIVEC
 100     { "altivecfdct",    fdct_altivec,          NO_PERM,   AV_CPU_FLAG_ALTIVEC },
 101 #endif
 102
 103 #if ARCH_BFIN
 104     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 105 #endif
 106
 107     { 0 }
 108 };
 109
 110 #if HAVE_MMX
 111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 112                                 DCTELEM *block, int16_t *qmat);
 113
 114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
 115     int16_t qmat[64]; int i;
 116     int16_t tmp[64];
 117
 118     for(i=0; i<64; i++){
 119         qmat[i]=4;
 120         tmp[i]= dst[i];
 121     }
 122     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 123 }
 124 #endif
 125
 126 static const struct algo idct_tab[] = {
 127     { "FAANI",          ff_faanidct,           NO_PERM  },
 128     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 129     { "INT",            j_rev_dct,             MMX_PERM },
 130     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 131
 132 #if HAVE_MMX
 133 #if CONFIG_GPL
 134     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 135     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 136 #endif
 137     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 138     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 139     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 140     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 141 #if ARCH_X86_64
 142     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 143 #endif
 144 #endif
 145
 146 #if ARCH_BFIN
 147     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 148 #endif
 149
 150 #if ARCH_ARM
 151     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 152     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 153 #endif
 154 #if HAVE_ARMV5TE
 155     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 156 #endif
 157 #if HAVE_ARMV6
 158     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 159 #endif
 160 #if HAVE_NEON
 161     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 162 #endif
 163
 164 #if ARCH_ALPHA
 165     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 166 #endif
 167
 168     { 0 }
 169 };
 170
 171 #define AANSCALE_BITS 12
 172
 173 static uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
 174
 175 static int64_t gettime(void)
 176 {
 177     struct timeval tv;
 178     gettimeofday(&tv, NULL);
 179     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 180 }
 181
 182 #define NB_ITS 20000
 183 #define NB_ITS_SPEED 50000
 184
 185 static short idct_mmx_perm[64];
 186
 187 static short idct_simple_mmx_perm[64] = {
 188     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 189     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 190     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 191     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 192     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 193     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 194     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 195     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 196 };
 197
 198 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 199
 200 static void idct_mmx_init(void)
 201 {
 202     int i;
 203
 204     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 205     for (i = 0; i < 64; i++) {
 206         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 207     }
 208 }
 209
 210 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 211 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 212
 213 static inline void mmx_emms(void)
 214 {
 215 #if HAVE_MMX
 216     if (cpu_flags & AV_CPU_FLAG_MMX)
 217         __asm__ volatile ("emms\n\t");
 218 #endif
 219 }
 220
 221 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 222 {
 223     int i, j;
 224
 225     memset(block, 0, 64 * sizeof(*block));
 226
 227     switch (test) {
 228     case 0:
 229         for (i = 0; i < 64; i++)
 230             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 231         if (is_idct) {
 232             ff_ref_fdct(block);
 233             for (i = 0; i < 64; i++)
 234                 block[i] >>= 3;
 235         }
 236         break;
 237     case 1:
 238         j = av_lfg_get(prng) % 10 + 1;
 239         for (i = 0; i < j; i++)
 240             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
 241         break;
 242     case 2:
 243         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 244         block[63] = (block[0] & 1) ^ 1;
 245         break;
 246     }
 247 }
 248
 249 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 250 {
 251     int i;
 252
 253     if (perm == MMX_PERM) {
 254         for (i = 0; i < 64; i++)
 255             dst[idct_mmx_perm[i]] = src[i];
 256     } else if (perm == MMX_SIMPLE_PERM) {
 257         for (i = 0; i < 64; i++)
 258             dst[idct_simple_mmx_perm[i]] = src[i];
 259     } else if (perm == SSE2_PERM) {
 260         for (i = 0; i < 64; i++)
 261             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 262     } else if (perm == PARTTRANS_PERM) {
 263         for (i = 0; i < 64; i++)
 264             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 265     } else if (perm == TRANSPOSE_PERM) {
 266         for (i = 0; i < 64; i++)
 267             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 268     } else {
 269         for (i = 0; i < 64; i++)
 270             dst[i] = src[i];
 271     }
 272 }
 273
 274 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 275 {
 276     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 277     int it, i, scale;
 278     int err_inf, v;
 279     int64_t err2, ti, ti1, it1, err_sum = 0;
 280     int64_t sysErr[64], sysErrMax = 0;
 281     int maxout = 0;
 282     int blockSumErrMax = 0, blockSumErr;
 283     AVLFG prng;
 284     const int vals=1<<bits;
 285     double omse, ome;
 286     int spec_err;
 287
 288     av_lfg_init(&prng, 1);
 289
 290     err_inf = 0;
 291     err2 = 0;
 292     for (i = 0; i < 64; i++)
 293         sysErr[i] = 0;
 294     for (it = 0; it < NB_ITS; it++) {
 295         init_block(block1, test, is_idct, &prng, vals);
 296         permute(block, block1, dct->format);
 297
 298         dct->func(block);
 299         mmx_emms();
 300
 301         if (dct->format == SCALE_PERM) {
 302             for (i = 0; i < 64; i++) {
 303                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 304                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 305             }
 306         }
 307
 308         ref(block1);
 309
 310         blockSumErr = 0;
 311         for (i = 0; i < 64; i++) {
 312             int err = block[i] - block1[i];
 313             err_sum += err;
 314             v = abs(err);
 315             if (v > err_inf)
 316                 err_inf = v;
 317             err2 += v * v;
 318             sysErr[i] += block[i] - block1[i];
 319             blockSumErr += v;
 320             if (abs(block[i]) > maxout)
 321                 maxout = abs(block[i]);
 322         }
 323         if (blockSumErrMax < blockSumErr)
 324             blockSumErrMax = blockSumErr;
 325     }
 326     for (i = 0; i < 64; i++)
 327         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 328
 329     for (i = 0; i < 64; i++) {
 330         if (i % 8 == 0)
 331             printf("\n");
 332         printf("%7d ", (int) sysErr[i]);
 333     }
 334     printf("\n");
 335
 336     omse = (double) err2 / NB_ITS / 64;
 337     ome  = (double) err_sum / NB_ITS / 64;
 338
 339     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 340
 341     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 342            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 343            omse, ome, (double) sysErrMax / NB_ITS,
 344            maxout, blockSumErrMax);
 345
 346     if (spec_err && !dct->nonspec)
 347         return 1;
 348
 349     if (!speed)
 350         return 0;
 351
 352     /* speed test */
 353
 354     init_block(block, test, is_idct, &prng, vals);
 355     permute(block1, block, dct->format);
 356
 357     ti = gettime();
 358     it1 = 0;
 359     do {
 360         for (it = 0; it < NB_ITS_SPEED; it++) {
 361             memcpy(block, block1, sizeof(block));
 362             dct->func(block);
 363         }
 364         it1 += NB_ITS_SPEED;
 365         ti1 = gettime() - ti;
 366     } while (ti1 < 1000000);
 367     mmx_emms();
 368
 369     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 370            (double) it1 * 1000.0 / (double) ti1);
 371
 372     return 0;
 373 }
 374
 375 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 376 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 377
 378 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 379 {
 380     static int init;
 381     static double c8[8][8];
 382     static double c4[4][4];
 383     double block1[64], block2[64], block3[64];
 384     double s, sum, v;
 385     int i, j, k;
 386
 387     if (!init) {
 388         init = 1;
 389
 390         for (i = 0; i < 8; i++) {
 391             sum = 0;
 392             for (j = 0; j < 8; j++) {
 393                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 394                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 395                 sum += c8[i][j] * c8[i][j];
 396             }
 397         }
 398
 399         for (i = 0; i < 4; i++) {
 400             sum = 0;
 401             for (j = 0; j < 4; j++) {
 402                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 403                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 404                 sum += c4[i][j] * c4[i][j];
 405             }
 406         }
 407     }
 408
 409     /* butterfly */
 410     s = 0.5 * sqrt(2.0);
 411     for (i = 0; i < 4; i++) {
 412         for (j = 0; j < 8; j++) {
 413             block1[8 * (2 * i) + j] =
 414                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 415             block1[8 * (2 * i + 1) + j] =
 416                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 417         }
 418     }
 419
 420     /* idct8 on lines */
 421     for (i = 0; i < 8; i++) {
 422         for (j = 0; j < 8; j++) {
 423             sum = 0;
 424             for (k = 0; k < 8; k++)
 425                 sum += c8[k][j] * block1[8 * i + k];
 426             block2[8 * i + j] = sum;
 427         }
 428     }
 429
 430     /* idct4 */
 431     for (i = 0; i < 8; i++) {
 432         for (j = 0; j < 4; j++) {
 433             /* top */
 434             sum = 0;
 435             for (k = 0; k < 4; k++)
 436                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 437             block3[8 * (2 * j) + i] = sum;
 438
 439             /* bottom */
 440             sum = 0;
 441             for (k = 0; k < 4; k++)
 442                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 443             block3[8 * (2 * j + 1) + i] = sum;
 444         }
 445     }
 446
 447     /* clamp and store the result */
 448     for (i = 0; i < 8; i++) {
 449         for (j = 0; j < 8; j++) {
 450             v = block3[8 * i + j];
 451             if      (v < 0)   v = 0;
 452             else if (v > 255) v = 255;
 453             dest[i * linesize + j] = (int) rint(v);
 454         }
 455     }
 456 }
 457
 458 static void idct248_error(const char *name,
 459                           void (*idct248_put)(uint8_t *dest, int line_size,
 460                                               int16_t *block),
 461                           int speed)
 462 {
 463     int it, i, it1, ti, ti1, err_max, v;
 464     AVLFG prng;
 465
 466     av_lfg_init(&prng, 1);
 467
 468     /* just one test to see if code is correct (precision is less
 469        important here) */
 470     err_max = 0;
 471     for (it = 0; it < NB_ITS; it++) {
 472         /* XXX: use forward transform to generate values */
 473         for (i = 0; i < 64; i++)
 474             block1[i] = av_lfg_get(&prng) % 256 - 128;
 475         block1[0] += 1024;
 476
 477         for (i = 0; i < 64; i++)
 478             block[i] = block1[i];
 479         idct248_ref(img_dest1, 8, block);
 480
 481         for (i = 0; i < 64; i++)
 482             block[i] = block1[i];
 483         idct248_put(img_dest, 8, block);
 484
 485         for (i = 0; i < 64; i++) {
 486             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 487             if (v == 255)
 488                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 489             if (v > err_max)
 490                 err_max = v;
 491         }
 492 #if 0
 493         printf("ref=\n");
 494         for(i=0;i<8;i++) {
 495             int j;
 496             for(j=0;j<8;j++) {
 497                 printf(" %3d", img_dest1[i*8+j]);
 498             }
 499             printf("\n");
 500         }
 501
 502         printf("out=\n");
 503         for(i=0;i<8;i++) {
 504             int j;
 505             for(j=0;j<8;j++) {
 506                 printf(" %3d", img_dest[i*8+j]);
 507             }
 508             printf("\n");
 509         }
 510 #endif
 511     }
 512     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 513
 514     if (!speed)
 515         return;
 516
 517     ti = gettime();
 518     it1 = 0;
 519     do {
 520         for (it = 0; it < NB_ITS_SPEED; it++) {
 521             for (i = 0; i < 64; i++)
 522                 block[i] = block1[i];
 523             idct248_put(img_dest, 8, block);
 524         }
 525         it1 += NB_ITS_SPEED;
 526         ti1 = gettime() - ti;
 527     } while (ti1 < 1000000);
 528     mmx_emms();
 529
 530     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 531            (double) it1 * 1000.0 / (double) ti1);
 532 }
 533
 534 static void help(void)
 535 {
 536     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 537            "test-number 0 -> test with random matrixes\n"
 538            "            1 -> test with random sparse matrixes\n"
 539            "            2 -> do 3. test from mpeg4 std\n"
 540            "bits        Number of time domain bits to use, 8 is default\n"
 541            "-i          test IDCT implementations\n"
 542            "-4          test IDCT248 implementations\n"
 543            "-t          speed test\n");
 544 }
 545
 546 int main(int argc, char **argv)
 547 {
 548     int test_idct = 0, test_248_dct = 0;
 549     int c, i;
 550     int test = 1;
 551     int speed = 0;
 552     int err = 0;
 553     int bits=8;
 554
 555     cpu_flags = av_get_cpu_flags();
 556
 557     ff_ref_dct_init();
 558     idct_mmx_init();
 559
 560     for (i = 0; i < 256; i++)
 561         cropTbl[i + MAX_NEG_CROP] = i;
 562     for (i = 0; i < MAX_NEG_CROP; i++) {
 563         cropTbl[i] = 0;
 564         cropTbl[i + MAX_NEG_CROP + 256] = 255;
 565     }
 566
 567     for (;;) {
 568         c = getopt(argc, argv, "ih4t");
 569         if (c == -1)
 570             break;
 571         switch (c) {
 572         case 'i':
 573             test_idct = 1;
 574             break;
 575         case '4':
 576             test_248_dct = 1;
 577             break;
 578         case 't':
 579             speed = 1;
 580             break;
 581         default:
 582         case 'h':
 583             help();
 584             return 0;
 585         }
 586     }
 587
 588     if (optind < argc)
 589         test = atoi(argv[optind]);
 590     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 591
 592     printf("ffmpeg DCT/IDCT test\n");
 593
 594     if (test_248_dct) {
 595         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 596     } else {
 597         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 598         for (i = 0; algos[i].name; i++)
 599             if (!(~cpu_flags & algos[i].mm_support)) {
 600                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 601             }
 602     }
 603
 604     return err;
 605 }