git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "dct.h"
  43 #include "simple_idct.h"
  44 #include "aandcttab.h"
  45 #include "faandct.h"
  46 #include "faanidct.h"
  47 #include "x86/idct_xvid.h"
  48 #include "dctref.h"
  49
  50 #undef printf
  51
  52 // BFIN
  53 void ff_bfin_idct(int16_t *block);
  54 void ff_bfin_fdct(int16_t *block);
  55
  56 // ALTIVEC
  57 void ff_fdct_altivec(int16_t *block);
  58
  59 // ARM
  60 void ff_j_rev_dct_arm(int16_t *data);
  61 void ff_simple_idct_arm(int16_t *data);
  62 void ff_simple_idct_armv5te(int16_t *data);
  63 void ff_simple_idct_armv6(int16_t *data);
  64 void ff_simple_idct_neon(int16_t *data);
  65
  66 void ff_simple_idct_axp(int16_t *data);
  67
  68 struct algo {
  69     const char *name;
  70     void (*func)(int16_t *block);
  71     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  72                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  73     int mm_support;
  74     int nonspec;
  75 };
  76
  77 static int cpu_flags;
  78
  79 static const struct algo fdct_tab[] = {
  80     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  81     { "FAAN",           ff_faandct,            NO_PERM    },
  82     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  83     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  84
  85 #if HAVE_MMX_INLINE
  86     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  87 #endif
  88 #if HAVE_MMXEXT_INLINE
  89     { "MMXEXT",         ff_fdct_mmxext,        NO_PERM,   AV_CPU_FLAG_MMXEXT  },
  90 #endif
  91 #if HAVE_SSE2_INLINE
  92     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  93 #endif
  94
  95 #if HAVE_ALTIVEC
  96     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  97 #endif
  98
  99 #if ARCH_BFIN
 100     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 101 #endif
 102
 103     { 0 }
 104 };
 105
 106 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
 107 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 108                                 int16_t *block, int16_t *qmat);
 109
 110 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
 111     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 112     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 113     int i;
 114
 115     for(i=0; i<64; i++){
 116         qmat[i]=4;
 117         tmp[i]= dst[i];
 118     }
 119     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 120 }
 121 #endif
 122
 123 static const struct algo idct_tab[] = {
 124     { "FAANI",          ff_faanidct,           NO_PERM  },
 125     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 126     { "INT",            ff_j_rev_dct,          MMX_PERM },
 127     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 128
 129 #if HAVE_MMX_INLINE
 130     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 131     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 132 #endif
 133 #if HAVE_MMXEXT_INLINE
 134     { "XVID-MMXEXT",    ff_idct_xvid_mmxext,   NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
 135 #endif
 136 #if HAVE_SSE2_INLINE
 137     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 138 #if ARCH_X86_64 && HAVE_YASM
 139     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 140 #endif
 141 #endif
 142
 143 #if ARCH_BFIN
 144     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 145 #endif
 146
 147 #if ARCH_ARM
 148     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 149     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 150 #endif
 151 #if HAVE_ARMV5TE
 152     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM,   AV_CPU_FLAG_ARMV5TE },
 153 #endif
 154 #if HAVE_ARMV6
 155     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM,  AV_CPU_FLAG_ARMV6   },
 156 #endif
 157 #if HAVE_NEON
 158     { "SIMPLE-NEON",    ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
 159 #endif
 160
 161 #if ARCH_ALPHA
 162     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 163 #endif
 164
 165     { 0 }
 166 };
 167
 168 #define AANSCALE_BITS 12
 169
 170 #define NB_ITS 20000
 171 #define NB_ITS_SPEED 50000
 172
 173 static short idct_mmx_perm[64];
 174
 175 static short idct_simple_mmx_perm[64] = {
 176     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 177     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 178     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 179     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 180     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 181     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 182     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 183     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 184 };
 185
 186 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 187
 188 static void idct_mmx_init(void)
 189 {
 190     int i;
 191
 192     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 193     for (i = 0; i < 64; i++) {
 194         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 195     }
 196 }
 197
 198 DECLARE_ALIGNED(16, static int16_t, block)[64];
 199 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 200
 201 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 202 {
 203     int i, j;
 204
 205     memset(block, 0, 64 * sizeof(*block));
 206
 207     switch (test) {
 208     case 0:
 209         for (i = 0; i < 64; i++)
 210             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 211         if (is_idct) {
 212             ff_ref_fdct(block);
 213             for (i = 0; i < 64; i++)
 214                 block[i] >>= 3;
 215         }
 216         break;
 217     case 1:
 218         j = av_lfg_get(prng) % 10 + 1;
 219         for (i = 0; i < j; i++) {
 220             int idx = av_lfg_get(prng) % 64;
 221             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
 222         }
 223         break;
 224     case 2:
 225         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 226         block[63] = (block[0] & 1) ^ 1;
 227         break;
 228     }
 229 }
 230
 231 static void permute(int16_t dst[64], const int16_t src[64], int perm)
 232 {
 233     int i;
 234
 235     if (perm == MMX_PERM) {
 236         for (i = 0; i < 64; i++)
 237             dst[idct_mmx_perm[i]] = src[i];
 238     } else if (perm == MMX_SIMPLE_PERM) {
 239         for (i = 0; i < 64; i++)
 240             dst[idct_simple_mmx_perm[i]] = src[i];
 241     } else if (perm == SSE2_PERM) {
 242         for (i = 0; i < 64; i++)
 243             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 244     } else if (perm == PARTTRANS_PERM) {
 245         for (i = 0; i < 64; i++)
 246             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 247     } else if (perm == TRANSPOSE_PERM) {
 248         for (i = 0; i < 64; i++)
 249             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 250     } else {
 251         for (i = 0; i < 64; i++)
 252             dst[i] = src[i];
 253     }
 254 }
 255
 256 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 257 {
 258     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 259     int it, i, scale;
 260     int err_inf, v;
 261     int64_t err2, ti, ti1, it1, err_sum = 0;
 262     int64_t sysErr[64], sysErrMax = 0;
 263     int maxout = 0;
 264     int blockSumErrMax = 0, blockSumErr;
 265     AVLFG prng;
 266     const int vals=1<<bits;
 267     double omse, ome;
 268     int spec_err;
 269
 270     av_lfg_init(&prng, 1);
 271
 272     err_inf = 0;
 273     err2 = 0;
 274     for (i = 0; i < 64; i++)
 275         sysErr[i] = 0;
 276     for (it = 0; it < NB_ITS; it++) {
 277         init_block(block1, test, is_idct, &prng, vals);
 278         permute(block, block1, dct->format);
 279
 280         dct->func(block);
 281         emms_c();
 282
 283         if (dct->format == SCALE_PERM) {
 284             for (i = 0; i < 64; i++) {
 285                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 286                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 287             }
 288         }
 289
 290         ref(block1);
 291
 292         blockSumErr = 0;
 293         for (i = 0; i < 64; i++) {
 294             int err = block[i] - block1[i];
 295             err_sum += err;
 296             v = abs(err);
 297             if (v > err_inf)
 298                 err_inf = v;
 299             err2 += v * v;
 300             sysErr[i] += block[i] - block1[i];
 301             blockSumErr += v;
 302             if (abs(block[i]) > maxout)
 303                 maxout = abs(block[i]);
 304         }
 305         if (blockSumErrMax < blockSumErr)
 306             blockSumErrMax = blockSumErr;
 307     }
 308     for (i = 0; i < 64; i++)
 309         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 310
 311     for (i = 0; i < 64; i++) {
 312         if (i % 8 == 0)
 313             printf("\n");
 314         printf("%7d ", (int) sysErr[i]);
 315     }
 316     printf("\n");
 317
 318     omse = (double) err2 / NB_ITS / 64;
 319     ome  = (double) err_sum / NB_ITS / 64;
 320
 321     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 322
 323     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 324            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 325            omse, ome, (double) sysErrMax / NB_ITS,
 326            maxout, blockSumErrMax);
 327
 328     if (spec_err && !dct->nonspec)
 329         return 1;
 330
 331     if (!speed)
 332         return 0;
 333
 334     /* speed test */
 335
 336     init_block(block, test, is_idct, &prng, vals);
 337     permute(block1, block, dct->format);
 338
 339     ti = av_gettime();
 340     it1 = 0;
 341     do {
 342         for (it = 0; it < NB_ITS_SPEED; it++) {
 343             memcpy(block, block1, sizeof(block));
 344             dct->func(block);
 345         }
 346         emms_c();
 347         it1 += NB_ITS_SPEED;
 348         ti1 = av_gettime() - ti;
 349     } while (ti1 < 1000000);
 350
 351     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 352            (double) it1 * 1000.0 / (double) ti1);
 353
 354     return 0;
 355 }
 356
 357 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 358 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 359
 360 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 361 {
 362     static int init;
 363     static double c8[8][8];
 364     static double c4[4][4];
 365     double block1[64], block2[64], block3[64];
 366     double s, sum, v;
 367     int i, j, k;
 368
 369     if (!init) {
 370         init = 1;
 371
 372         for (i = 0; i < 8; i++) {
 373             sum = 0;
 374             for (j = 0; j < 8; j++) {
 375                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 376                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 377                 sum += c8[i][j] * c8[i][j];
 378             }
 379         }
 380
 381         for (i = 0; i < 4; i++) {
 382             sum = 0;
 383             for (j = 0; j < 4; j++) {
 384                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 385                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 386                 sum += c4[i][j] * c4[i][j];
 387             }
 388         }
 389     }
 390
 391     /* butterfly */
 392     s = 0.5 * sqrt(2.0);
 393     for (i = 0; i < 4; i++) {
 394         for (j = 0; j < 8; j++) {
 395             block1[8 * (2 * i) + j] =
 396                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 397             block1[8 * (2 * i + 1) + j] =
 398                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 399         }
 400     }
 401
 402     /* idct8 on lines */
 403     for (i = 0; i < 8; i++) {
 404         for (j = 0; j < 8; j++) {
 405             sum = 0;
 406             for (k = 0; k < 8; k++)
 407                 sum += c8[k][j] * block1[8 * i + k];
 408             block2[8 * i + j] = sum;
 409         }
 410     }
 411
 412     /* idct4 */
 413     for (i = 0; i < 8; i++) {
 414         for (j = 0; j < 4; j++) {
 415             /* top */
 416             sum = 0;
 417             for (k = 0; k < 4; k++)
 418                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 419             block3[8 * (2 * j) + i] = sum;
 420
 421             /* bottom */
 422             sum = 0;
 423             for (k = 0; k < 4; k++)
 424                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 425             block3[8 * (2 * j + 1) + i] = sum;
 426         }
 427     }
 428
 429     /* clamp and store the result */
 430     for (i = 0; i < 8; i++) {
 431         for (j = 0; j < 8; j++) {
 432             v = block3[8 * i + j];
 433             if      (v < 0)   v = 0;
 434             else if (v > 255) v = 255;
 435             dest[i * linesize + j] = (int) rint(v);
 436         }
 437     }
 438 }
 439
 440 static void idct248_error(const char *name,
 441                           void (*idct248_put)(uint8_t *dest, int line_size,
 442                                               int16_t *block),
 443                           int speed)
 444 {
 445     int it, i, it1, ti, ti1, err_max, v;
 446     AVLFG prng;
 447
 448     av_lfg_init(&prng, 1);
 449
 450     /* just one test to see if code is correct (precision is less
 451        important here) */
 452     err_max = 0;
 453     for (it = 0; it < NB_ITS; it++) {
 454         /* XXX: use forward transform to generate values */
 455         for (i = 0; i < 64; i++)
 456             block1[i] = av_lfg_get(&prng) % 256 - 128;
 457         block1[0] += 1024;
 458
 459         for (i = 0; i < 64; i++)
 460             block[i] = block1[i];
 461         idct248_ref(img_dest1, 8, block);
 462
 463         for (i = 0; i < 64; i++)
 464             block[i] = block1[i];
 465         idct248_put(img_dest, 8, block);
 466
 467         for (i = 0; i < 64; i++) {
 468             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 469             if (v == 255)
 470                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 471             if (v > err_max)
 472                 err_max = v;
 473         }
 474 #if 0
 475         printf("ref=\n");
 476         for(i=0;i<8;i++) {
 477             int j;
 478             for(j=0;j<8;j++) {
 479                 printf(" %3d", img_dest1[i*8+j]);
 480             }
 481             printf("\n");
 482         }
 483
 484         printf("out=\n");
 485         for(i=0;i<8;i++) {
 486             int j;
 487             for(j=0;j<8;j++) {
 488                 printf(" %3d", img_dest[i*8+j]);
 489             }
 490             printf("\n");
 491         }
 492 #endif
 493     }
 494     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 495
 496     if (!speed)
 497         return;
 498
 499     ti = av_gettime();
 500     it1 = 0;
 501     do {
 502         for (it = 0; it < NB_ITS_SPEED; it++) {
 503             for (i = 0; i < 64; i++)
 504                 block[i] = block1[i];
 505             idct248_put(img_dest, 8, block);
 506         }
 507         emms_c();
 508         it1 += NB_ITS_SPEED;
 509         ti1 = av_gettime() - ti;
 510     } while (ti1 < 1000000);
 511
 512     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 513            (double) it1 * 1000.0 / (double) ti1);
 514 }
 515
 516 static void help(void)
 517 {
 518     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 519            "test-number 0 -> test with random matrixes\n"
 520            "            1 -> test with random sparse matrixes\n"
 521            "            2 -> do 3. test from mpeg4 std\n"
 522            "bits        Number of time domain bits to use, 8 is default\n"
 523            "-i          test IDCT implementations\n"
 524            "-4          test IDCT248 implementations\n"
 525            "-t          speed test\n");
 526 }
 527
 528 #if !HAVE_GETOPT
 529 #include "compat/getopt.c"
 530 #endif
 531
 532 int main(int argc, char **argv)
 533 {
 534     int test_idct = 0, test_248_dct = 0;
 535     int c, i;
 536     int test = 1;
 537     int speed = 0;
 538     int err = 0;
 539     int bits=8;
 540
 541     cpu_flags = av_get_cpu_flags();
 542
 543     ff_ref_dct_init();
 544     idct_mmx_init();
 545
 546     for (;;) {
 547         c = getopt(argc, argv, "ih4t");
 548         if (c == -1)
 549             break;
 550         switch (c) {
 551         case 'i':
 552             test_idct = 1;
 553             break;
 554         case '4':
 555             test_248_dct = 1;
 556             break;
 557         case 't':
 558             speed = 1;
 559             break;
 560         default:
 561         case 'h':
 562             help();
 563             return 0;
 564         }
 565     }
 566
 567     if (optind < argc)
 568         test = atoi(argv[optind]);
 569     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 570
 571     printf("ffmpeg DCT/IDCT test\n");
 572
 573     if (test_248_dct) {
 574         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 575     } else {
 576         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 577         for (i = 0; algos[i].name; i++)
 578             if (!(~cpu_flags & algos[i].mm_support)) {
 579                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 580             }
 581     }
 582
 583     if (err)
 584         printf("Error: %d.\n", err);
 585
 586     return !!err;
 587 }