git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "simple_idct.h"
  43 #include "aandcttab.h"
  44 #include "faandct.h"
  45 #include "faanidct.h"
  46 #include "x86/idct_xvid.h"
  47 #include "dctref.h"
  48
  49 #undef printf
  50
  51 void ff_mmx_idct(DCTELEM *data);
  52 void ff_mmxext_idct(DCTELEM *data);
  53
  54 // BFIN
  55 void ff_bfin_idct(DCTELEM *block);
  56 void ff_bfin_fdct(DCTELEM *block);
  57
  58 // ALTIVEC
  59 void ff_fdct_altivec(DCTELEM *block);
  60 //void ff_idct_altivec(DCTELEM *block);?? no routine
  61
  62 // ARM
  63 void ff_j_rev_dct_arm(DCTELEM *data);
  64 void ff_simple_idct_arm(DCTELEM *data);
  65 void ff_simple_idct_armv5te(DCTELEM *data);
  66 void ff_simple_idct_armv6(DCTELEM *data);
  67 void ff_simple_idct_neon(DCTELEM *data);
  68
  69 void ff_simple_idct_axp(DCTELEM *data);
  70
  71 struct algo {
  72     const char *name;
  73     void (*func)(DCTELEM *block);
  74     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  75                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  76     int mm_support;
  77     int nonspec;
  78 };
  79
  80 static int cpu_flags;
  81
  82 static const struct algo fdct_tab[] = {
  83     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  84     { "FAAN",           ff_faandct,            NO_PERM    },
  85     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  86     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  87
  88 #if HAVE_MMX
  89     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  90     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  91     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  92 #endif
  93
  94 #if HAVE_ALTIVEC
  95     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  96 #endif
  97
  98 #if ARCH_BFIN
  99     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 100 #endif
 101
 102     { 0 }
 103 };
 104
 105 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
 106 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 107                                 DCTELEM *block, int16_t *qmat);
 108
 109 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
 110     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 111     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 112     int i;
 113
 114     for(i=0; i<64; i++){
 115         qmat[i]=4;
 116         tmp[i]= dst[i];
 117     }
 118     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 119 }
 120 #endif
 121
 122 static const struct algo idct_tab[] = {
 123     { "FAANI",          ff_faanidct,           NO_PERM  },
 124     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 125     { "INT",            ff_j_rev_dct,          MMX_PERM },
 126     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 127
 128 #if HAVE_MMX
 129 #if CONFIG_GPL
 130     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 131     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 132 #endif
 133     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 134     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 135     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 136     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 137 #if ARCH_X86_64 && HAVE_YASM
 138     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 139 #endif
 140 #endif
 141
 142 #if ARCH_BFIN
 143     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 144 #endif
 145
 146 #if ARCH_ARM
 147     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 148     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 149 #endif
 150 #if HAVE_ARMV5TE
 151     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 152 #endif
 153 #if HAVE_ARMV6
 154     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 155 #endif
 156 #if HAVE_NEON
 157     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 158 #endif
 159
 160 #if ARCH_ALPHA
 161     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 162 #endif
 163
 164     { 0 }
 165 };
 166
 167 #define AANSCALE_BITS 12
 168
 169 #define NB_ITS 20000
 170 #define NB_ITS_SPEED 50000
 171
 172 static short idct_mmx_perm[64];
 173
 174 static short idct_simple_mmx_perm[64] = {
 175     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 176     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 177     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 178     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 179     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 180     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 181     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 182     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 183 };
 184
 185 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 186
 187 static void idct_mmx_init(void)
 188 {
 189     int i;
 190
 191     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 192     for (i = 0; i < 64; i++) {
 193         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 194     }
 195 }
 196
 197 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 198 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 199
 200 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 201 {
 202     int i, j;
 203
 204     memset(block, 0, 64 * sizeof(*block));
 205
 206     switch (test) {
 207     case 0:
 208         for (i = 0; i < 64; i++)
 209             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 210         if (is_idct) {
 211             ff_ref_fdct(block);
 212             for (i = 0; i < 64; i++)
 213                 block[i] >>= 3;
 214         }
 215         break;
 216     case 1:
 217         j = av_lfg_get(prng) % 10 + 1;
 218         for (i = 0; i < j; i++)
 219             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
 220         break;
 221     case 2:
 222         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 223         block[63] = (block[0] & 1) ^ 1;
 224         break;
 225     }
 226 }
 227
 228 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 229 {
 230     int i;
 231
 232     if (perm == MMX_PERM) {
 233         for (i = 0; i < 64; i++)
 234             dst[idct_mmx_perm[i]] = src[i];
 235     } else if (perm == MMX_SIMPLE_PERM) {
 236         for (i = 0; i < 64; i++)
 237             dst[idct_simple_mmx_perm[i]] = src[i];
 238     } else if (perm == SSE2_PERM) {
 239         for (i = 0; i < 64; i++)
 240             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 241     } else if (perm == PARTTRANS_PERM) {
 242         for (i = 0; i < 64; i++)
 243             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 244     } else if (perm == TRANSPOSE_PERM) {
 245         for (i = 0; i < 64; i++)
 246             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 247     } else {
 248         for (i = 0; i < 64; i++)
 249             dst[i] = src[i];
 250     }
 251 }
 252
 253 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 254 {
 255     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 256     int it, i, scale;
 257     int err_inf, v;
 258     int64_t err2, ti, ti1, it1, err_sum = 0;
 259     int64_t sysErr[64], sysErrMax = 0;
 260     int maxout = 0;
 261     int blockSumErrMax = 0, blockSumErr;
 262     AVLFG prng;
 263     const int vals=1<<bits;
 264     double omse, ome;
 265     int spec_err;
 266
 267     av_lfg_init(&prng, 1);
 268
 269     err_inf = 0;
 270     err2 = 0;
 271     for (i = 0; i < 64; i++)
 272         sysErr[i] = 0;
 273     for (it = 0; it < NB_ITS; it++) {
 274         init_block(block1, test, is_idct, &prng, vals);
 275         permute(block, block1, dct->format);
 276
 277         dct->func(block);
 278         emms_c();
 279
 280         if (dct->format == SCALE_PERM) {
 281             for (i = 0; i < 64; i++) {
 282                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 283                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 284             }
 285         }
 286
 287         ref(block1);
 288
 289         blockSumErr = 0;
 290         for (i = 0; i < 64; i++) {
 291             int err = block[i] - block1[i];
 292             err_sum += err;
 293             v = abs(err);
 294             if (v > err_inf)
 295                 err_inf = v;
 296             err2 += v * v;
 297             sysErr[i] += block[i] - block1[i];
 298             blockSumErr += v;
 299             if (abs(block[i]) > maxout)
 300                 maxout = abs(block[i]);
 301         }
 302         if (blockSumErrMax < blockSumErr)
 303             blockSumErrMax = blockSumErr;
 304     }
 305     for (i = 0; i < 64; i++)
 306         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 307
 308     for (i = 0; i < 64; i++) {
 309         if (i % 8 == 0)
 310             printf("\n");
 311         printf("%7d ", (int) sysErr[i]);
 312     }
 313     printf("\n");
 314
 315     omse = (double) err2 / NB_ITS / 64;
 316     ome  = (double) err_sum / NB_ITS / 64;
 317
 318     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 319
 320     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 321            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 322            omse, ome, (double) sysErrMax / NB_ITS,
 323            maxout, blockSumErrMax);
 324
 325     if (spec_err && !dct->nonspec)
 326         return 1;
 327
 328     if (!speed)
 329         return 0;
 330
 331     /* speed test */
 332
 333     init_block(block, test, is_idct, &prng, vals);
 334     permute(block1, block, dct->format);
 335
 336     ti = av_gettime();
 337     it1 = 0;
 338     do {
 339         for (it = 0; it < NB_ITS_SPEED; it++) {
 340             memcpy(block, block1, sizeof(block));
 341             dct->func(block);
 342         }
 343         emms_c();
 344         it1 += NB_ITS_SPEED;
 345         ti1 = av_gettime() - ti;
 346     } while (ti1 < 1000000);
 347
 348     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 349            (double) it1 * 1000.0 / (double) ti1);
 350
 351     return 0;
 352 }
 353
 354 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 355 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 356
 357 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 358 {
 359     static int init;
 360     static double c8[8][8];
 361     static double c4[4][4];
 362     double block1[64], block2[64], block3[64];
 363     double s, sum, v;
 364     int i, j, k;
 365
 366     if (!init) {
 367         init = 1;
 368
 369         for (i = 0; i < 8; i++) {
 370             sum = 0;
 371             for (j = 0; j < 8; j++) {
 372                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 373                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 374                 sum += c8[i][j] * c8[i][j];
 375             }
 376         }
 377
 378         for (i = 0; i < 4; i++) {
 379             sum = 0;
 380             for (j = 0; j < 4; j++) {
 381                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 382                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 383                 sum += c4[i][j] * c4[i][j];
 384             }
 385         }
 386     }
 387
 388     /* butterfly */
 389     s = 0.5 * sqrt(2.0);
 390     for (i = 0; i < 4; i++) {
 391         for (j = 0; j < 8; j++) {
 392             block1[8 * (2 * i) + j] =
 393                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 394             block1[8 * (2 * i + 1) + j] =
 395                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 396         }
 397     }
 398
 399     /* idct8 on lines */
 400     for (i = 0; i < 8; i++) {
 401         for (j = 0; j < 8; j++) {
 402             sum = 0;
 403             for (k = 0; k < 8; k++)
 404                 sum += c8[k][j] * block1[8 * i + k];
 405             block2[8 * i + j] = sum;
 406         }
 407     }
 408
 409     /* idct4 */
 410     for (i = 0; i < 8; i++) {
 411         for (j = 0; j < 4; j++) {
 412             /* top */
 413             sum = 0;
 414             for (k = 0; k < 4; k++)
 415                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 416             block3[8 * (2 * j) + i] = sum;
 417
 418             /* bottom */
 419             sum = 0;
 420             for (k = 0; k < 4; k++)
 421                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 422             block3[8 * (2 * j + 1) + i] = sum;
 423         }
 424     }
 425
 426     /* clamp and store the result */
 427     for (i = 0; i < 8; i++) {
 428         for (j = 0; j < 8; j++) {
 429             v = block3[8 * i + j];
 430             if      (v < 0)   v = 0;
 431             else if (v > 255) v = 255;
 432             dest[i * linesize + j] = (int) rint(v);
 433         }
 434     }
 435 }
 436
 437 static void idct248_error(const char *name,
 438                           void (*idct248_put)(uint8_t *dest, int line_size,
 439                                               int16_t *block),
 440                           int speed)
 441 {
 442     int it, i, it1, ti, ti1, err_max, v;
 443     AVLFG prng;
 444
 445     av_lfg_init(&prng, 1);
 446
 447     /* just one test to see if code is correct (precision is less
 448        important here) */
 449     err_max = 0;
 450     for (it = 0; it < NB_ITS; it++) {
 451         /* XXX: use forward transform to generate values */
 452         for (i = 0; i < 64; i++)
 453             block1[i] = av_lfg_get(&prng) % 256 - 128;
 454         block1[0] += 1024;
 455
 456         for (i = 0; i < 64; i++)
 457             block[i] = block1[i];
 458         idct248_ref(img_dest1, 8, block);
 459
 460         for (i = 0; i < 64; i++)
 461             block[i] = block1[i];
 462         idct248_put(img_dest, 8, block);
 463
 464         for (i = 0; i < 64; i++) {
 465             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 466             if (v == 255)
 467                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 468             if (v > err_max)
 469                 err_max = v;
 470         }
 471 #if 0
 472         printf("ref=\n");
 473         for(i=0;i<8;i++) {
 474             int j;
 475             for(j=0;j<8;j++) {
 476                 printf(" %3d", img_dest1[i*8+j]);
 477             }
 478             printf("\n");
 479         }
 480
 481         printf("out=\n");
 482         for(i=0;i<8;i++) {
 483             int j;
 484             for(j=0;j<8;j++) {
 485                 printf(" %3d", img_dest[i*8+j]);
 486             }
 487             printf("\n");
 488         }
 489 #endif
 490     }
 491     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 492
 493     if (!speed)
 494         return;
 495
 496     ti = av_gettime();
 497     it1 = 0;
 498     do {
 499         for (it = 0; it < NB_ITS_SPEED; it++) {
 500             for (i = 0; i < 64; i++)
 501                 block[i] = block1[i];
 502             idct248_put(img_dest, 8, block);
 503         }
 504         emms_c();
 505         it1 += NB_ITS_SPEED;
 506         ti1 = av_gettime() - ti;
 507     } while (ti1 < 1000000);
 508
 509     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 510            (double) it1 * 1000.0 / (double) ti1);
 511 }
 512
 513 static void help(void)
 514 {
 515     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 516            "test-number 0 -> test with random matrixes\n"
 517            "            1 -> test with random sparse matrixes\n"
 518            "            2 -> do 3. test from mpeg4 std\n"
 519            "bits        Number of time domain bits to use, 8 is default\n"
 520            "-i          test IDCT implementations\n"
 521            "-4          test IDCT248 implementations\n"
 522            "-t          speed test\n");
 523 }
 524
 525 #if !HAVE_GETOPT
 526 #include "compat/getopt.c"
 527 #endif
 528
 529 int main(int argc, char **argv)
 530 {
 531     int test_idct = 0, test_248_dct = 0;
 532     int c, i;
 533     int test = 1;
 534     int speed = 0;
 535     int err = 0;
 536     int bits=8;
 537
 538     cpu_flags = av_get_cpu_flags();
 539
 540     ff_ref_dct_init();
 541     idct_mmx_init();
 542
 543     for (;;) {
 544         c = getopt(argc, argv, "ih4t");
 545         if (c == -1)
 546             break;
 547         switch (c) {
 548         case 'i':
 549             test_idct = 1;
 550             break;
 551         case '4':
 552             test_248_dct = 1;
 553             break;
 554         case 't':
 555             speed = 1;
 556             break;
 557         default:
 558         case 'h':
 559             help();
 560             return 0;
 561         }
 562     }
 563
 564     if (optind < argc)
 565         test = atoi(argv[optind]);
 566     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 567
 568     printf("ffmpeg DCT/IDCT test\n");
 569
 570     if (test_248_dct) {
 571         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 572     } else {
 573         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 574         for (i = 0; algos[i].name; i++)
 575             if (!(~cpu_flags & algos[i].mm_support)) {
 576                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 577             }
 578     }
 579
 580     return err;
 581 }