git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "dct.h"
  43 #include "idctdsp.h"
  44 #include "simple_idct.h"
  45 #include "aandcttab.h"
  46 #include "faandct.h"
  47 #include "faanidct.h"
  48 #include "arm/idct.h"
  49 #include "ppc/fdct.h"
  50 #include "x86/fdct.h"
  51 #include "x86/idct_xvid.h"
  52 #include "x86/simple_idct.h"
  53 #include "dctref.h"
  54
  55 struct algo {
  56     const char *name;
  57     void (*func)(int16_t *block);
  58     enum idct_permutation_type perm_type;
  59     int cpu_flag;
  60     int nonspec;
  61 };
  62
  63 static const struct algo fdct_tab[] = {
  64     { "REF-DBL",     ff_ref_fdct,          FF_IDCT_PERM_NONE },
  65     { "FAAN",        ff_faandct,           FF_IDCT_PERM_NONE },
  66     { "IJG-AAN-INT", ff_fdct_ifast,        FF_IDCT_PERM_NONE },
  67     { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, FF_IDCT_PERM_NONE },
  68
  69 #if HAVE_MMX_INLINE
  70     { "MMX",         ff_fdct_mmx,          FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
  71 #endif
  72 #if HAVE_MMXEXT_INLINE
  73     { "MMXEXT",      ff_fdct_mmxext,       FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT },
  74 #endif
  75 #if HAVE_SSE2_INLINE
  76     { "SSE2",        ff_fdct_sse2,         FF_IDCT_PERM_NONE, AV_CPU_FLAG_SSE2 },
  77 #endif
  78
  79 #if HAVE_ALTIVEC
  80     { "altivecfdct", ff_fdct_altivec,      FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
  81 #endif
  82
  83     { 0 }
  84 };
  85
  86 static void ff_prores_idct_wrap(int16_t *dst){
  87     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
  88     int i;
  89
  90     for(i=0; i<64; i++){
  91         qmat[i]=4;
  92     }
  93     ff_prores_idct(dst, qmat);
  94     for(i=0; i<64; i++) {
  95          dst[i] -= 512;
  96     }
  97 }
  98 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
  99 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 100                                 int16_t *block, int16_t *qmat);
 101
 102 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
 103     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 104     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 105     int i;
 106
 107     for(i=0; i<64; i++){
 108         qmat[i]=4;
 109         tmp[i]= dst[i];
 110     }
 111     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 112
 113     for(i=0; i<64; i++) {
 114          dst[i] -= 512;
 115     }
 116 }
 117 #endif
 118
 119 static const struct algo idct_tab[] = {
 120     { "FAANI",       ff_faanidct,          FF_IDCT_PERM_NONE },
 121     { "REF-DBL",     ff_ref_idct,          FF_IDCT_PERM_NONE },
 122     { "INT",         ff_j_rev_dct,         FF_IDCT_PERM_LIBMPEG2 },
 123     { "SIMPLE-C",    ff_simple_idct_8,     FF_IDCT_PERM_NONE },
 124     { "PR-C",        ff_prores_idct_wrap,  FF_IDCT_PERM_NONE, 0, 1 },
 125
 126 #if HAVE_MMX_INLINE
 127     { "SIMPLE-MMX",     ff_simple_idct_mmx,     FF_IDCT_PERM_SIMPLE,    AV_CPU_FLAG_MMX },
 128     { "XVID-MMX",       ff_idct_xvid_mmx,       FF_IDCT_PERM_NONE,      AV_CPU_FLAG_MMX,    1 },
 129 #endif
 130 #if HAVE_MMXEXT_INLINE
 131     { "XVID-MMXEXT",    ff_idct_xvid_mmxext,    FF_IDCT_PERM_NONE,      AV_CPU_FLAG_MMXEXT, 1 },
 132 #endif
 133 #if HAVE_SSE2_INLINE
 134     { "XVID-SSE2",      ff_idct_xvid_sse2,      FF_IDCT_PERM_SSE2,      AV_CPU_FLAG_SSE2,   1 },
 135 #if ARCH_X86_64 && HAVE_YASM
 136     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
 137 #endif
 138 #endif
 139
 140 #if ARCH_ARM
 141     { "SIMPLE-ARM",     ff_simple_idct_arm,     FF_IDCT_PERM_NONE },
 142     { "INT-ARM",        ff_j_rev_dct_arm,       FF_IDCT_PERM_LIBMPEG2 },
 143 #endif
 144 #if HAVE_ARMV5TE
 145     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te, FF_IDCT_PERM_NONE,      AV_CPU_FLAG_ARMV5TE },
 146 #endif
 147 #if HAVE_ARMV6
 148     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,   FF_IDCT_PERM_LIBMPEG2,  AV_CPU_FLAG_ARMV6 },
 149 #endif
 150 #if HAVE_NEON && ARCH_ARM
 151     { "SIMPLE-NEON",    ff_simple_idct_neon,    FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
 152 #endif
 153
 154     { 0 }
 155 };
 156
 157 #define AANSCALE_BITS 12
 158
 159 #define NB_ITS 20000
 160 #define NB_ITS_SPEED 50000
 161
 162 static short idct_simple_mmx_perm[64] = {
 163     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 164     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 165     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 166     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 167     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 168     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 169     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 170     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 171 };
 172
 173 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 174
 175 DECLARE_ALIGNED(16, static int16_t, block)[64];
 176 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 177
 178 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 179 {
 180     int i, j;
 181
 182     memset(block, 0, 64 * sizeof(*block));
 183
 184     switch (test) {
 185     case 0:
 186         for (i = 0; i < 64; i++)
 187             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 188         if (is_idct) {
 189             ff_ref_fdct(block);
 190             for (i = 0; i < 64; i++)
 191                 block[i] >>= 3;
 192         }
 193         break;
 194     case 1:
 195         j = av_lfg_get(prng) % 10 + 1;
 196         for (i = 0; i < j; i++) {
 197             int idx = av_lfg_get(prng) % 64;
 198             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
 199         }
 200         break;
 201     case 2:
 202         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 203         block[63] = (block[0] & 1) ^ 1;
 204         break;
 205     }
 206 }
 207
 208 static void permute(int16_t dst[64], const int16_t src[64],
 209                     enum idct_permutation_type perm_type)
 210 {
 211     int i;
 212
 213     switch (perm_type) {
 214     case FF_IDCT_PERM_LIBMPEG2:
 215         for (i = 0; i < 64; i++)
 216             dst[(i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2)] = src[i];
 217         break;
 218     case FF_IDCT_PERM_SIMPLE:
 219         for (i = 0; i < 64; i++)
 220             dst[idct_simple_mmx_perm[i]] = src[i];
 221         break;
 222     case FF_IDCT_PERM_SSE2:
 223         for (i = 0; i < 64; i++)
 224             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 225         break;
 226     case FF_IDCT_PERM_PARTTRANS:
 227         for (i = 0; i < 64; i++)
 228             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 229         break;
 230     case FF_IDCT_PERM_TRANSPOSE:
 231         for (i = 0; i < 64; i++)
 232             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 233         break;
 234     default:
 235         for (i = 0; i < 64; i++)
 236             dst[i] = src[i];
 237         break;
 238     }
 239 }
 240
 241 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 242 {
 243     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 244     int it, i, scale;
 245     int err_inf, v;
 246     int64_t err2, ti, ti1, it1, err_sum = 0;
 247     int64_t sysErr[64], sysErrMax = 0;
 248     int maxout = 0;
 249     int blockSumErrMax = 0, blockSumErr;
 250     AVLFG prng;
 251     const int vals=1<<bits;
 252     double omse, ome;
 253     int spec_err;
 254
 255     av_lfg_init(&prng, 1);
 256
 257     err_inf = 0;
 258     err2 = 0;
 259     for (i = 0; i < 64; i++)
 260         sysErr[i] = 0;
 261     for (it = 0; it < NB_ITS; it++) {
 262         init_block(block1, test, is_idct, &prng, vals);
 263         permute(block, block1, dct->perm_type);
 264
 265         dct->func(block);
 266         emms_c();
 267
 268         if (!strcmp(dct->name, "IJG-AAN-INT")) {
 269             for (i = 0; i < 64; i++) {
 270                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 271                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 272             }
 273         }
 274
 275         ref(block1);
 276         if (!strcmp(dct->name, "PR-SSE2"))
 277             for (i = 0; i < 64; i++)
 278                 block1[i] = av_clip(block1[i], 4-512, 1019-512);
 279
 280         blockSumErr = 0;
 281         for (i = 0; i < 64; i++) {
 282             int err = block[i] - block1[i];
 283             err_sum += err;
 284             v = abs(err);
 285             if (v > err_inf)
 286                 err_inf = v;
 287             err2 += v * v;
 288             sysErr[i] += block[i] - block1[i];
 289             blockSumErr += v;
 290             if (abs(block[i]) > maxout)
 291                 maxout = abs(block[i]);
 292         }
 293         if (blockSumErrMax < blockSumErr)
 294             blockSumErrMax = blockSumErr;
 295     }
 296     for (i = 0; i < 64; i++)
 297         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 298
 299     for (i = 0; i < 64; i++) {
 300         if (i % 8 == 0)
 301             printf("\n");
 302         printf("%7d ", (int) sysErr[i]);
 303     }
 304     printf("\n");
 305
 306     omse = (double) err2 / NB_ITS / 64;
 307     ome  = (double) err_sum / NB_ITS / 64;
 308
 309     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 310
 311     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 312            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 313            omse, ome, (double) sysErrMax / NB_ITS,
 314            maxout, blockSumErrMax);
 315
 316     if (spec_err && !dct->nonspec)
 317         return 1;
 318
 319     if (!speed)
 320         return 0;
 321
 322     /* speed test */
 323
 324     init_block(block, test, is_idct, &prng, vals);
 325     permute(block1, block, dct->perm_type);
 326
 327     ti = av_gettime_relative();
 328     it1 = 0;
 329     do {
 330         for (it = 0; it < NB_ITS_SPEED; it++) {
 331             memcpy(block, block1, sizeof(block));
 332             dct->func(block);
 333         }
 334         emms_c();
 335         it1 += NB_ITS_SPEED;
 336         ti1 = av_gettime_relative() - ti;
 337     } while (ti1 < 1000000);
 338
 339     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 340            (double) it1 * 1000.0 / (double) ti1);
 341
 342     return 0;
 343 }
 344
 345 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 346 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 347
 348 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 349 {
 350     static int init;
 351     static double c8[8][8];
 352     static double c4[4][4];
 353     double block1[64], block2[64], block3[64];
 354     double s, sum, v;
 355     int i, j, k;
 356
 357     if (!init) {
 358         init = 1;
 359
 360         for (i = 0; i < 8; i++) {
 361             sum = 0;
 362             for (j = 0; j < 8; j++) {
 363                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 364                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 365                 sum += c8[i][j] * c8[i][j];
 366             }
 367         }
 368
 369         for (i = 0; i < 4; i++) {
 370             sum = 0;
 371             for (j = 0; j < 4; j++) {
 372                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 373                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 374                 sum += c4[i][j] * c4[i][j];
 375             }
 376         }
 377     }
 378
 379     /* butterfly */
 380     s = 0.5 * sqrt(2.0);
 381     for (i = 0; i < 4; i++) {
 382         for (j = 0; j < 8; j++) {
 383             block1[8 * (2 * i) + j] =
 384                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 385             block1[8 * (2 * i + 1) + j] =
 386                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 387         }
 388     }
 389
 390     /* idct8 on lines */
 391     for (i = 0; i < 8; i++) {
 392         for (j = 0; j < 8; j++) {
 393             sum = 0;
 394             for (k = 0; k < 8; k++)
 395                 sum += c8[k][j] * block1[8 * i + k];
 396             block2[8 * i + j] = sum;
 397         }
 398     }
 399
 400     /* idct4 */
 401     for (i = 0; i < 8; i++) {
 402         for (j = 0; j < 4; j++) {
 403             /* top */
 404             sum = 0;
 405             for (k = 0; k < 4; k++)
 406                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 407             block3[8 * (2 * j) + i] = sum;
 408
 409             /* bottom */
 410             sum = 0;
 411             for (k = 0; k < 4; k++)
 412                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 413             block3[8 * (2 * j + 1) + i] = sum;
 414         }
 415     }
 416
 417     /* clamp and store the result */
 418     for (i = 0; i < 8; i++) {
 419         for (j = 0; j < 8; j++) {
 420             v = block3[8 * i + j];
 421             if      (v < 0)   v = 0;
 422             else if (v > 255) v = 255;
 423             dest[i * linesize + j] = (int) rint(v);
 424         }
 425     }
 426 }
 427
 428 static void idct248_error(const char *name,
 429                           void (*idct248_put)(uint8_t *dest, int line_size,
 430                                               int16_t *block),
 431                           int speed)
 432 {
 433     int it, i, it1, ti, ti1, err_max, v;
 434     AVLFG prng;
 435
 436     av_lfg_init(&prng, 1);
 437
 438     /* just one test to see if code is correct (precision is less
 439        important here) */
 440     err_max = 0;
 441     for (it = 0; it < NB_ITS; it++) {
 442         /* XXX: use forward transform to generate values */
 443         for (i = 0; i < 64; i++)
 444             block1[i] = av_lfg_get(&prng) % 256 - 128;
 445         block1[0] += 1024;
 446
 447         for (i = 0; i < 64; i++)
 448             block[i] = block1[i];
 449         idct248_ref(img_dest1, 8, block);
 450
 451         for (i = 0; i < 64; i++)
 452             block[i] = block1[i];
 453         idct248_put(img_dest, 8, block);
 454
 455         for (i = 0; i < 64; i++) {
 456             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 457             if (v == 255)
 458                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 459             if (v > err_max)
 460                 err_max = v;
 461         }
 462 #if 0
 463         printf("ref=\n");
 464         for(i=0;i<8;i++) {
 465             int j;
 466             for(j=0;j<8;j++) {
 467                 printf(" %3d", img_dest1[i*8+j]);
 468             }
 469             printf("\n");
 470         }
 471
 472         printf("out=\n");
 473         for(i=0;i<8;i++) {
 474             int j;
 475             for(j=0;j<8;j++) {
 476                 printf(" %3d", img_dest[i*8+j]);
 477             }
 478             printf("\n");
 479         }
 480 #endif
 481     }
 482     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 483
 484     if (!speed)
 485         return;
 486
 487     ti = av_gettime_relative();
 488     it1 = 0;
 489     do {
 490         for (it = 0; it < NB_ITS_SPEED; it++) {
 491             for (i = 0; i < 64; i++)
 492                 block[i] = block1[i];
 493             idct248_put(img_dest, 8, block);
 494         }
 495         emms_c();
 496         it1 += NB_ITS_SPEED;
 497         ti1 = av_gettime_relative() - ti;
 498     } while (ti1 < 1000000);
 499
 500     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 501            (double) it1 * 1000.0 / (double) ti1);
 502 }
 503
 504 static void help(void)
 505 {
 506     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 507            "test-number 0 -> test with random matrixes\n"
 508            "            1 -> test with random sparse matrixes\n"
 509            "            2 -> do 3. test from mpeg4 std\n"
 510            "bits        Number of time domain bits to use, 8 is default\n"
 511            "-i          test IDCT implementations\n"
 512            "-4          test IDCT248 implementations\n"
 513            "-t          speed test\n");
 514 }
 515
 516 #if !HAVE_GETOPT
 517 #include "compat/getopt.c"
 518 #endif
 519
 520 int main(int argc, char **argv)
 521 {
 522     int test_idct = 0, test_248_dct = 0;
 523     int c, i;
 524     int test = 1;
 525     int speed = 0;
 526     int err = 0;
 527     int bits=8;
 528
 529     ff_ref_dct_init();
 530
 531     for (;;) {
 532         c = getopt(argc, argv, "ih4t");
 533         if (c == -1)
 534             break;
 535         switch (c) {
 536         case 'i':
 537             test_idct = 1;
 538             break;
 539         case '4':
 540             test_248_dct = 1;
 541             break;
 542         case 't':
 543             speed = 1;
 544             break;
 545         default:
 546         case 'h':
 547             help();
 548             return 0;
 549         }
 550     }
 551
 552     if (optind < argc)
 553         test = atoi(argv[optind]);
 554     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 555
 556     printf("ffmpeg DCT/IDCT test\n");
 557
 558     if (test_248_dct) {
 559         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 560     } else {
 561         const int cpu_flags = av_get_cpu_flags();
 562         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 563         for (i = 0; algos[i].name; i++)
 564             if (!(~cpu_flags & algos[i].cpu_flag)) {
 565                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 566             }
 567     }
 568
 569     if (err)
 570         printf("Error: %d.\n", err);
 571
 572     return !!err;
 573 }