git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include "config.h"
  29 #include <stdlib.h>
  30 #include <stdio.h>
  31 #include <string.h>
  32 #if HAVE_UNISTD_H
  33 #include <unistd.h>
  34 #endif
  35 #include <math.h>
  36
  37 #include "libavutil/cpu.h"
  38 #include "libavutil/common.h"
  39 #include "libavutil/lfg.h"
  40 #include "libavutil/time.h"
  41
  42 #include "dct.h"
  43 #include "simple_idct.h"
  44 #include "aandcttab.h"
  45 #include "faandct.h"
  46 #include "faanidct.h"
  47 #include "x86/idct_xvid.h"
  48 #include "dctref.h"
  49
  50 // ALTIVEC
  51 void ff_fdct_altivec(int16_t *block);
  52
  53 // ARM
  54 void ff_j_rev_dct_arm(int16_t *data);
  55 void ff_simple_idct_arm(int16_t *data);
  56 void ff_simple_idct_armv5te(int16_t *data);
  57 void ff_simple_idct_armv6(int16_t *data);
  58 void ff_simple_idct_neon(int16_t *data);
  59
  60 struct algo {
  61     const char *name;
  62     void (*func)(int16_t *block);
  63     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  64                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  65     int mm_support;
  66     int nonspec;
  67 };
  68
  69 static int cpu_flags;
  70
  71 static const struct algo fdct_tab[] = {
  72     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  73     { "FAAN",           ff_faandct,            NO_PERM    },
  74     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  75     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  76
  77 #if HAVE_MMX_INLINE
  78     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  79 #endif
  80 #if HAVE_MMXEXT_INLINE
  81     { "MMXEXT",         ff_fdct_mmxext,        NO_PERM,   AV_CPU_FLAG_MMXEXT  },
  82 #endif
  83 #if HAVE_SSE2_INLINE
  84     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  85 #endif
  86
  87 #if HAVE_ALTIVEC
  88     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  89 #endif
  90
  91     { 0 }
  92 };
  93
  94 static void ff_prores_idct_wrap(int16_t *dst){
  95     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
  96     int i;
  97
  98     for(i=0; i<64; i++){
  99         qmat[i]=4;
 100     }
 101     ff_prores_idct(dst, qmat);
 102     for(i=0; i<64; i++) {
 103          dst[i] -= 512;
 104     }
 105 }
 106 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
 107 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 108                                 int16_t *block, int16_t *qmat);
 109
 110 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
 111     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
 112     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
 113     int i;
 114
 115     for(i=0; i<64; i++){
 116         qmat[i]=4;
 117         tmp[i]= dst[i];
 118     }
 119     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 120
 121     for(i=0; i<64; i++) {
 122          dst[i] -= 512;
 123     }
 124 }
 125 #endif
 126
 127 static const struct algo idct_tab[] = {
 128     { "FAANI",          ff_faanidct,           NO_PERM  },
 129     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 130     { "INT",            ff_j_rev_dct,          MMX_PERM },
 131     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 132     { "PR-C",           ff_prores_idct_wrap,   NO_PERM, 0, 1 },
 133
 134 #if HAVE_MMX_INLINE
 135     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 136     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 137 #endif
 138 #if HAVE_MMXEXT_INLINE
 139     { "XVID-MMXEXT",    ff_idct_xvid_mmxext,   NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
 140 #endif
 141 #if HAVE_SSE2_INLINE
 142     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 143 #if ARCH_X86_64 && HAVE_YASM
 144     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 145 #endif
 146 #endif
 147
 148 #if ARCH_ARM
 149     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 150     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 151 #endif
 152 #if HAVE_ARMV5TE
 153     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM,   AV_CPU_FLAG_ARMV5TE },
 154 #endif
 155 #if HAVE_ARMV6
 156     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM,  AV_CPU_FLAG_ARMV6   },
 157 #endif
 158 #if HAVE_NEON && ARCH_ARM
 159     { "SIMPLE-NEON",    ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
 160 #endif
 161
 162     { 0 }
 163 };
 164
 165 #define AANSCALE_BITS 12
 166
 167 #define NB_ITS 20000
 168 #define NB_ITS_SPEED 50000
 169
 170 static short idct_mmx_perm[64];
 171
 172 static short idct_simple_mmx_perm[64] = {
 173     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 174     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 175     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 176     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 177     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 178     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 179     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 180     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 181 };
 182
 183 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 184
 185 static void idct_mmx_init(void)
 186 {
 187     int i;
 188
 189     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 190     for (i = 0; i < 64; i++) {
 191         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 192     }
 193 }
 194
 195 DECLARE_ALIGNED(16, static int16_t, block)[64];
 196 DECLARE_ALIGNED(8,  static int16_t, block1)[64];
 197
 198 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
 199 {
 200     int i, j;
 201
 202     memset(block, 0, 64 * sizeof(*block));
 203
 204     switch (test) {
 205     case 0:
 206         for (i = 0; i < 64; i++)
 207             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 208         if (is_idct) {
 209             ff_ref_fdct(block);
 210             for (i = 0; i < 64; i++)
 211                 block[i] >>= 3;
 212         }
 213         break;
 214     case 1:
 215         j = av_lfg_get(prng) % 10 + 1;
 216         for (i = 0; i < j; i++) {
 217             int idx = av_lfg_get(prng) % 64;
 218             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
 219         }
 220         break;
 221     case 2:
 222         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 223         block[63] = (block[0] & 1) ^ 1;
 224         break;
 225     }
 226 }
 227
 228 static void permute(int16_t dst[64], const int16_t src[64], int perm)
 229 {
 230     int i;
 231
 232     if (perm == MMX_PERM) {
 233         for (i = 0; i < 64; i++)
 234             dst[idct_mmx_perm[i]] = src[i];
 235     } else if (perm == MMX_SIMPLE_PERM) {
 236         for (i = 0; i < 64; i++)
 237             dst[idct_simple_mmx_perm[i]] = src[i];
 238     } else if (perm == SSE2_PERM) {
 239         for (i = 0; i < 64; i++)
 240             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 241     } else if (perm == PARTTRANS_PERM) {
 242         for (i = 0; i < 64; i++)
 243             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 244     } else if (perm == TRANSPOSE_PERM) {
 245         for (i = 0; i < 64; i++)
 246             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 247     } else {
 248         for (i = 0; i < 64; i++)
 249             dst[i] = src[i];
 250     }
 251 }
 252
 253 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 254 {
 255     void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 256     int it, i, scale;
 257     int err_inf, v;
 258     int64_t err2, ti, ti1, it1, err_sum = 0;
 259     int64_t sysErr[64], sysErrMax = 0;
 260     int maxout = 0;
 261     int blockSumErrMax = 0, blockSumErr;
 262     AVLFG prng;
 263     const int vals=1<<bits;
 264     double omse, ome;
 265     int spec_err;
 266
 267     av_lfg_init(&prng, 1);
 268
 269     err_inf = 0;
 270     err2 = 0;
 271     for (i = 0; i < 64; i++)
 272         sysErr[i] = 0;
 273     for (it = 0; it < NB_ITS; it++) {
 274         init_block(block1, test, is_idct, &prng, vals);
 275         permute(block, block1, dct->format);
 276
 277         dct->func(block);
 278         emms_c();
 279
 280         if (dct->format == SCALE_PERM) {
 281             for (i = 0; i < 64; i++) {
 282                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 283                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 284             }
 285         }
 286
 287         ref(block1);
 288         if (!strcmp(dct->name, "PR-SSE2"))
 289             for (i = 0; i < 64; i++)
 290                 block1[i] = av_clip(block1[i], 4-512, 1019-512);
 291
 292         blockSumErr = 0;
 293         for (i = 0; i < 64; i++) {
 294             int err = block[i] - block1[i];
 295             err_sum += err;
 296             v = abs(err);
 297             if (v > err_inf)
 298                 err_inf = v;
 299             err2 += v * v;
 300             sysErr[i] += block[i] - block1[i];
 301             blockSumErr += v;
 302             if (abs(block[i]) > maxout)
 303                 maxout = abs(block[i]);
 304         }
 305         if (blockSumErrMax < blockSumErr)
 306             blockSumErrMax = blockSumErr;
 307     }
 308     for (i = 0; i < 64; i++)
 309         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 310
 311     for (i = 0; i < 64; i++) {
 312         if (i % 8 == 0)
 313             printf("\n");
 314         printf("%7d ", (int) sysErr[i]);
 315     }
 316     printf("\n");
 317
 318     omse = (double) err2 / NB_ITS / 64;
 319     ome  = (double) err_sum / NB_ITS / 64;
 320
 321     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 322
 323     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 324            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 325            omse, ome, (double) sysErrMax / NB_ITS,
 326            maxout, blockSumErrMax);
 327
 328     if (spec_err && !dct->nonspec)
 329         return 1;
 330
 331     if (!speed)
 332         return 0;
 333
 334     /* speed test */
 335
 336     init_block(block, test, is_idct, &prng, vals);
 337     permute(block1, block, dct->format);
 338
 339     ti = av_gettime_relative();
 340     it1 = 0;
 341     do {
 342         for (it = 0; it < NB_ITS_SPEED; it++) {
 343             memcpy(block, block1, sizeof(block));
 344             dct->func(block);
 345         }
 346         emms_c();
 347         it1 += NB_ITS_SPEED;
 348         ti1 = av_gettime_relative() - ti;
 349     } while (ti1 < 1000000);
 350
 351     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 352            (double) it1 * 1000.0 / (double) ti1);
 353
 354     return 0;
 355 }
 356
 357 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 358 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 359
 360 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 361 {
 362     static int init;
 363     static double c8[8][8];
 364     static double c4[4][4];
 365     double block1[64], block2[64], block3[64];
 366     double s, sum, v;
 367     int i, j, k;
 368
 369     if (!init) {
 370         init = 1;
 371
 372         for (i = 0; i < 8; i++) {
 373             sum = 0;
 374             for (j = 0; j < 8; j++) {
 375                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 376                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 377                 sum += c8[i][j] * c8[i][j];
 378             }
 379         }
 380
 381         for (i = 0; i < 4; i++) {
 382             sum = 0;
 383             for (j = 0; j < 4; j++) {
 384                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 385                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 386                 sum += c4[i][j] * c4[i][j];
 387             }
 388         }
 389     }
 390
 391     /* butterfly */
 392     s = 0.5 * sqrt(2.0);
 393     for (i = 0; i < 4; i++) {
 394         for (j = 0; j < 8; j++) {
 395             block1[8 * (2 * i) + j] =
 396                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 397             block1[8 * (2 * i + 1) + j] =
 398                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 399         }
 400     }
 401
 402     /* idct8 on lines */
 403     for (i = 0; i < 8; i++) {
 404         for (j = 0; j < 8; j++) {
 405             sum = 0;
 406             for (k = 0; k < 8; k++)
 407                 sum += c8[k][j] * block1[8 * i + k];
 408             block2[8 * i + j] = sum;
 409         }
 410     }
 411
 412     /* idct4 */
 413     for (i = 0; i < 8; i++) {
 414         for (j = 0; j < 4; j++) {
 415             /* top */
 416             sum = 0;
 417             for (k = 0; k < 4; k++)
 418                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 419             block3[8 * (2 * j) + i] = sum;
 420
 421             /* bottom */
 422             sum = 0;
 423             for (k = 0; k < 4; k++)
 424                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 425             block3[8 * (2 * j + 1) + i] = sum;
 426         }
 427     }
 428
 429     /* clamp and store the result */
 430     for (i = 0; i < 8; i++) {
 431         for (j = 0; j < 8; j++) {
 432             v = block3[8 * i + j];
 433             if      (v < 0)   v = 0;
 434             else if (v > 255) v = 255;
 435             dest[i * linesize + j] = (int) rint(v);
 436         }
 437     }
 438 }
 439
 440 static void idct248_error(const char *name,
 441                           void (*idct248_put)(uint8_t *dest, int line_size,
 442                                               int16_t *block),
 443                           int speed)
 444 {
 445     int it, i, it1, ti, ti1, err_max, v;
 446     AVLFG prng;
 447
 448     av_lfg_init(&prng, 1);
 449
 450     /* just one test to see if code is correct (precision is less
 451        important here) */
 452     err_max = 0;
 453     for (it = 0; it < NB_ITS; it++) {
 454         /* XXX: use forward transform to generate values */
 455         for (i = 0; i < 64; i++)
 456             block1[i] = av_lfg_get(&prng) % 256 - 128;
 457         block1[0] += 1024;
 458
 459         for (i = 0; i < 64; i++)
 460             block[i] = block1[i];
 461         idct248_ref(img_dest1, 8, block);
 462
 463         for (i = 0; i < 64; i++)
 464             block[i] = block1[i];
 465         idct248_put(img_dest, 8, block);
 466
 467         for (i = 0; i < 64; i++) {
 468             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 469             if (v == 255)
 470                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 471             if (v > err_max)
 472                 err_max = v;
 473         }
 474 #if 0
 475         printf("ref=\n");
 476         for(i=0;i<8;i++) {
 477             int j;
 478             for(j=0;j<8;j++) {
 479                 printf(" %3d", img_dest1[i*8+j]);
 480             }
 481             printf("\n");
 482         }
 483
 484         printf("out=\n");
 485         for(i=0;i<8;i++) {
 486             int j;
 487             for(j=0;j<8;j++) {
 488                 printf(" %3d", img_dest[i*8+j]);
 489             }
 490             printf("\n");
 491         }
 492 #endif
 493     }
 494     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 495
 496     if (!speed)
 497         return;
 498
 499     ti = av_gettime_relative();
 500     it1 = 0;
 501     do {
 502         for (it = 0; it < NB_ITS_SPEED; it++) {
 503             for (i = 0; i < 64; i++)
 504                 block[i] = block1[i];
 505             idct248_put(img_dest, 8, block);
 506         }
 507         emms_c();
 508         it1 += NB_ITS_SPEED;
 509         ti1 = av_gettime_relative() - ti;
 510     } while (ti1 < 1000000);
 511
 512     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 513            (double) it1 * 1000.0 / (double) ti1);
 514 }
 515
 516 static void help(void)
 517 {
 518     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 519            "test-number 0 -> test with random matrixes\n"
 520            "            1 -> test with random sparse matrixes\n"
 521            "            2 -> do 3. test from mpeg4 std\n"
 522            "bits        Number of time domain bits to use, 8 is default\n"
 523            "-i          test IDCT implementations\n"
 524            "-4          test IDCT248 implementations\n"
 525            "-t          speed test\n");
 526 }
 527
 528 #if !HAVE_GETOPT
 529 #include "compat/getopt.c"
 530 #endif
 531
 532 int main(int argc, char **argv)
 533 {
 534     int test_idct = 0, test_248_dct = 0;
 535     int c, i;
 536     int test = 1;
 537     int speed = 0;
 538     int err = 0;
 539     int bits=8;
 540
 541     cpu_flags = av_get_cpu_flags();
 542
 543     ff_ref_dct_init();
 544     idct_mmx_init();
 545
 546     for (;;) {
 547         c = getopt(argc, argv, "ih4t");
 548         if (c == -1)
 549             break;
 550         switch (c) {
 551         case 'i':
 552             test_idct = 1;
 553             break;
 554         case '4':
 555             test_248_dct = 1;
 556             break;
 557         case 't':
 558             speed = 1;
 559             break;
 560         default:
 561         case 'h':
 562             help();
 563             return 0;
 564         }
 565     }
 566
 567     if (optind < argc)
 568         test = atoi(argv[optind]);
 569     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 570
 571     printf("ffmpeg DCT/IDCT test\n");
 572
 573     if (test_248_dct) {
 574         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 575     } else {
 576         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 577         for (i = 0; algos[i].name; i++)
 578             if (!(~cpu_flags & algos[i].mm_support)) {
 579                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 580             }
 581     }
 582
 583     if (err)
 584         printf("Error: %d.\n", err);
 585
 586     return !!err;
 587 }