git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33 #include <math.h>
  34
  35 #include "libavutil/cpu.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/lfg.h"
  38
  39 #include "simple_idct.h"
  40 #include "aandcttab.h"
  41 #include "faandct.h"
  42 #include "faanidct.h"
  43 #include "x86/idct_xvid.h"
  44 #include "dctref.h"
  45
  46 #undef printf
  47
  48 void ff_mmx_idct(DCTELEM *data);
  49 void ff_mmxext_idct(DCTELEM *data);
  50
  51 void odivx_idct_c(short *block);
  52
  53 // BFIN
  54 void ff_bfin_idct(DCTELEM *block);
  55 void ff_bfin_fdct(DCTELEM *block);
  56
  57 // ALTIVEC
  58 void fdct_altivec(DCTELEM *block);
  59 //void idct_altivec(DCTELEM *block);?? no routine
  60
  61 // ARM
  62 void ff_j_rev_dct_arm(DCTELEM *data);
  63 void ff_simple_idct_arm(DCTELEM *data);
  64 void ff_simple_idct_armv5te(DCTELEM *data);
  65 void ff_simple_idct_armv6(DCTELEM *data);
  66 void ff_simple_idct_neon(DCTELEM *data);
  67
  68 void ff_simple_idct_axp(DCTELEM *data);
  69
  70 struct algo {
  71     const char *name;
  72     void (*func)(DCTELEM *block);
  73     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  74                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  75     int mm_support;
  76     int nonspec;
  77 };
  78
  79 #ifndef FAAN_POSTSCALE
  80 #define FAAN_SCALE SCALE_PERM
  81 #else
  82 #define FAAN_SCALE NO_PERM
  83 #endif
  84
  85 static int cpu_flags;
  86
  87 static const struct algo fdct_tab[] = {
  88     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  89     { "FAAN",           ff_faandct,            FAAN_SCALE },
  90     { "IJG-AAN-INT",    fdct_ifast,            SCALE_PERM },
  91     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  92
  93 #if HAVE_MMX
  94     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  95     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  96     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  97 #endif
  98
  99 #if HAVE_ALTIVEC
 100     { "altivecfdct",    fdct_altivec,          NO_PERM,   AV_CPU_FLAG_ALTIVEC },
 101 #endif
 102
 103 #if ARCH_BFIN
 104     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 105 #endif
 106
 107     { 0 }
 108 };
 109
 110 #if HAVE_MMX
 111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 112                                 DCTELEM *block, int16_t *qmat);
 113
 114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
 115     int16_t qmat[64]; int i;
 116     int16_t tmp[64];
 117
 118     for(i=0; i<64; i++){
 119         qmat[i]=4;
 120         tmp[i]= dst[i];
 121     }
 122     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 123 }
 124 #endif
 125
 126 static const struct algo idct_tab[] = {
 127     { "FAANI",          ff_faanidct,           NO_PERM  },
 128     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 129     { "INT",            j_rev_dct,             MMX_PERM },
 130     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 131
 132 #if HAVE_MMX
 133 #if CONFIG_GPL
 134     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 135     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 136 #endif
 137     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 138     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 139     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 140     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 141 #if ARCH_X86_64
 142     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 143 #endif
 144 #endif
 145
 146 #if ARCH_BFIN
 147     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 148 #endif
 149
 150 #if ARCH_ARM
 151     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 152     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 153 #endif
 154 #if HAVE_ARMV5TE
 155     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 156 #endif
 157 #if HAVE_ARMV6
 158     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 159 #endif
 160 #if HAVE_NEON
 161     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 162 #endif
 163
 164 #if ARCH_ALPHA
 165     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 166 #endif
 167
 168     { 0 }
 169 };
 170
 171 #define AANSCALE_BITS 12
 172
 173 static int64_t gettime(void)
 174 {
 175     struct timeval tv;
 176     gettimeofday(&tv, NULL);
 177     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 178 }
 179
 180 #define NB_ITS 20000
 181 #define NB_ITS_SPEED 50000
 182
 183 static short idct_mmx_perm[64];
 184
 185 static short idct_simple_mmx_perm[64] = {
 186     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 187     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 188     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 189     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 190     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 191     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 192     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 193     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 194 };
 195
 196 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 197
 198 static void idct_mmx_init(void)
 199 {
 200     int i;
 201
 202     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 203     for (i = 0; i < 64; i++) {
 204         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 205     }
 206 }
 207
 208 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 209 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 210
 211 static inline void mmx_emms(void)
 212 {
 213 #if HAVE_MMX
 214     if (cpu_flags & AV_CPU_FLAG_MMX)
 215         __asm__ volatile ("emms\n\t");
 216 #endif
 217 }
 218
 219 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 220 {
 221     int i, j;
 222
 223     memset(block, 0, 64 * sizeof(*block));
 224
 225     switch (test) {
 226     case 0:
 227         for (i = 0; i < 64; i++)
 228             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 229         if (is_idct) {
 230             ff_ref_fdct(block);
 231             for (i = 0; i < 64; i++)
 232                 block[i] >>= 3;
 233         }
 234         break;
 235     case 1:
 236         j = av_lfg_get(prng) % 10 + 1;
 237         for (i = 0; i < j; i++)
 238             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
 239         break;
 240     case 2:
 241         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 242         block[63] = (block[0] & 1) ^ 1;
 243         break;
 244     }
 245 }
 246
 247 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 248 {
 249     int i;
 250
 251     if (perm == MMX_PERM) {
 252         for (i = 0; i < 64; i++)
 253             dst[idct_mmx_perm[i]] = src[i];
 254     } else if (perm == MMX_SIMPLE_PERM) {
 255         for (i = 0; i < 64; i++)
 256             dst[idct_simple_mmx_perm[i]] = src[i];
 257     } else if (perm == SSE2_PERM) {
 258         for (i = 0; i < 64; i++)
 259             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 260     } else if (perm == PARTTRANS_PERM) {
 261         for (i = 0; i < 64; i++)
 262             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 263     } else if (perm == TRANSPOSE_PERM) {
 264         for (i = 0; i < 64; i++)
 265             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 266     } else {
 267         for (i = 0; i < 64; i++)
 268             dst[i] = src[i];
 269     }
 270 }
 271
 272 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 273 {
 274     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 275     int it, i, scale;
 276     int err_inf, v;
 277     int64_t err2, ti, ti1, it1, err_sum = 0;
 278     int64_t sysErr[64], sysErrMax = 0;
 279     int maxout = 0;
 280     int blockSumErrMax = 0, blockSumErr;
 281     AVLFG prng;
 282     const int vals=1<<bits;
 283     double omse, ome;
 284     int spec_err;
 285
 286     av_lfg_init(&prng, 1);
 287
 288     err_inf = 0;
 289     err2 = 0;
 290     for (i = 0; i < 64; i++)
 291         sysErr[i] = 0;
 292     for (it = 0; it < NB_ITS; it++) {
 293         init_block(block1, test, is_idct, &prng, vals);
 294         permute(block, block1, dct->format);
 295
 296         dct->func(block);
 297         mmx_emms();
 298
 299         if (dct->format == SCALE_PERM) {
 300             for (i = 0; i < 64; i++) {
 301                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 302                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 303             }
 304         }
 305
 306         ref(block1);
 307
 308         blockSumErr = 0;
 309         for (i = 0; i < 64; i++) {
 310             int err = block[i] - block1[i];
 311             err_sum += err;
 312             v = abs(err);
 313             if (v > err_inf)
 314                 err_inf = v;
 315             err2 += v * v;
 316             sysErr[i] += block[i] - block1[i];
 317             blockSumErr += v;
 318             if (abs(block[i]) > maxout)
 319                 maxout = abs(block[i]);
 320         }
 321         if (blockSumErrMax < blockSumErr)
 322             blockSumErrMax = blockSumErr;
 323     }
 324     for (i = 0; i < 64; i++)
 325         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 326
 327     for (i = 0; i < 64; i++) {
 328         if (i % 8 == 0)
 329             printf("\n");
 330         printf("%7d ", (int) sysErr[i]);
 331     }
 332     printf("\n");
 333
 334     omse = (double) err2 / NB_ITS / 64;
 335     ome  = (double) err_sum / NB_ITS / 64;
 336
 337     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 338
 339     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 340            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 341            omse, ome, (double) sysErrMax / NB_ITS,
 342            maxout, blockSumErrMax);
 343
 344     if (spec_err && !dct->nonspec)
 345         return 1;
 346
 347     if (!speed)
 348         return 0;
 349
 350     /* speed test */
 351
 352     init_block(block, test, is_idct, &prng, vals);
 353     permute(block1, block, dct->format);
 354
 355     ti = gettime();
 356     it1 = 0;
 357     do {
 358         for (it = 0; it < NB_ITS_SPEED; it++) {
 359             memcpy(block, block1, sizeof(block));
 360             dct->func(block);
 361         }
 362         it1 += NB_ITS_SPEED;
 363         ti1 = gettime() - ti;
 364     } while (ti1 < 1000000);
 365     mmx_emms();
 366
 367     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 368            (double) it1 * 1000.0 / (double) ti1);
 369
 370     return 0;
 371 }
 372
 373 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 374 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 375
 376 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 377 {
 378     static int init;
 379     static double c8[8][8];
 380     static double c4[4][4];
 381     double block1[64], block2[64], block3[64];
 382     double s, sum, v;
 383     int i, j, k;
 384
 385     if (!init) {
 386         init = 1;
 387
 388         for (i = 0; i < 8; i++) {
 389             sum = 0;
 390             for (j = 0; j < 8; j++) {
 391                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 392                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 393                 sum += c8[i][j] * c8[i][j];
 394             }
 395         }
 396
 397         for (i = 0; i < 4; i++) {
 398             sum = 0;
 399             for (j = 0; j < 4; j++) {
 400                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 401                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 402                 sum += c4[i][j] * c4[i][j];
 403             }
 404         }
 405     }
 406
 407     /* butterfly */
 408     s = 0.5 * sqrt(2.0);
 409     for (i = 0; i < 4; i++) {
 410         for (j = 0; j < 8; j++) {
 411             block1[8 * (2 * i) + j] =
 412                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 413             block1[8 * (2 * i + 1) + j] =
 414                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 415         }
 416     }
 417
 418     /* idct8 on lines */
 419     for (i = 0; i < 8; i++) {
 420         for (j = 0; j < 8; j++) {
 421             sum = 0;
 422             for (k = 0; k < 8; k++)
 423                 sum += c8[k][j] * block1[8 * i + k];
 424             block2[8 * i + j] = sum;
 425         }
 426     }
 427
 428     /* idct4 */
 429     for (i = 0; i < 8; i++) {
 430         for (j = 0; j < 4; j++) {
 431             /* top */
 432             sum = 0;
 433             for (k = 0; k < 4; k++)
 434                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 435             block3[8 * (2 * j) + i] = sum;
 436
 437             /* bottom */
 438             sum = 0;
 439             for (k = 0; k < 4; k++)
 440                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 441             block3[8 * (2 * j + 1) + i] = sum;
 442         }
 443     }
 444
 445     /* clamp and store the result */
 446     for (i = 0; i < 8; i++) {
 447         for (j = 0; j < 8; j++) {
 448             v = block3[8 * i + j];
 449             if      (v < 0)   v = 0;
 450             else if (v > 255) v = 255;
 451             dest[i * linesize + j] = (int) rint(v);
 452         }
 453     }
 454 }
 455
 456 static void idct248_error(const char *name,
 457                           void (*idct248_put)(uint8_t *dest, int line_size,
 458                                               int16_t *block),
 459                           int speed)
 460 {
 461     int it, i, it1, ti, ti1, err_max, v;
 462     AVLFG prng;
 463
 464     av_lfg_init(&prng, 1);
 465
 466     /* just one test to see if code is correct (precision is less
 467        important here) */
 468     err_max = 0;
 469     for (it = 0; it < NB_ITS; it++) {
 470         /* XXX: use forward transform to generate values */
 471         for (i = 0; i < 64; i++)
 472             block1[i] = av_lfg_get(&prng) % 256 - 128;
 473         block1[0] += 1024;
 474
 475         for (i = 0; i < 64; i++)
 476             block[i] = block1[i];
 477         idct248_ref(img_dest1, 8, block);
 478
 479         for (i = 0; i < 64; i++)
 480             block[i] = block1[i];
 481         idct248_put(img_dest, 8, block);
 482
 483         for (i = 0; i < 64; i++) {
 484             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 485             if (v == 255)
 486                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 487             if (v > err_max)
 488                 err_max = v;
 489         }
 490 #if 0
 491         printf("ref=\n");
 492         for(i=0;i<8;i++) {
 493             int j;
 494             for(j=0;j<8;j++) {
 495                 printf(" %3d", img_dest1[i*8+j]);
 496             }
 497             printf("\n");
 498         }
 499
 500         printf("out=\n");
 501         for(i=0;i<8;i++) {
 502             int j;
 503             for(j=0;j<8;j++) {
 504                 printf(" %3d", img_dest[i*8+j]);
 505             }
 506             printf("\n");
 507         }
 508 #endif
 509     }
 510     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 511
 512     if (!speed)
 513         return;
 514
 515     ti = gettime();
 516     it1 = 0;
 517     do {
 518         for (it = 0; it < NB_ITS_SPEED; it++) {
 519             for (i = 0; i < 64; i++)
 520                 block[i] = block1[i];
 521             idct248_put(img_dest, 8, block);
 522         }
 523         it1 += NB_ITS_SPEED;
 524         ti1 = gettime() - ti;
 525     } while (ti1 < 1000000);
 526     mmx_emms();
 527
 528     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 529            (double) it1 * 1000.0 / (double) ti1);
 530 }
 531
 532 static void help(void)
 533 {
 534     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 535            "test-number 0 -> test with random matrixes\n"
 536            "            1 -> test with random sparse matrixes\n"
 537            "            2 -> do 3. test from mpeg4 std\n"
 538            "bits        Number of time domain bits to use, 8 is default\n"
 539            "-i          test IDCT implementations\n"
 540            "-4          test IDCT248 implementations\n"
 541            "-t          speed test\n");
 542 }
 543
 544 int main(int argc, char **argv)
 545 {
 546     int test_idct = 0, test_248_dct = 0;
 547     int c, i;
 548     int test = 1;
 549     int speed = 0;
 550     int err = 0;
 551     int bits=8;
 552
 553     cpu_flags = av_get_cpu_flags();
 554
 555     ff_ref_dct_init();
 556     idct_mmx_init();
 557
 558     for (;;) {
 559         c = getopt(argc, argv, "ih4t");
 560         if (c == -1)
 561             break;
 562         switch (c) {
 563         case 'i':
 564             test_idct = 1;
 565             break;
 566         case '4':
 567             test_248_dct = 1;
 568             break;
 569         case 't':
 570             speed = 1;
 571             break;
 572         default:
 573         case 'h':
 574             help();
 575             return 0;
 576         }
 577     }
 578
 579     if (optind < argc)
 580         test = atoi(argv[optind]);
 581     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 582
 583     printf("ffmpeg DCT/IDCT test\n");
 584
 585     if (test_248_dct) {
 586         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 587     } else {
 588         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 589         for (i = 0; algos[i].name; i++)
 590             if (!(~cpu_flags & algos[i].mm_support)) {
 591                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 592             }
 593     }
 594
 595     return err;
 596 }