git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33 #include <math.h>
  34
  35 #include "libavutil/cpu.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/lfg.h"
  38
  39 #include "simple_idct.h"
  40 #include "aandcttab.h"
  41 #include "faandct.h"
  42 #include "faanidct.h"
  43 #include "x86/idct_xvid.h"
  44 #include "dctref.h"
  45
  46 #undef printf
  47
  48 void ff_mmx_idct(DCTELEM *data);
  49 void ff_mmxext_idct(DCTELEM *data);
  50
  51 // BFIN
  52 void ff_bfin_idct(DCTELEM *block);
  53 void ff_bfin_fdct(DCTELEM *block);
  54
  55 // ALTIVEC
  56 void ff_fdct_altivec(DCTELEM *block);
  57 //void ff_idct_altivec(DCTELEM *block);?? no routine
  58
  59 // ARM
  60 void ff_j_rev_dct_arm(DCTELEM *data);
  61 void ff_simple_idct_arm(DCTELEM *data);
  62 void ff_simple_idct_armv5te(DCTELEM *data);
  63 void ff_simple_idct_armv6(DCTELEM *data);
  64 void ff_simple_idct_neon(DCTELEM *data);
  65
  66 void ff_simple_idct_axp(DCTELEM *data);
  67
  68 struct algo {
  69     const char *name;
  70     void (*func)(DCTELEM *block);
  71     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  72                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  73     int mm_support;
  74     int nonspec;
  75 };
  76
  77 #ifndef FAAN_POSTSCALE
  78 #define FAAN_SCALE SCALE_PERM
  79 #else
  80 #define FAAN_SCALE NO_PERM
  81 #endif
  82
  83 static int cpu_flags;
  84
  85 static const struct algo fdct_tab[] = {
  86     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  87     { "FAAN",           ff_faandct,            FAAN_SCALE },
  88     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  89     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  90
  91 #if HAVE_MMX
  92     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  93     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  94     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  95 #endif
  96
  97 #if HAVE_ALTIVEC
  98     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  99 #endif
 100
 101 #if ARCH_BFIN
 102     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 103 #endif
 104
 105     { 0 }
 106 };
 107
 108 #if HAVE_MMX
 109 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 110                                 DCTELEM *block, int16_t *qmat);
 111
 112 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
 113     int16_t qmat[64]; int i;
 114     int16_t tmp[64];
 115
 116     for(i=0; i<64; i++){
 117         qmat[i]=4;
 118         tmp[i]= dst[i];
 119     }
 120     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 121 }
 122 #endif
 123
 124 static const struct algo idct_tab[] = {
 125     { "FAANI",          ff_faanidct,           NO_PERM  },
 126     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 127     { "INT",            ff_j_rev_dct,          MMX_PERM },
 128     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 129
 130 #if HAVE_MMX
 131 #if CONFIG_GPL
 132     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 133     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 134 #endif
 135     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 136     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 137     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 138     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 139 #if ARCH_X86_64
 140     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 141 #endif
 142 #endif
 143
 144 #if ARCH_BFIN
 145     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 146 #endif
 147
 148 #if ARCH_ARM
 149     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 150     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 151 #endif
 152 #if HAVE_ARMV5TE
 153     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 154 #endif
 155 #if HAVE_ARMV6
 156     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 157 #endif
 158 #if HAVE_NEON
 159     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 160 #endif
 161
 162 #if ARCH_ALPHA
 163     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 164 #endif
 165
 166     { 0 }
 167 };
 168
 169 #define AANSCALE_BITS 12
 170
 171 static int64_t gettime(void)
 172 {
 173     struct timeval tv;
 174     gettimeofday(&tv, NULL);
 175     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 176 }
 177
 178 #define NB_ITS 20000
 179 #define NB_ITS_SPEED 50000
 180
 181 static short idct_mmx_perm[64];
 182
 183 static short idct_simple_mmx_perm[64] = {
 184     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 185     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 186     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 187     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 188     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 189     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 190     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 191     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 192 };
 193
 194 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 195
 196 static void idct_mmx_init(void)
 197 {
 198     int i;
 199
 200     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 201     for (i = 0; i < 64; i++) {
 202         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 203     }
 204 }
 205
 206 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 207 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 208
 209 static inline void mmx_emms(void)
 210 {
 211 #if HAVE_MMX
 212     if (cpu_flags & AV_CPU_FLAG_MMX)
 213         __asm__ volatile ("emms\n\t");
 214 #endif
 215 }
 216
 217 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 218 {
 219     int i, j;
 220
 221     memset(block, 0, 64 * sizeof(*block));
 222
 223     switch (test) {
 224     case 0:
 225         for (i = 0; i < 64; i++)
 226             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 227         if (is_idct) {
 228             ff_ref_fdct(block);
 229             for (i = 0; i < 64; i++)
 230                 block[i] >>= 3;
 231         }
 232         break;
 233     case 1:
 234         j = av_lfg_get(prng) % 10 + 1;
 235         for (i = 0; i < j; i++)
 236             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
 237         break;
 238     case 2:
 239         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 240         block[63] = (block[0] & 1) ^ 1;
 241         break;
 242     }
 243 }
 244
 245 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 246 {
 247     int i;
 248
 249     if (perm == MMX_PERM) {
 250         for (i = 0; i < 64; i++)
 251             dst[idct_mmx_perm[i]] = src[i];
 252     } else if (perm == MMX_SIMPLE_PERM) {
 253         for (i = 0; i < 64; i++)
 254             dst[idct_simple_mmx_perm[i]] = src[i];
 255     } else if (perm == SSE2_PERM) {
 256         for (i = 0; i < 64; i++)
 257             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 258     } else if (perm == PARTTRANS_PERM) {
 259         for (i = 0; i < 64; i++)
 260             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 261     } else if (perm == TRANSPOSE_PERM) {
 262         for (i = 0; i < 64; i++)
 263             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 264     } else {
 265         for (i = 0; i < 64; i++)
 266             dst[i] = src[i];
 267     }
 268 }
 269
 270 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 271 {
 272     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 273     int it, i, scale;
 274     int err_inf, v;
 275     int64_t err2, ti, ti1, it1, err_sum = 0;
 276     int64_t sysErr[64], sysErrMax = 0;
 277     int maxout = 0;
 278     int blockSumErrMax = 0, blockSumErr;
 279     AVLFG prng;
 280     const int vals=1<<bits;
 281     double omse, ome;
 282     int spec_err;
 283
 284     av_lfg_init(&prng, 1);
 285
 286     err_inf = 0;
 287     err2 = 0;
 288     for (i = 0; i < 64; i++)
 289         sysErr[i] = 0;
 290     for (it = 0; it < NB_ITS; it++) {
 291         init_block(block1, test, is_idct, &prng, vals);
 292         permute(block, block1, dct->format);
 293
 294         dct->func(block);
 295         mmx_emms();
 296
 297         if (dct->format == SCALE_PERM) {
 298             for (i = 0; i < 64; i++) {
 299                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 300                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 301             }
 302         }
 303
 304         ref(block1);
 305
 306         blockSumErr = 0;
 307         for (i = 0; i < 64; i++) {
 308             int err = block[i] - block1[i];
 309             err_sum += err;
 310             v = abs(err);
 311             if (v > err_inf)
 312                 err_inf = v;
 313             err2 += v * v;
 314             sysErr[i] += block[i] - block1[i];
 315             blockSumErr += v;
 316             if (abs(block[i]) > maxout)
 317                 maxout = abs(block[i]);
 318         }
 319         if (blockSumErrMax < blockSumErr)
 320             blockSumErrMax = blockSumErr;
 321     }
 322     for (i = 0; i < 64; i++)
 323         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 324
 325     for (i = 0; i < 64; i++) {
 326         if (i % 8 == 0)
 327             printf("\n");
 328         printf("%7d ", (int) sysErr[i]);
 329     }
 330     printf("\n");
 331
 332     omse = (double) err2 / NB_ITS / 64;
 333     ome  = (double) err_sum / NB_ITS / 64;
 334
 335     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 336
 337     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 338            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 339            omse, ome, (double) sysErrMax / NB_ITS,
 340            maxout, blockSumErrMax);
 341
 342     if (spec_err && !dct->nonspec)
 343         return 1;
 344
 345     if (!speed)
 346         return 0;
 347
 348     /* speed test */
 349
 350     init_block(block, test, is_idct, &prng, vals);
 351     permute(block1, block, dct->format);
 352
 353     ti = gettime();
 354     it1 = 0;
 355     do {
 356         for (it = 0; it < NB_ITS_SPEED; it++) {
 357             memcpy(block, block1, sizeof(block));
 358             dct->func(block);
 359         }
 360         it1 += NB_ITS_SPEED;
 361         ti1 = gettime() - ti;
 362     } while (ti1 < 1000000);
 363     mmx_emms();
 364
 365     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 366            (double) it1 * 1000.0 / (double) ti1);
 367
 368     return 0;
 369 }
 370
 371 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 372 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 373
 374 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 375 {
 376     static int init;
 377     static double c8[8][8];
 378     static double c4[4][4];
 379     double block1[64], block2[64], block3[64];
 380     double s, sum, v;
 381     int i, j, k;
 382
 383     if (!init) {
 384         init = 1;
 385
 386         for (i = 0; i < 8; i++) {
 387             sum = 0;
 388             for (j = 0; j < 8; j++) {
 389                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 390                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 391                 sum += c8[i][j] * c8[i][j];
 392             }
 393         }
 394
 395         for (i = 0; i < 4; i++) {
 396             sum = 0;
 397             for (j = 0; j < 4; j++) {
 398                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 399                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 400                 sum += c4[i][j] * c4[i][j];
 401             }
 402         }
 403     }
 404
 405     /* butterfly */
 406     s = 0.5 * sqrt(2.0);
 407     for (i = 0; i < 4; i++) {
 408         for (j = 0; j < 8; j++) {
 409             block1[8 * (2 * i) + j] =
 410                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 411             block1[8 * (2 * i + 1) + j] =
 412                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 413         }
 414     }
 415
 416     /* idct8 on lines */
 417     for (i = 0; i < 8; i++) {
 418         for (j = 0; j < 8; j++) {
 419             sum = 0;
 420             for (k = 0; k < 8; k++)
 421                 sum += c8[k][j] * block1[8 * i + k];
 422             block2[8 * i + j] = sum;
 423         }
 424     }
 425
 426     /* idct4 */
 427     for (i = 0; i < 8; i++) {
 428         for (j = 0; j < 4; j++) {
 429             /* top */
 430             sum = 0;
 431             for (k = 0; k < 4; k++)
 432                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 433             block3[8 * (2 * j) + i] = sum;
 434
 435             /* bottom */
 436             sum = 0;
 437             for (k = 0; k < 4; k++)
 438                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 439             block3[8 * (2 * j + 1) + i] = sum;
 440         }
 441     }
 442
 443     /* clamp and store the result */
 444     for (i = 0; i < 8; i++) {
 445         for (j = 0; j < 8; j++) {
 446             v = block3[8 * i + j];
 447             if      (v < 0)   v = 0;
 448             else if (v > 255) v = 255;
 449             dest[i * linesize + j] = (int) rint(v);
 450         }
 451     }
 452 }
 453
 454 static void idct248_error(const char *name,
 455                           void (*idct248_put)(uint8_t *dest, int line_size,
 456                                               int16_t *block),
 457                           int speed)
 458 {
 459     int it, i, it1, ti, ti1, err_max, v;
 460     AVLFG prng;
 461
 462     av_lfg_init(&prng, 1);
 463
 464     /* just one test to see if code is correct (precision is less
 465        important here) */
 466     err_max = 0;
 467     for (it = 0; it < NB_ITS; it++) {
 468         /* XXX: use forward transform to generate values */
 469         for (i = 0; i < 64; i++)
 470             block1[i] = av_lfg_get(&prng) % 256 - 128;
 471         block1[0] += 1024;
 472
 473         for (i = 0; i < 64; i++)
 474             block[i] = block1[i];
 475         idct248_ref(img_dest1, 8, block);
 476
 477         for (i = 0; i < 64; i++)
 478             block[i] = block1[i];
 479         idct248_put(img_dest, 8, block);
 480
 481         for (i = 0; i < 64; i++) {
 482             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 483             if (v == 255)
 484                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 485             if (v > err_max)
 486                 err_max = v;
 487         }
 488 #if 0
 489         printf("ref=\n");
 490         for(i=0;i<8;i++) {
 491             int j;
 492             for(j=0;j<8;j++) {
 493                 printf(" %3d", img_dest1[i*8+j]);
 494             }
 495             printf("\n");
 496         }
 497
 498         printf("out=\n");
 499         for(i=0;i<8;i++) {
 500             int j;
 501             for(j=0;j<8;j++) {
 502                 printf(" %3d", img_dest[i*8+j]);
 503             }
 504             printf("\n");
 505         }
 506 #endif
 507     }
 508     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 509
 510     if (!speed)
 511         return;
 512
 513     ti = gettime();
 514     it1 = 0;
 515     do {
 516         for (it = 0; it < NB_ITS_SPEED; it++) {
 517             for (i = 0; i < 64; i++)
 518                 block[i] = block1[i];
 519             idct248_put(img_dest, 8, block);
 520         }
 521         it1 += NB_ITS_SPEED;
 522         ti1 = gettime() - ti;
 523     } while (ti1 < 1000000);
 524     mmx_emms();
 525
 526     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 527            (double) it1 * 1000.0 / (double) ti1);
 528 }
 529
 530 static void help(void)
 531 {
 532     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 533            "test-number 0 -> test with random matrixes\n"
 534            "            1 -> test with random sparse matrixes\n"
 535            "            2 -> do 3. test from mpeg4 std\n"
 536            "bits        Number of time domain bits to use, 8 is default\n"
 537            "-i          test IDCT implementations\n"
 538            "-4          test IDCT248 implementations\n"
 539            "-t          speed test\n");
 540 }
 541
 542 int main(int argc, char **argv)
 543 {
 544     int test_idct = 0, test_248_dct = 0;
 545     int c, i;
 546     int test = 1;
 547     int speed = 0;
 548     int err = 0;
 549     int bits=8;
 550
 551     cpu_flags = av_get_cpu_flags();
 552
 553     ff_ref_dct_init();
 554     idct_mmx_init();
 555
 556     for (;;) {
 557         c = getopt(argc, argv, "ih4t");
 558         if (c == -1)
 559             break;
 560         switch (c) {
 561         case 'i':
 562             test_idct = 1;
 563             break;
 564         case '4':
 565             test_248_dct = 1;
 566             break;
 567         case 't':
 568             speed = 1;
 569             break;
 570         default:
 571         case 'h':
 572             help();
 573             return 0;
 574         }
 575     }
 576
 577     if (optind < argc)
 578         test = atoi(argv[optind]);
 579     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 580
 581     printf("ffmpeg DCT/IDCT test\n");
 582
 583     if (test_248_dct) {
 584         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 585     } else {
 586         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 587         for (i = 0; algos[i].name; i++)
 588             if (!(~cpu_flags & algos[i].mm_support)) {
 589                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 590             }
 591     }
 592
 593     return err;
 594 }