git.sesse.net Git - ffmpeg/blob - libavcodec/dct-test.c

   1 /*
   2  * (c) 2001 Fabrice Bellard
   3  *     2007 Marc Hoffman <marc.hoffman@analog.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * DCT test (c) 2001 Fabrice Bellard
  25  * Started from sample code by Juan J. Sierralta P.
  26  */
  27
  28 #include <stdlib.h>
  29 #include <stdio.h>
  30 #include <string.h>
  31 #include <sys/time.h>
  32 #include <unistd.h>
  33 #include <math.h>
  34
  35 #include "libavutil/cpu.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/lfg.h"
  38
  39 #include "simple_idct.h"
  40 #include "aandcttab.h"
  41 #include "faandct.h"
  42 #include "faanidct.h"
  43 #include "x86/idct_xvid.h"
  44 #include "dctref.h"
  45
  46 #undef printf
  47
  48 void ff_mmx_idct(DCTELEM *data);
  49 void ff_mmxext_idct(DCTELEM *data);
  50
  51 // BFIN
  52 void ff_bfin_idct(DCTELEM *block);
  53 void ff_bfin_fdct(DCTELEM *block);
  54
  55 // ALTIVEC
  56 void ff_fdct_altivec(DCTELEM *block);
  57 //void ff_idct_altivec(DCTELEM *block);?? no routine
  58
  59 // ARM
  60 void ff_j_rev_dct_arm(DCTELEM *data);
  61 void ff_simple_idct_arm(DCTELEM *data);
  62 void ff_simple_idct_armv5te(DCTELEM *data);
  63 void ff_simple_idct_armv6(DCTELEM *data);
  64 void ff_simple_idct_neon(DCTELEM *data);
  65
  66 void ff_simple_idct_axp(DCTELEM *data);
  67
  68 struct algo {
  69     const char *name;
  70     void (*func)(DCTELEM *block);
  71     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
  72                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
  73     int mm_support;
  74     int nonspec;
  75 };
  76
  77 static int cpu_flags;
  78
  79 static const struct algo fdct_tab[] = {
  80     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
  81     { "FAAN",           ff_faandct,            NO_PERM    },
  82     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
  83     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
  84
  85 #if HAVE_MMX
  86     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
  87     { "MMX2",           ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMX2    },
  88     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
  89 #endif
  90
  91 #if HAVE_ALTIVEC
  92     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
  93 #endif
  94
  95 #if ARCH_BFIN
  96     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
  97 #endif
  98
  99     { 0 }
 100 };
 101
 102 #if HAVE_MMX && HAVE_YASM
 103 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
 104                                 DCTELEM *block, int16_t *qmat);
 105
 106 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
 107     int16_t qmat[64]; int i;
 108     int16_t tmp[64];
 109
 110     for(i=0; i<64; i++){
 111         qmat[i]=4;
 112         tmp[i]= dst[i];
 113     }
 114     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
 115 }
 116 #endif
 117
 118 static const struct algo idct_tab[] = {
 119     { "FAANI",          ff_faanidct,           NO_PERM  },
 120     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
 121     { "INT",            ff_j_rev_dct,          MMX_PERM },
 122     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
 123
 124 #if HAVE_MMX
 125 #if CONFIG_GPL
 126     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
 127     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
 128 #endif
 129     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
 130     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
 131     { "XVID-MMX2",      ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMX2, 1 },
 132     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
 133 #if ARCH_X86_64 && HAVE_YASM
 134     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
 135 #endif
 136 #endif
 137
 138 #if ARCH_BFIN
 139     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 140 #endif
 141
 142 #if ARCH_ARM
 143     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
 144     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
 145 #endif
 146 #if HAVE_ARMV5TE
 147     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
 148 #endif
 149 #if HAVE_ARMV6
 150     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
 151 #endif
 152 #if HAVE_NEON
 153     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
 154 #endif
 155
 156 #if ARCH_ALPHA
 157     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
 158 #endif
 159
 160     { 0 }
 161 };
 162
 163 #define AANSCALE_BITS 12
 164
 165 static int64_t gettime(void)
 166 {
 167     struct timeval tv;
 168     gettimeofday(&tv, NULL);
 169     return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
 170 }
 171
 172 #define NB_ITS 20000
 173 #define NB_ITS_SPEED 50000
 174
 175 static short idct_mmx_perm[64];
 176
 177 static short idct_simple_mmx_perm[64] = {
 178     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
 179     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
 180     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
 181     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
 182     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
 183     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
 184     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
 185     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
 186 };
 187
 188 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
 189
 190 static void idct_mmx_init(void)
 191 {
 192     int i;
 193
 194     /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
 195     for (i = 0; i < 64; i++) {
 196         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
 197     }
 198 }
 199
 200 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
 201 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
 202
 203 static inline void mmx_emms(void)
 204 {
 205 #if HAVE_MMX
 206     if (cpu_flags & AV_CPU_FLAG_MMX)
 207         __asm__ volatile ("emms\n\t");
 208 #endif
 209 }
 210
 211 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
 212 {
 213     int i, j;
 214
 215     memset(block, 0, 64 * sizeof(*block));
 216
 217     switch (test) {
 218     case 0:
 219         for (i = 0; i < 64; i++)
 220             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
 221         if (is_idct) {
 222             ff_ref_fdct(block);
 223             for (i = 0; i < 64; i++)
 224                 block[i] >>= 3;
 225         }
 226         break;
 227     case 1:
 228         j = av_lfg_get(prng) % 10 + 1;
 229         for (i = 0; i < j; i++)
 230             block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
 231         break;
 232     case 2:
 233         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
 234         block[63] = (block[0] & 1) ^ 1;
 235         break;
 236     }
 237 }
 238
 239 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
 240 {
 241     int i;
 242
 243     if (perm == MMX_PERM) {
 244         for (i = 0; i < 64; i++)
 245             dst[idct_mmx_perm[i]] = src[i];
 246     } else if (perm == MMX_SIMPLE_PERM) {
 247         for (i = 0; i < 64; i++)
 248             dst[idct_simple_mmx_perm[i]] = src[i];
 249     } else if (perm == SSE2_PERM) {
 250         for (i = 0; i < 64; i++)
 251             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
 252     } else if (perm == PARTTRANS_PERM) {
 253         for (i = 0; i < 64; i++)
 254             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
 255     } else if (perm == TRANSPOSE_PERM) {
 256         for (i = 0; i < 64; i++)
 257             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
 258     } else {
 259         for (i = 0; i < 64; i++)
 260             dst[i] = src[i];
 261     }
 262 }
 263
 264 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
 265 {
 266     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
 267     int it, i, scale;
 268     int err_inf, v;
 269     int64_t err2, ti, ti1, it1, err_sum = 0;
 270     int64_t sysErr[64], sysErrMax = 0;
 271     int maxout = 0;
 272     int blockSumErrMax = 0, blockSumErr;
 273     AVLFG prng;
 274     const int vals=1<<bits;
 275     double omse, ome;
 276     int spec_err;
 277
 278     av_lfg_init(&prng, 1);
 279
 280     err_inf = 0;
 281     err2 = 0;
 282     for (i = 0; i < 64; i++)
 283         sysErr[i] = 0;
 284     for (it = 0; it < NB_ITS; it++) {
 285         init_block(block1, test, is_idct, &prng, vals);
 286         permute(block, block1, dct->format);
 287
 288         dct->func(block);
 289         mmx_emms();
 290
 291         if (dct->format == SCALE_PERM) {
 292             for (i = 0; i < 64; i++) {
 293                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
 294                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
 295             }
 296         }
 297
 298         ref(block1);
 299
 300         blockSumErr = 0;
 301         for (i = 0; i < 64; i++) {
 302             int err = block[i] - block1[i];
 303             err_sum += err;
 304             v = abs(err);
 305             if (v > err_inf)
 306                 err_inf = v;
 307             err2 += v * v;
 308             sysErr[i] += block[i] - block1[i];
 309             blockSumErr += v;
 310             if (abs(block[i]) > maxout)
 311                 maxout = abs(block[i]);
 312         }
 313         if (blockSumErrMax < blockSumErr)
 314             blockSumErrMax = blockSumErr;
 315     }
 316     for (i = 0; i < 64; i++)
 317         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
 318
 319     for (i = 0; i < 64; i++) {
 320         if (i % 8 == 0)
 321             printf("\n");
 322         printf("%7d ", (int) sysErr[i]);
 323     }
 324     printf("\n");
 325
 326     omse = (double) err2 / NB_ITS / 64;
 327     ome  = (double) err_sum / NB_ITS / 64;
 328
 329     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
 330
 331     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
 332            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
 333            omse, ome, (double) sysErrMax / NB_ITS,
 334            maxout, blockSumErrMax);
 335
 336     if (spec_err && !dct->nonspec)
 337         return 1;
 338
 339     if (!speed)
 340         return 0;
 341
 342     /* speed test */
 343
 344     init_block(block, test, is_idct, &prng, vals);
 345     permute(block1, block, dct->format);
 346
 347     ti = gettime();
 348     it1 = 0;
 349     do {
 350         for (it = 0; it < NB_ITS_SPEED; it++) {
 351             memcpy(block, block1, sizeof(block));
 352             dct->func(block);
 353         }
 354         it1 += NB_ITS_SPEED;
 355         ti1 = gettime() - ti;
 356     } while (ti1 < 1000000);
 357     mmx_emms();
 358
 359     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
 360            (double) it1 * 1000.0 / (double) ti1);
 361
 362     return 0;
 363 }
 364
 365 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
 366 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
 367
 368 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
 369 {
 370     static int init;
 371     static double c8[8][8];
 372     static double c4[4][4];
 373     double block1[64], block2[64], block3[64];
 374     double s, sum, v;
 375     int i, j, k;
 376
 377     if (!init) {
 378         init = 1;
 379
 380         for (i = 0; i < 8; i++) {
 381             sum = 0;
 382             for (j = 0; j < 8; j++) {
 383                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
 384                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
 385                 sum += c8[i][j] * c8[i][j];
 386             }
 387         }
 388
 389         for (i = 0; i < 4; i++) {
 390             sum = 0;
 391             for (j = 0; j < 4; j++) {
 392                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
 393                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
 394                 sum += c4[i][j] * c4[i][j];
 395             }
 396         }
 397     }
 398
 399     /* butterfly */
 400     s = 0.5 * sqrt(2.0);
 401     for (i = 0; i < 4; i++) {
 402         for (j = 0; j < 8; j++) {
 403             block1[8 * (2 * i) + j] =
 404                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
 405             block1[8 * (2 * i + 1) + j] =
 406                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
 407         }
 408     }
 409
 410     /* idct8 on lines */
 411     for (i = 0; i < 8; i++) {
 412         for (j = 0; j < 8; j++) {
 413             sum = 0;
 414             for (k = 0; k < 8; k++)
 415                 sum += c8[k][j] * block1[8 * i + k];
 416             block2[8 * i + j] = sum;
 417         }
 418     }
 419
 420     /* idct4 */
 421     for (i = 0; i < 8; i++) {
 422         for (j = 0; j < 4; j++) {
 423             /* top */
 424             sum = 0;
 425             for (k = 0; k < 4; k++)
 426                 sum += c4[k][j] * block2[8 * (2 * k) + i];
 427             block3[8 * (2 * j) + i] = sum;
 428
 429             /* bottom */
 430             sum = 0;
 431             for (k = 0; k < 4; k++)
 432                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
 433             block3[8 * (2 * j + 1) + i] = sum;
 434         }
 435     }
 436
 437     /* clamp and store the result */
 438     for (i = 0; i < 8; i++) {
 439         for (j = 0; j < 8; j++) {
 440             v = block3[8 * i + j];
 441             if      (v < 0)   v = 0;
 442             else if (v > 255) v = 255;
 443             dest[i * linesize + j] = (int) rint(v);
 444         }
 445     }
 446 }
 447
 448 static void idct248_error(const char *name,
 449                           void (*idct248_put)(uint8_t *dest, int line_size,
 450                                               int16_t *block),
 451                           int speed)
 452 {
 453     int it, i, it1, ti, ti1, err_max, v;
 454     AVLFG prng;
 455
 456     av_lfg_init(&prng, 1);
 457
 458     /* just one test to see if code is correct (precision is less
 459        important here) */
 460     err_max = 0;
 461     for (it = 0; it < NB_ITS; it++) {
 462         /* XXX: use forward transform to generate values */
 463         for (i = 0; i < 64; i++)
 464             block1[i] = av_lfg_get(&prng) % 256 - 128;
 465         block1[0] += 1024;
 466
 467         for (i = 0; i < 64; i++)
 468             block[i] = block1[i];
 469         idct248_ref(img_dest1, 8, block);
 470
 471         for (i = 0; i < 64; i++)
 472             block[i] = block1[i];
 473         idct248_put(img_dest, 8, block);
 474
 475         for (i = 0; i < 64; i++) {
 476             v = abs((int) img_dest[i] - (int) img_dest1[i]);
 477             if (v == 255)
 478                 printf("%d %d\n", img_dest[i], img_dest1[i]);
 479             if (v > err_max)
 480                 err_max = v;
 481         }
 482 #if 0
 483         printf("ref=\n");
 484         for(i=0;i<8;i++) {
 485             int j;
 486             for(j=0;j<8;j++) {
 487                 printf(" %3d", img_dest1[i*8+j]);
 488             }
 489             printf("\n");
 490         }
 491
 492         printf("out=\n");
 493         for(i=0;i<8;i++) {
 494             int j;
 495             for(j=0;j<8;j++) {
 496                 printf(" %3d", img_dest[i*8+j]);
 497             }
 498             printf("\n");
 499         }
 500 #endif
 501     }
 502     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
 503
 504     if (!speed)
 505         return;
 506
 507     ti = gettime();
 508     it1 = 0;
 509     do {
 510         for (it = 0; it < NB_ITS_SPEED; it++) {
 511             for (i = 0; i < 64; i++)
 512                 block[i] = block1[i];
 513             idct248_put(img_dest, 8, block);
 514         }
 515         it1 += NB_ITS_SPEED;
 516         ti1 = gettime() - ti;
 517     } while (ti1 < 1000000);
 518     mmx_emms();
 519
 520     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
 521            (double) it1 * 1000.0 / (double) ti1);
 522 }
 523
 524 static void help(void)
 525 {
 526     printf("dct-test [-i] [<test-number>] [<bits>]\n"
 527            "test-number 0 -> test with random matrixes\n"
 528            "            1 -> test with random sparse matrixes\n"
 529            "            2 -> do 3. test from mpeg4 std\n"
 530            "bits        Number of time domain bits to use, 8 is default\n"
 531            "-i          test IDCT implementations\n"
 532            "-4          test IDCT248 implementations\n"
 533            "-t          speed test\n");
 534 }
 535
 536 int main(int argc, char **argv)
 537 {
 538     int test_idct = 0, test_248_dct = 0;
 539     int c, i;
 540     int test = 1;
 541     int speed = 0;
 542     int err = 0;
 543     int bits=8;
 544
 545     cpu_flags = av_get_cpu_flags();
 546
 547     ff_ref_dct_init();
 548     idct_mmx_init();
 549
 550     for (;;) {
 551         c = getopt(argc, argv, "ih4t");
 552         if (c == -1)
 553             break;
 554         switch (c) {
 555         case 'i':
 556             test_idct = 1;
 557             break;
 558         case '4':
 559             test_248_dct = 1;
 560             break;
 561         case 't':
 562             speed = 1;
 563             break;
 564         default:
 565         case 'h':
 566             help();
 567             return 0;
 568         }
 569     }
 570
 571     if (optind < argc)
 572         test = atoi(argv[optind]);
 573     if(optind+1 < argc) bits= atoi(argv[optind+1]);
 574
 575     printf("ffmpeg DCT/IDCT test\n");
 576
 577     if (test_248_dct) {
 578         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
 579     } else {
 580         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
 581         for (i = 0; algos[i].name; i++)
 582             if (!(~cpu_flags & algos[i].mm_support)) {
 583                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
 584             }
 585     }
 586
 587     return err;
 588 }