2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 enum { FDCT, IDCT } is_idct;
73 void (* func) (DCTELEM *block);
74 void (* ref) (DCTELEM *block);
75 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 struct algo algos[] = {
88 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
89 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
90 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
91 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
92 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
93 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
94 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
95 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
98 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
135 #endif /* ARCH_ARM */
138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
144 #define AANSCALE_BITS 12
146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
148 static int64_t gettime(void)
151 gettimeofday(&tv,NULL);
152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
156 #define NB_ITS_SPEED 50000
158 static short idct_mmx_perm[64];
160 static short idct_simple_mmx_perm[64]={
161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
173 static void idct_mmx_init(void)
177 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
178 for (i = 0; i < 64; i++) {
179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
183 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
184 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
187 static inline void mmx_emms(void)
190 if (cpu_flags & AV_CPU_FLAG_MMX)
191 __asm__ volatile ("emms\n\t");
195 static void dct_error(const char *name, int is_idct,
196 void (*fdct_func)(DCTELEM *block),
197 void (*fdct_ref)(DCTELEM *block), int form, int test)
201 int64_t err2, ti, ti1, it1;
202 int64_t sysErr[64], sysErrMax=0;
204 int blockSumErrMax=0, blockSumErr;
207 av_lfg_init(&prng, 1);
211 for(i=0; i<64; i++) sysErr[i]=0;
212 for(it=0;it<NB_ITS;it++) {
218 block1[i] = (av_lfg_get(&prng) % 512) -256;
227 int num = av_lfg_get(&prng) % 10 + 1;
229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
232 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
233 block1[63]= (block1[0]&1)^1;
238 block_org[i]= block1[i];
240 if (form == MMX_PERM) {
242 block[idct_mmx_perm[i]] = block1[i];
243 } else if (form == MMX_SIMPLE_PERM) {
245 block[idct_simple_mmx_perm[i]] = block1[i];
247 } else if (form == SSE2_PERM) {
249 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
250 } else if (form == PARTTRANS_PERM) {
252 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
261 if (form == SCALE_PERM) {
262 for(i=0; i<64; i++) {
263 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
264 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
272 v = abs(block[i] - block1[i]);
276 sysErr[i] += block[i] - block1[i];
278 if( abs(block[i])>maxout) maxout=abs(block[i]);
280 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
282 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
285 if(i%8==0) printf("\n");
286 printf("%7d ", (int)sysErr[i]);
290 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
291 is_idct ? "IDCT" : "DCT",
292 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
300 block1[i] = av_lfg_get(&prng) % 512 -256;
310 block1[0] = av_lfg_get(&prng) % 512 -256;
311 block1[1] = av_lfg_get(&prng) % 512 -256;
312 block1[2] = av_lfg_get(&prng) % 512 -256;
313 block1[3] = av_lfg_get(&prng) % 512 -256;
317 if (form == MMX_PERM) {
319 block[idct_mmx_perm[i]] = block1[i];
320 } else if(form == MMX_SIMPLE_PERM) {
322 block[idct_simple_mmx_perm[i]] = block1[i];
331 for(it=0;it<NB_ITS_SPEED;it++) {
337 ti1 = gettime() - ti;
338 } while (ti1 < 1000000);
341 printf("%s %s: %0.1f kdct/s\n",
342 is_idct ? "IDCT" : "DCT",
343 name, (double)it1 * 1000.0 / (double)ti1);
346 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
347 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
349 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
352 static double c8[8][8];
353 static double c4[4][4];
354 double block1[64], block2[64], block3[64];
364 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
365 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
366 sum += c8[i][j] * c8[i][j];
373 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
374 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
375 sum += c4[i][j] * c4[i][j];
384 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
385 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
394 sum += c8[k][j] * block1[8*i+k];
405 sum += c4[k][j] * block2[8*(2*k)+i];
406 block3[8*(2*j)+i] = sum;
411 sum += c4[k][j] * block2[8*(2*k+1)+i];
412 block3[8*(2*j+1)+i] = sum;
416 /* clamp and store the result */
424 dest[i * linesize + j] = (int)rint(v);
429 static void idct248_error(const char *name,
430 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
432 int it, i, it1, ti, ti1, err_max, v;
436 av_lfg_init(&prng, 1);
438 /* just one test to see if code is correct (precision is less
441 for(it=0;it<NB_ITS;it++) {
443 /* XXX: use forward transform to generate values */
445 block1[i] = av_lfg_get(&prng) % 256 - 128;
450 idct248_ref(img_dest1, 8, block);
454 idct248_put(img_dest, 8, block);
457 v = abs((int)img_dest[i] - (int)img_dest1[i]);
459 printf("%d %d\n", img_dest[i], img_dest1[i]);
464 printf("%s %s: err_inf=%d\n",
465 1 ? "IDCT248" : "DCT248",
471 for(it=0;it<NB_ITS_SPEED;it++) {
474 idct248_put(img_dest, 8, block);
477 ti1 = gettime() - ti;
478 } while (ti1 < 1000000);
481 printf("%s %s: %0.1f kdct/s\n",
482 1 ? "IDCT248" : "DCT248",
483 name, (double)it1 * 1000.0 / (double)ti1);
486 static void help(void)
488 printf("dct-test [-i] [<test-number>]\n"
489 "test-number 0 -> test with random matrixes\n"
490 " 1 -> test with random sparse matrixes\n"
491 " 2 -> do 3. test from mpeg4 std\n"
492 "-i test IDCT implementations\n"
493 "-4 test IDCT248 implementations\n");
496 int main(int argc, char **argv)
498 int test_idct = 0, test_248_dct = 0;
501 cpu_flags = av_get_cpu_flags();
506 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
507 for(i=0;i<MAX_NEG_CROP;i++) {
509 cropTbl[i + MAX_NEG_CROP + 256] = 255;
513 c = getopt(argc, argv, "ih4");
530 if(optind <argc) test= atoi(argv[optind]);
532 printf("ffmpeg DCT/IDCT test\n");
535 idct248_error("SIMPLE-C", ff_simple_idct248_put);
537 for (i=0;algos[i].name;i++)
538 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
539 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);