2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/common.h"
36 #include "libavutil/lfg.h"
38 #include "simple_idct.h"
39 #include "aandcttab.h"
42 #include "x86/idct_xvid.h"
47 void ff_mmx_idct(DCTELEM *data);
48 void ff_mmxext_idct(DCTELEM *data);
50 void odivx_idct_c(short *block);
53 void ff_bfin_idct(DCTELEM *block);
54 void ff_bfin_fdct(DCTELEM *block);
57 void fdct_altivec(DCTELEM *block);
58 //void idct_altivec(DCTELEM *block);?? no routine
61 void ff_j_rev_dct_arm(DCTELEM *data);
62 void ff_simple_idct_arm(DCTELEM *data);
63 void ff_simple_idct_armv5te(DCTELEM *data);
64 void ff_simple_idct_armv6(DCTELEM *data);
65 void ff_simple_idct_neon(DCTELEM *data);
67 void ff_simple_idct_axp(DCTELEM *data);
71 enum { FDCT, IDCT } is_idct;
72 void (* func) (DCTELEM *block);
73 void (* ref) (DCTELEM *block);
74 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
78 #ifndef FAAN_POSTSCALE
79 #define FAAN_SCALE SCALE_PERM
81 #define FAAN_SCALE NO_PERM
86 struct algo algos[] = {
87 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
88 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
89 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
90 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
91 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
92 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
93 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
94 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
97 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX},
99 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMX2},
100 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2},
104 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX},
105 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX2},
107 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
108 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX},
109 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMX2},
110 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
114 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
118 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
119 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
123 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
124 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
126 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
129 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
132 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
134 #endif /* ARCH_ARM */
137 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
143 #define AANSCALE_BITS 12
145 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
147 static int64_t gettime(void)
150 gettimeofday(&tv,NULL);
151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
155 #define NB_ITS_SPEED 50000
157 static short idct_mmx_perm[64];
159 static short idct_simple_mmx_perm[64]={
160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
170 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
172 static void idct_mmx_init(void)
176 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
177 for (i = 0; i < 64; i++) {
178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
179 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
183 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
184 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
187 static inline void mmx_emms(void)
190 if (cpu_flags & FF_MM_MMX)
191 __asm__ volatile ("emms\n\t");
195 static void dct_error(const char *name, int is_idct,
196 void (*fdct_func)(DCTELEM *block),
197 void (*fdct_ref)(DCTELEM *block), int form, int test)
201 int64_t err2, ti, ti1, it1;
202 int64_t sysErr[64], sysErrMax=0;
204 int blockSumErrMax=0, blockSumErr;
207 av_lfg_init(&prng, 1);
211 for(i=0; i<64; i++) sysErr[i]=0;
212 for(it=0;it<NB_ITS;it++) {
218 block1[i] = (av_lfg_get(&prng) % 512) -256;
227 int num = av_lfg_get(&prng) % 10 + 1;
229 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % 512 -256;
232 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
233 block1[63]= (block1[0]&1)^1;
237 #if 0 // simulate mismatch control
242 if((sum&1)==0) block1[63]^=1;
247 block_org[i]= block1[i];
249 if (form == MMX_PERM) {
251 block[idct_mmx_perm[i]] = block1[i];
252 } else if (form == MMX_SIMPLE_PERM) {
254 block[idct_simple_mmx_perm[i]] = block1[i];
256 } else if (form == SSE2_PERM) {
258 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
259 } else if (form == PARTTRANS_PERM) {
261 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
266 #if 0 // simulate mismatch control for tested IDCT but not the ref
271 if((sum&1)==0) block[63]^=1;
278 if (form == SCALE_PERM) {
279 for(i=0; i<64; i++) {
280 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
281 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
289 v = abs(block[i] - block1[i]);
293 sysErr[i] += block[i] - block1[i];
295 if( abs(block[i])>maxout) maxout=abs(block[i]);
297 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
298 #if 0 // print different matrix pairs
302 if((i&7)==0) printf("\n");
303 printf("%4d ", block_org[i]);
306 if((i&7)==0) printf("\n");
307 printf("%4d ", block[i] - block1[i]);
312 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
314 #if 1 // dump systematic errors
316 if(i%8==0) printf("\n");
317 printf("%7d ", (int)sysErr[i]);
322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
323 is_idct ? "IDCT" : "DCT",
324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
332 block1[i] = av_lfg_get(&prng) % 512 -256;
342 block1[0] = av_lfg_get(&prng) % 512 -256;
343 block1[1] = av_lfg_get(&prng) % 512 -256;
344 block1[2] = av_lfg_get(&prng) % 512 -256;
345 block1[3] = av_lfg_get(&prng) % 512 -256;
349 if (form == MMX_PERM) {
351 block[idct_mmx_perm[i]] = block1[i];
352 } else if(form == MMX_SIMPLE_PERM) {
354 block[idct_simple_mmx_perm[i]] = block1[i];
363 for(it=0;it<NB_ITS_SPEED;it++) {
366 // memcpy(block, block1, sizeof(DCTELEM) * 64);
367 // do not memcpy especially not fastmemcpy because it does movntq !!!
371 ti1 = gettime() - ti;
372 } while (ti1 < 1000000);
375 printf("%s %s: %0.1f kdct/s\n",
376 is_idct ? "IDCT" : "DCT",
377 name, (double)it1 * 1000.0 / (double)ti1);
381 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
382 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
384 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
387 static double c8[8][8];
388 static double c4[4][4];
389 double block1[64], block2[64], block3[64];
399 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
400 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
401 sum += c8[i][j] * c8[i][j];
408 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
409 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
410 sum += c4[i][j] * c4[i][j];
419 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
420 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
429 sum += c8[k][j] * block1[8*i+k];
440 sum += c4[k][j] * block2[8*(2*k)+i];
441 block3[8*(2*j)+i] = sum;
446 sum += c4[k][j] * block2[8*(2*k+1)+i];
447 block3[8*(2*j+1)+i] = sum;
451 /* clamp and store the result */
459 dest[i * linesize + j] = (int)rint(v);
464 static void idct248_error(const char *name,
465 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
467 int it, i, it1, ti, ti1, err_max, v;
471 av_lfg_init(&prng, 1);
473 /* just one test to see if code is correct (precision is less
476 for(it=0;it<NB_ITS;it++) {
478 /* XXX: use forward transform to generate values */
480 block1[i] = av_lfg_get(&prng) % 256 - 128;
485 idct248_ref(img_dest1, 8, block);
489 idct248_put(img_dest, 8, block);
492 v = abs((int)img_dest[i] - (int)img_dest1[i]);
494 printf("%d %d\n", img_dest[i], img_dest1[i]);
503 printf(" %3d", img_dest1[i*8+j]);
512 printf(" %3d", img_dest[i*8+j]);
518 printf("%s %s: err_inf=%d\n",
519 1 ? "IDCT248" : "DCT248",
525 for(it=0;it<NB_ITS_SPEED;it++) {
528 // memcpy(block, block1, sizeof(DCTELEM) * 64);
529 // do not memcpy especially not fastmemcpy because it does movntq !!!
530 idct248_put(img_dest, 8, block);
533 ti1 = gettime() - ti;
534 } while (ti1 < 1000000);
537 printf("%s %s: %0.1f kdct/s\n",
538 1 ? "IDCT248" : "DCT248",
539 name, (double)it1 * 1000.0 / (double)ti1);
542 static void help(void)
544 printf("dct-test [-i] [<test-number>]\n"
545 "test-number 0 -> test with random matrixes\n"
546 " 1 -> test with random sparse matrixes\n"
547 " 2 -> do 3. test from mpeg4 std\n"
548 "-i test IDCT implementations\n"
549 "-4 test IDCT248 implementations\n");
552 int main(int argc, char **argv)
554 int test_idct = 0, test_248_dct = 0;
557 cpu_flags = mm_support();
562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
563 for(i=0;i<MAX_NEG_CROP;i++) {
565 cropTbl[i + MAX_NEG_CROP + 256] = 255;
569 c = getopt(argc, argv, "ih4");
586 if(optind <argc) test= atoi(argv[optind]);
588 printf("ffmpeg DCT/IDCT test\n");
591 idct248_error("SIMPLE-C", ff_simple_idct248_put);
593 for (i=0;algos[i].name;i++)
594 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
595 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);