2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 enum { FDCT, IDCT } is_idct;
73 void (* func) (DCTELEM *block);
74 void (* ref) (DCTELEM *block);
75 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 struct algo algos[] = {
88 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
89 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
90 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
91 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
92 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
93 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
94 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
95 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
98 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
100 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
101 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
105 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
106 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
108 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
109 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
110 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
111 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
115 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
119 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
120 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
124 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
125 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
127 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
130 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
133 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
135 #endif /* ARCH_ARM */
138 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
144 #define AANSCALE_BITS 12
146 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
148 static int64_t gettime(void)
151 gettimeofday(&tv,NULL);
152 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
156 #define NB_ITS_SPEED 50000
158 static short idct_mmx_perm[64];
160 static short idct_simple_mmx_perm[64]={
161 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
162 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
163 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
164 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
165 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
166 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
167 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
168 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
171 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
173 static void idct_mmx_init(void)
177 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
178 for (i = 0; i < 64; i++) {
179 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
180 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
188 static inline void mmx_emms(void)
191 if (cpu_flags & AV_CPU_FLAG_MMX)
192 __asm__ volatile ("emms\n\t");
196 static void dct_error(const char *name, int is_idct,
197 void (*fdct_func)(DCTELEM *block),
198 void (*fdct_ref)(DCTELEM *block), int form, int test, const int bits)
202 int64_t err2, ti, ti1, it1;
203 int64_t sysErr[64], sysErrMax=0;
205 int blockSumErrMax=0, blockSumErr;
207 const int vals=1<<bits;
209 av_lfg_init(&prng, 1);
213 for(i=0; i<64; i++) sysErr[i]=0;
214 for(it=0;it<NB_ITS;it++) {
220 block1[i] = (av_lfg_get(&prng) % (2*vals)) -vals;
229 int num = av_lfg_get(&prng) % 10 + 1;
231 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % (2*vals) -vals;
234 block1[0] = av_lfg_get(&prng) % (16*vals) - (8*vals);
235 block1[63]= (block1[0]&1)^1;
239 #if 0 // simulate mismatch control
244 if((sum&1)==0) block1[63]^=1;
249 block_org[i]= block1[i];
251 if (form == MMX_PERM) {
253 block[idct_mmx_perm[i]] = block1[i];
254 } else if (form == MMX_SIMPLE_PERM) {
256 block[idct_simple_mmx_perm[i]] = block1[i];
258 } else if (form == SSE2_PERM) {
260 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
261 } else if (form == PARTTRANS_PERM) {
263 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
268 #if 0 // simulate mismatch control for tested IDCT but not the ref
273 if((sum&1)==0) block[63]^=1;
280 if (form == SCALE_PERM) {
281 for(i=0; i<64; i++) {
282 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
283 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
291 v = abs(block[i] - block1[i]);
295 sysErr[i] += block[i] - block1[i];
297 if( abs(block[i])>maxout) maxout=abs(block[i]);
299 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
300 #if 0 // print different matrix pairs
304 if((i&7)==0) printf("\n");
305 printf("%4d ", block_org[i]);
308 if((i&7)==0) printf("\n");
309 printf("%4d ", block[i] - block1[i]);
314 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
317 if(i%8==0) printf("\n");
318 printf("%7d ", (int)sysErr[i]);
322 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
323 is_idct ? "IDCT" : "DCT",
324 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
332 block1[i] = av_lfg_get(&prng) % (2*vals) -vals;
342 block1[0] = av_lfg_get(&prng) % (2*vals) -vals;
343 block1[1] = av_lfg_get(&prng) % (2*vals) -vals;
344 block1[2] = av_lfg_get(&prng) % (2*vals) -vals;
345 block1[3] = av_lfg_get(&prng) % (2*vals) -vals;
349 if (form == MMX_PERM) {
351 block[idct_mmx_perm[i]] = block1[i];
352 } else if(form == MMX_SIMPLE_PERM) {
354 block[idct_simple_mmx_perm[i]] = block1[i];
363 for(it=0;it<NB_ITS_SPEED;it++) {
366 // memcpy(block, block1, sizeof(DCTELEM) * 64);
367 // do not memcpy especially not fastmemcpy because it does movntq !!!
371 ti1 = gettime() - ti;
372 } while (ti1 < 1000000);
375 printf("%s %s: %0.1f kdct/s\n",
376 is_idct ? "IDCT" : "DCT",
377 name, (double)it1 * 1000.0 / (double)ti1);
380 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
381 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
383 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
386 static double c8[8][8];
387 static double c4[4][4];
388 double block1[64], block2[64], block3[64];
398 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
399 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
400 sum += c8[i][j] * c8[i][j];
407 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
408 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
409 sum += c4[i][j] * c4[i][j];
418 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
419 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
428 sum += c8[k][j] * block1[8*i+k];
439 sum += c4[k][j] * block2[8*(2*k)+i];
440 block3[8*(2*j)+i] = sum;
445 sum += c4[k][j] * block2[8*(2*k+1)+i];
446 block3[8*(2*j+1)+i] = sum;
450 /* clamp and store the result */
458 dest[i * linesize + j] = (int)rint(v);
463 static void idct248_error(const char *name,
464 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
466 int it, i, it1, ti, ti1, err_max, v;
470 av_lfg_init(&prng, 1);
472 /* just one test to see if code is correct (precision is less
475 for(it=0;it<NB_ITS;it++) {
477 /* XXX: use forward transform to generate values */
479 block1[i] = av_lfg_get(&prng) % 256 - 128;
484 idct248_ref(img_dest1, 8, block);
488 idct248_put(img_dest, 8, block);
491 v = abs((int)img_dest[i] - (int)img_dest1[i]);
493 printf("%d %d\n", img_dest[i], img_dest1[i]);
502 printf(" %3d", img_dest1[i*8+j]);
511 printf(" %3d", img_dest[i*8+j]);
517 printf("%s %s: err_inf=%d\n",
518 1 ? "IDCT248" : "DCT248",
524 for(it=0;it<NB_ITS_SPEED;it++) {
527 // memcpy(block, block1, sizeof(DCTELEM) * 64);
528 // do not memcpy especially not fastmemcpy because it does movntq !!!
529 idct248_put(img_dest, 8, block);
532 ti1 = gettime() - ti;
533 } while (ti1 < 1000000);
536 printf("%s %s: %0.1f kdct/s\n",
537 1 ? "IDCT248" : "DCT248",
538 name, (double)it1 * 1000.0 / (double)ti1);
541 static void help(void)
543 printf("dct-test [-i] [<test-number>]\n"
544 "test-number 0 -> test with random matrixes\n"
545 " 1 -> test with random sparse matrixes\n"
546 " 2 -> do 3. test from mpeg4 std\n"
547 "-i test IDCT implementations\n"
548 "-4 test IDCT248 implementations\n");
551 int main(int argc, char **argv)
553 int test_idct = 0, test_248_dct = 0;
557 cpu_flags = av_get_cpu_flags();
562 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
563 for(i=0;i<MAX_NEG_CROP;i++) {
565 cropTbl[i + MAX_NEG_CROP + 256] = 255;
569 c = getopt(argc, argv, "ih4");
586 if(optind <argc) test= atoi(argv[optind]);
587 if(optind+1 < argc) bits= atoi(argv[optind+1]);
589 printf("ffmpeg DCT/IDCT test\n");
592 idct248_error("SIMPLE-C", ff_simple_idct248_put);
594 for (i=0;algos[i].name;i++)
595 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
596 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test, bits);