2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test. (c) 2001 Fabrice Bellard.
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/common.h"
37 #include "simple_idct.h"
38 #include "aandcttab.h"
41 #include "x86/idct_xvid.h"
46 void *fast_memcpy(void *a, const void *b, size_t c){return memcpy(a,b,c);};
48 /* reference fdct/idct */
49 void fdct(DCTELEM *block);
50 void idct(DCTELEM *block);
53 void ff_mmx_idct(DCTELEM *data);
54 void ff_mmxext_idct(DCTELEM *data);
56 void odivx_idct_c(short *block);
59 void ff_bfin_idct(DCTELEM *block);
60 void ff_bfin_fdct(DCTELEM *block);
63 void fdct_altivec(DCTELEM *block);
64 //void idct_altivec(DCTELEM *block);?? no routine
67 void j_rev_dct_ARM(DCTELEM *data);
68 void simple_idct_ARM(DCTELEM *data);
69 void simple_idct_armv5te(DCTELEM *data);
70 void ff_simple_idct_armv6(DCTELEM *data);
71 void ff_simple_idct_neon(DCTELEM *data);
75 enum { FDCT, IDCT } is_idct;
76 void (* func) (DCTELEM *block);
77 void (* ref) (DCTELEM *block);
78 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
82 #ifndef FAAN_POSTSCALE
83 #define FAAN_SCALE SCALE_PERM
85 #define FAAN_SCALE NO_PERM
90 struct algo algos[] = {
91 {"REF-DBL", 0, fdct, fdct, NO_PERM},
92 {"FAAN", 0, ff_faandct, fdct, FAAN_SCALE},
93 {"FAANI", 1, ff_faanidct, idct, NO_PERM},
94 {"IJG-AAN-INT", 0, fdct_ifast, fdct, SCALE_PERM},
95 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, fdct, NO_PERM},
96 {"REF-DBL", 1, idct, idct, NO_PERM},
97 {"INT", 1, j_rev_dct, idct, MMX_PERM},
98 {"SIMPLE-C", 1, ff_simple_idct, idct, NO_PERM},
101 {"MMX", 0, ff_fdct_mmx, fdct, NO_PERM, FF_MM_MMX},
103 {"MMX2", 0, ff_fdct_mmx2, fdct, NO_PERM, FF_MM_MMXEXT},
104 {"SSE2", 0, ff_fdct_sse2, fdct, NO_PERM, FF_MM_SSE2},
108 {"LIBMPEG2-MMX", 1, ff_mmx_idct, idct, MMX_PERM, FF_MM_MMX},
109 {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct, idct, MMX_PERM, FF_MM_MMXEXT},
111 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, idct, MMX_SIMPLE_PERM, FF_MM_MMX},
112 {"XVID-MMX", 1, ff_idct_xvid_mmx, idct, NO_PERM, FF_MM_MMX},
113 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, idct, NO_PERM, FF_MM_MMXEXT},
114 {"XVID-SSE2", 1, ff_idct_xvid_sse2, idct, SSE2_PERM, FF_MM_SSE2},
118 {"altivecfdct", 0, fdct_altivec, fdct, NO_PERM, FF_MM_ALTIVEC},
122 {"BFINfdct", 0, ff_bfin_fdct, fdct, NO_PERM},
123 {"BFINidct", 1, ff_bfin_idct, idct, NO_PERM},
127 {"SIMPLE-ARM", 1, simple_idct_ARM, idct, NO_PERM },
128 {"INT-ARM", 1, j_rev_dct_ARM, idct, MMX_PERM },
130 {"SIMPLE-ARMV5TE", 1, simple_idct_armv5te, idct, NO_PERM },
133 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, idct, MMX_PERM },
136 {"SIMPLE-NEON", 1, ff_simple_idct_neon, idct, PARTTRANS_PERM },
138 #endif /* ARCH_ARM */
143 #define AANSCALE_BITS 12
145 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
147 int64_t gettime(void)
150 gettimeofday(&tv,NULL);
151 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
155 #define NB_ITS_SPEED 50000
157 static short idct_mmx_perm[64];
159 static short idct_simple_mmx_perm[64]={
160 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
161 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
162 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
163 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
164 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
165 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
166 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
167 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
170 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
172 void idct_mmx_init(void)
176 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
177 for (i = 0; i < 64; i++) {
178 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
179 // idct_simple_mmx_perm[i] = simple_block_permute_op(i);
183 static DCTELEM block[64] __attribute__ ((aligned (16)));
184 static DCTELEM block1[64] __attribute__ ((aligned (8)));
185 static DCTELEM block_org[64] __attribute__ ((aligned (8)));
187 static inline void mmx_emms(void)
190 if (cpu_flags & FF_MM_MMX)
191 __asm__ volatile ("emms\n\t");
195 void dct_error(const char *name, int is_idct,
196 void (*fdct_func)(DCTELEM *block),
197 void (*fdct_ref)(DCTELEM *block), int form, int test)
201 int64_t err2, ti, ti1, it1;
202 int64_t sysErr[64], sysErrMax=0;
204 int blockSumErrMax=0, blockSumErr;
210 for(i=0; i<64; i++) sysErr[i]=0;
211 for(it=0;it<NB_ITS;it++) {
217 block1[i] = (random() % 512) -256;
226 int num= (random()%10)+1;
228 block1[random()%64] = (random() % 512) -256;
231 block1[0]= (random()%4096)-2048;
232 block1[63]= (block1[0]&1)^1;
236 #if 0 // simulate mismatch control
241 if((sum&1)==0) block1[63]^=1;
246 block_org[i]= block1[i];
248 if (form == MMX_PERM) {
250 block[idct_mmx_perm[i]] = block1[i];
251 } else if (form == MMX_SIMPLE_PERM) {
253 block[idct_simple_mmx_perm[i]] = block1[i];
255 } else if (form == SSE2_PERM) {
257 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
258 } else if (form == PARTTRANS_PERM) {
260 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
265 #if 0 // simulate mismatch control for tested IDCT but not the ref
270 if((sum&1)==0) block[63]^=1;
277 if (form == SCALE_PERM) {
278 for(i=0; i<64; i++) {
279 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
280 block[i] = (block[i] * scale /*+ (1<<(AANSCALE_BITS-1))*/) >> AANSCALE_BITS;
288 v = abs(block[i] - block1[i]);
292 sysErr[i] += block[i] - block1[i];
294 if( abs(block[i])>maxout) maxout=abs(block[i]);
296 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
297 #if 0 // print different matrix pairs
301 if((i&7)==0) printf("\n");
302 printf("%4d ", block_org[i]);
305 if((i&7)==0) printf("\n");
306 printf("%4d ", block[i] - block1[i]);
311 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
313 #if 1 // dump systematic errors
315 if(i%8==0) printf("\n");
316 printf("%5d ", (int)sysErr[i]);
321 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
322 is_idct ? "IDCT" : "DCT",
323 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
331 block1[i] = (random() % 512) -256;
341 block1[0] = (random() % 512) -256;
342 block1[1] = (random() % 512) -256;
343 block1[2] = (random() % 512) -256;
344 block1[3] = (random() % 512) -256;
348 if (form == MMX_PERM) {
350 block[idct_mmx_perm[i]] = block1[i];
351 } else if(form == MMX_SIMPLE_PERM) {
353 block[idct_simple_mmx_perm[i]] = block1[i];
362 for(it=0;it<NB_ITS_SPEED;it++) {
365 // memcpy(block, block1, sizeof(DCTELEM) * 64);
366 // do not memcpy especially not fastmemcpy because it does movntq !!!
370 ti1 = gettime() - ti;
371 } while (ti1 < 1000000);
374 printf("%s %s: %0.1f kdct/s\n",
375 is_idct ? "IDCT" : "DCT",
376 name, (double)it1 * 1000.0 / (double)ti1);
380 static uint8_t img_dest[64] __attribute__ ((aligned (8)));
381 static uint8_t img_dest1[64] __attribute__ ((aligned (8)));
383 void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
386 static double c8[8][8];
387 static double c4[4][4];
388 double block1[64], block2[64], block3[64];
398 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
399 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
400 sum += c8[i][j] * c8[i][j];
407 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
408 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
409 sum += c4[i][j] * c4[i][j];
418 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
419 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
428 sum += c8[k][j] * block1[8*i+k];
439 sum += c4[k][j] * block2[8*(2*k)+i];
440 block3[8*(2*j)+i] = sum;
445 sum += c4[k][j] * block2[8*(2*k+1)+i];
446 block3[8*(2*j+1)+i] = sum;
450 /* clamp and store the result */
458 dest[i * linesize + j] = (int)rint(v);
463 void idct248_error(const char *name,
464 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
466 int it, i, it1, ti, ti1, err_max, v;
470 /* just one test to see if code is correct (precision is less
473 for(it=0;it<NB_ITS;it++) {
475 /* XXX: use forward transform to generate values */
477 block1[i] = (random() % 256) - 128;
482 idct248_ref(img_dest1, 8, block);
486 idct248_put(img_dest, 8, block);
489 v = abs((int)img_dest[i] - (int)img_dest1[i]);
491 printf("%d %d\n", img_dest[i], img_dest1[i]);
500 printf(" %3d", img_dest1[i*8+j]);
509 printf(" %3d", img_dest[i*8+j]);
515 printf("%s %s: err_inf=%d\n",
516 1 ? "IDCT248" : "DCT248",
522 for(it=0;it<NB_ITS_SPEED;it++) {
525 // memcpy(block, block1, sizeof(DCTELEM) * 64);
526 // do not memcpy especially not fastmemcpy because it does movntq !!!
527 idct248_put(img_dest, 8, block);
530 ti1 = gettime() - ti;
531 } while (ti1 < 1000000);
534 printf("%s %s: %0.1f kdct/s\n",
535 1 ? "IDCT248" : "DCT248",
536 name, (double)it1 * 1000.0 / (double)ti1);
541 printf("dct-test [-i] [<test-number>]\n"
542 "test-number 0 -> test with random matrixes\n"
543 " 1 -> test with random sparse matrixes\n"
544 " 2 -> do 3. test from mpeg4 std\n"
545 "-i test IDCT implementations\n"
546 "-4 test IDCT248 implementations\n");
549 int main(int argc, char **argv)
551 int test_idct = 0, test_248_dct = 0;
554 cpu_flags = mm_support();
559 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
560 for(i=0;i<MAX_NEG_CROP;i++) {
562 cropTbl[i + MAX_NEG_CROP + 256] = 255;
566 c = getopt(argc, argv, "ih4");
583 if(optind <argc) test= atoi(argv[optind]);
585 printf("ffmpeg DCT/IDCT test\n");
588 idct248_error("SIMPLE-C", ff_simple_idct248_put);
590 for (i=0;algos[i].name;i++)
591 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
592 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);