2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
110 static const struct algo idct_tab[] = {
111 { "FAANI", ff_faanidct, NO_PERM },
112 { "REF-DBL", ff_ref_idct, NO_PERM },
113 { "INT", j_rev_dct, MMX_PERM },
114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
128 { "BFINidct", ff_bfin_idct, NO_PERM },
132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
152 #define AANSCALE_BITS 12
154 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
156 static int64_t gettime(void)
159 gettimeofday(&tv, NULL);
160 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
164 #define NB_ITS_SPEED 50000
166 static short idct_mmx_perm[64];
168 static short idct_simple_mmx_perm[64] = {
169 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
170 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
171 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
172 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
173 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
174 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
175 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
176 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
179 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
181 static void idct_mmx_init(void)
185 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
186 for (i = 0; i < 64; i++) {
187 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
191 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
192 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
194 static inline void mmx_emms(void)
197 if (cpu_flags & AV_CPU_FLAG_MMX)
198 __asm__ volatile ("emms\n\t");
202 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
206 memset(block, 0, 64 * sizeof(*block));
210 for (i = 0; i < 64; i++)
211 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
214 for (i = 0; i < 64; i++)
219 j = av_lfg_get(prng) % 10 + 1;
220 for (i = 0; i < j; i++)
221 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
224 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
225 block[63] = (block[0] & 1) ^ 1;
230 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
234 if (perm == MMX_PERM) {
235 for (i = 0; i < 64; i++)
236 dst[idct_mmx_perm[i]] = src[i];
237 } else if (perm == MMX_SIMPLE_PERM) {
238 for (i = 0; i < 64; i++)
239 dst[idct_simple_mmx_perm[i]] = src[i];
240 } else if (perm == SSE2_PERM) {
241 for (i = 0; i < 64; i++)
242 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
243 } else if (perm == PARTTRANS_PERM) {
244 for (i = 0; i < 64; i++)
245 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
247 for (i = 0; i < 64; i++)
252 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
254 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
257 int64_t err2, ti, ti1, it1, err_sum = 0;
258 int64_t sysErr[64], sysErrMax = 0;
260 int blockSumErrMax = 0, blockSumErr;
262 const int vals=1<<bits;
266 av_lfg_init(&prng, 1);
270 for (i = 0; i < 64; i++)
272 for (it = 0; it < NB_ITS; it++) {
273 init_block(block1, test, is_idct, &prng, vals);
274 permute(block, block1, dct->format);
279 if (dct->format == SCALE_PERM) {
280 for (i = 0; i < 64; i++) {
281 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
282 block[i] = (block[i] * scale) >> AANSCALE_BITS;
289 for (i = 0; i < 64; i++) {
290 int err = block[i] - block1[i];
296 sysErr[i] += block[i] - block1[i];
298 if (abs(block[i]) > maxout)
299 maxout = abs(block[i]);
301 if (blockSumErrMax < blockSumErr)
302 blockSumErrMax = blockSumErr;
304 for (i = 0; i < 64; i++)
305 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
307 for (i = 0; i < 64; i++) {
310 printf("%7d ", (int) sysErr[i]);
314 omse = (double) err2 / NB_ITS / 64;
315 ome = (double) err_sum / NB_ITS / 64;
317 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
319 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
320 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
321 omse, ome, (double) sysErrMax / NB_ITS,
322 maxout, blockSumErrMax);
324 if (spec_err && !dct->nonspec)
332 init_block(block, test, is_idct, &prng, vals);
333 permute(block1, block, dct->format);
338 for (it = 0; it < NB_ITS_SPEED; it++) {
339 memcpy(block, block1, sizeof(block));
343 ti1 = gettime() - ti;
344 } while (ti1 < 1000000);
347 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
348 (double) it1 * 1000.0 / (double) ti1);
353 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
354 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
356 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
359 static double c8[8][8];
360 static double c4[4][4];
361 double block1[64], block2[64], block3[64];
368 for (i = 0; i < 8; i++) {
370 for (j = 0; j < 8; j++) {
371 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
372 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
373 sum += c8[i][j] * c8[i][j];
377 for (i = 0; i < 4; i++) {
379 for (j = 0; j < 4; j++) {
380 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
381 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
382 sum += c4[i][j] * c4[i][j];
389 for (i = 0; i < 4; i++) {
390 for (j = 0; j < 8; j++) {
391 block1[8 * (2 * i) + j] =
392 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
393 block1[8 * (2 * i + 1) + j] =
394 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
399 for (i = 0; i < 8; i++) {
400 for (j = 0; j < 8; j++) {
402 for (k = 0; k < 8; k++)
403 sum += c8[k][j] * block1[8 * i + k];
404 block2[8 * i + j] = sum;
409 for (i = 0; i < 8; i++) {
410 for (j = 0; j < 4; j++) {
413 for (k = 0; k < 4; k++)
414 sum += c4[k][j] * block2[8 * (2 * k) + i];
415 block3[8 * (2 * j) + i] = sum;
419 for (k = 0; k < 4; k++)
420 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
421 block3[8 * (2 * j + 1) + i] = sum;
425 /* clamp and store the result */
426 for (i = 0; i < 8; i++) {
427 for (j = 0; j < 8; j++) {
428 v = block3[8 * i + j];
430 else if (v > 255) v = 255;
431 dest[i * linesize + j] = (int) rint(v);
436 static void idct248_error(const char *name,
437 void (*idct248_put)(uint8_t *dest, int line_size,
441 int it, i, it1, ti, ti1, err_max, v;
444 av_lfg_init(&prng, 1);
446 /* just one test to see if code is correct (precision is less
449 for (it = 0; it < NB_ITS; it++) {
450 /* XXX: use forward transform to generate values */
451 for (i = 0; i < 64; i++)
452 block1[i] = av_lfg_get(&prng) % 256 - 128;
455 for (i = 0; i < 64; i++)
456 block[i] = block1[i];
457 idct248_ref(img_dest1, 8, block);
459 for (i = 0; i < 64; i++)
460 block[i] = block1[i];
461 idct248_put(img_dest, 8, block);
463 for (i = 0; i < 64; i++) {
464 v = abs((int) img_dest[i] - (int) img_dest1[i]);
466 printf("%d %d\n", img_dest[i], img_dest1[i]);
475 printf(" %3d", img_dest1[i*8+j]);
484 printf(" %3d", img_dest[i*8+j]);
490 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
498 for (it = 0; it < NB_ITS_SPEED; it++) {
499 for (i = 0; i < 64; i++)
500 block[i] = block1[i];
501 idct248_put(img_dest, 8, block);
504 ti1 = gettime() - ti;
505 } while (ti1 < 1000000);
508 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
509 (double) it1 * 1000.0 / (double) ti1);
512 static void help(void)
514 printf("dct-test [-i] [<test-number>] [<bits>]\n"
515 "test-number 0 -> test with random matrixes\n"
516 " 1 -> test with random sparse matrixes\n"
517 " 2 -> do 3. test from mpeg4 std\n"
518 "bits Number of time domain bits to use, 8 is default\n"
519 "-i test IDCT implementations\n"
520 "-4 test IDCT248 implementations\n"
524 int main(int argc, char **argv)
526 int test_idct = 0, test_248_dct = 0;
533 cpu_flags = av_get_cpu_flags();
538 for (i = 0; i < 256; i++)
539 cropTbl[i + MAX_NEG_CROP] = i;
540 for (i = 0; i < MAX_NEG_CROP; i++) {
542 cropTbl[i + MAX_NEG_CROP + 256] = 255;
546 c = getopt(argc, argv, "ih4t");
567 test = atoi(argv[optind]);
568 if(optind+1 < argc) bits= atoi(argv[optind+1]);
570 printf("ffmpeg DCT/IDCT test\n");
573 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
575 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
576 for (i = 0; algos[i].name; i++)
577 if (!(~cpu_flags & algos[i].mm_support)) {
578 err |= dct_error(&algos[i], test, test_idct, speed, bits);