2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 enum { FDCT, IDCT } is_idct;
73 void (*func)(DCTELEM *block);
74 void (*ref) (DCTELEM *block);
75 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
76 SSE2_PERM, PARTTRANS_PERM } format;
80 #ifndef FAAN_POSTSCALE
81 #define FAAN_SCALE SCALE_PERM
83 #define FAAN_SCALE NO_PERM
88 struct algo algos[] = {
89 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
90 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
91 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
92 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
93 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
94 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
95 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
96 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
99 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX},
101 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_MMX2},
102 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_SSE2},
106 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX},
107 {"LIBMPEG2-MMX2", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, AV_CPU_FLAG_MMX2},
109 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX},
110 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX},
111 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, AV_CPU_FLAG_MMX2},
112 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, AV_CPU_FLAG_SSE2},
116 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, AV_CPU_FLAG_ALTIVEC},
120 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
121 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
125 {"SIMPLE-ARM", 1, ff_simple_idct_arm, ff_ref_idct, NO_PERM },
126 {"INT-ARM", 1, ff_j_rev_dct_arm, ff_ref_idct, MMX_PERM },
128 {"SIMPLE-ARMV5TE", 1, ff_simple_idct_armv5te, ff_ref_idct, NO_PERM },
131 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
134 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
136 #endif /* ARCH_ARM */
139 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
145 #define AANSCALE_BITS 12
147 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
149 static int64_t gettime(void)
152 gettimeofday(&tv, NULL);
153 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
157 #define NB_ITS_SPEED 50000
159 static short idct_mmx_perm[64];
161 static short idct_simple_mmx_perm[64] = {
162 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
163 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
164 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
165 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
166 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
167 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
168 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
169 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
172 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
174 static void idct_mmx_init(void)
178 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
179 for (i = 0; i < 64; i++) {
180 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
184 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
185 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
186 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
188 static inline void mmx_emms(void)
191 if (cpu_flags & AV_CPU_FLAG_MMX)
192 __asm__ volatile ("emms\n\t");
196 static void dct_error(const char *name, int is_idct,
197 void (*fdct_func)(DCTELEM *block),
198 void (*fdct_ref)(DCTELEM *block), int form,
203 int64_t err2, ti, ti1, it1;
204 int64_t sysErr[64], sysErrMax = 0;
206 int blockSumErrMax = 0, blockSumErr;
209 av_lfg_init(&prng, 1);
213 for (i = 0; i < 64; i++)
215 for (it = 0; it < NB_ITS; it++) {
216 for (i = 0; i < 64; i++)
220 for (i = 0; i < 64; i++)
221 block1[i] = (av_lfg_get(&prng) % 512) - 256;
224 for (i = 0; i < 64; i++)
229 int num = av_lfg_get(&prng) % 10 + 1;
230 for (i = 0; i < num; i++)
231 block1[av_lfg_get(&prng) % 64] =
232 av_lfg_get(&prng) % 512 - 256;
236 block1[0] = av_lfg_get(&prng) % 4096 - 2048;
237 block1[63] = (block1[0] & 1) ^ 1;
241 for (i = 0; i < 64; i++)
242 block_org[i] = block1[i];
244 if (form == MMX_PERM) {
245 for (i = 0; i < 64; i++)
246 block[idct_mmx_perm[i]] = block1[i];
247 } else if (form == MMX_SIMPLE_PERM) {
248 for (i = 0; i < 64; i++)
249 block[idct_simple_mmx_perm[i]] = block1[i];
250 } else if (form == SSE2_PERM) {
251 for (i = 0; i < 64; i++)
252 block[(i & 0x38) | idct_sse2_row_perm[i & 7]] = block1[i];
253 } else if (form == PARTTRANS_PERM) {
254 for (i = 0; i < 64; i++)
255 block[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = block1[i];
257 for (i = 0; i < 64; i++)
258 block[i] = block1[i];
264 if (form == SCALE_PERM) {
265 for (i = 0; i < 64; i++) {
266 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
267 block[i] = (block[i] * scale) >> AANSCALE_BITS;
274 for (i = 0; i < 64; i++) {
275 v = abs(block[i] - block1[i]);
279 sysErr[i] += block[i] - block1[i];
281 if (abs(block[i]) > maxout)
282 maxout = abs(block[i]);
284 if (blockSumErrMax < blockSumErr)
285 blockSumErrMax = blockSumErr;
287 for (i = 0; i < 64; i++)
288 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
290 for (i = 0; i < 64; i++) {
293 printf("%7d ", (int) sysErr[i]);
297 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
298 is_idct ? "IDCT" : "DCT", name, err_inf,
299 (double) err2 / NB_ITS / 64.0, (double) sysErrMax / NB_ITS,
300 maxout, blockSumErrMax);
303 for (i = 0; i < 64; i++)
308 for (i = 0; i < 64; i++)
309 block1[i] = av_lfg_get(&prng) % 512 - 256;
312 for (i = 0; i < 64; i++)
318 block1[0] = av_lfg_get(&prng) % 512 - 256;
319 block1[1] = av_lfg_get(&prng) % 512 - 256;
320 block1[2] = av_lfg_get(&prng) % 512 - 256;
321 block1[3] = av_lfg_get(&prng) % 512 - 256;
325 if (form == MMX_PERM) {
326 for (i = 0; i < 64; i++)
327 block[idct_mmx_perm[i]] = block1[i];
328 } else if (form == MMX_SIMPLE_PERM) {
329 for (i = 0; i < 64; i++)
330 block[idct_simple_mmx_perm[i]] = block1[i];
332 for (i = 0; i < 64; i++)
333 block[i] = block1[i];
339 for (it = 0; it < NB_ITS_SPEED; it++) {
340 for (i = 0; i < 64; i++)
341 block[i] = block1[i];
345 ti1 = gettime() - ti;
346 } while (ti1 < 1000000);
349 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", name,
350 (double) it1 * 1000.0 / (double) ti1);
353 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
354 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
356 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
359 static double c8[8][8];
360 static double c4[4][4];
361 double block1[64], block2[64], block3[64];
368 for (i = 0; i < 8; i++) {
370 for (j = 0; j < 8; j++) {
371 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
372 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
373 sum += c8[i][j] * c8[i][j];
377 for (i = 0; i < 4; i++) {
379 for (j = 0; j < 4; j++) {
380 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
381 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
382 sum += c4[i][j] * c4[i][j];
389 for (i = 0; i < 4; i++) {
390 for (j = 0; j < 8; j++) {
391 block1[8 * (2 * i) + j] =
392 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
393 block1[8 * (2 * i + 1) + j] =
394 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
399 for (i = 0; i < 8; i++) {
400 for (j = 0; j < 8; j++) {
402 for (k = 0; k < 8; k++)
403 sum += c8[k][j] * block1[8 * i + k];
404 block2[8 * i + j] = sum;
409 for (i = 0; i < 8; i++) {
410 for (j = 0; j < 4; j++) {
413 for (k = 0; k < 4; k++)
414 sum += c4[k][j] * block2[8 * (2 * k) + i];
415 block3[8 * (2 * j) + i] = sum;
419 for (k = 0; k < 4; k++)
420 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
421 block3[8 * (2 * j + 1) + i] = sum;
425 /* clamp and store the result */
426 for (i = 0; i < 8; i++) {
427 for (j = 0; j < 8; j++) {
428 v = block3[8 * i + j];
430 else if (v > 255) v = 255;
431 dest[i * linesize + j] = (int) rint(v);
436 static void idct248_error(const char *name,
437 void (*idct248_put)(uint8_t *dest, int line_size,
440 int it, i, it1, ti, ti1, err_max, v;
443 av_lfg_init(&prng, 1);
445 /* just one test to see if code is correct (precision is less
448 for (it = 0; it < NB_ITS; it++) {
449 /* XXX: use forward transform to generate values */
450 for (i = 0; i < 64; i++)
451 block1[i] = av_lfg_get(&prng) % 256 - 128;
454 for (i = 0; i < 64; i++)
455 block[i] = block1[i];
456 idct248_ref(img_dest1, 8, block);
458 for (i = 0; i < 64; i++)
459 block[i] = block1[i];
460 idct248_put(img_dest, 8, block);
462 for (i = 0; i < 64; i++) {
463 v = abs((int) img_dest[i] - (int) img_dest1[i]);
465 printf("%d %d\n", img_dest[i], img_dest1[i]);
470 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
475 for (it = 0; it < NB_ITS_SPEED; it++) {
476 for (i = 0; i < 64; i++)
477 block[i] = block1[i];
478 idct248_put(img_dest, 8, block);
481 ti1 = gettime() - ti;
482 } while (ti1 < 1000000);
485 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
486 (double) it1 * 1000.0 / (double) ti1);
489 static void help(void)
491 printf("dct-test [-i] [<test-number>]\n"
492 "test-number 0 -> test with random matrixes\n"
493 " 1 -> test with random sparse matrixes\n"
494 " 2 -> do 3. test from mpeg4 std\n"
495 "-i test IDCT implementations\n"
496 "-4 test IDCT248 implementations\n");
499 int main(int argc, char **argv)
501 int test_idct = 0, test_248_dct = 0;
505 cpu_flags = av_get_cpu_flags();
510 for (i = 0; i < 256; i++)
511 cropTbl[i + MAX_NEG_CROP] = i;
512 for (i = 0; i < MAX_NEG_CROP; i++) {
514 cropTbl[i + MAX_NEG_CROP + 256] = 255;
518 c = getopt(argc, argv, "ih4");
536 test = atoi(argv[optind]);
538 printf("ffmpeg DCT/IDCT test\n");
541 idct248_error("SIMPLE-C", ff_simple_idct248_put);
543 for (i = 0; algos[i].name; i++)
544 if (algos[i].is_idct == test_idct &&
545 !(~cpu_flags & algos[i].mm_support)) {
546 dct_error(algos[i].name, algos[i].is_idct, algos[i].func,
547 algos[i].ref, algos[i].format, test);