2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
110 static const struct algo idct_tab[] = {
111 { "FAANI", ff_faanidct, NO_PERM },
112 { "REF-DBL", ff_ref_idct, NO_PERM },
113 { "INT", j_rev_dct, MMX_PERM },
114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
128 { "BFINidct", ff_bfin_idct, NO_PERM },
132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
152 #define AANSCALE_BITS 12
154 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
156 static int64_t gettime(void)
159 gettimeofday(&tv, NULL);
160 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
164 #define NB_ITS_SPEED 50000
166 static short idct_mmx_perm[64];
168 static short idct_simple_mmx_perm[64] = {
169 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
170 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
171 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
172 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
173 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
174 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
175 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
176 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
179 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
181 static void idct_mmx_init(void)
185 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
186 for (i = 0; i < 64; i++) {
187 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
191 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
192 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
194 static inline void mmx_emms(void)
197 if (cpu_flags & AV_CPU_FLAG_MMX)
198 __asm__ volatile ("emms\n\t");
202 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
206 memset(block, 0, 64 * sizeof(*block));
210 for (i = 0; i < 64; i++)
211 block[i] = (av_lfg_get(prng) % 512) - 256;
214 for (i = 0; i < 64; i++)
219 j = av_lfg_get(prng) % 10 + 1;
220 for (i = 0; i < j; i++)
221 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
224 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
225 block[63] = (block[0] & 1) ^ 1;
230 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
234 if (perm == MMX_PERM) {
235 for (i = 0; i < 64; i++)
236 dst[idct_mmx_perm[i]] = src[i];
237 } else if (perm == MMX_SIMPLE_PERM) {
238 for (i = 0; i < 64; i++)
239 dst[idct_simple_mmx_perm[i]] = src[i];
240 } else if (perm == SSE2_PERM) {
241 for (i = 0; i < 64; i++)
242 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
243 } else if (perm == PARTTRANS_PERM) {
244 for (i = 0; i < 64; i++)
245 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
247 for (i = 0; i < 64; i++)
252 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
254 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
257 int64_t err2, ti, ti1, it1, err_sum = 0;
258 int64_t sysErr[64], sysErrMax = 0;
260 int blockSumErrMax = 0, blockSumErr;
265 av_lfg_init(&prng, 1);
269 for (i = 0; i < 64; i++)
271 for (it = 0; it < NB_ITS; it++) {
272 init_block(block1, test, is_idct, &prng);
273 permute(block, block1, dct->format);
278 if (dct->format == SCALE_PERM) {
279 for (i = 0; i < 64; i++) {
280 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
281 block[i] = (block[i] * scale) >> AANSCALE_BITS;
288 for (i = 0; i < 64; i++) {
289 int err = block[i] - block1[i];
295 sysErr[i] += block[i] - block1[i];
297 if (abs(block[i]) > maxout)
298 maxout = abs(block[i]);
300 if (blockSumErrMax < blockSumErr)
301 blockSumErrMax = blockSumErr;
303 for (i = 0; i < 64; i++)
304 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
306 for (i = 0; i < 64; i++) {
309 printf("%7d ", (int) sysErr[i]);
313 omse = (double) err2 / NB_ITS / 64;
314 ome = (double) err_sum / NB_ITS / 64;
316 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
318 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
319 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
320 omse, ome, (double) sysErrMax / NB_ITS,
321 maxout, blockSumErrMax);
323 if (spec_err && !dct->nonspec)
330 init_block(block, test, is_idct, &prng);
331 permute(block1, block, dct->format);
336 for (it = 0; it < NB_ITS_SPEED; it++) {
337 memcpy(block, block1, sizeof(block));
341 ti1 = gettime() - ti;
342 } while (ti1 < 1000000);
345 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
346 (double) it1 * 1000.0 / (double) ti1);
351 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
352 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
354 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
357 static double c8[8][8];
358 static double c4[4][4];
359 double block1[64], block2[64], block3[64];
366 for (i = 0; i < 8; i++) {
368 for (j = 0; j < 8; j++) {
369 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
370 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
371 sum += c8[i][j] * c8[i][j];
375 for (i = 0; i < 4; i++) {
377 for (j = 0; j < 4; j++) {
378 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
379 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
380 sum += c4[i][j] * c4[i][j];
387 for (i = 0; i < 4; i++) {
388 for (j = 0; j < 8; j++) {
389 block1[8 * (2 * i) + j] =
390 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
391 block1[8 * (2 * i + 1) + j] =
392 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
397 for (i = 0; i < 8; i++) {
398 for (j = 0; j < 8; j++) {
400 for (k = 0; k < 8; k++)
401 sum += c8[k][j] * block1[8 * i + k];
402 block2[8 * i + j] = sum;
407 for (i = 0; i < 8; i++) {
408 for (j = 0; j < 4; j++) {
411 for (k = 0; k < 4; k++)
412 sum += c4[k][j] * block2[8 * (2 * k) + i];
413 block3[8 * (2 * j) + i] = sum;
417 for (k = 0; k < 4; k++)
418 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
419 block3[8 * (2 * j + 1) + i] = sum;
423 /* clamp and store the result */
424 for (i = 0; i < 8; i++) {
425 for (j = 0; j < 8; j++) {
426 v = block3[8 * i + j];
428 else if (v > 255) v = 255;
429 dest[i * linesize + j] = (int) rint(v);
434 static void idct248_error(const char *name,
435 void (*idct248_put)(uint8_t *dest, int line_size,
439 int it, i, it1, ti, ti1, err_max, v;
442 av_lfg_init(&prng, 1);
444 /* just one test to see if code is correct (precision is less
447 for (it = 0; it < NB_ITS; it++) {
448 /* XXX: use forward transform to generate values */
449 for (i = 0; i < 64; i++)
450 block1[i] = av_lfg_get(&prng) % 256 - 128;
453 for (i = 0; i < 64; i++)
454 block[i] = block1[i];
455 idct248_ref(img_dest1, 8, block);
457 for (i = 0; i < 64; i++)
458 block[i] = block1[i];
459 idct248_put(img_dest, 8, block);
461 for (i = 0; i < 64; i++) {
462 v = abs((int) img_dest[i] - (int) img_dest1[i]);
464 printf("%d %d\n", img_dest[i], img_dest1[i]);
469 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
477 for (it = 0; it < NB_ITS_SPEED; it++) {
478 for (i = 0; i < 64; i++)
479 block[i] = block1[i];
480 idct248_put(img_dest, 8, block);
483 ti1 = gettime() - ti;
484 } while (ti1 < 1000000);
487 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
488 (double) it1 * 1000.0 / (double) ti1);
491 static void help(void)
493 printf("dct-test [-i] [<test-number>]\n"
494 "test-number 0 -> test with random matrixes\n"
495 " 1 -> test with random sparse matrixes\n"
496 " 2 -> do 3. test from mpeg4 std\n"
497 "-i test IDCT implementations\n"
498 "-4 test IDCT248 implementations\n"
502 int main(int argc, char **argv)
504 int test_idct = 0, test_248_dct = 0;
510 cpu_flags = av_get_cpu_flags();
515 for (i = 0; i < 256; i++)
516 cropTbl[i + MAX_NEG_CROP] = i;
517 for (i = 0; i < MAX_NEG_CROP; i++) {
519 cropTbl[i + MAX_NEG_CROP + 256] = 255;
523 c = getopt(argc, argv, "ih4t");
544 test = atoi(argv[optind]);
546 printf("Libav DCT/IDCT test\n");
549 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
551 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
552 for (i = 0; algos[i].name; i++)
553 if (!(~cpu_flags & algos[i].mm_support)) {
554 err |= dct_error(&algos[i], test, test_idct, speed);