2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
66 void ff_simple_idct_axp(DCTELEM *data);
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM } format;
77 #ifndef FAAN_POSTSCALE
78 #define FAAN_SCALE SCALE_PERM
80 #define FAAN_SCALE NO_PERM
85 static const struct algo fdct_tab[] = {
86 { "REF-DBL", ff_ref_fdct, NO_PERM },
87 { "FAAN", ff_faandct, FAAN_SCALE },
88 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
89 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
92 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
93 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
94 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
98 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
102 { "BFINfdct", ff_bfin_fdct, NO_PERM },
108 static const struct algo idct_tab[] = {
109 { "FAANI", ff_faanidct, NO_PERM },
110 { "REF-DBL", ff_ref_idct, NO_PERM },
111 { "INT", ff_j_rev_dct, MMX_PERM },
112 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
116 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
117 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
119 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
120 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
121 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
122 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
126 { "BFINidct", ff_bfin_idct, NO_PERM },
130 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
131 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
134 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
137 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
140 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
144 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
150 #define AANSCALE_BITS 12
152 static int64_t gettime(void)
155 gettimeofday(&tv, NULL);
156 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
160 #define NB_ITS_SPEED 50000
162 static short idct_mmx_perm[64];
164 static short idct_simple_mmx_perm[64] = {
165 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
166 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
167 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
168 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
169 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
170 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
171 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
172 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
175 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
177 static void idct_mmx_init(void)
181 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
182 for (i = 0; i < 64; i++) {
183 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
187 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
188 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
190 static inline void mmx_emms(void)
193 if (cpu_flags & AV_CPU_FLAG_MMX)
194 __asm__ volatile ("emms\n\t");
198 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
202 memset(block, 0, 64 * sizeof(*block));
206 for (i = 0; i < 64; i++)
207 block[i] = (av_lfg_get(prng) % 512) - 256;
210 for (i = 0; i < 64; i++)
215 j = av_lfg_get(prng) % 10 + 1;
216 for (i = 0; i < j; i++)
217 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
220 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
221 block[63] = (block[0] & 1) ^ 1;
226 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
230 if (perm == MMX_PERM) {
231 for (i = 0; i < 64; i++)
232 dst[idct_mmx_perm[i]] = src[i];
233 } else if (perm == MMX_SIMPLE_PERM) {
234 for (i = 0; i < 64; i++)
235 dst[idct_simple_mmx_perm[i]] = src[i];
236 } else if (perm == SSE2_PERM) {
237 for (i = 0; i < 64; i++)
238 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
239 } else if (perm == PARTTRANS_PERM) {
240 for (i = 0; i < 64; i++)
241 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
243 for (i = 0; i < 64; i++)
248 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
250 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
253 int64_t err2, ti, ti1, it1, err_sum = 0;
254 int64_t sysErr[64], sysErrMax = 0;
256 int blockSumErrMax = 0, blockSumErr;
261 av_lfg_init(&prng, 1);
265 for (i = 0; i < 64; i++)
267 for (it = 0; it < NB_ITS; it++) {
268 init_block(block1, test, is_idct, &prng);
269 permute(block, block1, dct->format);
274 if (dct->format == SCALE_PERM) {
275 for (i = 0; i < 64; i++) {
276 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
277 block[i] = (block[i] * scale) >> AANSCALE_BITS;
284 for (i = 0; i < 64; i++) {
285 int err = block[i] - block1[i];
291 sysErr[i] += block[i] - block1[i];
293 if (abs(block[i]) > maxout)
294 maxout = abs(block[i]);
296 if (blockSumErrMax < blockSumErr)
297 blockSumErrMax = blockSumErr;
299 for (i = 0; i < 64; i++)
300 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
302 for (i = 0; i < 64; i++) {
305 printf("%7d ", (int) sysErr[i]);
309 omse = (double) err2 / NB_ITS / 64;
310 ome = (double) err_sum / NB_ITS / 64;
312 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
314 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
315 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
316 omse, ome, (double) sysErrMax / NB_ITS,
317 maxout, blockSumErrMax);
319 if (spec_err && !dct->nonspec)
326 init_block(block, test, is_idct, &prng);
327 permute(block1, block, dct->format);
332 for (it = 0; it < NB_ITS_SPEED; it++) {
333 memcpy(block, block1, sizeof(block));
337 ti1 = gettime() - ti;
338 } while (ti1 < 1000000);
341 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
342 (double) it1 * 1000.0 / (double) ti1);
347 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
348 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
350 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
353 static double c8[8][8];
354 static double c4[4][4];
355 double block1[64], block2[64], block3[64];
362 for (i = 0; i < 8; i++) {
364 for (j = 0; j < 8; j++) {
365 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
366 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
367 sum += c8[i][j] * c8[i][j];
371 for (i = 0; i < 4; i++) {
373 for (j = 0; j < 4; j++) {
374 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
375 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
376 sum += c4[i][j] * c4[i][j];
383 for (i = 0; i < 4; i++) {
384 for (j = 0; j < 8; j++) {
385 block1[8 * (2 * i) + j] =
386 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
387 block1[8 * (2 * i + 1) + j] =
388 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
393 for (i = 0; i < 8; i++) {
394 for (j = 0; j < 8; j++) {
396 for (k = 0; k < 8; k++)
397 sum += c8[k][j] * block1[8 * i + k];
398 block2[8 * i + j] = sum;
403 for (i = 0; i < 8; i++) {
404 for (j = 0; j < 4; j++) {
407 for (k = 0; k < 4; k++)
408 sum += c4[k][j] * block2[8 * (2 * k) + i];
409 block3[8 * (2 * j) + i] = sum;
413 for (k = 0; k < 4; k++)
414 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
415 block3[8 * (2 * j + 1) + i] = sum;
419 /* clamp and store the result */
420 for (i = 0; i < 8; i++) {
421 for (j = 0; j < 8; j++) {
422 v = block3[8 * i + j];
424 else if (v > 255) v = 255;
425 dest[i * linesize + j] = (int) rint(v);
430 static void idct248_error(const char *name,
431 void (*idct248_put)(uint8_t *dest, int line_size,
435 int it, i, it1, ti, ti1, err_max, v;
438 av_lfg_init(&prng, 1);
440 /* just one test to see if code is correct (precision is less
443 for (it = 0; it < NB_ITS; it++) {
444 /* XXX: use forward transform to generate values */
445 for (i = 0; i < 64; i++)
446 block1[i] = av_lfg_get(&prng) % 256 - 128;
449 for (i = 0; i < 64; i++)
450 block[i] = block1[i];
451 idct248_ref(img_dest1, 8, block);
453 for (i = 0; i < 64; i++)
454 block[i] = block1[i];
455 idct248_put(img_dest, 8, block);
457 for (i = 0; i < 64; i++) {
458 v = abs((int) img_dest[i] - (int) img_dest1[i]);
460 printf("%d %d\n", img_dest[i], img_dest1[i]);
465 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
473 for (it = 0; it < NB_ITS_SPEED; it++) {
474 for (i = 0; i < 64; i++)
475 block[i] = block1[i];
476 idct248_put(img_dest, 8, block);
479 ti1 = gettime() - ti;
480 } while (ti1 < 1000000);
483 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
484 (double) it1 * 1000.0 / (double) ti1);
487 static void help(void)
489 printf("dct-test [-i] [<test-number>]\n"
490 "test-number 0 -> test with random matrixes\n"
491 " 1 -> test with random sparse matrixes\n"
492 " 2 -> do 3. test from mpeg4 std\n"
493 "-i test IDCT implementations\n"
494 "-4 test IDCT248 implementations\n"
498 int main(int argc, char **argv)
500 int test_idct = 0, test_248_dct = 0;
506 cpu_flags = av_get_cpu_flags();
512 c = getopt(argc, argv, "ih4t");
533 test = atoi(argv[optind]);
535 printf("Libav DCT/IDCT test\n");
538 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
540 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
541 for (i = 0; algos[i].name; i++)
542 if (!(~cpu_flags & algos[i].mm_support)) {
543 err |= dct_error(&algos[i], test, test_idct, speed);