2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
110 static const struct algo idct_tab[] = {
111 { "FAANI", ff_faanidct, NO_PERM },
112 { "REF-DBL", ff_ref_idct, NO_PERM },
113 { "INT", j_rev_dct, MMX_PERM },
114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
128 { "BFINidct", ff_bfin_idct, NO_PERM },
132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
152 #define AANSCALE_BITS 12
154 static int64_t gettime(void)
157 gettimeofday(&tv, NULL);
158 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
162 #define NB_ITS_SPEED 50000
164 static short idct_mmx_perm[64];
166 static short idct_simple_mmx_perm[64] = {
167 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
168 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
169 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
170 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
171 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
172 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
173 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
174 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
177 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
179 static void idct_mmx_init(void)
183 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
184 for (i = 0; i < 64; i++) {
185 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
189 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
190 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
192 static inline void mmx_emms(void)
195 if (cpu_flags & AV_CPU_FLAG_MMX)
196 __asm__ volatile ("emms\n\t");
200 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
204 memset(block, 0, 64 * sizeof(*block));
208 for (i = 0; i < 64; i++)
209 block[i] = (av_lfg_get(prng) % 512) - 256;
212 for (i = 0; i < 64; i++)
217 j = av_lfg_get(prng) % 10 + 1;
218 for (i = 0; i < j; i++)
219 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
222 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
223 block[63] = (block[0] & 1) ^ 1;
228 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
232 if (perm == MMX_PERM) {
233 for (i = 0; i < 64; i++)
234 dst[idct_mmx_perm[i]] = src[i];
235 } else if (perm == MMX_SIMPLE_PERM) {
236 for (i = 0; i < 64; i++)
237 dst[idct_simple_mmx_perm[i]] = src[i];
238 } else if (perm == SSE2_PERM) {
239 for (i = 0; i < 64; i++)
240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241 } else if (perm == PARTTRANS_PERM) {
242 for (i = 0; i < 64; i++)
243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
245 for (i = 0; i < 64; i++)
250 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
252 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
255 int64_t err2, ti, ti1, it1, err_sum = 0;
256 int64_t sysErr[64], sysErrMax = 0;
258 int blockSumErrMax = 0, blockSumErr;
263 av_lfg_init(&prng, 1);
267 for (i = 0; i < 64; i++)
269 for (it = 0; it < NB_ITS; it++) {
270 init_block(block1, test, is_idct, &prng);
271 permute(block, block1, dct->format);
276 if (dct->format == SCALE_PERM) {
277 for (i = 0; i < 64; i++) {
278 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
279 block[i] = (block[i] * scale) >> AANSCALE_BITS;
286 for (i = 0; i < 64; i++) {
287 int err = block[i] - block1[i];
293 sysErr[i] += block[i] - block1[i];
295 if (abs(block[i]) > maxout)
296 maxout = abs(block[i]);
298 if (blockSumErrMax < blockSumErr)
299 blockSumErrMax = blockSumErr;
301 for (i = 0; i < 64; i++)
302 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
304 for (i = 0; i < 64; i++) {
307 printf("%7d ", (int) sysErr[i]);
311 omse = (double) err2 / NB_ITS / 64;
312 ome = (double) err_sum / NB_ITS / 64;
314 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
316 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
317 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
318 omse, ome, (double) sysErrMax / NB_ITS,
319 maxout, blockSumErrMax);
321 if (spec_err && !dct->nonspec)
328 init_block(block, test, is_idct, &prng);
329 permute(block1, block, dct->format);
334 for (it = 0; it < NB_ITS_SPEED; it++) {
335 memcpy(block, block1, sizeof(block));
339 ti1 = gettime() - ti;
340 } while (ti1 < 1000000);
343 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
344 (double) it1 * 1000.0 / (double) ti1);
349 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
350 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
352 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
355 static double c8[8][8];
356 static double c4[4][4];
357 double block1[64], block2[64], block3[64];
364 for (i = 0; i < 8; i++) {
366 for (j = 0; j < 8; j++) {
367 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
368 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
369 sum += c8[i][j] * c8[i][j];
373 for (i = 0; i < 4; i++) {
375 for (j = 0; j < 4; j++) {
376 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
377 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
378 sum += c4[i][j] * c4[i][j];
385 for (i = 0; i < 4; i++) {
386 for (j = 0; j < 8; j++) {
387 block1[8 * (2 * i) + j] =
388 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
389 block1[8 * (2 * i + 1) + j] =
390 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
395 for (i = 0; i < 8; i++) {
396 for (j = 0; j < 8; j++) {
398 for (k = 0; k < 8; k++)
399 sum += c8[k][j] * block1[8 * i + k];
400 block2[8 * i + j] = sum;
405 for (i = 0; i < 8; i++) {
406 for (j = 0; j < 4; j++) {
409 for (k = 0; k < 4; k++)
410 sum += c4[k][j] * block2[8 * (2 * k) + i];
411 block3[8 * (2 * j) + i] = sum;
415 for (k = 0; k < 4; k++)
416 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
417 block3[8 * (2 * j + 1) + i] = sum;
421 /* clamp and store the result */
422 for (i = 0; i < 8; i++) {
423 for (j = 0; j < 8; j++) {
424 v = block3[8 * i + j];
426 else if (v > 255) v = 255;
427 dest[i * linesize + j] = (int) rint(v);
432 static void idct248_error(const char *name,
433 void (*idct248_put)(uint8_t *dest, int line_size,
437 int it, i, it1, ti, ti1, err_max, v;
440 av_lfg_init(&prng, 1);
442 /* just one test to see if code is correct (precision is less
445 for (it = 0; it < NB_ITS; it++) {
446 /* XXX: use forward transform to generate values */
447 for (i = 0; i < 64; i++)
448 block1[i] = av_lfg_get(&prng) % 256 - 128;
451 for (i = 0; i < 64; i++)
452 block[i] = block1[i];
453 idct248_ref(img_dest1, 8, block);
455 for (i = 0; i < 64; i++)
456 block[i] = block1[i];
457 idct248_put(img_dest, 8, block);
459 for (i = 0; i < 64; i++) {
460 v = abs((int) img_dest[i] - (int) img_dest1[i]);
462 printf("%d %d\n", img_dest[i], img_dest1[i]);
467 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
475 for (it = 0; it < NB_ITS_SPEED; it++) {
476 for (i = 0; i < 64; i++)
477 block[i] = block1[i];
478 idct248_put(img_dest, 8, block);
481 ti1 = gettime() - ti;
482 } while (ti1 < 1000000);
485 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
486 (double) it1 * 1000.0 / (double) ti1);
489 static void help(void)
491 printf("dct-test [-i] [<test-number>]\n"
492 "test-number 0 -> test with random matrixes\n"
493 " 1 -> test with random sparse matrixes\n"
494 " 2 -> do 3. test from mpeg4 std\n"
495 "-i test IDCT implementations\n"
496 "-4 test IDCT248 implementations\n"
500 int main(int argc, char **argv)
502 int test_idct = 0, test_248_dct = 0;
508 cpu_flags = av_get_cpu_flags();
514 c = getopt(argc, argv, "ih4t");
535 test = atoi(argv[optind]);
537 printf("Libav DCT/IDCT test\n");
540 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
542 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
543 for (i = 0; algos[i].name; i++)
544 if (!(~cpu_flags & algos[i].mm_support)) {
545 err |= dct_error(&algos[i], test, test_idct, speed);