2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
66 void ff_simple_idct_axp(DCTELEM *data);
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 static const struct algo idct_tab[] = {
103 { "FAANI", ff_faanidct, NO_PERM },
104 { "REF-DBL", ff_ref_idct, NO_PERM },
105 { "INT", ff_j_rev_dct, MMX_PERM },
106 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
110 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
111 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
113 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
114 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
115 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
116 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
120 { "BFINidct", ff_bfin_idct, NO_PERM },
124 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
125 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
128 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
131 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
134 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
138 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
144 #define AANSCALE_BITS 12
146 static int64_t gettime(void)
149 gettimeofday(&tv, NULL);
150 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
154 #define NB_ITS_SPEED 50000
156 static short idct_mmx_perm[64];
158 static short idct_simple_mmx_perm[64] = {
159 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
160 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
161 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
162 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
163 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
164 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
165 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
166 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
169 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
171 static void idct_mmx_init(void)
175 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
176 for (i = 0; i < 64; i++) {
177 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
181 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
182 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
184 static inline void mmx_emms(void)
187 if (cpu_flags & AV_CPU_FLAG_MMX)
188 __asm__ volatile ("emms\n\t");
192 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
196 memset(block, 0, 64 * sizeof(*block));
200 for (i = 0; i < 64; i++)
201 block[i] = (av_lfg_get(prng) % 512) - 256;
204 for (i = 0; i < 64; i++)
209 j = av_lfg_get(prng) % 10 + 1;
210 for (i = 0; i < j; i++)
211 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
214 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
215 block[63] = (block[0] & 1) ^ 1;
220 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
224 if (perm == MMX_PERM) {
225 for (i = 0; i < 64; i++)
226 dst[idct_mmx_perm[i]] = src[i];
227 } else if (perm == MMX_SIMPLE_PERM) {
228 for (i = 0; i < 64; i++)
229 dst[idct_simple_mmx_perm[i]] = src[i];
230 } else if (perm == SSE2_PERM) {
231 for (i = 0; i < 64; i++)
232 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
233 } else if (perm == PARTTRANS_PERM) {
234 for (i = 0; i < 64; i++)
235 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
237 for (i = 0; i < 64; i++)
242 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
244 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
247 int64_t err2, ti, ti1, it1, err_sum = 0;
248 int64_t sysErr[64], sysErrMax = 0;
250 int blockSumErrMax = 0, blockSumErr;
255 av_lfg_init(&prng, 1);
259 for (i = 0; i < 64; i++)
261 for (it = 0; it < NB_ITS; it++) {
262 init_block(block1, test, is_idct, &prng);
263 permute(block, block1, dct->format);
268 if (dct->format == SCALE_PERM) {
269 for (i = 0; i < 64; i++) {
270 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
271 block[i] = (block[i] * scale) >> AANSCALE_BITS;
278 for (i = 0; i < 64; i++) {
279 int err = block[i] - block1[i];
285 sysErr[i] += block[i] - block1[i];
287 if (abs(block[i]) > maxout)
288 maxout = abs(block[i]);
290 if (blockSumErrMax < blockSumErr)
291 blockSumErrMax = blockSumErr;
293 for (i = 0; i < 64; i++)
294 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
296 for (i = 0; i < 64; i++) {
299 printf("%7d ", (int) sysErr[i]);
303 omse = (double) err2 / NB_ITS / 64;
304 ome = (double) err_sum / NB_ITS / 64;
306 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
308 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
309 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
310 omse, ome, (double) sysErrMax / NB_ITS,
311 maxout, blockSumErrMax);
313 if (spec_err && !dct->nonspec)
320 init_block(block, test, is_idct, &prng);
321 permute(block1, block, dct->format);
326 for (it = 0; it < NB_ITS_SPEED; it++) {
327 memcpy(block, block1, sizeof(block));
331 ti1 = gettime() - ti;
332 } while (ti1 < 1000000);
335 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
336 (double) it1 * 1000.0 / (double) ti1);
341 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
342 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
344 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
347 static double c8[8][8];
348 static double c4[4][4];
349 double block1[64], block2[64], block3[64];
356 for (i = 0; i < 8; i++) {
358 for (j = 0; j < 8; j++) {
359 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
360 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
361 sum += c8[i][j] * c8[i][j];
365 for (i = 0; i < 4; i++) {
367 for (j = 0; j < 4; j++) {
368 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
369 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
370 sum += c4[i][j] * c4[i][j];
377 for (i = 0; i < 4; i++) {
378 for (j = 0; j < 8; j++) {
379 block1[8 * (2 * i) + j] =
380 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
381 block1[8 * (2 * i + 1) + j] =
382 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
387 for (i = 0; i < 8; i++) {
388 for (j = 0; j < 8; j++) {
390 for (k = 0; k < 8; k++)
391 sum += c8[k][j] * block1[8 * i + k];
392 block2[8 * i + j] = sum;
397 for (i = 0; i < 8; i++) {
398 for (j = 0; j < 4; j++) {
401 for (k = 0; k < 4; k++)
402 sum += c4[k][j] * block2[8 * (2 * k) + i];
403 block3[8 * (2 * j) + i] = sum;
407 for (k = 0; k < 4; k++)
408 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
409 block3[8 * (2 * j + 1) + i] = sum;
413 /* clamp and store the result */
414 for (i = 0; i < 8; i++) {
415 for (j = 0; j < 8; j++) {
416 v = block3[8 * i + j];
418 else if (v > 255) v = 255;
419 dest[i * linesize + j] = (int) rint(v);
424 static void idct248_error(const char *name,
425 void (*idct248_put)(uint8_t *dest, int line_size,
429 int it, i, it1, ti, ti1, err_max, v;
432 av_lfg_init(&prng, 1);
434 /* just one test to see if code is correct (precision is less
437 for (it = 0; it < NB_ITS; it++) {
438 /* XXX: use forward transform to generate values */
439 for (i = 0; i < 64; i++)
440 block1[i] = av_lfg_get(&prng) % 256 - 128;
443 for (i = 0; i < 64; i++)
444 block[i] = block1[i];
445 idct248_ref(img_dest1, 8, block);
447 for (i = 0; i < 64; i++)
448 block[i] = block1[i];
449 idct248_put(img_dest, 8, block);
451 for (i = 0; i < 64; i++) {
452 v = abs((int) img_dest[i] - (int) img_dest1[i]);
454 printf("%d %d\n", img_dest[i], img_dest1[i]);
459 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
467 for (it = 0; it < NB_ITS_SPEED; it++) {
468 for (i = 0; i < 64; i++)
469 block[i] = block1[i];
470 idct248_put(img_dest, 8, block);
473 ti1 = gettime() - ti;
474 } while (ti1 < 1000000);
477 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
478 (double) it1 * 1000.0 / (double) ti1);
481 static void help(void)
483 printf("dct-test [-i] [<test-number>]\n"
484 "test-number 0 -> test with random matrixes\n"
485 " 1 -> test with random sparse matrixes\n"
486 " 2 -> do 3. test from mpeg4 std\n"
487 "-i test IDCT implementations\n"
488 "-4 test IDCT248 implementations\n"
492 int main(int argc, char **argv)
494 int test_idct = 0, test_248_dct = 0;
500 cpu_flags = av_get_cpu_flags();
506 c = getopt(argc, argv, "ih4t");
527 test = atoi(argv[optind]);
529 printf("Libav DCT/IDCT test\n");
532 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
534 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
535 for (i = 0; algos[i].name; i++)
536 if (!(~cpu_flags & algos[i].mm_support)) {
537 err |= dct_error(&algos[i], test, test_idct, speed);