2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
42 #include "simple_idct.h"
43 #include "aandcttab.h"
46 #include "x86/idct_xvid.h"
51 void ff_mmx_idct(DCTELEM *data);
52 void ff_mmxext_idct(DCTELEM *data);
55 void ff_bfin_idct(DCTELEM *block);
56 void ff_bfin_fdct(DCTELEM *block);
59 void ff_fdct_altivec(DCTELEM *block);
60 //void ff_idct_altivec(DCTELEM *block);?? no routine
63 void ff_j_rev_dct_arm(DCTELEM *data);
64 void ff_simple_idct_arm(DCTELEM *data);
65 void ff_simple_idct_armv5te(DCTELEM *data);
66 void ff_simple_idct_armv6(DCTELEM *data);
67 void ff_simple_idct_neon(DCTELEM *data);
69 void ff_simple_idct_axp(DCTELEM *data);
73 void (*func)(DCTELEM *block);
74 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
75 SSE2_PERM, PARTTRANS_PERM } format;
82 static const struct algo fdct_tab[] = {
83 { "REF-DBL", ff_ref_fdct, NO_PERM },
84 { "FAAN", ff_faandct, NO_PERM },
85 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
86 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
89 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
90 { "MMXEXT", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT },
91 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
95 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
99 { "BFINfdct", ff_bfin_fdct, NO_PERM },
105 static const struct algo idct_tab[] = {
106 { "FAANI", ff_faanidct, NO_PERM },
107 { "REF-DBL", ff_ref_idct, NO_PERM },
108 { "INT", ff_j_rev_dct, MMX_PERM },
109 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
112 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
113 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
114 { "XVID-MMXEXT", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
115 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
119 { "BFINidct", ff_bfin_idct, NO_PERM },
123 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
124 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
127 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
130 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
133 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
137 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
143 #define AANSCALE_BITS 12
146 #define NB_ITS_SPEED 50000
148 static short idct_mmx_perm[64];
150 static short idct_simple_mmx_perm[64] = {
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
163 static void idct_mmx_init(void)
167 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
168 for (i = 0; i < 64; i++) {
169 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
173 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
174 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
176 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
180 memset(block, 0, 64 * sizeof(*block));
184 for (i = 0; i < 64; i++)
185 block[i] = (av_lfg_get(prng) % 512) - 256;
188 for (i = 0; i < 64; i++)
193 j = av_lfg_get(prng) % 10 + 1;
194 for (i = 0; i < j; i++)
195 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
198 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
199 block[63] = (block[0] & 1) ^ 1;
204 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
208 if (perm == MMX_PERM) {
209 for (i = 0; i < 64; i++)
210 dst[idct_mmx_perm[i]] = src[i];
211 } else if (perm == MMX_SIMPLE_PERM) {
212 for (i = 0; i < 64; i++)
213 dst[idct_simple_mmx_perm[i]] = src[i];
214 } else if (perm == SSE2_PERM) {
215 for (i = 0; i < 64; i++)
216 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
217 } else if (perm == PARTTRANS_PERM) {
218 for (i = 0; i < 64; i++)
219 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
221 for (i = 0; i < 64; i++)
226 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
228 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
231 int64_t err2, ti, ti1, it1, err_sum = 0;
232 int64_t sysErr[64], sysErrMax = 0;
234 int blockSumErrMax = 0, blockSumErr;
239 av_lfg_init(&prng, 1);
243 for (i = 0; i < 64; i++)
245 for (it = 0; it < NB_ITS; it++) {
246 init_block(block1, test, is_idct, &prng);
247 permute(block, block1, dct->format);
252 if (dct->format == SCALE_PERM) {
253 for (i = 0; i < 64; i++) {
254 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
255 block[i] = (block[i] * scale) >> AANSCALE_BITS;
262 for (i = 0; i < 64; i++) {
263 int err = block[i] - block1[i];
269 sysErr[i] += block[i] - block1[i];
271 if (abs(block[i]) > maxout)
272 maxout = abs(block[i]);
274 if (blockSumErrMax < blockSumErr)
275 blockSumErrMax = blockSumErr;
277 for (i = 0; i < 64; i++)
278 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
280 for (i = 0; i < 64; i++) {
283 printf("%7d ", (int) sysErr[i]);
287 omse = (double) err2 / NB_ITS / 64;
288 ome = (double) err_sum / NB_ITS / 64;
290 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
292 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
293 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
294 omse, ome, (double) sysErrMax / NB_ITS,
295 maxout, blockSumErrMax);
297 if (spec_err && !dct->nonspec)
304 init_block(block, test, is_idct, &prng);
305 permute(block1, block, dct->format);
310 for (it = 0; it < NB_ITS_SPEED; it++) {
311 memcpy(block, block1, sizeof(block));
315 ti1 = av_gettime() - ti;
316 } while (ti1 < 1000000);
319 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
320 (double) it1 * 1000.0 / (double) ti1);
325 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
326 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
328 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
331 static double c8[8][8];
332 static double c4[4][4];
333 double block1[64], block2[64], block3[64];
340 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
343 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
344 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
345 sum += c8[i][j] * c8[i][j];
349 for (i = 0; i < 4; i++) {
351 for (j = 0; j < 4; j++) {
352 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
353 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
354 sum += c4[i][j] * c4[i][j];
361 for (i = 0; i < 4; i++) {
362 for (j = 0; j < 8; j++) {
363 block1[8 * (2 * i) + j] =
364 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
365 block1[8 * (2 * i + 1) + j] =
366 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
371 for (i = 0; i < 8; i++) {
372 for (j = 0; j < 8; j++) {
374 for (k = 0; k < 8; k++)
375 sum += c8[k][j] * block1[8 * i + k];
376 block2[8 * i + j] = sum;
381 for (i = 0; i < 8; i++) {
382 for (j = 0; j < 4; j++) {
385 for (k = 0; k < 4; k++)
386 sum += c4[k][j] * block2[8 * (2 * k) + i];
387 block3[8 * (2 * j) + i] = sum;
391 for (k = 0; k < 4; k++)
392 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
393 block3[8 * (2 * j + 1) + i] = sum;
397 /* clamp and store the result */
398 for (i = 0; i < 8; i++) {
399 for (j = 0; j < 8; j++) {
400 v = block3[8 * i + j];
402 else if (v > 255) v = 255;
403 dest[i * linesize + j] = (int) rint(v);
408 static void idct248_error(const char *name,
409 void (*idct248_put)(uint8_t *dest, int line_size,
413 int it, i, it1, ti, ti1, err_max, v;
416 av_lfg_init(&prng, 1);
418 /* just one test to see if code is correct (precision is less
421 for (it = 0; it < NB_ITS; it++) {
422 /* XXX: use forward transform to generate values */
423 for (i = 0; i < 64; i++)
424 block1[i] = av_lfg_get(&prng) % 256 - 128;
427 for (i = 0; i < 64; i++)
428 block[i] = block1[i];
429 idct248_ref(img_dest1, 8, block);
431 for (i = 0; i < 64; i++)
432 block[i] = block1[i];
433 idct248_put(img_dest, 8, block);
435 for (i = 0; i < 64; i++) {
436 v = abs((int) img_dest[i] - (int) img_dest1[i]);
438 printf("%d %d\n", img_dest[i], img_dest1[i]);
443 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
451 for (it = 0; it < NB_ITS_SPEED; it++) {
452 for (i = 0; i < 64; i++)
453 block[i] = block1[i];
454 idct248_put(img_dest, 8, block);
457 ti1 = av_gettime() - ti;
458 } while (ti1 < 1000000);
461 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
462 (double) it1 * 1000.0 / (double) ti1);
465 static void help(void)
467 printf("dct-test [-i] [<test-number>]\n"
468 "test-number 0 -> test with random matrixes\n"
469 " 1 -> test with random sparse matrixes\n"
470 " 2 -> do 3. test from mpeg4 std\n"
471 "-i test IDCT implementations\n"
472 "-4 test IDCT248 implementations\n"
477 #include "compat/getopt.c"
480 int main(int argc, char **argv)
482 int test_idct = 0, test_248_dct = 0;
488 cpu_flags = av_get_cpu_flags();
494 c = getopt(argc, argv, "ih4t");
515 test = atoi(argv[optind]);
517 printf("Libav DCT/IDCT test\n");
520 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
522 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
523 for (i = 0; algos[i].name; i++)
524 if (!(~cpu_flags & algos[i].mm_support)) {
525 err |= dct_error(&algos[i], test, test_idct, speed);