2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
44 #include "simple_idct.h"
45 #include "aandcttab.h"
49 #include "x86/idct_xvid.h"
53 void ff_fdct_altivec(int16_t *block);
56 void ff_j_rev_dct_arm(int16_t *data);
57 void ff_simple_idct_arm(int16_t *data);
58 void ff_simple_idct_armv5te(int16_t *data);
59 void ff_simple_idct_armv6(int16_t *data);
60 void ff_simple_idct_neon(int16_t *data);
64 void (*func)(int16_t *block);
65 enum idct_permutation_type perm_type;
70 static const struct algo fdct_tab[] = {
71 { "REF-DBL", ff_ref_fdct, FF_IDCT_PERM_NONE },
72 { "FAAN", ff_faandct, FF_IDCT_PERM_NONE },
73 { "IJG-AAN-INT", ff_fdct_ifast, FF_IDCT_PERM_NONE },
74 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, FF_IDCT_PERM_NONE },
77 { "MMX", ff_fdct_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
79 #if HAVE_MMXEXT_INLINE
80 { "MMXEXT", ff_fdct_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT },
83 { "SSE2", ff_fdct_sse2, FF_IDCT_PERM_NONE, AV_CPU_FLAG_SSE2 },
87 { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
93 static const struct algo idct_tab[] = {
94 { "FAANI", ff_faanidct, FF_IDCT_PERM_NONE },
95 { "REF-DBL", ff_ref_idct, FF_IDCT_PERM_NONE },
96 { "INT", ff_j_rev_dct, FF_IDCT_PERM_LIBMPEG2 },
97 { "SIMPLE-C", ff_simple_idct_8, FF_IDCT_PERM_NONE },
100 { "SIMPLE-MMX", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
101 { "XVID-MMX", ff_idct_xvid_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX, 1 },
103 #if HAVE_MMXEXT_INLINE
104 { "XVID-MMXEXT", ff_idct_xvid_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT, 1 },
107 { "XVID-SSE2", ff_idct_xvid_sse2, FF_IDCT_PERM_SSE2, AV_CPU_FLAG_SSE2, 1 },
111 { "SIMPLE-ARM", ff_simple_idct_arm, FF_IDCT_PERM_NONE },
112 { "INT-ARM", ff_j_rev_dct_arm, FF_IDCT_PERM_LIBMPEG2 },
115 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ARMV5TE },
118 { "SIMPLE-ARMV6", ff_simple_idct_armv6, FF_IDCT_PERM_LIBMPEG2, AV_CPU_FLAG_ARMV6 },
120 #if HAVE_NEON && ARCH_ARM
121 { "SIMPLE-NEON", ff_simple_idct_neon, FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
127 #define AANSCALE_BITS 12
130 #define NB_ITS_SPEED 50000
132 static short idct_simple_mmx_perm[64] = {
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
143 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
145 DECLARE_ALIGNED(16, static int16_t, block)[64];
146 DECLARE_ALIGNED(8, static int16_t, block1)[64];
148 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
152 memset(block, 0, 64 * sizeof(*block));
156 for (i = 0; i < 64; i++)
157 block[i] = (av_lfg_get(prng) % 512) - 256;
160 for (i = 0; i < 64; i++)
165 j = av_lfg_get(prng) % 10 + 1;
166 for (i = 0; i < j; i++)
167 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
170 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
171 block[63] = (block[0] & 1) ^ 1;
176 static void permute(int16_t dst[64], const int16_t src[64],
177 enum idct_permutation_type perm_type)
182 case FF_IDCT_PERM_LIBMPEG2:
183 for (i = 0; i < 64; i++)
184 dst[(i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2)] = src[i];
186 case FF_IDCT_PERM_SIMPLE:
187 for (i = 0; i < 64; i++)
188 dst[idct_simple_mmx_perm[i]] = src[i];
190 case FF_IDCT_PERM_SSE2:
191 for (i = 0; i < 64; i++)
192 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
194 case FF_IDCT_PERM_PARTTRANS:
195 for (i = 0; i < 64; i++)
196 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
199 for (i = 0; i < 64; i++)
205 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
207 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
210 int64_t err2, ti, ti1, it1, err_sum = 0;
211 int64_t sysErr[64], sysErrMax = 0;
213 int blockSumErrMax = 0, blockSumErr;
218 av_lfg_init(&prng, 1);
222 for (i = 0; i < 64; i++)
224 for (it = 0; it < NB_ITS; it++) {
225 init_block(block1, test, is_idct, &prng);
226 permute(block, block1, dct->perm_type);
231 if (!strcmp(dct->name, "IJG-AAN-INT")) {
232 for (i = 0; i < 64; i++) {
233 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
234 block[i] = (block[i] * scale) >> AANSCALE_BITS;
241 for (i = 0; i < 64; i++) {
242 int err = block[i] - block1[i];
248 sysErr[i] += block[i] - block1[i];
250 if (abs(block[i]) > maxout)
251 maxout = abs(block[i]);
253 if (blockSumErrMax < blockSumErr)
254 blockSumErrMax = blockSumErr;
256 for (i = 0; i < 64; i++)
257 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
259 for (i = 0; i < 64; i++) {
262 printf("%7d ", (int) sysErr[i]);
266 omse = (double) err2 / NB_ITS / 64;
267 ome = (double) err_sum / NB_ITS / 64;
269 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
271 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
272 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
273 omse, ome, (double) sysErrMax / NB_ITS,
274 maxout, blockSumErrMax);
276 if (spec_err && !dct->nonspec)
283 init_block(block, test, is_idct, &prng);
284 permute(block1, block, dct->perm_type);
289 for (it = 0; it < NB_ITS_SPEED; it++) {
290 memcpy(block, block1, sizeof(block));
294 ti1 = av_gettime() - ti;
295 } while (ti1 < 1000000);
298 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
299 (double) it1 * 1000.0 / (double) ti1);
304 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
305 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
307 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
310 static double c8[8][8];
311 static double c4[4][4];
312 double block1[64], block2[64], block3[64];
319 for (i = 0; i < 8; i++) {
321 for (j = 0; j < 8; j++) {
322 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
323 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
324 sum += c8[i][j] * c8[i][j];
328 for (i = 0; i < 4; i++) {
330 for (j = 0; j < 4; j++) {
331 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
332 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
333 sum += c4[i][j] * c4[i][j];
340 for (i = 0; i < 4; i++) {
341 for (j = 0; j < 8; j++) {
342 block1[8 * (2 * i) + j] =
343 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
344 block1[8 * (2 * i + 1) + j] =
345 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
350 for (i = 0; i < 8; i++) {
351 for (j = 0; j < 8; j++) {
353 for (k = 0; k < 8; k++)
354 sum += c8[k][j] * block1[8 * i + k];
355 block2[8 * i + j] = sum;
360 for (i = 0; i < 8; i++) {
361 for (j = 0; j < 4; j++) {
364 for (k = 0; k < 4; k++)
365 sum += c4[k][j] * block2[8 * (2 * k) + i];
366 block3[8 * (2 * j) + i] = sum;
370 for (k = 0; k < 4; k++)
371 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
372 block3[8 * (2 * j + 1) + i] = sum;
376 /* clamp and store the result */
377 for (i = 0; i < 8; i++) {
378 for (j = 0; j < 8; j++) {
379 v = block3[8 * i + j];
381 else if (v > 255) v = 255;
382 dest[i * linesize + j] = (int) rint(v);
387 static void idct248_error(const char *name,
388 void (*idct248_put)(uint8_t *dest, int line_size,
392 int it, i, it1, ti, ti1, err_max, v;
395 av_lfg_init(&prng, 1);
397 /* just one test to see if code is correct (precision is less
400 for (it = 0; it < NB_ITS; it++) {
401 /* XXX: use forward transform to generate values */
402 for (i = 0; i < 64; i++)
403 block1[i] = av_lfg_get(&prng) % 256 - 128;
406 for (i = 0; i < 64; i++)
407 block[i] = block1[i];
408 idct248_ref(img_dest1, 8, block);
410 for (i = 0; i < 64; i++)
411 block[i] = block1[i];
412 idct248_put(img_dest, 8, block);
414 for (i = 0; i < 64; i++) {
415 v = abs((int) img_dest[i] - (int) img_dest1[i]);
417 printf("%d %d\n", img_dest[i], img_dest1[i]);
422 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
430 for (it = 0; it < NB_ITS_SPEED; it++) {
431 for (i = 0; i < 64; i++)
432 block[i] = block1[i];
433 idct248_put(img_dest, 8, block);
436 ti1 = av_gettime() - ti;
437 } while (ti1 < 1000000);
440 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
441 (double) it1 * 1000.0 / (double) ti1);
444 static void help(void)
446 printf("dct-test [-i] [<test-number>]\n"
447 "test-number 0 -> test with random matrixes\n"
448 " 1 -> test with random sparse matrixes\n"
449 " 2 -> do 3. test from mpeg4 std\n"
450 "-i test IDCT implementations\n"
451 "-4 test IDCT248 implementations\n"
456 #include "compat/getopt.c"
459 int main(int argc, char **argv)
461 int test_idct = 0, test_248_dct = 0;
470 c = getopt(argc, argv, "ih4t");
491 test = atoi(argv[optind]);
493 printf("Libav DCT/IDCT test\n");
496 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
498 const int cpu_flags = av_get_cpu_flags();
499 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
500 for (i = 0; algos[i].name; i++)
501 if (!(~cpu_flags & algos[i].cpu_flag)) {
502 err |= dct_error(&algos[i], test, test_idct, speed);
507 printf("Error: %d.\n", err);