2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
44 #include "simple_idct.h"
45 #include "aandcttab.h"
50 #include "x86/idct_xvid.h"
51 #include "x86/simple_idct.h"
55 void ff_j_rev_dct_arm(int16_t *data);
56 void ff_simple_idct_arm(int16_t *data);
57 void ff_simple_idct_armv5te(int16_t *data);
58 void ff_simple_idct_armv6(int16_t *data);
59 void ff_simple_idct_neon(int16_t *data);
63 void (*func)(int16_t *block);
64 enum idct_permutation_type perm_type;
69 static const struct algo fdct_tab[] = {
70 { "REF-DBL", ff_ref_fdct, FF_IDCT_PERM_NONE },
71 { "FAAN", ff_faandct, FF_IDCT_PERM_NONE },
72 { "IJG-AAN-INT", ff_fdct_ifast, FF_IDCT_PERM_NONE },
73 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, FF_IDCT_PERM_NONE },
76 { "MMX", ff_fdct_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
78 #if HAVE_MMXEXT_INLINE
79 { "MMXEXT", ff_fdct_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT },
82 { "SSE2", ff_fdct_sse2, FF_IDCT_PERM_NONE, AV_CPU_FLAG_SSE2 },
86 { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
92 static const struct algo idct_tab[] = {
93 { "FAANI", ff_faanidct, FF_IDCT_PERM_NONE },
94 { "REF-DBL", ff_ref_idct, FF_IDCT_PERM_NONE },
95 { "INT", ff_j_rev_dct, FF_IDCT_PERM_LIBMPEG2 },
96 { "SIMPLE-C", ff_simple_idct_8, FF_IDCT_PERM_NONE },
99 { "SIMPLE-MMX", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
100 { "XVID-MMX", ff_idct_xvid_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX, 1 },
102 #if HAVE_MMXEXT_INLINE
103 { "XVID-MMXEXT", ff_idct_xvid_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT, 1 },
106 { "XVID-SSE2", ff_idct_xvid_sse2, FF_IDCT_PERM_SSE2, AV_CPU_FLAG_SSE2, 1 },
110 { "SIMPLE-ARM", ff_simple_idct_arm, FF_IDCT_PERM_NONE },
111 { "INT-ARM", ff_j_rev_dct_arm, FF_IDCT_PERM_LIBMPEG2 },
114 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ARMV5TE },
117 { "SIMPLE-ARMV6", ff_simple_idct_armv6, FF_IDCT_PERM_LIBMPEG2, AV_CPU_FLAG_ARMV6 },
119 #if HAVE_NEON && ARCH_ARM
120 { "SIMPLE-NEON", ff_simple_idct_neon, FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
126 #define AANSCALE_BITS 12
129 #define NB_ITS_SPEED 50000
131 static short idct_simple_mmx_perm[64] = {
132 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
133 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
134 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
135 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
136 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
137 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
138 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
139 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
142 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
144 DECLARE_ALIGNED(16, static int16_t, block)[64];
145 DECLARE_ALIGNED(8, static int16_t, block1)[64];
147 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
151 memset(block, 0, 64 * sizeof(*block));
155 for (i = 0; i < 64; i++)
156 block[i] = (av_lfg_get(prng) % 512) - 256;
159 for (i = 0; i < 64; i++)
164 j = av_lfg_get(prng) % 10 + 1;
165 for (i = 0; i < j; i++)
166 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
169 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
170 block[63] = (block[0] & 1) ^ 1;
175 static void permute(int16_t dst[64], const int16_t src[64],
176 enum idct_permutation_type perm_type)
181 case FF_IDCT_PERM_LIBMPEG2:
182 for (i = 0; i < 64; i++)
183 dst[(i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2)] = src[i];
185 case FF_IDCT_PERM_SIMPLE:
186 for (i = 0; i < 64; i++)
187 dst[idct_simple_mmx_perm[i]] = src[i];
189 case FF_IDCT_PERM_SSE2:
190 for (i = 0; i < 64; i++)
191 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
193 case FF_IDCT_PERM_PARTTRANS:
194 for (i = 0; i < 64; i++)
195 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
198 for (i = 0; i < 64; i++)
204 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
206 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
209 int64_t err2, ti, ti1, it1, err_sum = 0;
210 int64_t sysErr[64], sysErrMax = 0;
212 int blockSumErrMax = 0, blockSumErr;
217 av_lfg_init(&prng, 1);
221 for (i = 0; i < 64; i++)
223 for (it = 0; it < NB_ITS; it++) {
224 init_block(block1, test, is_idct, &prng);
225 permute(block, block1, dct->perm_type);
230 if (!strcmp(dct->name, "IJG-AAN-INT")) {
231 for (i = 0; i < 64; i++) {
232 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
233 block[i] = (block[i] * scale) >> AANSCALE_BITS;
240 for (i = 0; i < 64; i++) {
241 int err = block[i] - block1[i];
247 sysErr[i] += block[i] - block1[i];
249 if (abs(block[i]) > maxout)
250 maxout = abs(block[i]);
252 if (blockSumErrMax < blockSumErr)
253 blockSumErrMax = blockSumErr;
255 for (i = 0; i < 64; i++)
256 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
258 for (i = 0; i < 64; i++) {
261 printf("%7d ", (int) sysErr[i]);
265 omse = (double) err2 / NB_ITS / 64;
266 ome = (double) err_sum / NB_ITS / 64;
268 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
270 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
271 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
272 omse, ome, (double) sysErrMax / NB_ITS,
273 maxout, blockSumErrMax);
275 if (spec_err && !dct->nonspec)
282 init_block(block, test, is_idct, &prng);
283 permute(block1, block, dct->perm_type);
288 for (it = 0; it < NB_ITS_SPEED; it++) {
289 memcpy(block, block1, sizeof(block));
293 ti1 = av_gettime() - ti;
294 } while (ti1 < 1000000);
297 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
298 (double) it1 * 1000.0 / (double) ti1);
303 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
304 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
306 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
309 static double c8[8][8];
310 static double c4[4][4];
311 double block1[64], block2[64], block3[64];
318 for (i = 0; i < 8; i++) {
320 for (j = 0; j < 8; j++) {
321 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
322 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
323 sum += c8[i][j] * c8[i][j];
327 for (i = 0; i < 4; i++) {
329 for (j = 0; j < 4; j++) {
330 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
331 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
332 sum += c4[i][j] * c4[i][j];
339 for (i = 0; i < 4; i++) {
340 for (j = 0; j < 8; j++) {
341 block1[8 * (2 * i) + j] =
342 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
343 block1[8 * (2 * i + 1) + j] =
344 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
349 for (i = 0; i < 8; i++) {
350 for (j = 0; j < 8; j++) {
352 for (k = 0; k < 8; k++)
353 sum += c8[k][j] * block1[8 * i + k];
354 block2[8 * i + j] = sum;
359 for (i = 0; i < 8; i++) {
360 for (j = 0; j < 4; j++) {
363 for (k = 0; k < 4; k++)
364 sum += c4[k][j] * block2[8 * (2 * k) + i];
365 block3[8 * (2 * j) + i] = sum;
369 for (k = 0; k < 4; k++)
370 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
371 block3[8 * (2 * j + 1) + i] = sum;
375 /* clamp and store the result */
376 for (i = 0; i < 8; i++) {
377 for (j = 0; j < 8; j++) {
378 v = block3[8 * i + j];
380 else if (v > 255) v = 255;
381 dest[i * linesize + j] = (int) rint(v);
386 static void idct248_error(const char *name,
387 void (*idct248_put)(uint8_t *dest, int line_size,
391 int it, i, it1, ti, ti1, err_max, v;
394 av_lfg_init(&prng, 1);
396 /* just one test to see if code is correct (precision is less
399 for (it = 0; it < NB_ITS; it++) {
400 /* XXX: use forward transform to generate values */
401 for (i = 0; i < 64; i++)
402 block1[i] = av_lfg_get(&prng) % 256 - 128;
405 for (i = 0; i < 64; i++)
406 block[i] = block1[i];
407 idct248_ref(img_dest1, 8, block);
409 for (i = 0; i < 64; i++)
410 block[i] = block1[i];
411 idct248_put(img_dest, 8, block);
413 for (i = 0; i < 64; i++) {
414 v = abs((int) img_dest[i] - (int) img_dest1[i]);
416 printf("%d %d\n", img_dest[i], img_dest1[i]);
421 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
429 for (it = 0; it < NB_ITS_SPEED; it++) {
430 for (i = 0; i < 64; i++)
431 block[i] = block1[i];
432 idct248_put(img_dest, 8, block);
435 ti1 = av_gettime() - ti;
436 } while (ti1 < 1000000);
439 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
440 (double) it1 * 1000.0 / (double) ti1);
443 static void help(void)
445 printf("dct-test [-i] [<test-number>]\n"
446 "test-number 0 -> test with random matrixes\n"
447 " 1 -> test with random sparse matrixes\n"
448 " 2 -> do 3. test from mpeg4 std\n"
449 "-i test IDCT implementations\n"
450 "-4 test IDCT248 implementations\n"
455 #include "compat/getopt.c"
458 int main(int argc, char **argv)
460 int test_idct = 0, test_248_dct = 0;
469 c = getopt(argc, argv, "ih4t");
490 test = atoi(argv[optind]);
492 printf("Libav DCT/IDCT test\n");
495 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
497 const int cpu_flags = av_get_cpu_flags();
498 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
499 for (i = 0; algos[i].name; i++)
500 if (!(~cpu_flags & algos[i].cpu_flag)) {
501 err |= dct_error(&algos[i], test, test_idct, speed);
506 printf("Error: %d.\n", err);