2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
51 void ff_fdct_altivec(int16_t *block);
54 void ff_j_rev_dct_arm(int16_t *data);
55 void ff_simple_idct_arm(int16_t *data);
56 void ff_simple_idct_armv5te(int16_t *data);
57 void ff_simple_idct_armv6(int16_t *data);
58 void ff_simple_idct_neon(int16_t *data);
62 void (*func)(int16_t *block);
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM } format;
69 static const struct algo fdct_tab[] = {
70 { "REF-DBL", ff_ref_fdct, NO_PERM },
71 { "FAAN", ff_faandct, NO_PERM },
72 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
73 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
76 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
78 #if HAVE_MMXEXT_INLINE
79 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
82 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
86 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
92 static const struct algo idct_tab[] = {
93 { "FAANI", ff_faanidct, NO_PERM },
94 { "REF-DBL", ff_ref_idct, NO_PERM },
95 { "INT", ff_j_rev_dct, MMX_PERM },
96 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
99 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
100 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
102 #if HAVE_MMXEXT_INLINE
103 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
106 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
110 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
111 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
114 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
117 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
119 #if HAVE_NEON && ARCH_ARM
120 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
126 #define AANSCALE_BITS 12
129 #define NB_ITS_SPEED 50000
131 static short idct_mmx_perm[64];
133 static short idct_simple_mmx_perm[64] = {
134 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
135 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
136 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
137 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
138 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
139 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
140 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
141 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
144 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
146 static void idct_mmx_init(void)
150 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
151 for (i = 0; i < 64; i++) {
152 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
156 DECLARE_ALIGNED(16, static int16_t, block)[64];
157 DECLARE_ALIGNED(8, static int16_t, block1)[64];
159 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
163 memset(block, 0, 64 * sizeof(*block));
167 for (i = 0; i < 64; i++)
168 block[i] = (av_lfg_get(prng) % 512) - 256;
171 for (i = 0; i < 64; i++)
176 j = av_lfg_get(prng) % 10 + 1;
177 for (i = 0; i < j; i++)
178 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
181 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
182 block[63] = (block[0] & 1) ^ 1;
187 static void permute(int16_t dst[64], const int16_t src[64], int perm)
191 if (perm == MMX_PERM) {
192 for (i = 0; i < 64; i++)
193 dst[idct_mmx_perm[i]] = src[i];
194 } else if (perm == MMX_SIMPLE_PERM) {
195 for (i = 0; i < 64; i++)
196 dst[idct_simple_mmx_perm[i]] = src[i];
197 } else if (perm == SSE2_PERM) {
198 for (i = 0; i < 64; i++)
199 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
200 } else if (perm == PARTTRANS_PERM) {
201 for (i = 0; i < 64; i++)
202 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
204 for (i = 0; i < 64; i++)
209 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
211 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
214 int64_t err2, ti, ti1, it1, err_sum = 0;
215 int64_t sysErr[64], sysErrMax = 0;
217 int blockSumErrMax = 0, blockSumErr;
222 av_lfg_init(&prng, 1);
226 for (i = 0; i < 64; i++)
228 for (it = 0; it < NB_ITS; it++) {
229 init_block(block1, test, is_idct, &prng);
230 permute(block, block1, dct->format);
235 if (dct->format == SCALE_PERM) {
236 for (i = 0; i < 64; i++) {
237 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
238 block[i] = (block[i] * scale) >> AANSCALE_BITS;
245 for (i = 0; i < 64; i++) {
246 int err = block[i] - block1[i];
252 sysErr[i] += block[i] - block1[i];
254 if (abs(block[i]) > maxout)
255 maxout = abs(block[i]);
257 if (blockSumErrMax < blockSumErr)
258 blockSumErrMax = blockSumErr;
260 for (i = 0; i < 64; i++)
261 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
263 for (i = 0; i < 64; i++) {
266 printf("%7d ", (int) sysErr[i]);
270 omse = (double) err2 / NB_ITS / 64;
271 ome = (double) err_sum / NB_ITS / 64;
273 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
275 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
276 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
277 omse, ome, (double) sysErrMax / NB_ITS,
278 maxout, blockSumErrMax);
280 if (spec_err && !dct->nonspec)
287 init_block(block, test, is_idct, &prng);
288 permute(block1, block, dct->format);
293 for (it = 0; it < NB_ITS_SPEED; it++) {
294 memcpy(block, block1, sizeof(block));
298 ti1 = av_gettime() - ti;
299 } while (ti1 < 1000000);
302 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
303 (double) it1 * 1000.0 / (double) ti1);
308 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
309 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
311 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
314 static double c8[8][8];
315 static double c4[4][4];
316 double block1[64], block2[64], block3[64];
323 for (i = 0; i < 8; i++) {
325 for (j = 0; j < 8; j++) {
326 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
327 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
328 sum += c8[i][j] * c8[i][j];
332 for (i = 0; i < 4; i++) {
334 for (j = 0; j < 4; j++) {
335 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
336 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
337 sum += c4[i][j] * c4[i][j];
344 for (i = 0; i < 4; i++) {
345 for (j = 0; j < 8; j++) {
346 block1[8 * (2 * i) + j] =
347 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
348 block1[8 * (2 * i + 1) + j] =
349 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
354 for (i = 0; i < 8; i++) {
355 for (j = 0; j < 8; j++) {
357 for (k = 0; k < 8; k++)
358 sum += c8[k][j] * block1[8 * i + k];
359 block2[8 * i + j] = sum;
364 for (i = 0; i < 8; i++) {
365 for (j = 0; j < 4; j++) {
368 for (k = 0; k < 4; k++)
369 sum += c4[k][j] * block2[8 * (2 * k) + i];
370 block3[8 * (2 * j) + i] = sum;
374 for (k = 0; k < 4; k++)
375 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
376 block3[8 * (2 * j + 1) + i] = sum;
380 /* clamp and store the result */
381 for (i = 0; i < 8; i++) {
382 for (j = 0; j < 8; j++) {
383 v = block3[8 * i + j];
385 else if (v > 255) v = 255;
386 dest[i * linesize + j] = (int) rint(v);
391 static void idct248_error(const char *name,
392 void (*idct248_put)(uint8_t *dest, int line_size,
396 int it, i, it1, ti, ti1, err_max, v;
399 av_lfg_init(&prng, 1);
401 /* just one test to see if code is correct (precision is less
404 for (it = 0; it < NB_ITS; it++) {
405 /* XXX: use forward transform to generate values */
406 for (i = 0; i < 64; i++)
407 block1[i] = av_lfg_get(&prng) % 256 - 128;
410 for (i = 0; i < 64; i++)
411 block[i] = block1[i];
412 idct248_ref(img_dest1, 8, block);
414 for (i = 0; i < 64; i++)
415 block[i] = block1[i];
416 idct248_put(img_dest, 8, block);
418 for (i = 0; i < 64; i++) {
419 v = abs((int) img_dest[i] - (int) img_dest1[i]);
421 printf("%d %d\n", img_dest[i], img_dest1[i]);
426 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
434 for (it = 0; it < NB_ITS_SPEED; it++) {
435 for (i = 0; i < 64; i++)
436 block[i] = block1[i];
437 idct248_put(img_dest, 8, block);
440 ti1 = av_gettime() - ti;
441 } while (ti1 < 1000000);
444 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
445 (double) it1 * 1000.0 / (double) ti1);
448 static void help(void)
450 printf("dct-test [-i] [<test-number>]\n"
451 "test-number 0 -> test with random matrixes\n"
452 " 1 -> test with random sparse matrixes\n"
453 " 2 -> do 3. test from mpeg4 std\n"
454 "-i test IDCT implementations\n"
455 "-4 test IDCT248 implementations\n"
460 #include "compat/getopt.c"
463 int main(int argc, char **argv)
465 int test_idct = 0, test_248_dct = 0;
475 c = getopt(argc, argv, "ih4t");
496 test = atoi(argv[optind]);
498 printf("Libav DCT/IDCT test\n");
501 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
503 const int cpu_flags = av_get_cpu_flags();
504 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
505 for (i = 0; algos[i].name; i++)
506 if (!(~cpu_flags & algos[i].cpu_flag)) {
507 err |= dct_error(&algos[i], test, test_idct, speed);
512 printf("Error: %d.\n", err);