2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
51 void ff_fdct_altivec(int16_t *block);
54 void ff_j_rev_dct_arm(int16_t *data);
55 void ff_simple_idct_arm(int16_t *data);
56 void ff_simple_idct_armv5te(int16_t *data);
57 void ff_simple_idct_armv6(int16_t *data);
58 void ff_simple_idct_neon(int16_t *data);
62 void (*func)(int16_t *block);
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM } format;
71 static const struct algo fdct_tab[] = {
72 { "REF-DBL", ff_ref_fdct, NO_PERM },
73 { "FAAN", ff_faandct, NO_PERM },
74 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
75 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
78 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
80 #if HAVE_MMXEXT_INLINE
81 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
84 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
88 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
94 static const struct algo idct_tab[] = {
95 { "FAANI", ff_faanidct, NO_PERM },
96 { "REF-DBL", ff_ref_idct, NO_PERM },
97 { "INT", ff_j_rev_dct, MMX_PERM },
98 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
101 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
102 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
104 #if HAVE_MMXEXT_INLINE
105 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
108 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
112 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
113 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
116 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
119 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
121 #if HAVE_NEON && ARCH_ARM
122 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
128 #define AANSCALE_BITS 12
131 #define NB_ITS_SPEED 50000
133 static short idct_mmx_perm[64];
135 static short idct_simple_mmx_perm[64] = {
136 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
137 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
138 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
139 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
140 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
141 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
142 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
143 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
146 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
148 static void idct_mmx_init(void)
152 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
153 for (i = 0; i < 64; i++) {
154 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
158 DECLARE_ALIGNED(16, static int16_t, block)[64];
159 DECLARE_ALIGNED(8, static int16_t, block1)[64];
161 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
165 memset(block, 0, 64 * sizeof(*block));
169 for (i = 0; i < 64; i++)
170 block[i] = (av_lfg_get(prng) % 512) - 256;
173 for (i = 0; i < 64; i++)
178 j = av_lfg_get(prng) % 10 + 1;
179 for (i = 0; i < j; i++)
180 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
183 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
184 block[63] = (block[0] & 1) ^ 1;
189 static void permute(int16_t dst[64], const int16_t src[64], int perm)
193 if (perm == MMX_PERM) {
194 for (i = 0; i < 64; i++)
195 dst[idct_mmx_perm[i]] = src[i];
196 } else if (perm == MMX_SIMPLE_PERM) {
197 for (i = 0; i < 64; i++)
198 dst[idct_simple_mmx_perm[i]] = src[i];
199 } else if (perm == SSE2_PERM) {
200 for (i = 0; i < 64; i++)
201 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
202 } else if (perm == PARTTRANS_PERM) {
203 for (i = 0; i < 64; i++)
204 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
206 for (i = 0; i < 64; i++)
211 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
213 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
216 int64_t err2, ti, ti1, it1, err_sum = 0;
217 int64_t sysErr[64], sysErrMax = 0;
219 int blockSumErrMax = 0, blockSumErr;
224 av_lfg_init(&prng, 1);
228 for (i = 0; i < 64; i++)
230 for (it = 0; it < NB_ITS; it++) {
231 init_block(block1, test, is_idct, &prng);
232 permute(block, block1, dct->format);
237 if (dct->format == SCALE_PERM) {
238 for (i = 0; i < 64; i++) {
239 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
240 block[i] = (block[i] * scale) >> AANSCALE_BITS;
247 for (i = 0; i < 64; i++) {
248 int err = block[i] - block1[i];
254 sysErr[i] += block[i] - block1[i];
256 if (abs(block[i]) > maxout)
257 maxout = abs(block[i]);
259 if (blockSumErrMax < blockSumErr)
260 blockSumErrMax = blockSumErr;
262 for (i = 0; i < 64; i++)
263 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
265 for (i = 0; i < 64; i++) {
268 printf("%7d ", (int) sysErr[i]);
272 omse = (double) err2 / NB_ITS / 64;
273 ome = (double) err_sum / NB_ITS / 64;
275 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
277 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
278 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
279 omse, ome, (double) sysErrMax / NB_ITS,
280 maxout, blockSumErrMax);
282 if (spec_err && !dct->nonspec)
289 init_block(block, test, is_idct, &prng);
290 permute(block1, block, dct->format);
295 for (it = 0; it < NB_ITS_SPEED; it++) {
296 memcpy(block, block1, sizeof(block));
300 ti1 = av_gettime() - ti;
301 } while (ti1 < 1000000);
304 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
305 (double) it1 * 1000.0 / (double) ti1);
310 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
311 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
313 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
316 static double c8[8][8];
317 static double c4[4][4];
318 double block1[64], block2[64], block3[64];
325 for (i = 0; i < 8; i++) {
327 for (j = 0; j < 8; j++) {
328 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
329 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
330 sum += c8[i][j] * c8[i][j];
334 for (i = 0; i < 4; i++) {
336 for (j = 0; j < 4; j++) {
337 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
338 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
339 sum += c4[i][j] * c4[i][j];
346 for (i = 0; i < 4; i++) {
347 for (j = 0; j < 8; j++) {
348 block1[8 * (2 * i) + j] =
349 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
350 block1[8 * (2 * i + 1) + j] =
351 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
356 for (i = 0; i < 8; i++) {
357 for (j = 0; j < 8; j++) {
359 for (k = 0; k < 8; k++)
360 sum += c8[k][j] * block1[8 * i + k];
361 block2[8 * i + j] = sum;
366 for (i = 0; i < 8; i++) {
367 for (j = 0; j < 4; j++) {
370 for (k = 0; k < 4; k++)
371 sum += c4[k][j] * block2[8 * (2 * k) + i];
372 block3[8 * (2 * j) + i] = sum;
376 for (k = 0; k < 4; k++)
377 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
378 block3[8 * (2 * j + 1) + i] = sum;
382 /* clamp and store the result */
383 for (i = 0; i < 8; i++) {
384 for (j = 0; j < 8; j++) {
385 v = block3[8 * i + j];
387 else if (v > 255) v = 255;
388 dest[i * linesize + j] = (int) rint(v);
393 static void idct248_error(const char *name,
394 void (*idct248_put)(uint8_t *dest, int line_size,
398 int it, i, it1, ti, ti1, err_max, v;
401 av_lfg_init(&prng, 1);
403 /* just one test to see if code is correct (precision is less
406 for (it = 0; it < NB_ITS; it++) {
407 /* XXX: use forward transform to generate values */
408 for (i = 0; i < 64; i++)
409 block1[i] = av_lfg_get(&prng) % 256 - 128;
412 for (i = 0; i < 64; i++)
413 block[i] = block1[i];
414 idct248_ref(img_dest1, 8, block);
416 for (i = 0; i < 64; i++)
417 block[i] = block1[i];
418 idct248_put(img_dest, 8, block);
420 for (i = 0; i < 64; i++) {
421 v = abs((int) img_dest[i] - (int) img_dest1[i]);
423 printf("%d %d\n", img_dest[i], img_dest1[i]);
428 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
436 for (it = 0; it < NB_ITS_SPEED; it++) {
437 for (i = 0; i < 64; i++)
438 block[i] = block1[i];
439 idct248_put(img_dest, 8, block);
442 ti1 = av_gettime() - ti;
443 } while (ti1 < 1000000);
446 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
447 (double) it1 * 1000.0 / (double) ti1);
450 static void help(void)
452 printf("dct-test [-i] [<test-number>]\n"
453 "test-number 0 -> test with random matrixes\n"
454 " 1 -> test with random sparse matrixes\n"
455 " 2 -> do 3. test from mpeg4 std\n"
456 "-i test IDCT implementations\n"
457 "-4 test IDCT248 implementations\n"
462 #include "compat/getopt.c"
465 int main(int argc, char **argv)
467 int test_idct = 0, test_248_dct = 0;
473 cpu_flags = av_get_cpu_flags();
479 c = getopt(argc, argv, "ih4t");
500 test = atoi(argv[optind]);
502 printf("Libav DCT/IDCT test\n");
505 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
507 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
508 for (i = 0; algos[i].name; i++)
509 if (!(~cpu_flags & algos[i].mm_support)) {
510 err |= dct_error(&algos[i], test, test_idct, speed);
515 printf("Error: %d.\n", err);