2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
53 void ff_bfin_idct(int16_t *block);
54 void ff_bfin_fdct(int16_t *block);
57 void ff_fdct_altivec(int16_t *block);
60 void ff_j_rev_dct_arm(int16_t *data);
61 void ff_simple_idct_arm(int16_t *data);
62 void ff_simple_idct_armv5te(int16_t *data);
63 void ff_simple_idct_armv6(int16_t *data);
64 void ff_simple_idct_neon(int16_t *data);
66 void ff_simple_idct_axp(int16_t *data);
70 void (*func)(int16_t *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 static const struct algo idct_tab[] = {
103 { "FAANI", ff_faanidct, NO_PERM },
104 { "REF-DBL", ff_ref_idct, NO_PERM },
105 { "INT", ff_j_rev_dct, MMX_PERM },
106 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
109 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
110 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
111 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
112 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
116 { "BFINidct", ff_bfin_idct, NO_PERM },
120 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
121 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
124 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
127 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
130 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
134 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
140 #define AANSCALE_BITS 12
143 #define NB_ITS_SPEED 50000
145 static short idct_mmx_perm[64];
147 static short idct_simple_mmx_perm[64] = {
148 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
149 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
150 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
151 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
152 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
153 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
154 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
155 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
158 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
160 static void idct_mmx_init(void)
164 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
165 for (i = 0; i < 64; i++) {
166 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
170 DECLARE_ALIGNED(16, static int16_t, block)[64];
171 DECLARE_ALIGNED(8, static int16_t, block1)[64];
173 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng)
177 memset(block, 0, 64 * sizeof(*block));
181 for (i = 0; i < 64; i++)
182 block[i] = (av_lfg_get(prng) % 512) - 256;
185 for (i = 0; i < 64; i++)
190 j = av_lfg_get(prng) % 10 + 1;
191 for (i = 0; i < j; i++)
192 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
195 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
196 block[63] = (block[0] & 1) ^ 1;
201 static void permute(int16_t dst[64], const int16_t src[64], int perm)
205 if (perm == MMX_PERM) {
206 for (i = 0; i < 64; i++)
207 dst[idct_mmx_perm[i]] = src[i];
208 } else if (perm == MMX_SIMPLE_PERM) {
209 for (i = 0; i < 64; i++)
210 dst[idct_simple_mmx_perm[i]] = src[i];
211 } else if (perm == SSE2_PERM) {
212 for (i = 0; i < 64; i++)
213 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
214 } else if (perm == PARTTRANS_PERM) {
215 for (i = 0; i < 64; i++)
216 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
218 for (i = 0; i < 64; i++)
223 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
225 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
228 int64_t err2, ti, ti1, it1, err_sum = 0;
229 int64_t sysErr[64], sysErrMax = 0;
231 int blockSumErrMax = 0, blockSumErr;
236 av_lfg_init(&prng, 1);
240 for (i = 0; i < 64; i++)
242 for (it = 0; it < NB_ITS; it++) {
243 init_block(block1, test, is_idct, &prng);
244 permute(block, block1, dct->format);
249 if (dct->format == SCALE_PERM) {
250 for (i = 0; i < 64; i++) {
251 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
252 block[i] = (block[i] * scale) >> AANSCALE_BITS;
259 for (i = 0; i < 64; i++) {
260 int err = block[i] - block1[i];
266 sysErr[i] += block[i] - block1[i];
268 if (abs(block[i]) > maxout)
269 maxout = abs(block[i]);
271 if (blockSumErrMax < blockSumErr)
272 blockSumErrMax = blockSumErr;
274 for (i = 0; i < 64; i++)
275 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
277 for (i = 0; i < 64; i++) {
280 printf("%7d ", (int) sysErr[i]);
284 omse = (double) err2 / NB_ITS / 64;
285 ome = (double) err_sum / NB_ITS / 64;
287 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
289 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
290 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
291 omse, ome, (double) sysErrMax / NB_ITS,
292 maxout, blockSumErrMax);
294 if (spec_err && !dct->nonspec)
301 init_block(block, test, is_idct, &prng);
302 permute(block1, block, dct->format);
307 for (it = 0; it < NB_ITS_SPEED; it++) {
308 memcpy(block, block1, sizeof(block));
312 ti1 = av_gettime() - ti;
313 } while (ti1 < 1000000);
316 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
317 (double) it1 * 1000.0 / (double) ti1);
322 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
323 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
325 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
328 static double c8[8][8];
329 static double c4[4][4];
330 double block1[64], block2[64], block3[64];
337 for (i = 0; i < 8; i++) {
339 for (j = 0; j < 8; j++) {
340 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
341 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
342 sum += c8[i][j] * c8[i][j];
346 for (i = 0; i < 4; i++) {
348 for (j = 0; j < 4; j++) {
349 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
350 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
351 sum += c4[i][j] * c4[i][j];
358 for (i = 0; i < 4; i++) {
359 for (j = 0; j < 8; j++) {
360 block1[8 * (2 * i) + j] =
361 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
362 block1[8 * (2 * i + 1) + j] =
363 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
368 for (i = 0; i < 8; i++) {
369 for (j = 0; j < 8; j++) {
371 for (k = 0; k < 8; k++)
372 sum += c8[k][j] * block1[8 * i + k];
373 block2[8 * i + j] = sum;
378 for (i = 0; i < 8; i++) {
379 for (j = 0; j < 4; j++) {
382 for (k = 0; k < 4; k++)
383 sum += c4[k][j] * block2[8 * (2 * k) + i];
384 block3[8 * (2 * j) + i] = sum;
388 for (k = 0; k < 4; k++)
389 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
390 block3[8 * (2 * j + 1) + i] = sum;
394 /* clamp and store the result */
395 for (i = 0; i < 8; i++) {
396 for (j = 0; j < 8; j++) {
397 v = block3[8 * i + j];
399 else if (v > 255) v = 255;
400 dest[i * linesize + j] = (int) rint(v);
405 static void idct248_error(const char *name,
406 void (*idct248_put)(uint8_t *dest, int line_size,
410 int it, i, it1, ti, ti1, err_max, v;
413 av_lfg_init(&prng, 1);
415 /* just one test to see if code is correct (precision is less
418 for (it = 0; it < NB_ITS; it++) {
419 /* XXX: use forward transform to generate values */
420 for (i = 0; i < 64; i++)
421 block1[i] = av_lfg_get(&prng) % 256 - 128;
424 for (i = 0; i < 64; i++)
425 block[i] = block1[i];
426 idct248_ref(img_dest1, 8, block);
428 for (i = 0; i < 64; i++)
429 block[i] = block1[i];
430 idct248_put(img_dest, 8, block);
432 for (i = 0; i < 64; i++) {
433 v = abs((int) img_dest[i] - (int) img_dest1[i]);
435 printf("%d %d\n", img_dest[i], img_dest1[i]);
440 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
448 for (it = 0; it < NB_ITS_SPEED; it++) {
449 for (i = 0; i < 64; i++)
450 block[i] = block1[i];
451 idct248_put(img_dest, 8, block);
454 ti1 = av_gettime() - ti;
455 } while (ti1 < 1000000);
458 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
459 (double) it1 * 1000.0 / (double) ti1);
462 static void help(void)
464 printf("dct-test [-i] [<test-number>]\n"
465 "test-number 0 -> test with random matrixes\n"
466 " 1 -> test with random sparse matrixes\n"
467 " 2 -> do 3. test from mpeg4 std\n"
468 "-i test IDCT implementations\n"
469 "-4 test IDCT248 implementations\n"
474 #include "compat/getopt.c"
477 int main(int argc, char **argv)
479 int test_idct = 0, test_248_dct = 0;
485 cpu_flags = av_get_cpu_flags();
491 c = getopt(argc, argv, "ih4t");
512 test = atoi(argv[optind]);
514 printf("Libav DCT/IDCT test\n");
517 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
519 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
520 for (i = 0; algos[i].name; i++)
521 if (!(~cpu_flags & algos[i].mm_support)) {
522 err |= dct_error(&algos[i], test, test_idct, speed);