2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
42 #include "simple_idct.h"
43 #include "aandcttab.h"
46 #include "x86/idct_xvid.h"
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
59 void ff_j_rev_dct_arm(DCTELEM *data);
60 void ff_simple_idct_arm(DCTELEM *data);
61 void ff_simple_idct_armv5te(DCTELEM *data);
62 void ff_simple_idct_armv6(DCTELEM *data);
63 void ff_simple_idct_neon(DCTELEM *data);
65 void ff_simple_idct_axp(DCTELEM *data);
69 void (*func)(DCTELEM *block);
70 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
71 SSE2_PERM, PARTTRANS_PERM } format;
78 static const struct algo fdct_tab[] = {
79 { "REF-DBL", ff_ref_fdct, NO_PERM },
80 { "FAAN", ff_faandct, NO_PERM },
81 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
82 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
85 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
86 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
87 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
91 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
95 { "BFINfdct", ff_bfin_fdct, NO_PERM },
101 static const struct algo idct_tab[] = {
102 { "FAANI", ff_faanidct, NO_PERM },
103 { "REF-DBL", ff_ref_idct, NO_PERM },
104 { "INT", ff_j_rev_dct, MMX_PERM },
105 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
108 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
109 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
110 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
111 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
115 { "BFINidct", ff_bfin_idct, NO_PERM },
119 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
120 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
123 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
126 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
129 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
133 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
139 #define AANSCALE_BITS 12
142 #define NB_ITS_SPEED 50000
144 static short idct_mmx_perm[64];
146 static short idct_simple_mmx_perm[64] = {
147 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
148 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
149 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
150 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
151 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
152 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
153 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
154 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
157 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
159 static void idct_mmx_init(void)
163 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
164 for (i = 0; i < 64; i++) {
165 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
169 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
170 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
172 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
176 memset(block, 0, 64 * sizeof(*block));
180 for (i = 0; i < 64; i++)
181 block[i] = (av_lfg_get(prng) % 512) - 256;
184 for (i = 0; i < 64; i++)
189 j = av_lfg_get(prng) % 10 + 1;
190 for (i = 0; i < j; i++)
191 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
194 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
195 block[63] = (block[0] & 1) ^ 1;
200 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
204 if (perm == MMX_PERM) {
205 for (i = 0; i < 64; i++)
206 dst[idct_mmx_perm[i]] = src[i];
207 } else if (perm == MMX_SIMPLE_PERM) {
208 for (i = 0; i < 64; i++)
209 dst[idct_simple_mmx_perm[i]] = src[i];
210 } else if (perm == SSE2_PERM) {
211 for (i = 0; i < 64; i++)
212 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
213 } else if (perm == PARTTRANS_PERM) {
214 for (i = 0; i < 64; i++)
215 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
217 for (i = 0; i < 64; i++)
222 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
224 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
227 int64_t err2, ti, ti1, it1, err_sum = 0;
228 int64_t sysErr[64], sysErrMax = 0;
230 int blockSumErrMax = 0, blockSumErr;
235 av_lfg_init(&prng, 1);
239 for (i = 0; i < 64; i++)
241 for (it = 0; it < NB_ITS; it++) {
242 init_block(block1, test, is_idct, &prng);
243 permute(block, block1, dct->format);
248 if (dct->format == SCALE_PERM) {
249 for (i = 0; i < 64; i++) {
250 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
251 block[i] = (block[i] * scale) >> AANSCALE_BITS;
258 for (i = 0; i < 64; i++) {
259 int err = block[i] - block1[i];
265 sysErr[i] += block[i] - block1[i];
267 if (abs(block[i]) > maxout)
268 maxout = abs(block[i]);
270 if (blockSumErrMax < blockSumErr)
271 blockSumErrMax = blockSumErr;
273 for (i = 0; i < 64; i++)
274 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
276 for (i = 0; i < 64; i++) {
279 printf("%7d ", (int) sysErr[i]);
283 omse = (double) err2 / NB_ITS / 64;
284 ome = (double) err_sum / NB_ITS / 64;
286 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
288 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
289 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
290 omse, ome, (double) sysErrMax / NB_ITS,
291 maxout, blockSumErrMax);
293 if (spec_err && !dct->nonspec)
300 init_block(block, test, is_idct, &prng);
301 permute(block1, block, dct->format);
306 for (it = 0; it < NB_ITS_SPEED; it++) {
307 memcpy(block, block1, sizeof(block));
311 ti1 = av_gettime() - ti;
312 } while (ti1 < 1000000);
315 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
316 (double) it1 * 1000.0 / (double) ti1);
321 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
322 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
324 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
327 static double c8[8][8];
328 static double c4[4][4];
329 double block1[64], block2[64], block3[64];
336 for (i = 0; i < 8; i++) {
338 for (j = 0; j < 8; j++) {
339 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
340 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
341 sum += c8[i][j] * c8[i][j];
345 for (i = 0; i < 4; i++) {
347 for (j = 0; j < 4; j++) {
348 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
349 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
350 sum += c4[i][j] * c4[i][j];
357 for (i = 0; i < 4; i++) {
358 for (j = 0; j < 8; j++) {
359 block1[8 * (2 * i) + j] =
360 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
361 block1[8 * (2 * i + 1) + j] =
362 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
367 for (i = 0; i < 8; i++) {
368 for (j = 0; j < 8; j++) {
370 for (k = 0; k < 8; k++)
371 sum += c8[k][j] * block1[8 * i + k];
372 block2[8 * i + j] = sum;
377 for (i = 0; i < 8; i++) {
378 for (j = 0; j < 4; j++) {
381 for (k = 0; k < 4; k++)
382 sum += c4[k][j] * block2[8 * (2 * k) + i];
383 block3[8 * (2 * j) + i] = sum;
387 for (k = 0; k < 4; k++)
388 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
389 block3[8 * (2 * j + 1) + i] = sum;
393 /* clamp and store the result */
394 for (i = 0; i < 8; i++) {
395 for (j = 0; j < 8; j++) {
396 v = block3[8 * i + j];
398 else if (v > 255) v = 255;
399 dest[i * linesize + j] = (int) rint(v);
404 static void idct248_error(const char *name,
405 void (*idct248_put)(uint8_t *dest, int line_size,
409 int it, i, it1, ti, ti1, err_max, v;
412 av_lfg_init(&prng, 1);
414 /* just one test to see if code is correct (precision is less
417 for (it = 0; it < NB_ITS; it++) {
418 /* XXX: use forward transform to generate values */
419 for (i = 0; i < 64; i++)
420 block1[i] = av_lfg_get(&prng) % 256 - 128;
423 for (i = 0; i < 64; i++)
424 block[i] = block1[i];
425 idct248_ref(img_dest1, 8, block);
427 for (i = 0; i < 64; i++)
428 block[i] = block1[i];
429 idct248_put(img_dest, 8, block);
431 for (i = 0; i < 64; i++) {
432 v = abs((int) img_dest[i] - (int) img_dest1[i]);
434 printf("%d %d\n", img_dest[i], img_dest1[i]);
439 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
447 for (it = 0; it < NB_ITS_SPEED; it++) {
448 for (i = 0; i < 64; i++)
449 block[i] = block1[i];
450 idct248_put(img_dest, 8, block);
453 ti1 = av_gettime() - ti;
454 } while (ti1 < 1000000);
457 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
458 (double) it1 * 1000.0 / (double) ti1);
461 static void help(void)
463 printf("dct-test [-i] [<test-number>]\n"
464 "test-number 0 -> test with random matrixes\n"
465 " 1 -> test with random sparse matrixes\n"
466 " 2 -> do 3. test from mpeg4 std\n"
467 "-i test IDCT implementations\n"
468 "-4 test IDCT248 implementations\n"
473 #include "compat/getopt.c"
476 int main(int argc, char **argv)
478 int test_idct = 0, test_248_dct = 0;
484 cpu_flags = av_get_cpu_flags();
490 c = getopt(argc, argv, "ih4t");
511 test = atoi(argv[optind]);
513 printf("Libav DCT/IDCT test\n");
516 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
518 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
519 for (i = 0; algos[i].name; i++)
520 if (!(~cpu_flags & algos[i].mm_support)) {
521 err |= dct_error(&algos[i], test, test_idct, speed);