2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of Libav.
7 * Libav is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * Libav is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with Libav; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
34 #include "libavutil/cpu.h"
35 #include "libavutil/common.h"
36 #include "libavutil/lfg.h"
37 #include "libavutil/time.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
66 void ff_simple_idct_axp(DCTELEM *data);
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 static const struct algo idct_tab[] = {
103 { "FAANI", ff_faanidct, NO_PERM },
104 { "REF-DBL", ff_ref_idct, NO_PERM },
105 { "INT", ff_j_rev_dct, MMX_PERM },
106 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
110 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
111 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
113 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
114 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
115 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
116 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
120 { "BFINidct", ff_bfin_idct, NO_PERM },
124 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
125 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
128 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
131 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
134 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
138 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
144 #define AANSCALE_BITS 12
147 #define NB_ITS_SPEED 50000
149 static short idct_mmx_perm[64];
151 static short idct_simple_mmx_perm[64] = {
152 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
153 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
154 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
155 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
156 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
157 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
158 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
159 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
162 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
164 static void idct_mmx_init(void)
168 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
169 for (i = 0; i < 64; i++) {
170 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
174 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
175 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
177 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng)
181 memset(block, 0, 64 * sizeof(*block));
185 for (i = 0; i < 64; i++)
186 block[i] = (av_lfg_get(prng) % 512) - 256;
189 for (i = 0; i < 64; i++)
194 j = av_lfg_get(prng) % 10 + 1;
195 for (i = 0; i < j; i++)
196 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % 512 - 256;
199 block[ 0] = av_lfg_get(prng) % 4096 - 2048;
200 block[63] = (block[0] & 1) ^ 1;
205 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
209 if (perm == MMX_PERM) {
210 for (i = 0; i < 64; i++)
211 dst[idct_mmx_perm[i]] = src[i];
212 } else if (perm == MMX_SIMPLE_PERM) {
213 for (i = 0; i < 64; i++)
214 dst[idct_simple_mmx_perm[i]] = src[i];
215 } else if (perm == SSE2_PERM) {
216 for (i = 0; i < 64; i++)
217 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
218 } else if (perm == PARTTRANS_PERM) {
219 for (i = 0; i < 64; i++)
220 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
222 for (i = 0; i < 64; i++)
227 static int dct_error(const struct algo *dct, int test, int is_idct, int speed)
229 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
232 int64_t err2, ti, ti1, it1, err_sum = 0;
233 int64_t sysErr[64], sysErrMax = 0;
235 int blockSumErrMax = 0, blockSumErr;
240 av_lfg_init(&prng, 1);
244 for (i = 0; i < 64; i++)
246 for (it = 0; it < NB_ITS; it++) {
247 init_block(block1, test, is_idct, &prng);
248 permute(block, block1, dct->format);
253 if (dct->format == SCALE_PERM) {
254 for (i = 0; i < 64; i++) {
255 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
256 block[i] = (block[i] * scale) >> AANSCALE_BITS;
263 for (i = 0; i < 64; i++) {
264 int err = block[i] - block1[i];
270 sysErr[i] += block[i] - block1[i];
272 if (abs(block[i]) > maxout)
273 maxout = abs(block[i]);
275 if (blockSumErrMax < blockSumErr)
276 blockSumErrMax = blockSumErr;
278 for (i = 0; i < 64; i++)
279 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
281 for (i = 0; i < 64; i++) {
284 printf("%7d ", (int) sysErr[i]);
288 omse = (double) err2 / NB_ITS / 64;
289 ome = (double) err_sum / NB_ITS / 64;
291 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
293 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
294 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
295 omse, ome, (double) sysErrMax / NB_ITS,
296 maxout, blockSumErrMax);
298 if (spec_err && !dct->nonspec)
305 init_block(block, test, is_idct, &prng);
306 permute(block1, block, dct->format);
311 for (it = 0; it < NB_ITS_SPEED; it++) {
312 memcpy(block, block1, sizeof(block));
316 ti1 = av_gettime() - ti;
317 } while (ti1 < 1000000);
320 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
321 (double) it1 * 1000.0 / (double) ti1);
326 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
327 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
329 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
332 static double c8[8][8];
333 static double c4[4][4];
334 double block1[64], block2[64], block3[64];
341 for (i = 0; i < 8; i++) {
343 for (j = 0; j < 8; j++) {
344 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
345 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
346 sum += c8[i][j] * c8[i][j];
350 for (i = 0; i < 4; i++) {
352 for (j = 0; j < 4; j++) {
353 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
354 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
355 sum += c4[i][j] * c4[i][j];
362 for (i = 0; i < 4; i++) {
363 for (j = 0; j < 8; j++) {
364 block1[8 * (2 * i) + j] =
365 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
366 block1[8 * (2 * i + 1) + j] =
367 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
372 for (i = 0; i < 8; i++) {
373 for (j = 0; j < 8; j++) {
375 for (k = 0; k < 8; k++)
376 sum += c8[k][j] * block1[8 * i + k];
377 block2[8 * i + j] = sum;
382 for (i = 0; i < 8; i++) {
383 for (j = 0; j < 4; j++) {
386 for (k = 0; k < 4; k++)
387 sum += c4[k][j] * block2[8 * (2 * k) + i];
388 block3[8 * (2 * j) + i] = sum;
392 for (k = 0; k < 4; k++)
393 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
394 block3[8 * (2 * j + 1) + i] = sum;
398 /* clamp and store the result */
399 for (i = 0; i < 8; i++) {
400 for (j = 0; j < 8; j++) {
401 v = block3[8 * i + j];
403 else if (v > 255) v = 255;
404 dest[i * linesize + j] = (int) rint(v);
409 static void idct248_error(const char *name,
410 void (*idct248_put)(uint8_t *dest, int line_size,
414 int it, i, it1, ti, ti1, err_max, v;
417 av_lfg_init(&prng, 1);
419 /* just one test to see if code is correct (precision is less
422 for (it = 0; it < NB_ITS; it++) {
423 /* XXX: use forward transform to generate values */
424 for (i = 0; i < 64; i++)
425 block1[i] = av_lfg_get(&prng) % 256 - 128;
428 for (i = 0; i < 64; i++)
429 block[i] = block1[i];
430 idct248_ref(img_dest1, 8, block);
432 for (i = 0; i < 64; i++)
433 block[i] = block1[i];
434 idct248_put(img_dest, 8, block);
436 for (i = 0; i < 64; i++) {
437 v = abs((int) img_dest[i] - (int) img_dest1[i]);
439 printf("%d %d\n", img_dest[i], img_dest1[i]);
444 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
452 for (it = 0; it < NB_ITS_SPEED; it++) {
453 for (i = 0; i < 64; i++)
454 block[i] = block1[i];
455 idct248_put(img_dest, 8, block);
458 ti1 = av_gettime() - ti;
459 } while (ti1 < 1000000);
462 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
463 (double) it1 * 1000.0 / (double) ti1);
466 static void help(void)
468 printf("dct-test [-i] [<test-number>]\n"
469 "test-number 0 -> test with random matrixes\n"
470 " 1 -> test with random sparse matrixes\n"
471 " 2 -> do 3. test from mpeg4 std\n"
472 "-i test IDCT implementations\n"
473 "-4 test IDCT248 implementations\n"
477 int main(int argc, char **argv)
479 int test_idct = 0, test_248_dct = 0;
485 cpu_flags = av_get_cpu_flags();
491 c = getopt(argc, argv, "ih4t");
512 test = atoi(argv[optind]);
514 printf("Libav DCT/IDCT test\n");
517 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
519 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
520 for (i = 0; algos[i].name; i++)
521 if (!(~cpu_flags & algos[i].mm_support)) {
522 err |= dct_error(&algos[i], test, test_idct, speed);