2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
66 void ff_simple_idct_axp(DCTELEM *data);
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 #if HAVE_MMX && HAVE_YASM
103 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
104 DCTELEM *block, int16_t *qmat);
106 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
107 int16_t qmat[64]; int i;
114 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
118 static const struct algo idct_tab[] = {
119 { "FAANI", ff_faanidct, NO_PERM },
120 { "REF-DBL", ff_ref_idct, NO_PERM },
121 { "INT", ff_j_rev_dct, MMX_PERM },
122 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
126 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
127 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
129 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
130 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
131 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
132 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
133 #if ARCH_X86_64 && HAVE_YASM
134 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
139 { "BFINidct", ff_bfin_idct, NO_PERM },
143 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
144 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
147 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
150 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
153 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
157 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
163 #define AANSCALE_BITS 12
165 static int64_t gettime(void)
168 gettimeofday(&tv, NULL);
169 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
173 #define NB_ITS_SPEED 50000
175 static short idct_mmx_perm[64];
177 static short idct_simple_mmx_perm[64] = {
178 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
179 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
180 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
181 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
182 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
183 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
184 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
185 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
188 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
190 static void idct_mmx_init(void)
194 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
195 for (i = 0; i < 64; i++) {
196 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
200 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
201 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
203 static inline void mmx_emms(void)
206 if (cpu_flags & AV_CPU_FLAG_MMX)
207 __asm__ volatile ("emms\n\t");
211 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
215 memset(block, 0, 64 * sizeof(*block));
219 for (i = 0; i < 64; i++)
220 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
223 for (i = 0; i < 64; i++)
228 j = av_lfg_get(prng) % 10 + 1;
229 for (i = 0; i < j; i++)
230 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
233 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
234 block[63] = (block[0] & 1) ^ 1;
239 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
243 if (perm == MMX_PERM) {
244 for (i = 0; i < 64; i++)
245 dst[idct_mmx_perm[i]] = src[i];
246 } else if (perm == MMX_SIMPLE_PERM) {
247 for (i = 0; i < 64; i++)
248 dst[idct_simple_mmx_perm[i]] = src[i];
249 } else if (perm == SSE2_PERM) {
250 for (i = 0; i < 64; i++)
251 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
252 } else if (perm == PARTTRANS_PERM) {
253 for (i = 0; i < 64; i++)
254 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
255 } else if (perm == TRANSPOSE_PERM) {
256 for (i = 0; i < 64; i++)
257 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
259 for (i = 0; i < 64; i++)
264 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
266 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
269 int64_t err2, ti, ti1, it1, err_sum = 0;
270 int64_t sysErr[64], sysErrMax = 0;
272 int blockSumErrMax = 0, blockSumErr;
274 const int vals=1<<bits;
278 av_lfg_init(&prng, 1);
282 for (i = 0; i < 64; i++)
284 for (it = 0; it < NB_ITS; it++) {
285 init_block(block1, test, is_idct, &prng, vals);
286 permute(block, block1, dct->format);
291 if (dct->format == SCALE_PERM) {
292 for (i = 0; i < 64; i++) {
293 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
294 block[i] = (block[i] * scale) >> AANSCALE_BITS;
301 for (i = 0; i < 64; i++) {
302 int err = block[i] - block1[i];
308 sysErr[i] += block[i] - block1[i];
310 if (abs(block[i]) > maxout)
311 maxout = abs(block[i]);
313 if (blockSumErrMax < blockSumErr)
314 blockSumErrMax = blockSumErr;
316 for (i = 0; i < 64; i++)
317 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
319 for (i = 0; i < 64; i++) {
322 printf("%7d ", (int) sysErr[i]);
326 omse = (double) err2 / NB_ITS / 64;
327 ome = (double) err_sum / NB_ITS / 64;
329 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
331 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
332 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
333 omse, ome, (double) sysErrMax / NB_ITS,
334 maxout, blockSumErrMax);
336 if (spec_err && !dct->nonspec)
344 init_block(block, test, is_idct, &prng, vals);
345 permute(block1, block, dct->format);
350 for (it = 0; it < NB_ITS_SPEED; it++) {
351 memcpy(block, block1, sizeof(block));
355 ti1 = gettime() - ti;
356 } while (ti1 < 1000000);
359 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
360 (double) it1 * 1000.0 / (double) ti1);
365 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
366 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
368 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
371 static double c8[8][8];
372 static double c4[4][4];
373 double block1[64], block2[64], block3[64];
380 for (i = 0; i < 8; i++) {
382 for (j = 0; j < 8; j++) {
383 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
384 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
385 sum += c8[i][j] * c8[i][j];
389 for (i = 0; i < 4; i++) {
391 for (j = 0; j < 4; j++) {
392 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
393 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
394 sum += c4[i][j] * c4[i][j];
401 for (i = 0; i < 4; i++) {
402 for (j = 0; j < 8; j++) {
403 block1[8 * (2 * i) + j] =
404 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
405 block1[8 * (2 * i + 1) + j] =
406 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
411 for (i = 0; i < 8; i++) {
412 for (j = 0; j < 8; j++) {
414 for (k = 0; k < 8; k++)
415 sum += c8[k][j] * block1[8 * i + k];
416 block2[8 * i + j] = sum;
421 for (i = 0; i < 8; i++) {
422 for (j = 0; j < 4; j++) {
425 for (k = 0; k < 4; k++)
426 sum += c4[k][j] * block2[8 * (2 * k) + i];
427 block3[8 * (2 * j) + i] = sum;
431 for (k = 0; k < 4; k++)
432 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
433 block3[8 * (2 * j + 1) + i] = sum;
437 /* clamp and store the result */
438 for (i = 0; i < 8; i++) {
439 for (j = 0; j < 8; j++) {
440 v = block3[8 * i + j];
442 else if (v > 255) v = 255;
443 dest[i * linesize + j] = (int) rint(v);
448 static void idct248_error(const char *name,
449 void (*idct248_put)(uint8_t *dest, int line_size,
453 int it, i, it1, ti, ti1, err_max, v;
456 av_lfg_init(&prng, 1);
458 /* just one test to see if code is correct (precision is less
461 for (it = 0; it < NB_ITS; it++) {
462 /* XXX: use forward transform to generate values */
463 for (i = 0; i < 64; i++)
464 block1[i] = av_lfg_get(&prng) % 256 - 128;
467 for (i = 0; i < 64; i++)
468 block[i] = block1[i];
469 idct248_ref(img_dest1, 8, block);
471 for (i = 0; i < 64; i++)
472 block[i] = block1[i];
473 idct248_put(img_dest, 8, block);
475 for (i = 0; i < 64; i++) {
476 v = abs((int) img_dest[i] - (int) img_dest1[i]);
478 printf("%d %d\n", img_dest[i], img_dest1[i]);
487 printf(" %3d", img_dest1[i*8+j]);
496 printf(" %3d", img_dest[i*8+j]);
502 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
510 for (it = 0; it < NB_ITS_SPEED; it++) {
511 for (i = 0; i < 64; i++)
512 block[i] = block1[i];
513 idct248_put(img_dest, 8, block);
516 ti1 = gettime() - ti;
517 } while (ti1 < 1000000);
520 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
521 (double) it1 * 1000.0 / (double) ti1);
524 static void help(void)
526 printf("dct-test [-i] [<test-number>] [<bits>]\n"
527 "test-number 0 -> test with random matrixes\n"
528 " 1 -> test with random sparse matrixes\n"
529 " 2 -> do 3. test from mpeg4 std\n"
530 "bits Number of time domain bits to use, 8 is default\n"
531 "-i test IDCT implementations\n"
532 "-4 test IDCT248 implementations\n"
536 int main(int argc, char **argv)
538 int test_idct = 0, test_248_dct = 0;
545 cpu_flags = av_get_cpu_flags();
551 c = getopt(argc, argv, "ih4t");
572 test = atoi(argv[optind]);
573 if(optind+1 < argc) bits= atoi(argv[optind+1]);
575 printf("ffmpeg DCT/IDCT test\n");
578 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
580 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
581 for (i = 0; algos[i].name; i++)
582 if (!(~cpu_flags & algos[i].mm_support)) {
583 err |= dct_error(&algos[i], test, test_idct, speed, bits);