2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
112 DCTELEM *block, int16_t *qmat);
114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
115 int16_t qmat[64]; int i;
122 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
126 static const struct algo idct_tab[] = {
127 { "FAANI", ff_faanidct, NO_PERM },
128 { "REF-DBL", ff_ref_idct, NO_PERM },
129 { "INT", j_rev_dct, MMX_PERM },
130 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
134 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
135 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
137 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
138 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
139 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
140 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
142 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
147 { "BFINidct", ff_bfin_idct, NO_PERM },
151 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
152 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
155 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
158 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
161 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
165 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
171 #define AANSCALE_BITS 12
173 static int64_t gettime(void)
176 gettimeofday(&tv, NULL);
177 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
181 #define NB_ITS_SPEED 50000
183 static short idct_mmx_perm[64];
185 static short idct_simple_mmx_perm[64] = {
186 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
187 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
188 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
189 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
190 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
191 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
192 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
193 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
196 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
198 static void idct_mmx_init(void)
202 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
203 for (i = 0; i < 64; i++) {
204 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
208 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
209 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
211 static inline void mmx_emms(void)
214 if (cpu_flags & AV_CPU_FLAG_MMX)
215 __asm__ volatile ("emms\n\t");
219 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
223 memset(block, 0, 64 * sizeof(*block));
227 for (i = 0; i < 64; i++)
228 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
231 for (i = 0; i < 64; i++)
236 j = av_lfg_get(prng) % 10 + 1;
237 for (i = 0; i < j; i++)
238 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
241 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
242 block[63] = (block[0] & 1) ^ 1;
247 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
251 if (perm == MMX_PERM) {
252 for (i = 0; i < 64; i++)
253 dst[idct_mmx_perm[i]] = src[i];
254 } else if (perm == MMX_SIMPLE_PERM) {
255 for (i = 0; i < 64; i++)
256 dst[idct_simple_mmx_perm[i]] = src[i];
257 } else if (perm == SSE2_PERM) {
258 for (i = 0; i < 64; i++)
259 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
260 } else if (perm == PARTTRANS_PERM) {
261 for (i = 0; i < 64; i++)
262 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
263 } else if (perm == TRANSPOSE_PERM) {
264 for (i = 0; i < 64; i++)
265 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
267 for (i = 0; i < 64; i++)
272 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
274 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
277 int64_t err2, ti, ti1, it1, err_sum = 0;
278 int64_t sysErr[64], sysErrMax = 0;
280 int blockSumErrMax = 0, blockSumErr;
282 const int vals=1<<bits;
286 av_lfg_init(&prng, 1);
290 for (i = 0; i < 64; i++)
292 for (it = 0; it < NB_ITS; it++) {
293 init_block(block1, test, is_idct, &prng, vals);
294 permute(block, block1, dct->format);
299 if (dct->format == SCALE_PERM) {
300 for (i = 0; i < 64; i++) {
301 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
302 block[i] = (block[i] * scale) >> AANSCALE_BITS;
309 for (i = 0; i < 64; i++) {
310 int err = block[i] - block1[i];
316 sysErr[i] += block[i] - block1[i];
318 if (abs(block[i]) > maxout)
319 maxout = abs(block[i]);
321 if (blockSumErrMax < blockSumErr)
322 blockSumErrMax = blockSumErr;
324 for (i = 0; i < 64; i++)
325 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
327 for (i = 0; i < 64; i++) {
330 printf("%7d ", (int) sysErr[i]);
334 omse = (double) err2 / NB_ITS / 64;
335 ome = (double) err_sum / NB_ITS / 64;
337 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
339 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
340 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
341 omse, ome, (double) sysErrMax / NB_ITS,
342 maxout, blockSumErrMax);
344 if (spec_err && !dct->nonspec)
352 init_block(block, test, is_idct, &prng, vals);
353 permute(block1, block, dct->format);
358 for (it = 0; it < NB_ITS_SPEED; it++) {
359 memcpy(block, block1, sizeof(block));
363 ti1 = gettime() - ti;
364 } while (ti1 < 1000000);
367 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
368 (double) it1 * 1000.0 / (double) ti1);
373 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
374 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
376 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
379 static double c8[8][8];
380 static double c4[4][4];
381 double block1[64], block2[64], block3[64];
388 for (i = 0; i < 8; i++) {
390 for (j = 0; j < 8; j++) {
391 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
392 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
393 sum += c8[i][j] * c8[i][j];
397 for (i = 0; i < 4; i++) {
399 for (j = 0; j < 4; j++) {
400 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
401 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
402 sum += c4[i][j] * c4[i][j];
409 for (i = 0; i < 4; i++) {
410 for (j = 0; j < 8; j++) {
411 block1[8 * (2 * i) + j] =
412 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
413 block1[8 * (2 * i + 1) + j] =
414 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
419 for (i = 0; i < 8; i++) {
420 for (j = 0; j < 8; j++) {
422 for (k = 0; k < 8; k++)
423 sum += c8[k][j] * block1[8 * i + k];
424 block2[8 * i + j] = sum;
429 for (i = 0; i < 8; i++) {
430 for (j = 0; j < 4; j++) {
433 for (k = 0; k < 4; k++)
434 sum += c4[k][j] * block2[8 * (2 * k) + i];
435 block3[8 * (2 * j) + i] = sum;
439 for (k = 0; k < 4; k++)
440 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
441 block3[8 * (2 * j + 1) + i] = sum;
445 /* clamp and store the result */
446 for (i = 0; i < 8; i++) {
447 for (j = 0; j < 8; j++) {
448 v = block3[8 * i + j];
450 else if (v > 255) v = 255;
451 dest[i * linesize + j] = (int) rint(v);
456 static void idct248_error(const char *name,
457 void (*idct248_put)(uint8_t *dest, int line_size,
461 int it, i, it1, ti, ti1, err_max, v;
464 av_lfg_init(&prng, 1);
466 /* just one test to see if code is correct (precision is less
469 for (it = 0; it < NB_ITS; it++) {
470 /* XXX: use forward transform to generate values */
471 for (i = 0; i < 64; i++)
472 block1[i] = av_lfg_get(&prng) % 256 - 128;
475 for (i = 0; i < 64; i++)
476 block[i] = block1[i];
477 idct248_ref(img_dest1, 8, block);
479 for (i = 0; i < 64; i++)
480 block[i] = block1[i];
481 idct248_put(img_dest, 8, block);
483 for (i = 0; i < 64; i++) {
484 v = abs((int) img_dest[i] - (int) img_dest1[i]);
486 printf("%d %d\n", img_dest[i], img_dest1[i]);
495 printf(" %3d", img_dest1[i*8+j]);
504 printf(" %3d", img_dest[i*8+j]);
510 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
518 for (it = 0; it < NB_ITS_SPEED; it++) {
519 for (i = 0; i < 64; i++)
520 block[i] = block1[i];
521 idct248_put(img_dest, 8, block);
524 ti1 = gettime() - ti;
525 } while (ti1 < 1000000);
528 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
529 (double) it1 * 1000.0 / (double) ti1);
532 static void help(void)
534 printf("dct-test [-i] [<test-number>] [<bits>]\n"
535 "test-number 0 -> test with random matrixes\n"
536 " 1 -> test with random sparse matrixes\n"
537 " 2 -> do 3. test from mpeg4 std\n"
538 "bits Number of time domain bits to use, 8 is default\n"
539 "-i test IDCT implementations\n"
540 "-4 test IDCT248 implementations\n"
544 int main(int argc, char **argv)
546 int test_idct = 0, test_248_dct = 0;
553 cpu_flags = av_get_cpu_flags();
559 c = getopt(argc, argv, "ih4t");
580 test = atoi(argv[optind]);
581 if(optind+1 < argc) bits= atoi(argv[optind+1]);
583 printf("ffmpeg DCT/IDCT test\n");
586 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
588 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
589 for (i = 0; algos[i].name; i++)
590 if (!(~cpu_flags & algos[i].mm_support)) {
591 err |= dct_error(&algos[i], test, test_idct, speed, bits);