2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
110 static const struct algo idct_tab[] = {
111 { "FAANI", ff_faanidct, NO_PERM },
112 { "REF-DBL", ff_ref_idct, NO_PERM },
113 { "INT", j_rev_dct, MMX_PERM },
114 { "SIMPLE-C", ff_simple_idct, NO_PERM },
118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
128 { "BFINidct", ff_bfin_idct, NO_PERM },
132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
152 #define AANSCALE_BITS 12
154 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
156 static int64_t gettime(void)
159 gettimeofday(&tv, NULL);
160 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
164 #define NB_ITS_SPEED 50000
166 static short idct_mmx_perm[64];
168 static short idct_simple_mmx_perm[64] = {
169 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
170 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
171 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
172 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
173 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
174 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
175 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
176 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
179 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
181 static void idct_mmx_init(void)
185 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
186 for (i = 0; i < 64; i++) {
187 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
191 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
192 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
193 DECLARE_ALIGNED(8, static DCTELEM, block_org)[64];
195 static inline void mmx_emms(void)
198 if (cpu_flags & AV_CPU_FLAG_MMX)
199 __asm__ volatile ("emms\n\t");
204 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
206 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
209 int64_t err2, ti, ti1, it1, err_sum = 0;
210 int64_t sysErr[64], sysErrMax = 0;
212 int blockSumErrMax = 0, blockSumErr;
214 const int vals=1<<bits;
218 av_lfg_init(&prng, 1);
222 for (i = 0; i < 64; i++)
224 for (it = 0; it < NB_ITS; it++) {
225 for (i = 0; i < 64; i++)
229 for (i = 0; i < 64; i++)
230 block1[i] = (av_lfg_get(&prng) % (2*vals)) -vals;
233 for (i = 0; i < 64; i++)
238 int num = av_lfg_get(&prng) % 10 + 1;
239 for (i = 0; i < num; i++)
240 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % (2*vals) -vals;
244 block1[0] = av_lfg_get(&prng) % (16*vals) - (8*vals);
245 block1[63] = (block1[0] & 1) ^ 1;
249 for (i = 0; i < 64; i++)
250 block_org[i] = block1[i];
252 if (dct->format == MMX_PERM) {
253 for (i = 0; i < 64; i++)
254 block[idct_mmx_perm[i]] = block1[i];
255 } else if (dct->format == MMX_SIMPLE_PERM) {
256 for (i = 0; i < 64; i++)
257 block[idct_simple_mmx_perm[i]] = block1[i];
258 } else if (dct->format == SSE2_PERM) {
259 for (i = 0; i < 64; i++)
260 block[(i & 0x38) | idct_sse2_row_perm[i & 7]] = block1[i];
261 } else if (dct->format == PARTTRANS_PERM) {
262 for (i = 0; i < 64; i++)
263 block[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = block1[i];
265 for (i = 0; i < 64; i++)
266 block[i] = block1[i];
272 if (dct->format == SCALE_PERM) {
273 for (i = 0; i < 64; i++) {
274 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
275 block[i] = (block[i] * scale) >> AANSCALE_BITS;
282 for (i = 0; i < 64; i++) {
283 int err = block[i] - block1[i];
289 sysErr[i] += block[i] - block1[i];
291 if (abs(block[i]) > maxout)
292 maxout = abs(block[i]);
294 if (blockSumErrMax < blockSumErr)
295 blockSumErrMax = blockSumErr;
297 for (i = 0; i < 64; i++)
298 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
300 for (i = 0; i < 64; i++) {
303 printf("%7d ", (int) sysErr[i]);
307 omse = (double) err2 / NB_ITS / 64;
308 ome = (double) err_sum / NB_ITS / 64;
310 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
312 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
313 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
314 omse, ome, (double) sysErrMax / NB_ITS,
315 maxout, blockSumErrMax);
317 if (spec_err && !dct->nonspec)
324 for (i = 0; i < 64; i++)
329 for (i = 0; i < 64; i++)
330 block1[i] = av_lfg_get(&prng) % (2*vals) -vals;
333 for (i = 0; i < 64; i++)
339 block1[0] = av_lfg_get(&prng) % (2*vals) -vals;
340 block1[1] = av_lfg_get(&prng) % (2*vals) -vals;
341 block1[2] = av_lfg_get(&prng) % (2*vals) -vals;
342 block1[3] = av_lfg_get(&prng) % (2*vals) -vals;
346 if (dct->format == MMX_PERM) {
347 for (i = 0; i < 64; i++)
348 block[idct_mmx_perm[i]] = block1[i];
349 } else if (dct->format == MMX_SIMPLE_PERM) {
350 for (i = 0; i < 64; i++)
351 block[idct_simple_mmx_perm[i]] = block1[i];
353 for (i = 0; i < 64; i++)
354 block[i] = block1[i];
360 for (it = 0; it < NB_ITS_SPEED; it++) {
361 for (i = 0; i < 64; i++)
362 block[i] = block1[i];
366 ti1 = gettime() - ti;
367 } while (ti1 < 1000000);
370 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
371 (double) it1 * 1000.0 / (double) ti1);
376 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
377 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
379 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
382 static double c8[8][8];
383 static double c4[4][4];
384 double block1[64], block2[64], block3[64];
391 for (i = 0; i < 8; i++) {
393 for (j = 0; j < 8; j++) {
394 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
395 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
396 sum += c8[i][j] * c8[i][j];
400 for (i = 0; i < 4; i++) {
402 for (j = 0; j < 4; j++) {
403 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
404 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
405 sum += c4[i][j] * c4[i][j];
412 for (i = 0; i < 4; i++) {
413 for (j = 0; j < 8; j++) {
414 block1[8 * (2 * i) + j] =
415 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
416 block1[8 * (2 * i + 1) + j] =
417 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
422 for (i = 0; i < 8; i++) {
423 for (j = 0; j < 8; j++) {
425 for (k = 0; k < 8; k++)
426 sum += c8[k][j] * block1[8 * i + k];
427 block2[8 * i + j] = sum;
432 for (i = 0; i < 8; i++) {
433 for (j = 0; j < 4; j++) {
436 for (k = 0; k < 4; k++)
437 sum += c4[k][j] * block2[8 * (2 * k) + i];
438 block3[8 * (2 * j) + i] = sum;
442 for (k = 0; k < 4; k++)
443 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
444 block3[8 * (2 * j + 1) + i] = sum;
448 /* clamp and store the result */
449 for (i = 0; i < 8; i++) {
450 for (j = 0; j < 8; j++) {
451 v = block3[8 * i + j];
453 else if (v > 255) v = 255;
454 dest[i * linesize + j] = (int) rint(v);
459 static void idct248_error(const char *name,
460 void (*idct248_put)(uint8_t *dest, int line_size,
464 int it, i, it1, ti, ti1, err_max, v;
467 av_lfg_init(&prng, 1);
469 /* just one test to see if code is correct (precision is less
472 for (it = 0; it < NB_ITS; it++) {
473 /* XXX: use forward transform to generate values */
474 for (i = 0; i < 64; i++)
475 block1[i] = av_lfg_get(&prng) % 256 - 128;
478 for (i = 0; i < 64; i++)
479 block[i] = block1[i];
480 idct248_ref(img_dest1, 8, block);
482 for (i = 0; i < 64; i++)
483 block[i] = block1[i];
484 idct248_put(img_dest, 8, block);
486 for (i = 0; i < 64; i++) {
487 v = abs((int) img_dest[i] - (int) img_dest1[i]);
489 printf("%d %d\n", img_dest[i], img_dest1[i]);
498 printf(" %3d", img_dest1[i*8+j]);
507 printf(" %3d", img_dest[i*8+j]);
513 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
521 for (it = 0; it < NB_ITS_SPEED; it++) {
522 for (i = 0; i < 64; i++)
523 block[i] = block1[i];
524 idct248_put(img_dest, 8, block);
527 ti1 = gettime() - ti;
528 } while (ti1 < 1000000);
531 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
532 (double) it1 * 1000.0 / (double) ti1);
535 static void help(void)
537 printf("dct-test [-i] [<test-number>]\n"
538 "test-number 0 -> test with random matrixes\n"
539 " 1 -> test with random sparse matrixes\n"
540 " 2 -> do 3. test from mpeg4 std\n"
541 "-i test IDCT implementations\n"
542 "-4 test IDCT248 implementations\n"
546 int main(int argc, char **argv)
548 int test_idct = 0, test_248_dct = 0;
555 cpu_flags = av_get_cpu_flags();
560 for (i = 0; i < 256; i++)
561 cropTbl[i + MAX_NEG_CROP] = i;
562 for (i = 0; i < MAX_NEG_CROP; i++) {
564 cropTbl[i + MAX_NEG_CROP + 256] = 255;
568 c = getopt(argc, argv, "ih4t");
589 test = atoi(argv[optind]);
590 if(optind+1 < argc) bits= atoi(argv[optind+1]);
592 printf("ffmpeg DCT/IDCT test\n");
595 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
597 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
598 for (i = 0; algos[i].name; i++)
599 if (!(~cpu_flags & algos[i].mm_support)) {
600 err |= dct_error(&algos[i], test, test_idct, speed, bits);