2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
110 static const struct algo idct_tab[] = {
111 { "FAANI", ff_faanidct, NO_PERM },
112 { "REF-DBL", ff_ref_idct, NO_PERM },
113 { "INT", j_rev_dct, MMX_PERM },
114 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
118 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
119 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
121 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
122 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
123 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
124 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
128 { "BFINidct", ff_bfin_idct, NO_PERM },
132 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
133 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
136 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
139 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
142 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
146 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
152 #define AANSCALE_BITS 12
154 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
156 static int64_t gettime(void)
159 gettimeofday(&tv, NULL);
160 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
164 #define NB_ITS_SPEED 50000
166 static short idct_mmx_perm[64];
168 static short idct_simple_mmx_perm[64] = {
169 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
170 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
171 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
172 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
173 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
174 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
175 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
176 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
179 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
181 static void idct_mmx_init(void)
185 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
186 for (i = 0; i < 64; i++) {
187 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
191 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
192 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
194 static inline void mmx_emms(void)
197 if (cpu_flags & AV_CPU_FLAG_MMX)
198 __asm__ volatile ("emms\n\t");
203 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
205 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
208 int64_t err2, ti, ti1, it1, err_sum = 0;
209 int64_t sysErr[64], sysErrMax = 0;
211 int blockSumErrMax = 0, blockSumErr;
213 const int vals=1<<bits;
217 av_lfg_init(&prng, 1);
221 for (i = 0; i < 64; i++)
223 for (it = 0; it < NB_ITS; it++) {
224 for (i = 0; i < 64; i++)
228 for (i = 0; i < 64; i++)
229 block1[i] = (av_lfg_get(&prng) % (2*vals)) -vals;
232 for (i = 0; i < 64; i++)
237 int num = av_lfg_get(&prng) % 10 + 1;
238 for (i = 0; i < num; i++)
239 block1[av_lfg_get(&prng) % 64] = av_lfg_get(&prng) % (2*vals) -vals;
243 block1[0] = av_lfg_get(&prng) % (16*vals) - (8*vals);
244 block1[63] = (block1[0] & 1) ^ 1;
248 if (dct->format == MMX_PERM) {
249 for (i = 0; i < 64; i++)
250 block[idct_mmx_perm[i]] = block1[i];
251 } else if (dct->format == MMX_SIMPLE_PERM) {
252 for (i = 0; i < 64; i++)
253 block[idct_simple_mmx_perm[i]] = block1[i];
254 } else if (dct->format == SSE2_PERM) {
255 for (i = 0; i < 64; i++)
256 block[(i & 0x38) | idct_sse2_row_perm[i & 7]] = block1[i];
257 } else if (dct->format == PARTTRANS_PERM) {
258 for (i = 0; i < 64; i++)
259 block[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = block1[i];
261 for (i = 0; i < 64; i++)
262 block[i] = block1[i];
268 if (dct->format == SCALE_PERM) {
269 for (i = 0; i < 64; i++) {
270 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
271 block[i] = (block[i] * scale) >> AANSCALE_BITS;
278 for (i = 0; i < 64; i++) {
279 int err = block[i] - block1[i];
285 sysErr[i] += block[i] - block1[i];
287 if (abs(block[i]) > maxout)
288 maxout = abs(block[i]);
290 if (blockSumErrMax < blockSumErr)
291 blockSumErrMax = blockSumErr;
293 for (i = 0; i < 64; i++)
294 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
296 for (i = 0; i < 64; i++) {
299 printf("%7d ", (int) sysErr[i]);
303 omse = (double) err2 / NB_ITS / 64;
304 ome = (double) err_sum / NB_ITS / 64;
306 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
308 printf("%s %s: ppe=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
309 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
310 omse, ome, (double) sysErrMax / NB_ITS,
311 maxout, blockSumErrMax);
313 if (spec_err && !dct->nonspec)
320 for (i = 0; i < 64; i++)
325 for (i = 0; i < 64; i++)
326 block1[i] = av_lfg_get(&prng) % (2*vals) -vals;
329 for (i = 0; i < 64; i++)
335 block1[0] = av_lfg_get(&prng) % (2*vals) -vals;
336 block1[1] = av_lfg_get(&prng) % (2*vals) -vals;
337 block1[2] = av_lfg_get(&prng) % (2*vals) -vals;
338 block1[3] = av_lfg_get(&prng) % (2*vals) -vals;
342 if (dct->format == MMX_PERM) {
343 for (i = 0; i < 64; i++)
344 block[idct_mmx_perm[i]] = block1[i];
345 } else if (dct->format == MMX_SIMPLE_PERM) {
346 for (i = 0; i < 64; i++)
347 block[idct_simple_mmx_perm[i]] = block1[i];
349 for (i = 0; i < 64; i++)
350 block[i] = block1[i];
356 for (it = 0; it < NB_ITS_SPEED; it++) {
357 for (i = 0; i < 64; i++)
358 block[i] = block1[i];
362 ti1 = gettime() - ti;
363 } while (ti1 < 1000000);
366 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
367 (double) it1 * 1000.0 / (double) ti1);
372 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
373 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
375 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
378 static double c8[8][8];
379 static double c4[4][4];
380 double block1[64], block2[64], block3[64];
387 for (i = 0; i < 8; i++) {
389 for (j = 0; j < 8; j++) {
390 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
391 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
392 sum += c8[i][j] * c8[i][j];
396 for (i = 0; i < 4; i++) {
398 for (j = 0; j < 4; j++) {
399 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
400 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
401 sum += c4[i][j] * c4[i][j];
408 for (i = 0; i < 4; i++) {
409 for (j = 0; j < 8; j++) {
410 block1[8 * (2 * i) + j] =
411 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
412 block1[8 * (2 * i + 1) + j] =
413 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
418 for (i = 0; i < 8; i++) {
419 for (j = 0; j < 8; j++) {
421 for (k = 0; k < 8; k++)
422 sum += c8[k][j] * block1[8 * i + k];
423 block2[8 * i + j] = sum;
428 for (i = 0; i < 8; i++) {
429 for (j = 0; j < 4; j++) {
432 for (k = 0; k < 4; k++)
433 sum += c4[k][j] * block2[8 * (2 * k) + i];
434 block3[8 * (2 * j) + i] = sum;
438 for (k = 0; k < 4; k++)
439 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
440 block3[8 * (2 * j + 1) + i] = sum;
444 /* clamp and store the result */
445 for (i = 0; i < 8; i++) {
446 for (j = 0; j < 8; j++) {
447 v = block3[8 * i + j];
449 else if (v > 255) v = 255;
450 dest[i * linesize + j] = (int) rint(v);
455 static void idct248_error(const char *name,
456 void (*idct248_put)(uint8_t *dest, int line_size,
460 int it, i, it1, ti, ti1, err_max, v;
463 av_lfg_init(&prng, 1);
465 /* just one test to see if code is correct (precision is less
468 for (it = 0; it < NB_ITS; it++) {
469 /* XXX: use forward transform to generate values */
470 for (i = 0; i < 64; i++)
471 block1[i] = av_lfg_get(&prng) % 256 - 128;
474 for (i = 0; i < 64; i++)
475 block[i] = block1[i];
476 idct248_ref(img_dest1, 8, block);
478 for (i = 0; i < 64; i++)
479 block[i] = block1[i];
480 idct248_put(img_dest, 8, block);
482 for (i = 0; i < 64; i++) {
483 v = abs((int) img_dest[i] - (int) img_dest1[i]);
485 printf("%d %d\n", img_dest[i], img_dest1[i]);
494 printf(" %3d", img_dest1[i*8+j]);
503 printf(" %3d", img_dest[i*8+j]);
509 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
517 for (it = 0; it < NB_ITS_SPEED; it++) {
518 for (i = 0; i < 64; i++)
519 block[i] = block1[i];
520 idct248_put(img_dest, 8, block);
523 ti1 = gettime() - ti;
524 } while (ti1 < 1000000);
527 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
528 (double) it1 * 1000.0 / (double) ti1);
531 static void help(void)
533 printf("dct-test [-i] [<test-number>]\n"
534 "test-number 0 -> test with random matrixes\n"
535 " 1 -> test with random sparse matrixes\n"
536 " 2 -> do 3. test from mpeg4 std\n"
537 "-i test IDCT implementations\n"
538 "-4 test IDCT248 implementations\n"
542 int main(int argc, char **argv)
544 int test_idct = 0, test_248_dct = 0;
551 cpu_flags = av_get_cpu_flags();
556 for (i = 0; i < 256; i++)
557 cropTbl[i + MAX_NEG_CROP] = i;
558 for (i = 0; i < MAX_NEG_CROP; i++) {
560 cropTbl[i + MAX_NEG_CROP + 256] = 255;
564 c = getopt(argc, argv, "ih4t");
585 test = atoi(argv[optind]);
586 if(optind+1 < argc) bits= atoi(argv[optind+1]);
588 printf("ffmpeg DCT/IDCT test\n");
591 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
593 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
594 for (i = 0; algos[i].name; i++)
595 if (!(~cpu_flags & algos[i].mm_support)) {
596 err |= dct_error(&algos[i], test, test_idct, speed, bits);