2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
51 void odivx_idct_c(short *block);
54 void ff_bfin_idct(DCTELEM *block);
55 void ff_bfin_fdct(DCTELEM *block);
58 void fdct_altivec(DCTELEM *block);
59 //void idct_altivec(DCTELEM *block);?? no routine
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
79 #ifndef FAAN_POSTSCALE
80 #define FAAN_SCALE SCALE_PERM
82 #define FAAN_SCALE NO_PERM
87 static const struct algo fdct_tab[] = {
88 { "REF-DBL", ff_ref_fdct, NO_PERM },
89 { "FAAN", ff_faandct, FAAN_SCALE },
90 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
91 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
94 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
95 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
96 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
112 DCTELEM *block, int16_t *qmat);
114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
115 int16_t qmat[64]; int i;
122 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
126 static const struct algo idct_tab[] = {
127 { "FAANI", ff_faanidct, NO_PERM },
128 { "REF-DBL", ff_ref_idct, NO_PERM },
129 { "INT", j_rev_dct, MMX_PERM },
130 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
134 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
135 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
137 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
138 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
139 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
140 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
142 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
147 { "BFINidct", ff_bfin_idct, NO_PERM },
151 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
152 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
155 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
158 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
161 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
165 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
171 #define AANSCALE_BITS 12
173 static uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
175 static int64_t gettime(void)
178 gettimeofday(&tv, NULL);
179 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
183 #define NB_ITS_SPEED 50000
185 static short idct_mmx_perm[64];
187 static short idct_simple_mmx_perm[64] = {
188 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
189 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
190 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
191 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
192 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
193 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
194 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
195 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
198 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
200 static void idct_mmx_init(void)
204 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
205 for (i = 0; i < 64; i++) {
206 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
210 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
211 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
213 static inline void mmx_emms(void)
216 if (cpu_flags & AV_CPU_FLAG_MMX)
217 __asm__ volatile ("emms\n\t");
221 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
225 memset(block, 0, 64 * sizeof(*block));
229 for (i = 0; i < 64; i++)
230 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
233 for (i = 0; i < 64; i++)
238 j = av_lfg_get(prng) % 10 + 1;
239 for (i = 0; i < j; i++)
240 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
243 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
244 block[63] = (block[0] & 1) ^ 1;
249 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
253 if (perm == MMX_PERM) {
254 for (i = 0; i < 64; i++)
255 dst[idct_mmx_perm[i]] = src[i];
256 } else if (perm == MMX_SIMPLE_PERM) {
257 for (i = 0; i < 64; i++)
258 dst[idct_simple_mmx_perm[i]] = src[i];
259 } else if (perm == SSE2_PERM) {
260 for (i = 0; i < 64; i++)
261 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
262 } else if (perm == PARTTRANS_PERM) {
263 for (i = 0; i < 64; i++)
264 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
265 } else if (perm == TRANSPOSE_PERM) {
266 for (i = 0; i < 64; i++)
267 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
269 for (i = 0; i < 64; i++)
274 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
276 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
279 int64_t err2, ti, ti1, it1, err_sum = 0;
280 int64_t sysErr[64], sysErrMax = 0;
282 int blockSumErrMax = 0, blockSumErr;
284 const int vals=1<<bits;
288 av_lfg_init(&prng, 1);
292 for (i = 0; i < 64; i++)
294 for (it = 0; it < NB_ITS; it++) {
295 init_block(block1, test, is_idct, &prng, vals);
296 permute(block, block1, dct->format);
301 if (dct->format == SCALE_PERM) {
302 for (i = 0; i < 64; i++) {
303 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
304 block[i] = (block[i] * scale) >> AANSCALE_BITS;
311 for (i = 0; i < 64; i++) {
312 int err = block[i] - block1[i];
318 sysErr[i] += block[i] - block1[i];
320 if (abs(block[i]) > maxout)
321 maxout = abs(block[i]);
323 if (blockSumErrMax < blockSumErr)
324 blockSumErrMax = blockSumErr;
326 for (i = 0; i < 64; i++)
327 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
329 for (i = 0; i < 64; i++) {
332 printf("%7d ", (int) sysErr[i]);
336 omse = (double) err2 / NB_ITS / 64;
337 ome = (double) err_sum / NB_ITS / 64;
339 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
341 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
342 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
343 omse, ome, (double) sysErrMax / NB_ITS,
344 maxout, blockSumErrMax);
346 if (spec_err && !dct->nonspec)
354 init_block(block, test, is_idct, &prng, vals);
355 permute(block1, block, dct->format);
360 for (it = 0; it < NB_ITS_SPEED; it++) {
361 memcpy(block, block1, sizeof(block));
365 ti1 = gettime() - ti;
366 } while (ti1 < 1000000);
369 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
370 (double) it1 * 1000.0 / (double) ti1);
375 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
376 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
378 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
381 static double c8[8][8];
382 static double c4[4][4];
383 double block1[64], block2[64], block3[64];
390 for (i = 0; i < 8; i++) {
392 for (j = 0; j < 8; j++) {
393 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
394 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
395 sum += c8[i][j] * c8[i][j];
399 for (i = 0; i < 4; i++) {
401 for (j = 0; j < 4; j++) {
402 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
403 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
404 sum += c4[i][j] * c4[i][j];
411 for (i = 0; i < 4; i++) {
412 for (j = 0; j < 8; j++) {
413 block1[8 * (2 * i) + j] =
414 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
415 block1[8 * (2 * i + 1) + j] =
416 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
421 for (i = 0; i < 8; i++) {
422 for (j = 0; j < 8; j++) {
424 for (k = 0; k < 8; k++)
425 sum += c8[k][j] * block1[8 * i + k];
426 block2[8 * i + j] = sum;
431 for (i = 0; i < 8; i++) {
432 for (j = 0; j < 4; j++) {
435 for (k = 0; k < 4; k++)
436 sum += c4[k][j] * block2[8 * (2 * k) + i];
437 block3[8 * (2 * j) + i] = sum;
441 for (k = 0; k < 4; k++)
442 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
443 block3[8 * (2 * j + 1) + i] = sum;
447 /* clamp and store the result */
448 for (i = 0; i < 8; i++) {
449 for (j = 0; j < 8; j++) {
450 v = block3[8 * i + j];
452 else if (v > 255) v = 255;
453 dest[i * linesize + j] = (int) rint(v);
458 static void idct248_error(const char *name,
459 void (*idct248_put)(uint8_t *dest, int line_size,
463 int it, i, it1, ti, ti1, err_max, v;
466 av_lfg_init(&prng, 1);
468 /* just one test to see if code is correct (precision is less
471 for (it = 0; it < NB_ITS; it++) {
472 /* XXX: use forward transform to generate values */
473 for (i = 0; i < 64; i++)
474 block1[i] = av_lfg_get(&prng) % 256 - 128;
477 for (i = 0; i < 64; i++)
478 block[i] = block1[i];
479 idct248_ref(img_dest1, 8, block);
481 for (i = 0; i < 64; i++)
482 block[i] = block1[i];
483 idct248_put(img_dest, 8, block);
485 for (i = 0; i < 64; i++) {
486 v = abs((int) img_dest[i] - (int) img_dest1[i]);
488 printf("%d %d\n", img_dest[i], img_dest1[i]);
497 printf(" %3d", img_dest1[i*8+j]);
506 printf(" %3d", img_dest[i*8+j]);
512 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
520 for (it = 0; it < NB_ITS_SPEED; it++) {
521 for (i = 0; i < 64; i++)
522 block[i] = block1[i];
523 idct248_put(img_dest, 8, block);
526 ti1 = gettime() - ti;
527 } while (ti1 < 1000000);
530 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
531 (double) it1 * 1000.0 / (double) ti1);
534 static void help(void)
536 printf("dct-test [-i] [<test-number>] [<bits>]\n"
537 "test-number 0 -> test with random matrixes\n"
538 " 1 -> test with random sparse matrixes\n"
539 " 2 -> do 3. test from mpeg4 std\n"
540 "bits Number of time domain bits to use, 8 is default\n"
541 "-i test IDCT implementations\n"
542 "-4 test IDCT248 implementations\n"
546 int main(int argc, char **argv)
548 int test_idct = 0, test_248_dct = 0;
555 cpu_flags = av_get_cpu_flags();
560 for (i = 0; i < 256; i++)
561 cropTbl[i + MAX_NEG_CROP] = i;
562 for (i = 0; i < MAX_NEG_CROP; i++) {
564 cropTbl[i + MAX_NEG_CROP + 256] = 255;
568 c = getopt(argc, argv, "ih4t");
589 test = atoi(argv[optind]);
590 if(optind+1 < argc) bits= atoi(argv[optind+1]);
592 printf("ffmpeg DCT/IDCT test\n");
595 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
597 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
598 for (i = 0; algos[i].name; i++)
599 if (!(~cpu_flags & algos[i].mm_support)) {
600 err |= dct_error(&algos[i], test, test_idct, speed, bits);