2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
52 void ff_mmx_idct(int16_t *data);
53 void ff_mmxext_idct(int16_t *data);
56 void ff_bfin_idct(int16_t *block);
57 void ff_bfin_fdct(int16_t *block);
60 void ff_fdct_altivec(int16_t *block);
63 void ff_j_rev_dct_arm(int16_t *data);
64 void ff_simple_idct_arm(int16_t *data);
65 void ff_simple_idct_armv5te(int16_t *data);
66 void ff_simple_idct_armv6(int16_t *data);
67 void ff_simple_idct_neon(int16_t *data);
69 void ff_simple_idct_axp(int16_t *data);
73 void (*func)(int16_t *block);
74 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
75 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
82 static const struct algo fdct_tab[] = {
83 { "REF-DBL", ff_ref_fdct, NO_PERM },
84 { "FAAN", ff_faandct, NO_PERM },
85 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
86 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
89 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
90 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
91 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
95 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
99 { "BFINfdct", ff_bfin_fdct, NO_PERM },
105 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
106 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
107 int16_t *block, int16_t *qmat);
109 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
110 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
111 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
118 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
122 static const struct algo idct_tab[] = {
123 { "FAANI", ff_faanidct, NO_PERM },
124 { "REF-DBL", ff_ref_idct, NO_PERM },
125 { "INT", ff_j_rev_dct, MMX_PERM },
126 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
130 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
131 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
133 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
134 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
135 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
136 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
137 #if ARCH_X86_64 && HAVE_YASM
138 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
143 { "BFINidct", ff_bfin_idct, NO_PERM },
147 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
148 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
151 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
154 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
157 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
161 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
167 #define AANSCALE_BITS 12
170 #define NB_ITS_SPEED 50000
172 static short idct_mmx_perm[64];
174 static short idct_simple_mmx_perm[64] = {
175 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
176 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
177 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
178 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
179 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
180 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
181 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
182 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
185 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
187 static void idct_mmx_init(void)
191 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
192 for (i = 0; i < 64; i++) {
193 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
197 DECLARE_ALIGNED(16, static int16_t, block)[64];
198 DECLARE_ALIGNED(8, static int16_t, block1)[64];
200 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
204 memset(block, 0, 64 * sizeof(*block));
208 for (i = 0; i < 64; i++)
209 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
212 for (i = 0; i < 64; i++)
217 j = av_lfg_get(prng) % 10 + 1;
218 for (i = 0; i < j; i++) {
219 int idx = av_lfg_get(prng) % 64;
220 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
224 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
225 block[63] = (block[0] & 1) ^ 1;
230 static void permute(int16_t dst[64], const int16_t src[64], int perm)
234 if (perm == MMX_PERM) {
235 for (i = 0; i < 64; i++)
236 dst[idct_mmx_perm[i]] = src[i];
237 } else if (perm == MMX_SIMPLE_PERM) {
238 for (i = 0; i < 64; i++)
239 dst[idct_simple_mmx_perm[i]] = src[i];
240 } else if (perm == SSE2_PERM) {
241 for (i = 0; i < 64; i++)
242 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
243 } else if (perm == PARTTRANS_PERM) {
244 for (i = 0; i < 64; i++)
245 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
246 } else if (perm == TRANSPOSE_PERM) {
247 for (i = 0; i < 64; i++)
248 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
250 for (i = 0; i < 64; i++)
255 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
257 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
260 int64_t err2, ti, ti1, it1, err_sum = 0;
261 int64_t sysErr[64], sysErrMax = 0;
263 int blockSumErrMax = 0, blockSumErr;
265 const int vals=1<<bits;
269 av_lfg_init(&prng, 1);
273 for (i = 0; i < 64; i++)
275 for (it = 0; it < NB_ITS; it++) {
276 init_block(block1, test, is_idct, &prng, vals);
277 permute(block, block1, dct->format);
282 if (dct->format == SCALE_PERM) {
283 for (i = 0; i < 64; i++) {
284 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
285 block[i] = (block[i] * scale) >> AANSCALE_BITS;
292 for (i = 0; i < 64; i++) {
293 int err = block[i] - block1[i];
299 sysErr[i] += block[i] - block1[i];
301 if (abs(block[i]) > maxout)
302 maxout = abs(block[i]);
304 if (blockSumErrMax < blockSumErr)
305 blockSumErrMax = blockSumErr;
307 for (i = 0; i < 64; i++)
308 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
310 for (i = 0; i < 64; i++) {
313 printf("%7d ", (int) sysErr[i]);
317 omse = (double) err2 / NB_ITS / 64;
318 ome = (double) err_sum / NB_ITS / 64;
320 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
322 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
323 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
324 omse, ome, (double) sysErrMax / NB_ITS,
325 maxout, blockSumErrMax);
327 if (spec_err && !dct->nonspec)
335 init_block(block, test, is_idct, &prng, vals);
336 permute(block1, block, dct->format);
341 for (it = 0; it < NB_ITS_SPEED; it++) {
342 memcpy(block, block1, sizeof(block));
347 ti1 = av_gettime() - ti;
348 } while (ti1 < 1000000);
350 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
351 (double) it1 * 1000.0 / (double) ti1);
356 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
357 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
359 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
362 static double c8[8][8];
363 static double c4[4][4];
364 double block1[64], block2[64], block3[64];
371 for (i = 0; i < 8; i++) {
373 for (j = 0; j < 8; j++) {
374 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
375 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
376 sum += c8[i][j] * c8[i][j];
380 for (i = 0; i < 4; i++) {
382 for (j = 0; j < 4; j++) {
383 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
384 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
385 sum += c4[i][j] * c4[i][j];
392 for (i = 0; i < 4; i++) {
393 for (j = 0; j < 8; j++) {
394 block1[8 * (2 * i) + j] =
395 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
396 block1[8 * (2 * i + 1) + j] =
397 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
402 for (i = 0; i < 8; i++) {
403 for (j = 0; j < 8; j++) {
405 for (k = 0; k < 8; k++)
406 sum += c8[k][j] * block1[8 * i + k];
407 block2[8 * i + j] = sum;
412 for (i = 0; i < 8; i++) {
413 for (j = 0; j < 4; j++) {
416 for (k = 0; k < 4; k++)
417 sum += c4[k][j] * block2[8 * (2 * k) + i];
418 block3[8 * (2 * j) + i] = sum;
422 for (k = 0; k < 4; k++)
423 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
424 block3[8 * (2 * j + 1) + i] = sum;
428 /* clamp and store the result */
429 for (i = 0; i < 8; i++) {
430 for (j = 0; j < 8; j++) {
431 v = block3[8 * i + j];
433 else if (v > 255) v = 255;
434 dest[i * linesize + j] = (int) rint(v);
439 static void idct248_error(const char *name,
440 void (*idct248_put)(uint8_t *dest, int line_size,
444 int it, i, it1, ti, ti1, err_max, v;
447 av_lfg_init(&prng, 1);
449 /* just one test to see if code is correct (precision is less
452 for (it = 0; it < NB_ITS; it++) {
453 /* XXX: use forward transform to generate values */
454 for (i = 0; i < 64; i++)
455 block1[i] = av_lfg_get(&prng) % 256 - 128;
458 for (i = 0; i < 64; i++)
459 block[i] = block1[i];
460 idct248_ref(img_dest1, 8, block);
462 for (i = 0; i < 64; i++)
463 block[i] = block1[i];
464 idct248_put(img_dest, 8, block);
466 for (i = 0; i < 64; i++) {
467 v = abs((int) img_dest[i] - (int) img_dest1[i]);
469 printf("%d %d\n", img_dest[i], img_dest1[i]);
478 printf(" %3d", img_dest1[i*8+j]);
487 printf(" %3d", img_dest[i*8+j]);
493 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
501 for (it = 0; it < NB_ITS_SPEED; it++) {
502 for (i = 0; i < 64; i++)
503 block[i] = block1[i];
504 idct248_put(img_dest, 8, block);
508 ti1 = av_gettime() - ti;
509 } while (ti1 < 1000000);
511 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
512 (double) it1 * 1000.0 / (double) ti1);
515 static void help(void)
517 printf("dct-test [-i] [<test-number>] [<bits>]\n"
518 "test-number 0 -> test with random matrixes\n"
519 " 1 -> test with random sparse matrixes\n"
520 " 2 -> do 3. test from mpeg4 std\n"
521 "bits Number of time domain bits to use, 8 is default\n"
522 "-i test IDCT implementations\n"
523 "-4 test IDCT248 implementations\n"
528 #include "compat/getopt.c"
531 int main(int argc, char **argv)
533 int test_idct = 0, test_248_dct = 0;
540 cpu_flags = av_get_cpu_flags();
546 c = getopt(argc, argv, "ih4t");
567 test = atoi(argv[optind]);
568 if(optind+1 < argc) bits= atoi(argv[optind+1]);
570 printf("ffmpeg DCT/IDCT test\n");
573 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
575 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
576 for (i = 0; algos[i].name; i++)
577 if (!(~cpu_flags & algos[i].mm_support)) {
578 err |= dct_error(&algos[i], test, test_idct, speed, bits);