2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
53 void ff_bfin_idct(int16_t *block);
54 void ff_bfin_fdct(int16_t *block);
57 void ff_fdct_altivec(int16_t *block);
60 void ff_j_rev_dct_arm(int16_t *data);
61 void ff_simple_idct_arm(int16_t *data);
62 void ff_simple_idct_armv5te(int16_t *data);
63 void ff_simple_idct_armv6(int16_t *data);
64 void ff_simple_idct_neon(int16_t *data);
66 void ff_simple_idct_axp(int16_t *data);
70 void (*func)(int16_t *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
88 #if HAVE_MMXEXT_INLINE
89 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
92 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
96 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
100 { "BFINfdct", ff_bfin_fdct, NO_PERM },
106 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
107 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
108 int16_t *block, int16_t *qmat);
110 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
111 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
112 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
119 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
123 static const struct algo idct_tab[] = {
124 { "FAANI", ff_faanidct, NO_PERM },
125 { "REF-DBL", ff_ref_idct, NO_PERM },
126 { "INT", ff_j_rev_dct, MMX_PERM },
127 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
130 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
131 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
133 #if HAVE_MMXEXT_INLINE
134 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
137 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
138 #if ARCH_X86_64 && HAVE_YASM
139 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
144 { "BFINidct", ff_bfin_idct, NO_PERM },
148 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
149 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
152 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
155 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
158 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
162 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
168 #define AANSCALE_BITS 12
171 #define NB_ITS_SPEED 50000
173 static short idct_mmx_perm[64];
175 static short idct_simple_mmx_perm[64] = {
176 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
177 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
178 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
179 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
180 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
181 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
182 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
183 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
186 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
188 static void idct_mmx_init(void)
192 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
193 for (i = 0; i < 64; i++) {
194 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
198 DECLARE_ALIGNED(16, static int16_t, block)[64];
199 DECLARE_ALIGNED(8, static int16_t, block1)[64];
201 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
205 memset(block, 0, 64 * sizeof(*block));
209 for (i = 0; i < 64; i++)
210 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
213 for (i = 0; i < 64; i++)
218 j = av_lfg_get(prng) % 10 + 1;
219 for (i = 0; i < j; i++) {
220 int idx = av_lfg_get(prng) % 64;
221 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
225 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
226 block[63] = (block[0] & 1) ^ 1;
231 static void permute(int16_t dst[64], const int16_t src[64], int perm)
235 if (perm == MMX_PERM) {
236 for (i = 0; i < 64; i++)
237 dst[idct_mmx_perm[i]] = src[i];
238 } else if (perm == MMX_SIMPLE_PERM) {
239 for (i = 0; i < 64; i++)
240 dst[idct_simple_mmx_perm[i]] = src[i];
241 } else if (perm == SSE2_PERM) {
242 for (i = 0; i < 64; i++)
243 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
244 } else if (perm == PARTTRANS_PERM) {
245 for (i = 0; i < 64; i++)
246 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
247 } else if (perm == TRANSPOSE_PERM) {
248 for (i = 0; i < 64; i++)
249 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
251 for (i = 0; i < 64; i++)
256 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
258 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
261 int64_t err2, ti, ti1, it1, err_sum = 0;
262 int64_t sysErr[64], sysErrMax = 0;
264 int blockSumErrMax = 0, blockSumErr;
266 const int vals=1<<bits;
270 av_lfg_init(&prng, 1);
274 for (i = 0; i < 64; i++)
276 for (it = 0; it < NB_ITS; it++) {
277 init_block(block1, test, is_idct, &prng, vals);
278 permute(block, block1, dct->format);
283 if (dct->format == SCALE_PERM) {
284 for (i = 0; i < 64; i++) {
285 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
286 block[i] = (block[i] * scale) >> AANSCALE_BITS;
293 for (i = 0; i < 64; i++) {
294 int err = block[i] - block1[i];
300 sysErr[i] += block[i] - block1[i];
302 if (abs(block[i]) > maxout)
303 maxout = abs(block[i]);
305 if (blockSumErrMax < blockSumErr)
306 blockSumErrMax = blockSumErr;
308 for (i = 0; i < 64; i++)
309 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
311 for (i = 0; i < 64; i++) {
314 printf("%7d ", (int) sysErr[i]);
318 omse = (double) err2 / NB_ITS / 64;
319 ome = (double) err_sum / NB_ITS / 64;
321 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
323 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
324 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
325 omse, ome, (double) sysErrMax / NB_ITS,
326 maxout, blockSumErrMax);
328 if (spec_err && !dct->nonspec)
336 init_block(block, test, is_idct, &prng, vals);
337 permute(block1, block, dct->format);
342 for (it = 0; it < NB_ITS_SPEED; it++) {
343 memcpy(block, block1, sizeof(block));
348 ti1 = av_gettime() - ti;
349 } while (ti1 < 1000000);
351 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
352 (double) it1 * 1000.0 / (double) ti1);
357 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
358 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
360 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
363 static double c8[8][8];
364 static double c4[4][4];
365 double block1[64], block2[64], block3[64];
372 for (i = 0; i < 8; i++) {
374 for (j = 0; j < 8; j++) {
375 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
376 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
377 sum += c8[i][j] * c8[i][j];
381 for (i = 0; i < 4; i++) {
383 for (j = 0; j < 4; j++) {
384 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
385 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
386 sum += c4[i][j] * c4[i][j];
393 for (i = 0; i < 4; i++) {
394 for (j = 0; j < 8; j++) {
395 block1[8 * (2 * i) + j] =
396 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
397 block1[8 * (2 * i + 1) + j] =
398 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
403 for (i = 0; i < 8; i++) {
404 for (j = 0; j < 8; j++) {
406 for (k = 0; k < 8; k++)
407 sum += c8[k][j] * block1[8 * i + k];
408 block2[8 * i + j] = sum;
413 for (i = 0; i < 8; i++) {
414 for (j = 0; j < 4; j++) {
417 for (k = 0; k < 4; k++)
418 sum += c4[k][j] * block2[8 * (2 * k) + i];
419 block3[8 * (2 * j) + i] = sum;
423 for (k = 0; k < 4; k++)
424 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
425 block3[8 * (2 * j + 1) + i] = sum;
429 /* clamp and store the result */
430 for (i = 0; i < 8; i++) {
431 for (j = 0; j < 8; j++) {
432 v = block3[8 * i + j];
434 else if (v > 255) v = 255;
435 dest[i * linesize + j] = (int) rint(v);
440 static void idct248_error(const char *name,
441 void (*idct248_put)(uint8_t *dest, int line_size,
445 int it, i, it1, ti, ti1, err_max, v;
448 av_lfg_init(&prng, 1);
450 /* just one test to see if code is correct (precision is less
453 for (it = 0; it < NB_ITS; it++) {
454 /* XXX: use forward transform to generate values */
455 for (i = 0; i < 64; i++)
456 block1[i] = av_lfg_get(&prng) % 256 - 128;
459 for (i = 0; i < 64; i++)
460 block[i] = block1[i];
461 idct248_ref(img_dest1, 8, block);
463 for (i = 0; i < 64; i++)
464 block[i] = block1[i];
465 idct248_put(img_dest, 8, block);
467 for (i = 0; i < 64; i++) {
468 v = abs((int) img_dest[i] - (int) img_dest1[i]);
470 printf("%d %d\n", img_dest[i], img_dest1[i]);
479 printf(" %3d", img_dest1[i*8+j]);
488 printf(" %3d", img_dest[i*8+j]);
494 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
502 for (it = 0; it < NB_ITS_SPEED; it++) {
503 for (i = 0; i < 64; i++)
504 block[i] = block1[i];
505 idct248_put(img_dest, 8, block);
509 ti1 = av_gettime() - ti;
510 } while (ti1 < 1000000);
512 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
513 (double) it1 * 1000.0 / (double) ti1);
516 static void help(void)
518 printf("dct-test [-i] [<test-number>] [<bits>]\n"
519 "test-number 0 -> test with random matrixes\n"
520 " 1 -> test with random sparse matrixes\n"
521 " 2 -> do 3. test from mpeg4 std\n"
522 "bits Number of time domain bits to use, 8 is default\n"
523 "-i test IDCT implementations\n"
524 "-4 test IDCT248 implementations\n"
529 #include "compat/getopt.c"
532 int main(int argc, char **argv)
534 int test_idct = 0, test_248_dct = 0;
541 cpu_flags = av_get_cpu_flags();
547 c = getopt(argc, argv, "ih4t");
568 test = atoi(argv[optind]);
569 if(optind+1 < argc) bits= atoi(argv[optind+1]);
571 printf("ffmpeg DCT/IDCT test\n");
574 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
576 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
577 for (i = 0; algos[i].name; i++)
578 if (!(~cpu_flags & algos[i].mm_support)) {
579 err |= dct_error(&algos[i], test, test_idct, speed, bits);
584 printf("Error: %d.\n", err);