2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
51 void ff_fdct_altivec(int16_t *block);
54 void ff_j_rev_dct_arm(int16_t *data);
55 void ff_simple_idct_arm(int16_t *data);
56 void ff_simple_idct_armv5te(int16_t *data);
57 void ff_simple_idct_armv6(int16_t *data);
58 void ff_simple_idct_neon(int16_t *data);
62 void (*func)(int16_t *block);
63 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
64 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
71 static const struct algo fdct_tab[] = {
72 { "REF-DBL", ff_ref_fdct, NO_PERM },
73 { "FAAN", ff_faandct, NO_PERM },
74 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
75 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
78 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
80 #if HAVE_MMXEXT_INLINE
81 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
84 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
88 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
94 static void ff_prores_idct_wrap(int16_t *dst){
95 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
101 ff_prores_idct(dst, qmat);
102 for(i=0; i<64; i++) {
106 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
107 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
108 int16_t *block, int16_t *qmat);
110 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
111 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
112 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
119 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
121 for(i=0; i<64; i++) {
127 static const struct algo idct_tab[] = {
128 { "FAANI", ff_faanidct, NO_PERM },
129 { "REF-DBL", ff_ref_idct, NO_PERM },
130 { "INT", ff_j_rev_dct, MMX_PERM },
131 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
132 { "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 },
135 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
136 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
138 #if HAVE_MMXEXT_INLINE
139 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
142 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
143 #if ARCH_X86_64 && HAVE_YASM
144 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
149 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
150 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
153 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
156 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
158 #if HAVE_NEON && ARCH_ARM
159 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
165 #define AANSCALE_BITS 12
168 #define NB_ITS_SPEED 50000
170 static short idct_mmx_perm[64];
172 static short idct_simple_mmx_perm[64] = {
173 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
174 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
175 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
176 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
177 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
178 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
179 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
180 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
183 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
185 static void idct_mmx_init(void)
189 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
190 for (i = 0; i < 64; i++) {
191 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
195 DECLARE_ALIGNED(16, static int16_t, block)[64];
196 DECLARE_ALIGNED(8, static int16_t, block1)[64];
198 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
202 memset(block, 0, 64 * sizeof(*block));
206 for (i = 0; i < 64; i++)
207 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
210 for (i = 0; i < 64; i++)
215 j = av_lfg_get(prng) % 10 + 1;
216 for (i = 0; i < j; i++) {
217 int idx = av_lfg_get(prng) % 64;
218 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
222 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
223 block[63] = (block[0] & 1) ^ 1;
228 static void permute(int16_t dst[64], const int16_t src[64], int perm)
232 if (perm == MMX_PERM) {
233 for (i = 0; i < 64; i++)
234 dst[idct_mmx_perm[i]] = src[i];
235 } else if (perm == MMX_SIMPLE_PERM) {
236 for (i = 0; i < 64; i++)
237 dst[idct_simple_mmx_perm[i]] = src[i];
238 } else if (perm == SSE2_PERM) {
239 for (i = 0; i < 64; i++)
240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241 } else if (perm == PARTTRANS_PERM) {
242 for (i = 0; i < 64; i++)
243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244 } else if (perm == TRANSPOSE_PERM) {
245 for (i = 0; i < 64; i++)
246 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
248 for (i = 0; i < 64; i++)
253 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
255 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
258 int64_t err2, ti, ti1, it1, err_sum = 0;
259 int64_t sysErr[64], sysErrMax = 0;
261 int blockSumErrMax = 0, blockSumErr;
263 const int vals=1<<bits;
267 av_lfg_init(&prng, 1);
271 for (i = 0; i < 64; i++)
273 for (it = 0; it < NB_ITS; it++) {
274 init_block(block1, test, is_idct, &prng, vals);
275 permute(block, block1, dct->format);
280 if (dct->format == SCALE_PERM) {
281 for (i = 0; i < 64; i++) {
282 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
283 block[i] = (block[i] * scale) >> AANSCALE_BITS;
288 if (!strcmp(dct->name, "PR-SSE2"))
289 for (i = 0; i < 64; i++)
290 block1[i] = av_clip(block1[i], 4-512, 1019-512);
293 for (i = 0; i < 64; i++) {
294 int err = block[i] - block1[i];
300 sysErr[i] += block[i] - block1[i];
302 if (abs(block[i]) > maxout)
303 maxout = abs(block[i]);
305 if (blockSumErrMax < blockSumErr)
306 blockSumErrMax = blockSumErr;
308 for (i = 0; i < 64; i++)
309 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
311 for (i = 0; i < 64; i++) {
314 printf("%7d ", (int) sysErr[i]);
318 omse = (double) err2 / NB_ITS / 64;
319 ome = (double) err_sum / NB_ITS / 64;
321 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
323 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
324 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
325 omse, ome, (double) sysErrMax / NB_ITS,
326 maxout, blockSumErrMax);
328 if (spec_err && !dct->nonspec)
336 init_block(block, test, is_idct, &prng, vals);
337 permute(block1, block, dct->format);
339 ti = av_gettime_relative();
342 for (it = 0; it < NB_ITS_SPEED; it++) {
343 memcpy(block, block1, sizeof(block));
348 ti1 = av_gettime_relative() - ti;
349 } while (ti1 < 1000000);
351 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
352 (double) it1 * 1000.0 / (double) ti1);
357 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
358 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
360 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
363 static double c8[8][8];
364 static double c4[4][4];
365 double block1[64], block2[64], block3[64];
372 for (i = 0; i < 8; i++) {
374 for (j = 0; j < 8; j++) {
375 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
376 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
377 sum += c8[i][j] * c8[i][j];
381 for (i = 0; i < 4; i++) {
383 for (j = 0; j < 4; j++) {
384 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
385 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
386 sum += c4[i][j] * c4[i][j];
393 for (i = 0; i < 4; i++) {
394 for (j = 0; j < 8; j++) {
395 block1[8 * (2 * i) + j] =
396 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
397 block1[8 * (2 * i + 1) + j] =
398 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
403 for (i = 0; i < 8; i++) {
404 for (j = 0; j < 8; j++) {
406 for (k = 0; k < 8; k++)
407 sum += c8[k][j] * block1[8 * i + k];
408 block2[8 * i + j] = sum;
413 for (i = 0; i < 8; i++) {
414 for (j = 0; j < 4; j++) {
417 for (k = 0; k < 4; k++)
418 sum += c4[k][j] * block2[8 * (2 * k) + i];
419 block3[8 * (2 * j) + i] = sum;
423 for (k = 0; k < 4; k++)
424 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
425 block3[8 * (2 * j + 1) + i] = sum;
429 /* clamp and store the result */
430 for (i = 0; i < 8; i++) {
431 for (j = 0; j < 8; j++) {
432 v = block3[8 * i + j];
434 else if (v > 255) v = 255;
435 dest[i * linesize + j] = (int) rint(v);
440 static void idct248_error(const char *name,
441 void (*idct248_put)(uint8_t *dest, int line_size,
445 int it, i, it1, ti, ti1, err_max, v;
448 av_lfg_init(&prng, 1);
450 /* just one test to see if code is correct (precision is less
453 for (it = 0; it < NB_ITS; it++) {
454 /* XXX: use forward transform to generate values */
455 for (i = 0; i < 64; i++)
456 block1[i] = av_lfg_get(&prng) % 256 - 128;
459 for (i = 0; i < 64; i++)
460 block[i] = block1[i];
461 idct248_ref(img_dest1, 8, block);
463 for (i = 0; i < 64; i++)
464 block[i] = block1[i];
465 idct248_put(img_dest, 8, block);
467 for (i = 0; i < 64; i++) {
468 v = abs((int) img_dest[i] - (int) img_dest1[i]);
470 printf("%d %d\n", img_dest[i], img_dest1[i]);
479 printf(" %3d", img_dest1[i*8+j]);
488 printf(" %3d", img_dest[i*8+j]);
494 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
499 ti = av_gettime_relative();
502 for (it = 0; it < NB_ITS_SPEED; it++) {
503 for (i = 0; i < 64; i++)
504 block[i] = block1[i];
505 idct248_put(img_dest, 8, block);
509 ti1 = av_gettime_relative() - ti;
510 } while (ti1 < 1000000);
512 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
513 (double) it1 * 1000.0 / (double) ti1);
516 static void help(void)
518 printf("dct-test [-i] [<test-number>] [<bits>]\n"
519 "test-number 0 -> test with random matrixes\n"
520 " 1 -> test with random sparse matrixes\n"
521 " 2 -> do 3. test from mpeg4 std\n"
522 "bits Number of time domain bits to use, 8 is default\n"
523 "-i test IDCT implementations\n"
524 "-4 test IDCT248 implementations\n"
529 #include "compat/getopt.c"
532 int main(int argc, char **argv)
534 int test_idct = 0, test_248_dct = 0;
541 cpu_flags = av_get_cpu_flags();
547 c = getopt(argc, argv, "ih4t");
568 test = atoi(argv[optind]);
569 if(optind+1 < argc) bits= atoi(argv[optind+1]);
571 printf("ffmpeg DCT/IDCT test\n");
574 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
576 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
577 for (i = 0; algos[i].name; i++)
578 if (!(~cpu_flags & algos[i].mm_support)) {
579 err |= dct_error(&algos[i], test, test_idct, speed, bits);
584 printf("Error: %d.\n", err);