2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
53 void ff_bfin_idct(int16_t *block);
54 void ff_bfin_fdct(int16_t *block);
57 void ff_fdct_altivec(int16_t *block);
60 void ff_j_rev_dct_arm(int16_t *data);
61 void ff_simple_idct_arm(int16_t *data);
62 void ff_simple_idct_armv5te(int16_t *data);
63 void ff_simple_idct_armv6(int16_t *data);
64 void ff_simple_idct_neon(int16_t *data);
66 void ff_simple_idct_axp(int16_t *data);
70 void (*func)(int16_t *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
79 static const struct algo fdct_tab[] = {
80 { "REF-DBL", ff_ref_fdct, NO_PERM },
81 { "FAAN", ff_faandct, NO_PERM },
82 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
83 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
86 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
87 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
103 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
104 int16_t *block, int16_t *qmat);
106 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
107 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
108 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
115 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
119 static const struct algo idct_tab[] = {
120 { "FAANI", ff_faanidct, NO_PERM },
121 { "REF-DBL", ff_ref_idct, NO_PERM },
122 { "INT", ff_j_rev_dct, MMX_PERM },
123 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
126 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
127 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
128 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
129 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
130 #if ARCH_X86_64 && HAVE_YASM
131 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
136 { "BFINidct", ff_bfin_idct, NO_PERM },
140 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
141 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
144 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
147 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
150 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
154 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
160 #define AANSCALE_BITS 12
163 #define NB_ITS_SPEED 50000
165 static short idct_mmx_perm[64];
167 static short idct_simple_mmx_perm[64] = {
168 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
169 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
170 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
171 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
172 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
173 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
174 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
175 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
178 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
180 static void idct_mmx_init(void)
184 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
185 for (i = 0; i < 64; i++) {
186 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
190 DECLARE_ALIGNED(16, static int16_t, block)[64];
191 DECLARE_ALIGNED(8, static int16_t, block1)[64];
193 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
197 memset(block, 0, 64 * sizeof(*block));
201 for (i = 0; i < 64; i++)
202 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
205 for (i = 0; i < 64; i++)
210 j = av_lfg_get(prng) % 10 + 1;
211 for (i = 0; i < j; i++) {
212 int idx = av_lfg_get(prng) % 64;
213 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
217 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
218 block[63] = (block[0] & 1) ^ 1;
223 static void permute(int16_t dst[64], const int16_t src[64], int perm)
227 if (perm == MMX_PERM) {
228 for (i = 0; i < 64; i++)
229 dst[idct_mmx_perm[i]] = src[i];
230 } else if (perm == MMX_SIMPLE_PERM) {
231 for (i = 0; i < 64; i++)
232 dst[idct_simple_mmx_perm[i]] = src[i];
233 } else if (perm == SSE2_PERM) {
234 for (i = 0; i < 64; i++)
235 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
236 } else if (perm == PARTTRANS_PERM) {
237 for (i = 0; i < 64; i++)
238 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
239 } else if (perm == TRANSPOSE_PERM) {
240 for (i = 0; i < 64; i++)
241 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
243 for (i = 0; i < 64; i++)
248 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
250 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
253 int64_t err2, ti, ti1, it1, err_sum = 0;
254 int64_t sysErr[64], sysErrMax = 0;
256 int blockSumErrMax = 0, blockSumErr;
258 const int vals=1<<bits;
262 av_lfg_init(&prng, 1);
266 for (i = 0; i < 64; i++)
268 for (it = 0; it < NB_ITS; it++) {
269 init_block(block1, test, is_idct, &prng, vals);
270 permute(block, block1, dct->format);
275 if (dct->format == SCALE_PERM) {
276 for (i = 0; i < 64; i++) {
277 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
278 block[i] = (block[i] * scale) >> AANSCALE_BITS;
285 for (i = 0; i < 64; i++) {
286 int err = block[i] - block1[i];
292 sysErr[i] += block[i] - block1[i];
294 if (abs(block[i]) > maxout)
295 maxout = abs(block[i]);
297 if (blockSumErrMax < blockSumErr)
298 blockSumErrMax = blockSumErr;
300 for (i = 0; i < 64; i++)
301 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
303 for (i = 0; i < 64; i++) {
306 printf("%7d ", (int) sysErr[i]);
310 omse = (double) err2 / NB_ITS / 64;
311 ome = (double) err_sum / NB_ITS / 64;
313 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
315 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
316 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
317 omse, ome, (double) sysErrMax / NB_ITS,
318 maxout, blockSumErrMax);
320 if (spec_err && !dct->nonspec)
328 init_block(block, test, is_idct, &prng, vals);
329 permute(block1, block, dct->format);
334 for (it = 0; it < NB_ITS_SPEED; it++) {
335 memcpy(block, block1, sizeof(block));
340 ti1 = av_gettime() - ti;
341 } while (ti1 < 1000000);
343 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
344 (double) it1 * 1000.0 / (double) ti1);
349 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
350 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
352 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
355 static double c8[8][8];
356 static double c4[4][4];
357 double block1[64], block2[64], block3[64];
364 for (i = 0; i < 8; i++) {
366 for (j = 0; j < 8; j++) {
367 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
368 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
369 sum += c8[i][j] * c8[i][j];
373 for (i = 0; i < 4; i++) {
375 for (j = 0; j < 4; j++) {
376 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
377 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
378 sum += c4[i][j] * c4[i][j];
385 for (i = 0; i < 4; i++) {
386 for (j = 0; j < 8; j++) {
387 block1[8 * (2 * i) + j] =
388 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
389 block1[8 * (2 * i + 1) + j] =
390 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
395 for (i = 0; i < 8; i++) {
396 for (j = 0; j < 8; j++) {
398 for (k = 0; k < 8; k++)
399 sum += c8[k][j] * block1[8 * i + k];
400 block2[8 * i + j] = sum;
405 for (i = 0; i < 8; i++) {
406 for (j = 0; j < 4; j++) {
409 for (k = 0; k < 4; k++)
410 sum += c4[k][j] * block2[8 * (2 * k) + i];
411 block3[8 * (2 * j) + i] = sum;
415 for (k = 0; k < 4; k++)
416 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
417 block3[8 * (2 * j + 1) + i] = sum;
421 /* clamp and store the result */
422 for (i = 0; i < 8; i++) {
423 for (j = 0; j < 8; j++) {
424 v = block3[8 * i + j];
426 else if (v > 255) v = 255;
427 dest[i * linesize + j] = (int) rint(v);
432 static void idct248_error(const char *name,
433 void (*idct248_put)(uint8_t *dest, int line_size,
437 int it, i, it1, ti, ti1, err_max, v;
440 av_lfg_init(&prng, 1);
442 /* just one test to see if code is correct (precision is less
445 for (it = 0; it < NB_ITS; it++) {
446 /* XXX: use forward transform to generate values */
447 for (i = 0; i < 64; i++)
448 block1[i] = av_lfg_get(&prng) % 256 - 128;
451 for (i = 0; i < 64; i++)
452 block[i] = block1[i];
453 idct248_ref(img_dest1, 8, block);
455 for (i = 0; i < 64; i++)
456 block[i] = block1[i];
457 idct248_put(img_dest, 8, block);
459 for (i = 0; i < 64; i++) {
460 v = abs((int) img_dest[i] - (int) img_dest1[i]);
462 printf("%d %d\n", img_dest[i], img_dest1[i]);
471 printf(" %3d", img_dest1[i*8+j]);
480 printf(" %3d", img_dest[i*8+j]);
486 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
494 for (it = 0; it < NB_ITS_SPEED; it++) {
495 for (i = 0; i < 64; i++)
496 block[i] = block1[i];
497 idct248_put(img_dest, 8, block);
501 ti1 = av_gettime() - ti;
502 } while (ti1 < 1000000);
504 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
505 (double) it1 * 1000.0 / (double) ti1);
508 static void help(void)
510 printf("dct-test [-i] [<test-number>] [<bits>]\n"
511 "test-number 0 -> test with random matrixes\n"
512 " 1 -> test with random sparse matrixes\n"
513 " 2 -> do 3. test from mpeg4 std\n"
514 "bits Number of time domain bits to use, 8 is default\n"
515 "-i test IDCT implementations\n"
516 "-4 test IDCT248 implementations\n"
521 #include "compat/getopt.c"
524 int main(int argc, char **argv)
526 int test_idct = 0, test_248_dct = 0;
533 cpu_flags = av_get_cpu_flags();
539 c = getopt(argc, argv, "ih4t");
560 test = atoi(argv[optind]);
561 if(optind+1 < argc) bits= atoi(argv[optind+1]);
563 printf("ffmpeg DCT/IDCT test\n");
566 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
568 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
569 for (i = 0; algos[i].name; i++)
570 if (!(~cpu_flags & algos[i].mm_support)) {
571 err |= dct_error(&algos[i], test, test_idct, speed, bits);