2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
42 #include "simple_idct.h"
43 #include "aandcttab.h"
46 #include "x86/idct_xvid.h"
51 void ff_mmx_idct(DCTELEM *data);
52 void ff_mmxext_idct(DCTELEM *data);
55 void ff_bfin_idct(DCTELEM *block);
56 void ff_bfin_fdct(DCTELEM *block);
59 void ff_fdct_altivec(DCTELEM *block);
62 void ff_j_rev_dct_arm(DCTELEM *data);
63 void ff_simple_idct_arm(DCTELEM *data);
64 void ff_simple_idct_armv5te(DCTELEM *data);
65 void ff_simple_idct_armv6(DCTELEM *data);
66 void ff_simple_idct_neon(DCTELEM *data);
68 void ff_simple_idct_axp(DCTELEM *data);
72 void (*func)(DCTELEM *block);
73 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
74 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
81 static const struct algo fdct_tab[] = {
82 { "REF-DBL", ff_ref_fdct, NO_PERM },
83 { "FAAN", ff_faandct, NO_PERM },
84 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
85 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
88 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
89 { "MMXEXT", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT },
90 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
94 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
98 { "BFINfdct", ff_bfin_fdct, NO_PERM },
104 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
105 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
106 DCTELEM *block, int16_t *qmat);
108 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
109 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
110 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
117 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
121 static const struct algo idct_tab[] = {
122 { "FAANI", ff_faanidct, NO_PERM },
123 { "REF-DBL", ff_ref_idct, NO_PERM },
124 { "INT", ff_j_rev_dct, MMX_PERM },
125 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
129 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
130 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
132 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
133 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
134 { "XVID-MMXEXT", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
135 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
136 #if ARCH_X86_64 && HAVE_YASM
137 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
142 { "BFINidct", ff_bfin_idct, NO_PERM },
146 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
147 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
150 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
153 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
156 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
160 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
166 #define AANSCALE_BITS 12
169 #define NB_ITS_SPEED 50000
171 static short idct_mmx_perm[64];
173 static short idct_simple_mmx_perm[64] = {
174 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
175 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
176 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
177 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
178 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
179 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
180 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
181 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
184 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
186 static void idct_mmx_init(void)
190 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
191 for (i = 0; i < 64; i++) {
192 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
196 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
197 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
199 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
203 memset(block, 0, 64 * sizeof(*block));
207 for (i = 0; i < 64; i++)
208 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
211 for (i = 0; i < 64; i++)
216 j = av_lfg_get(prng) % 10 + 1;
217 for (i = 0; i < j; i++) {
218 int idx = av_lfg_get(prng) % 64;
219 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
223 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
224 block[63] = (block[0] & 1) ^ 1;
229 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
233 if (perm == MMX_PERM) {
234 for (i = 0; i < 64; i++)
235 dst[idct_mmx_perm[i]] = src[i];
236 } else if (perm == MMX_SIMPLE_PERM) {
237 for (i = 0; i < 64; i++)
238 dst[idct_simple_mmx_perm[i]] = src[i];
239 } else if (perm == SSE2_PERM) {
240 for (i = 0; i < 64; i++)
241 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
242 } else if (perm == PARTTRANS_PERM) {
243 for (i = 0; i < 64; i++)
244 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
245 } else if (perm == TRANSPOSE_PERM) {
246 for (i = 0; i < 64; i++)
247 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
249 for (i = 0; i < 64; i++)
254 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
256 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
259 int64_t err2, ti, ti1, it1, err_sum = 0;
260 int64_t sysErr[64], sysErrMax = 0;
262 int blockSumErrMax = 0, blockSumErr;
264 const int vals=1<<bits;
268 av_lfg_init(&prng, 1);
272 for (i = 0; i < 64; i++)
274 for (it = 0; it < NB_ITS; it++) {
275 init_block(block1, test, is_idct, &prng, vals);
276 permute(block, block1, dct->format);
281 if (dct->format == SCALE_PERM) {
282 for (i = 0; i < 64; i++) {
283 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
284 block[i] = (block[i] * scale) >> AANSCALE_BITS;
291 for (i = 0; i < 64; i++) {
292 int err = block[i] - block1[i];
298 sysErr[i] += block[i] - block1[i];
300 if (abs(block[i]) > maxout)
301 maxout = abs(block[i]);
303 if (blockSumErrMax < blockSumErr)
304 blockSumErrMax = blockSumErr;
306 for (i = 0; i < 64; i++)
307 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
309 for (i = 0; i < 64; i++) {
312 printf("%7d ", (int) sysErr[i]);
316 omse = (double) err2 / NB_ITS / 64;
317 ome = (double) err_sum / NB_ITS / 64;
319 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
321 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
322 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
323 omse, ome, (double) sysErrMax / NB_ITS,
324 maxout, blockSumErrMax);
326 if (spec_err && !dct->nonspec)
334 init_block(block, test, is_idct, &prng, vals);
335 permute(block1, block, dct->format);
340 for (it = 0; it < NB_ITS_SPEED; it++) {
341 memcpy(block, block1, sizeof(block));
346 ti1 = av_gettime() - ti;
347 } while (ti1 < 1000000);
349 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
350 (double) it1 * 1000.0 / (double) ti1);
355 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
356 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
358 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
361 static double c8[8][8];
362 static double c4[4][4];
363 double block1[64], block2[64], block3[64];
370 for (i = 0; i < 8; i++) {
372 for (j = 0; j < 8; j++) {
373 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
374 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
375 sum += c8[i][j] * c8[i][j];
379 for (i = 0; i < 4; i++) {
381 for (j = 0; j < 4; j++) {
382 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
383 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
384 sum += c4[i][j] * c4[i][j];
391 for (i = 0; i < 4; i++) {
392 for (j = 0; j < 8; j++) {
393 block1[8 * (2 * i) + j] =
394 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
395 block1[8 * (2 * i + 1) + j] =
396 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
401 for (i = 0; i < 8; i++) {
402 for (j = 0; j < 8; j++) {
404 for (k = 0; k < 8; k++)
405 sum += c8[k][j] * block1[8 * i + k];
406 block2[8 * i + j] = sum;
411 for (i = 0; i < 8; i++) {
412 for (j = 0; j < 4; j++) {
415 for (k = 0; k < 4; k++)
416 sum += c4[k][j] * block2[8 * (2 * k) + i];
417 block3[8 * (2 * j) + i] = sum;
421 for (k = 0; k < 4; k++)
422 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
423 block3[8 * (2 * j + 1) + i] = sum;
427 /* clamp and store the result */
428 for (i = 0; i < 8; i++) {
429 for (j = 0; j < 8; j++) {
430 v = block3[8 * i + j];
432 else if (v > 255) v = 255;
433 dest[i * linesize + j] = (int) rint(v);
438 static void idct248_error(const char *name,
439 void (*idct248_put)(uint8_t *dest, int line_size,
443 int it, i, it1, ti, ti1, err_max, v;
446 av_lfg_init(&prng, 1);
448 /* just one test to see if code is correct (precision is less
451 for (it = 0; it < NB_ITS; it++) {
452 /* XXX: use forward transform to generate values */
453 for (i = 0; i < 64; i++)
454 block1[i] = av_lfg_get(&prng) % 256 - 128;
457 for (i = 0; i < 64; i++)
458 block[i] = block1[i];
459 idct248_ref(img_dest1, 8, block);
461 for (i = 0; i < 64; i++)
462 block[i] = block1[i];
463 idct248_put(img_dest, 8, block);
465 for (i = 0; i < 64; i++) {
466 v = abs((int) img_dest[i] - (int) img_dest1[i]);
468 printf("%d %d\n", img_dest[i], img_dest1[i]);
477 printf(" %3d", img_dest1[i*8+j]);
486 printf(" %3d", img_dest[i*8+j]);
492 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
500 for (it = 0; it < NB_ITS_SPEED; it++) {
501 for (i = 0; i < 64; i++)
502 block[i] = block1[i];
503 idct248_put(img_dest, 8, block);
507 ti1 = av_gettime() - ti;
508 } while (ti1 < 1000000);
510 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
511 (double) it1 * 1000.0 / (double) ti1);
514 static void help(void)
516 printf("dct-test [-i] [<test-number>] [<bits>]\n"
517 "test-number 0 -> test with random matrixes\n"
518 " 1 -> test with random sparse matrixes\n"
519 " 2 -> do 3. test from mpeg4 std\n"
520 "bits Number of time domain bits to use, 8 is default\n"
521 "-i test IDCT implementations\n"
522 "-4 test IDCT248 implementations\n"
527 #include "compat/getopt.c"
530 int main(int argc, char **argv)
532 int test_idct = 0, test_248_dct = 0;
539 cpu_flags = av_get_cpu_flags();
545 c = getopt(argc, argv, "ih4t");
566 test = atoi(argv[optind]);
567 if(optind+1 < argc) bits= atoi(argv[optind+1]);
569 printf("ffmpeg DCT/IDCT test\n");
572 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
574 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
575 for (i = 0; algos[i].name; i++)
576 if (!(~cpu_flags & algos[i].mm_support)) {
577 err |= dct_error(&algos[i], test, test_idct, speed, bits);