2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
42 #include "simple_idct.h"
43 #include "aandcttab.h"
46 #include "x86/idct_xvid.h"
51 void ff_mmx_idct(DCTELEM *data);
52 void ff_mmxext_idct(DCTELEM *data);
55 void ff_bfin_idct(DCTELEM *block);
56 void ff_bfin_fdct(DCTELEM *block);
59 void ff_fdct_altivec(DCTELEM *block);
60 //void ff_idct_altivec(DCTELEM *block);?? no routine
63 void ff_j_rev_dct_arm(DCTELEM *data);
64 void ff_simple_idct_arm(DCTELEM *data);
65 void ff_simple_idct_armv5te(DCTELEM *data);
66 void ff_simple_idct_armv6(DCTELEM *data);
67 void ff_simple_idct_neon(DCTELEM *data);
69 void ff_simple_idct_axp(DCTELEM *data);
73 void (*func)(DCTELEM *block);
74 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
75 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
82 static const struct algo fdct_tab[] = {
83 { "REF-DBL", ff_ref_fdct, NO_PERM },
84 { "FAAN", ff_faandct, NO_PERM },
85 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
86 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
89 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
90 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
91 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
95 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
99 { "BFINfdct", ff_bfin_fdct, NO_PERM },
105 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
106 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
107 DCTELEM *block, int16_t *qmat);
109 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
110 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
111 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
118 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
122 static const struct algo idct_tab[] = {
123 { "FAANI", ff_faanidct, NO_PERM },
124 { "REF-DBL", ff_ref_idct, NO_PERM },
125 { "INT", ff_j_rev_dct, MMX_PERM },
126 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
130 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
131 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
133 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
134 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
135 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
136 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
137 #if ARCH_X86_64 && HAVE_YASM
138 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
143 { "BFINidct", ff_bfin_idct, NO_PERM },
147 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
148 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
151 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
154 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
157 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
161 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
167 #define AANSCALE_BITS 12
170 #define NB_ITS_SPEED 50000
172 static short idct_mmx_perm[64];
174 static short idct_simple_mmx_perm[64] = {
175 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
176 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
177 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
178 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
179 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
180 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
181 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
182 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
185 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
187 static void idct_mmx_init(void)
191 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
192 for (i = 0; i < 64; i++) {
193 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
197 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
198 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
200 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
204 memset(block, 0, 64 * sizeof(*block));
208 for (i = 0; i < 64; i++)
209 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
212 for (i = 0; i < 64; i++)
217 j = av_lfg_get(prng) % 10 + 1;
218 for (i = 0; i < j; i++)
219 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
222 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
223 block[63] = (block[0] & 1) ^ 1;
228 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
232 if (perm == MMX_PERM) {
233 for (i = 0; i < 64; i++)
234 dst[idct_mmx_perm[i]] = src[i];
235 } else if (perm == MMX_SIMPLE_PERM) {
236 for (i = 0; i < 64; i++)
237 dst[idct_simple_mmx_perm[i]] = src[i];
238 } else if (perm == SSE2_PERM) {
239 for (i = 0; i < 64; i++)
240 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241 } else if (perm == PARTTRANS_PERM) {
242 for (i = 0; i < 64; i++)
243 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244 } else if (perm == TRANSPOSE_PERM) {
245 for (i = 0; i < 64; i++)
246 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
248 for (i = 0; i < 64; i++)
253 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
255 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
258 int64_t err2, ti, ti1, it1, err_sum = 0;
259 int64_t sysErr[64], sysErrMax = 0;
261 int blockSumErrMax = 0, blockSumErr;
263 const int vals=1<<bits;
267 av_lfg_init(&prng, 1);
271 for (i = 0; i < 64; i++)
273 for (it = 0; it < NB_ITS; it++) {
274 init_block(block1, test, is_idct, &prng, vals);
275 permute(block, block1, dct->format);
280 if (dct->format == SCALE_PERM) {
281 for (i = 0; i < 64; i++) {
282 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
283 block[i] = (block[i] * scale) >> AANSCALE_BITS;
290 for (i = 0; i < 64; i++) {
291 int err = block[i] - block1[i];
297 sysErr[i] += block[i] - block1[i];
299 if (abs(block[i]) > maxout)
300 maxout = abs(block[i]);
302 if (blockSumErrMax < blockSumErr)
303 blockSumErrMax = blockSumErr;
305 for (i = 0; i < 64; i++)
306 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
308 for (i = 0; i < 64; i++) {
311 printf("%7d ", (int) sysErr[i]);
315 omse = (double) err2 / NB_ITS / 64;
316 ome = (double) err_sum / NB_ITS / 64;
318 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
320 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
321 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
322 omse, ome, (double) sysErrMax / NB_ITS,
323 maxout, blockSumErrMax);
325 if (spec_err && !dct->nonspec)
333 init_block(block, test, is_idct, &prng, vals);
334 permute(block1, block, dct->format);
339 for (it = 0; it < NB_ITS_SPEED; it++) {
340 memcpy(block, block1, sizeof(block));
345 ti1 = av_gettime() - ti;
346 } while (ti1 < 1000000);
348 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
349 (double) it1 * 1000.0 / (double) ti1);
354 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
355 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
357 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
360 static double c8[8][8];
361 static double c4[4][4];
362 double block1[64], block2[64], block3[64];
369 for (i = 0; i < 8; i++) {
371 for (j = 0; j < 8; j++) {
372 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
373 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
374 sum += c8[i][j] * c8[i][j];
378 for (i = 0; i < 4; i++) {
380 for (j = 0; j < 4; j++) {
381 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
382 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
383 sum += c4[i][j] * c4[i][j];
390 for (i = 0; i < 4; i++) {
391 for (j = 0; j < 8; j++) {
392 block1[8 * (2 * i) + j] =
393 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
394 block1[8 * (2 * i + 1) + j] =
395 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
400 for (i = 0; i < 8; i++) {
401 for (j = 0; j < 8; j++) {
403 for (k = 0; k < 8; k++)
404 sum += c8[k][j] * block1[8 * i + k];
405 block2[8 * i + j] = sum;
410 for (i = 0; i < 8; i++) {
411 for (j = 0; j < 4; j++) {
414 for (k = 0; k < 4; k++)
415 sum += c4[k][j] * block2[8 * (2 * k) + i];
416 block3[8 * (2 * j) + i] = sum;
420 for (k = 0; k < 4; k++)
421 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
422 block3[8 * (2 * j + 1) + i] = sum;
426 /* clamp and store the result */
427 for (i = 0; i < 8; i++) {
428 for (j = 0; j < 8; j++) {
429 v = block3[8 * i + j];
431 else if (v > 255) v = 255;
432 dest[i * linesize + j] = (int) rint(v);
437 static void idct248_error(const char *name,
438 void (*idct248_put)(uint8_t *dest, int line_size,
442 int it, i, it1, ti, ti1, err_max, v;
445 av_lfg_init(&prng, 1);
447 /* just one test to see if code is correct (precision is less
450 for (it = 0; it < NB_ITS; it++) {
451 /* XXX: use forward transform to generate values */
452 for (i = 0; i < 64; i++)
453 block1[i] = av_lfg_get(&prng) % 256 - 128;
456 for (i = 0; i < 64; i++)
457 block[i] = block1[i];
458 idct248_ref(img_dest1, 8, block);
460 for (i = 0; i < 64; i++)
461 block[i] = block1[i];
462 idct248_put(img_dest, 8, block);
464 for (i = 0; i < 64; i++) {
465 v = abs((int) img_dest[i] - (int) img_dest1[i]);
467 printf("%d %d\n", img_dest[i], img_dest1[i]);
476 printf(" %3d", img_dest1[i*8+j]);
485 printf(" %3d", img_dest[i*8+j]);
491 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
499 for (it = 0; it < NB_ITS_SPEED; it++) {
500 for (i = 0; i < 64; i++)
501 block[i] = block1[i];
502 idct248_put(img_dest, 8, block);
506 ti1 = av_gettime() - ti;
507 } while (ti1 < 1000000);
509 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
510 (double) it1 * 1000.0 / (double) ti1);
513 static void help(void)
515 printf("dct-test [-i] [<test-number>] [<bits>]\n"
516 "test-number 0 -> test with random matrixes\n"
517 " 1 -> test with random sparse matrixes\n"
518 " 2 -> do 3. test from mpeg4 std\n"
519 "bits Number of time domain bits to use, 8 is default\n"
520 "-i test IDCT implementations\n"
521 "-4 test IDCT248 implementations\n"
526 #include "compat/getopt.c"
529 int main(int argc, char **argv)
531 int test_idct = 0, test_248_dct = 0;
538 cpu_flags = av_get_cpu_flags();
544 c = getopt(argc, argv, "ih4t");
565 test = atoi(argv[optind]);
566 if(optind+1 < argc) bits= atoi(argv[optind+1]);
568 printf("ffmpeg DCT/IDCT test\n");
571 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
573 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
574 for (i = 0; algos[i].name; i++)
575 if (!(~cpu_flags & algos[i].mm_support)) {
576 err |= dct_error(&algos[i], test, test_idct, speed, bits);