2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
35 #include "libavutil/cpu.h"
36 #include "libavutil/common.h"
37 #include "libavutil/lfg.h"
39 #include "simple_idct.h"
40 #include "aandcttab.h"
43 #include "x86/idct_xvid.h"
48 void ff_mmx_idct(DCTELEM *data);
49 void ff_mmxext_idct(DCTELEM *data);
52 void ff_bfin_idct(DCTELEM *block);
53 void ff_bfin_fdct(DCTELEM *block);
56 void ff_fdct_altivec(DCTELEM *block);
57 //void ff_idct_altivec(DCTELEM *block);?? no routine
60 void ff_j_rev_dct_arm(DCTELEM *data);
61 void ff_simple_idct_arm(DCTELEM *data);
62 void ff_simple_idct_armv5te(DCTELEM *data);
63 void ff_simple_idct_armv6(DCTELEM *data);
64 void ff_simple_idct_neon(DCTELEM *data);
66 void ff_simple_idct_axp(DCTELEM *data);
70 void (*func)(DCTELEM *block);
71 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
72 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
77 #ifndef FAAN_POSTSCALE
78 #define FAAN_SCALE SCALE_PERM
80 #define FAAN_SCALE NO_PERM
85 static const struct algo fdct_tab[] = {
86 { "REF-DBL", ff_ref_fdct, NO_PERM },
87 { "FAAN", ff_faandct, FAAN_SCALE },
88 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
89 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
92 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
93 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
94 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
98 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
102 { "BFINfdct", ff_bfin_fdct, NO_PERM },
109 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
110 DCTELEM *block, int16_t *qmat);
112 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
113 int16_t qmat[64]; int i;
120 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
124 static const struct algo idct_tab[] = {
125 { "FAANI", ff_faanidct, NO_PERM },
126 { "REF-DBL", ff_ref_idct, NO_PERM },
127 { "INT", ff_j_rev_dct, MMX_PERM },
128 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
132 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
133 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
135 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
136 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
137 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
138 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
140 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
145 { "BFINidct", ff_bfin_idct, NO_PERM },
149 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
150 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
153 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
156 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
159 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
163 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
169 #define AANSCALE_BITS 12
171 static int64_t gettime(void)
174 gettimeofday(&tv, NULL);
175 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
179 #define NB_ITS_SPEED 50000
181 static short idct_mmx_perm[64];
183 static short idct_simple_mmx_perm[64] = {
184 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
185 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
186 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
187 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
188 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
189 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
190 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
191 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
194 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
196 static void idct_mmx_init(void)
200 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
201 for (i = 0; i < 64; i++) {
202 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
206 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
207 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
209 static inline void mmx_emms(void)
212 if (cpu_flags & AV_CPU_FLAG_MMX)
213 __asm__ volatile ("emms\n\t");
217 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
221 memset(block, 0, 64 * sizeof(*block));
225 for (i = 0; i < 64; i++)
226 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
229 for (i = 0; i < 64; i++)
234 j = av_lfg_get(prng) % 10 + 1;
235 for (i = 0; i < j; i++)
236 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
239 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
240 block[63] = (block[0] & 1) ^ 1;
245 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
249 if (perm == MMX_PERM) {
250 for (i = 0; i < 64; i++)
251 dst[idct_mmx_perm[i]] = src[i];
252 } else if (perm == MMX_SIMPLE_PERM) {
253 for (i = 0; i < 64; i++)
254 dst[idct_simple_mmx_perm[i]] = src[i];
255 } else if (perm == SSE2_PERM) {
256 for (i = 0; i < 64; i++)
257 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
258 } else if (perm == PARTTRANS_PERM) {
259 for (i = 0; i < 64; i++)
260 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
261 } else if (perm == TRANSPOSE_PERM) {
262 for (i = 0; i < 64; i++)
263 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
265 for (i = 0; i < 64; i++)
270 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
272 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
275 int64_t err2, ti, ti1, it1, err_sum = 0;
276 int64_t sysErr[64], sysErrMax = 0;
278 int blockSumErrMax = 0, blockSumErr;
280 const int vals=1<<bits;
284 av_lfg_init(&prng, 1);
288 for (i = 0; i < 64; i++)
290 for (it = 0; it < NB_ITS; it++) {
291 init_block(block1, test, is_idct, &prng, vals);
292 permute(block, block1, dct->format);
297 if (dct->format == SCALE_PERM) {
298 for (i = 0; i < 64; i++) {
299 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
300 block[i] = (block[i] * scale) >> AANSCALE_BITS;
307 for (i = 0; i < 64; i++) {
308 int err = block[i] - block1[i];
314 sysErr[i] += block[i] - block1[i];
316 if (abs(block[i]) > maxout)
317 maxout = abs(block[i]);
319 if (blockSumErrMax < blockSumErr)
320 blockSumErrMax = blockSumErr;
322 for (i = 0; i < 64; i++)
323 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
325 for (i = 0; i < 64; i++) {
328 printf("%7d ", (int) sysErr[i]);
332 omse = (double) err2 / NB_ITS / 64;
333 ome = (double) err_sum / NB_ITS / 64;
335 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
337 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
338 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
339 omse, ome, (double) sysErrMax / NB_ITS,
340 maxout, blockSumErrMax);
342 if (spec_err && !dct->nonspec)
350 init_block(block, test, is_idct, &prng, vals);
351 permute(block1, block, dct->format);
356 for (it = 0; it < NB_ITS_SPEED; it++) {
357 memcpy(block, block1, sizeof(block));
361 ti1 = gettime() - ti;
362 } while (ti1 < 1000000);
365 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
366 (double) it1 * 1000.0 / (double) ti1);
371 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
372 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
374 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
377 static double c8[8][8];
378 static double c4[4][4];
379 double block1[64], block2[64], block3[64];
386 for (i = 0; i < 8; i++) {
388 for (j = 0; j < 8; j++) {
389 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
390 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
391 sum += c8[i][j] * c8[i][j];
395 for (i = 0; i < 4; i++) {
397 for (j = 0; j < 4; j++) {
398 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
399 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
400 sum += c4[i][j] * c4[i][j];
407 for (i = 0; i < 4; i++) {
408 for (j = 0; j < 8; j++) {
409 block1[8 * (2 * i) + j] =
410 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
411 block1[8 * (2 * i + 1) + j] =
412 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
417 for (i = 0; i < 8; i++) {
418 for (j = 0; j < 8; j++) {
420 for (k = 0; k < 8; k++)
421 sum += c8[k][j] * block1[8 * i + k];
422 block2[8 * i + j] = sum;
427 for (i = 0; i < 8; i++) {
428 for (j = 0; j < 4; j++) {
431 for (k = 0; k < 4; k++)
432 sum += c4[k][j] * block2[8 * (2 * k) + i];
433 block3[8 * (2 * j) + i] = sum;
437 for (k = 0; k < 4; k++)
438 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
439 block3[8 * (2 * j + 1) + i] = sum;
443 /* clamp and store the result */
444 for (i = 0; i < 8; i++) {
445 for (j = 0; j < 8; j++) {
446 v = block3[8 * i + j];
448 else if (v > 255) v = 255;
449 dest[i * linesize + j] = (int) rint(v);
454 static void idct248_error(const char *name,
455 void (*idct248_put)(uint8_t *dest, int line_size,
459 int it, i, it1, ti, ti1, err_max, v;
462 av_lfg_init(&prng, 1);
464 /* just one test to see if code is correct (precision is less
467 for (it = 0; it < NB_ITS; it++) {
468 /* XXX: use forward transform to generate values */
469 for (i = 0; i < 64; i++)
470 block1[i] = av_lfg_get(&prng) % 256 - 128;
473 for (i = 0; i < 64; i++)
474 block[i] = block1[i];
475 idct248_ref(img_dest1, 8, block);
477 for (i = 0; i < 64; i++)
478 block[i] = block1[i];
479 idct248_put(img_dest, 8, block);
481 for (i = 0; i < 64; i++) {
482 v = abs((int) img_dest[i] - (int) img_dest1[i]);
484 printf("%d %d\n", img_dest[i], img_dest1[i]);
493 printf(" %3d", img_dest1[i*8+j]);
502 printf(" %3d", img_dest[i*8+j]);
508 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
516 for (it = 0; it < NB_ITS_SPEED; it++) {
517 for (i = 0; i < 64; i++)
518 block[i] = block1[i];
519 idct248_put(img_dest, 8, block);
522 ti1 = gettime() - ti;
523 } while (ti1 < 1000000);
526 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
527 (double) it1 * 1000.0 / (double) ti1);
530 static void help(void)
532 printf("dct-test [-i] [<test-number>] [<bits>]\n"
533 "test-number 0 -> test with random matrixes\n"
534 " 1 -> test with random sparse matrixes\n"
535 " 2 -> do 3. test from mpeg4 std\n"
536 "bits Number of time domain bits to use, 8 is default\n"
537 "-i test IDCT implementations\n"
538 "-4 test IDCT248 implementations\n"
542 int main(int argc, char **argv)
544 int test_idct = 0, test_248_dct = 0;
551 cpu_flags = av_get_cpu_flags();
557 c = getopt(argc, argv, "ih4t");
578 test = atoi(argv[optind]);
579 if(optind+1 < argc) bits= atoi(argv[optind+1]);
581 printf("ffmpeg DCT/IDCT test\n");
584 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
586 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
587 for (i = 0; algos[i].name; i++)
588 if (!(~cpu_flags & algos[i].mm_support)) {
589 err |= dct_error(&algos[i], test, test_idct, speed, bits);