2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
47 #include "x86/idct_xvid.h"
51 void ff_bfin_idct(int16_t *block);
52 void ff_bfin_fdct(int16_t *block);
55 void ff_fdct_altivec(int16_t *block);
58 void ff_j_rev_dct_arm(int16_t *data);
59 void ff_simple_idct_arm(int16_t *data);
60 void ff_simple_idct_armv5te(int16_t *data);
61 void ff_simple_idct_armv6(int16_t *data);
62 void ff_simple_idct_neon(int16_t *data);
66 void (*func)(int16_t *block);
67 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
68 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
75 static const struct algo fdct_tab[] = {
76 { "REF-DBL", ff_ref_fdct, NO_PERM },
77 { "FAAN", ff_faandct, NO_PERM },
78 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
79 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
82 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
84 #if HAVE_MMXEXT_INLINE
85 { "MMXEXT", ff_fdct_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT },
88 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
92 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
96 { "BFINfdct", ff_bfin_fdct, NO_PERM },
102 static void ff_prores_idct_wrap(int16_t *dst){
103 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
109 ff_prores_idct(dst, qmat);
110 for(i=0; i<64; i++) {
114 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
115 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
116 int16_t *block, int16_t *qmat);
118 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
119 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
120 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
127 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
129 for(i=0; i<64; i++) {
135 static const struct algo idct_tab[] = {
136 { "FAANI", ff_faanidct, NO_PERM },
137 { "REF-DBL", ff_ref_idct, NO_PERM },
138 { "INT", ff_j_rev_dct, MMX_PERM },
139 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
140 { "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 },
143 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
144 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
146 #if HAVE_MMXEXT_INLINE
147 { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
150 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
151 #if ARCH_X86_64 && HAVE_YASM
152 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
157 { "BFINidct", ff_bfin_idct, NO_PERM },
161 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
162 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
165 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
168 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
170 #if HAVE_NEON && ARCH_ARM
171 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM, AV_CPU_FLAG_NEON },
177 #define AANSCALE_BITS 12
180 #define NB_ITS_SPEED 50000
182 static short idct_mmx_perm[64];
184 static short idct_simple_mmx_perm[64] = {
185 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
186 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
187 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
188 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
189 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
190 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
191 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
192 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
195 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
197 static void idct_mmx_init(void)
201 /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
202 for (i = 0; i < 64; i++) {
203 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
207 DECLARE_ALIGNED(16, static int16_t, block)[64];
208 DECLARE_ALIGNED(8, static int16_t, block1)[64];
210 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
214 memset(block, 0, 64 * sizeof(*block));
218 for (i = 0; i < 64; i++)
219 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
222 for (i = 0; i < 64; i++)
227 j = av_lfg_get(prng) % 10 + 1;
228 for (i = 0; i < j; i++) {
229 int idx = av_lfg_get(prng) % 64;
230 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
234 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
235 block[63] = (block[0] & 1) ^ 1;
240 static void permute(int16_t dst[64], const int16_t src[64], int perm)
244 if (perm == MMX_PERM) {
245 for (i = 0; i < 64; i++)
246 dst[idct_mmx_perm[i]] = src[i];
247 } else if (perm == MMX_SIMPLE_PERM) {
248 for (i = 0; i < 64; i++)
249 dst[idct_simple_mmx_perm[i]] = src[i];
250 } else if (perm == SSE2_PERM) {
251 for (i = 0; i < 64; i++)
252 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
253 } else if (perm == PARTTRANS_PERM) {
254 for (i = 0; i < 64; i++)
255 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
256 } else if (perm == TRANSPOSE_PERM) {
257 for (i = 0; i < 64; i++)
258 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
260 for (i = 0; i < 64; i++)
265 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
267 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
270 int64_t err2, ti, ti1, it1, err_sum = 0;
271 int64_t sysErr[64], sysErrMax = 0;
273 int blockSumErrMax = 0, blockSumErr;
275 const int vals=1<<bits;
279 av_lfg_init(&prng, 1);
283 for (i = 0; i < 64; i++)
285 for (it = 0; it < NB_ITS; it++) {
286 init_block(block1, test, is_idct, &prng, vals);
287 permute(block, block1, dct->format);
292 if (dct->format == SCALE_PERM) {
293 for (i = 0; i < 64; i++) {
294 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
295 block[i] = (block[i] * scale) >> AANSCALE_BITS;
300 if (!strcmp(dct->name, "PR-SSE2"))
301 for (i = 0; i < 64; i++)
302 block1[i] = av_clip(block1[i], 4-512, 1019-512);
305 for (i = 0; i < 64; i++) {
306 int err = block[i] - block1[i];
312 sysErr[i] += block[i] - block1[i];
314 if (abs(block[i]) > maxout)
315 maxout = abs(block[i]);
317 if (blockSumErrMax < blockSumErr)
318 blockSumErrMax = blockSumErr;
320 for (i = 0; i < 64; i++)
321 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
323 for (i = 0; i < 64; i++) {
326 printf("%7d ", (int) sysErr[i]);
330 omse = (double) err2 / NB_ITS / 64;
331 ome = (double) err_sum / NB_ITS / 64;
333 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
335 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
336 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
337 omse, ome, (double) sysErrMax / NB_ITS,
338 maxout, blockSumErrMax);
340 if (spec_err && !dct->nonspec)
348 init_block(block, test, is_idct, &prng, vals);
349 permute(block1, block, dct->format);
354 for (it = 0; it < NB_ITS_SPEED; it++) {
355 memcpy(block, block1, sizeof(block));
360 ti1 = av_gettime() - ti;
361 } while (ti1 < 1000000);
363 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
364 (double) it1 * 1000.0 / (double) ti1);
369 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
370 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
372 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
375 static double c8[8][8];
376 static double c4[4][4];
377 double block1[64], block2[64], block3[64];
384 for (i = 0; i < 8; i++) {
386 for (j = 0; j < 8; j++) {
387 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
388 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
389 sum += c8[i][j] * c8[i][j];
393 for (i = 0; i < 4; i++) {
395 for (j = 0; j < 4; j++) {
396 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
397 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
398 sum += c4[i][j] * c4[i][j];
405 for (i = 0; i < 4; i++) {
406 for (j = 0; j < 8; j++) {
407 block1[8 * (2 * i) + j] =
408 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
409 block1[8 * (2 * i + 1) + j] =
410 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
415 for (i = 0; i < 8; i++) {
416 for (j = 0; j < 8; j++) {
418 for (k = 0; k < 8; k++)
419 sum += c8[k][j] * block1[8 * i + k];
420 block2[8 * i + j] = sum;
425 for (i = 0; i < 8; i++) {
426 for (j = 0; j < 4; j++) {
429 for (k = 0; k < 4; k++)
430 sum += c4[k][j] * block2[8 * (2 * k) + i];
431 block3[8 * (2 * j) + i] = sum;
435 for (k = 0; k < 4; k++)
436 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
437 block3[8 * (2 * j + 1) + i] = sum;
441 /* clamp and store the result */
442 for (i = 0; i < 8; i++) {
443 for (j = 0; j < 8; j++) {
444 v = block3[8 * i + j];
446 else if (v > 255) v = 255;
447 dest[i * linesize + j] = (int) rint(v);
452 static void idct248_error(const char *name,
453 void (*idct248_put)(uint8_t *dest, int line_size,
457 int it, i, it1, ti, ti1, err_max, v;
460 av_lfg_init(&prng, 1);
462 /* just one test to see if code is correct (precision is less
465 for (it = 0; it < NB_ITS; it++) {
466 /* XXX: use forward transform to generate values */
467 for (i = 0; i < 64; i++)
468 block1[i] = av_lfg_get(&prng) % 256 - 128;
471 for (i = 0; i < 64; i++)
472 block[i] = block1[i];
473 idct248_ref(img_dest1, 8, block);
475 for (i = 0; i < 64; i++)
476 block[i] = block1[i];
477 idct248_put(img_dest, 8, block);
479 for (i = 0; i < 64; i++) {
480 v = abs((int) img_dest[i] - (int) img_dest1[i]);
482 printf("%d %d\n", img_dest[i], img_dest1[i]);
491 printf(" %3d", img_dest1[i*8+j]);
500 printf(" %3d", img_dest[i*8+j]);
506 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
514 for (it = 0; it < NB_ITS_SPEED; it++) {
515 for (i = 0; i < 64; i++)
516 block[i] = block1[i];
517 idct248_put(img_dest, 8, block);
521 ti1 = av_gettime() - ti;
522 } while (ti1 < 1000000);
524 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
525 (double) it1 * 1000.0 / (double) ti1);
528 static void help(void)
530 printf("dct-test [-i] [<test-number>] [<bits>]\n"
531 "test-number 0 -> test with random matrixes\n"
532 " 1 -> test with random sparse matrixes\n"
533 " 2 -> do 3. test from mpeg4 std\n"
534 "bits Number of time domain bits to use, 8 is default\n"
535 "-i test IDCT implementations\n"
536 "-4 test IDCT248 implementations\n"
541 #include "compat/getopt.c"
544 int main(int argc, char **argv)
546 int test_idct = 0, test_248_dct = 0;
553 cpu_flags = av_get_cpu_flags();
559 c = getopt(argc, argv, "ih4t");
580 test = atoi(argv[optind]);
581 if(optind+1 < argc) bits= atoi(argv[optind+1]);
583 printf("ffmpeg DCT/IDCT test\n");
586 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
588 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
589 for (i = 0; algos[i].name; i++)
590 if (!(~cpu_flags & algos[i].mm_support)) {
591 err |= dct_error(&algos[i], test, test_idct, speed, bits);
596 printf("Error: %d.\n", err);