2 * (c) 2001 Fabrice Bellard
3 * 2007 Marc Hoffman <marc.hoffman@analog.com>
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * DCT test (c) 2001 Fabrice Bellard
25 * Started from sample code by Juan J. Sierralta P.
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
44 #include "simple_idct.h"
45 #include "aandcttab.h"
51 #include "x86/idct_xvid.h"
52 #include "x86/simple_idct.h"
57 void (*func)(int16_t *block);
58 enum idct_permutation_type perm_type;
63 static const struct algo fdct_tab[] = {
64 { "REF-DBL", ff_ref_fdct, FF_IDCT_PERM_NONE },
65 { "FAAN", ff_faandct, FF_IDCT_PERM_NONE },
66 { "IJG-AAN-INT", ff_fdct_ifast, FF_IDCT_PERM_NONE },
67 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, FF_IDCT_PERM_NONE },
70 { "MMX", ff_fdct_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX },
72 #if HAVE_MMXEXT_INLINE
73 { "MMXEXT", ff_fdct_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT },
76 { "SSE2", ff_fdct_sse2, FF_IDCT_PERM_NONE, AV_CPU_FLAG_SSE2 },
80 { "altivecfdct", ff_fdct_altivec, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ALTIVEC },
86 static void ff_prores_idct_wrap(int16_t *dst){
87 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
93 ff_prores_idct(dst, qmat);
98 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
99 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
100 int16_t *block, int16_t *qmat);
102 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
103 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
104 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
111 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
113 for(i=0; i<64; i++) {
119 static const struct algo idct_tab[] = {
120 { "FAANI", ff_faanidct, FF_IDCT_PERM_NONE },
121 { "REF-DBL", ff_ref_idct, FF_IDCT_PERM_NONE },
122 { "INT", ff_j_rev_dct, FF_IDCT_PERM_LIBMPEG2 },
123 { "SIMPLE-C", ff_simple_idct_8, FF_IDCT_PERM_NONE },
124 { "PR-C", ff_prores_idct_wrap, FF_IDCT_PERM_NONE, 0, 1 },
127 { "SIMPLE-MMX", ff_simple_idct_mmx, FF_IDCT_PERM_SIMPLE, AV_CPU_FLAG_MMX },
128 { "XVID-MMX", ff_idct_xvid_mmx, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMX, 1 },
130 #if HAVE_MMXEXT_INLINE
131 { "XVID-MMXEXT", ff_idct_xvid_mmxext, FF_IDCT_PERM_NONE, AV_CPU_FLAG_MMXEXT, 1 },
134 { "XVID-SSE2", ff_idct_xvid_sse2, FF_IDCT_PERM_SSE2, AV_CPU_FLAG_SSE2, 1 },
135 #if ARCH_X86_64 && HAVE_YASM
136 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, FF_IDCT_PERM_TRANSPOSE, AV_CPU_FLAG_SSE2, 1 },
141 { "SIMPLE-ARM", ff_simple_idct_arm, FF_IDCT_PERM_NONE },
142 { "INT-ARM", ff_j_rev_dct_arm, FF_IDCT_PERM_LIBMPEG2 },
145 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te, FF_IDCT_PERM_NONE, AV_CPU_FLAG_ARMV5TE },
148 { "SIMPLE-ARMV6", ff_simple_idct_armv6, FF_IDCT_PERM_LIBMPEG2, AV_CPU_FLAG_ARMV6 },
150 #if HAVE_NEON && ARCH_ARM
151 { "SIMPLE-NEON", ff_simple_idct_neon, FF_IDCT_PERM_PARTTRANS, AV_CPU_FLAG_NEON },
157 #define AANSCALE_BITS 12
160 #define NB_ITS_SPEED 50000
162 static short idct_simple_mmx_perm[64] = {
163 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
164 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
165 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
166 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
167 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
168 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
169 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
170 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
173 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
175 DECLARE_ALIGNED(16, static int16_t, block)[64];
176 DECLARE_ALIGNED(8, static int16_t, block1)[64];
178 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
182 memset(block, 0, 64 * sizeof(*block));
186 for (i = 0; i < 64; i++)
187 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
190 for (i = 0; i < 64; i++)
195 j = av_lfg_get(prng) % 10 + 1;
196 for (i = 0; i < j; i++) {
197 int idx = av_lfg_get(prng) % 64;
198 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
202 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
203 block[63] = (block[0] & 1) ^ 1;
208 static void permute(int16_t dst[64], const int16_t src[64],
209 enum idct_permutation_type perm_type)
214 case FF_IDCT_PERM_LIBMPEG2:
215 for (i = 0; i < 64; i++)
216 dst[(i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2)] = src[i];
218 case FF_IDCT_PERM_SIMPLE:
219 for (i = 0; i < 64; i++)
220 dst[idct_simple_mmx_perm[i]] = src[i];
222 case FF_IDCT_PERM_SSE2:
223 for (i = 0; i < 64; i++)
224 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
226 case FF_IDCT_PERM_PARTTRANS:
227 for (i = 0; i < 64; i++)
228 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
230 case FF_IDCT_PERM_TRANSPOSE:
231 for (i = 0; i < 64; i++)
232 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
235 for (i = 0; i < 64; i++)
241 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
243 void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
246 int64_t err2, ti, ti1, it1, err_sum = 0;
247 int64_t sysErr[64], sysErrMax = 0;
249 int blockSumErrMax = 0, blockSumErr;
251 const int vals=1<<bits;
255 av_lfg_init(&prng, 1);
259 for (i = 0; i < 64; i++)
261 for (it = 0; it < NB_ITS; it++) {
262 init_block(block1, test, is_idct, &prng, vals);
263 permute(block, block1, dct->perm_type);
268 if (!strcmp(dct->name, "IJG-AAN-INT")) {
269 for (i = 0; i < 64; i++) {
270 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
271 block[i] = (block[i] * scale) >> AANSCALE_BITS;
276 if (!strcmp(dct->name, "PR-SSE2"))
277 for (i = 0; i < 64; i++)
278 block1[i] = av_clip(block1[i], 4-512, 1019-512);
281 for (i = 0; i < 64; i++) {
282 int err = block[i] - block1[i];
288 sysErr[i] += block[i] - block1[i];
290 if (abs(block[i]) > maxout)
291 maxout = abs(block[i]);
293 if (blockSumErrMax < blockSumErr)
294 blockSumErrMax = blockSumErr;
296 for (i = 0; i < 64; i++)
297 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
299 for (i = 0; i < 64; i++) {
302 printf("%7d ", (int) sysErr[i]);
306 omse = (double) err2 / NB_ITS / 64;
307 ome = (double) err_sum / NB_ITS / 64;
309 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
311 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
312 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
313 omse, ome, (double) sysErrMax / NB_ITS,
314 maxout, blockSumErrMax);
316 if (spec_err && !dct->nonspec)
324 init_block(block, test, is_idct, &prng, vals);
325 permute(block1, block, dct->perm_type);
327 ti = av_gettime_relative();
330 for (it = 0; it < NB_ITS_SPEED; it++) {
331 memcpy(block, block1, sizeof(block));
336 ti1 = av_gettime_relative() - ti;
337 } while (ti1 < 1000000);
339 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
340 (double) it1 * 1000.0 / (double) ti1);
345 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
346 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
348 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
351 static double c8[8][8];
352 static double c4[4][4];
353 double block1[64], block2[64], block3[64];
360 for (i = 0; i < 8; i++) {
362 for (j = 0; j < 8; j++) {
363 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
364 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
365 sum += c8[i][j] * c8[i][j];
369 for (i = 0; i < 4; i++) {
371 for (j = 0; j < 4; j++) {
372 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
373 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
374 sum += c4[i][j] * c4[i][j];
381 for (i = 0; i < 4; i++) {
382 for (j = 0; j < 8; j++) {
383 block1[8 * (2 * i) + j] =
384 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
385 block1[8 * (2 * i + 1) + j] =
386 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
391 for (i = 0; i < 8; i++) {
392 for (j = 0; j < 8; j++) {
394 for (k = 0; k < 8; k++)
395 sum += c8[k][j] * block1[8 * i + k];
396 block2[8 * i + j] = sum;
401 for (i = 0; i < 8; i++) {
402 for (j = 0; j < 4; j++) {
405 for (k = 0; k < 4; k++)
406 sum += c4[k][j] * block2[8 * (2 * k) + i];
407 block3[8 * (2 * j) + i] = sum;
411 for (k = 0; k < 4; k++)
412 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
413 block3[8 * (2 * j + 1) + i] = sum;
417 /* clamp and store the result */
418 for (i = 0; i < 8; i++) {
419 for (j = 0; j < 8; j++) {
420 v = block3[8 * i + j];
422 else if (v > 255) v = 255;
423 dest[i * linesize + j] = (int) rint(v);
428 static void idct248_error(const char *name,
429 void (*idct248_put)(uint8_t *dest, int line_size,
433 int it, i, it1, ti, ti1, err_max, v;
436 av_lfg_init(&prng, 1);
438 /* just one test to see if code is correct (precision is less
441 for (it = 0; it < NB_ITS; it++) {
442 /* XXX: use forward transform to generate values */
443 for (i = 0; i < 64; i++)
444 block1[i] = av_lfg_get(&prng) % 256 - 128;
447 for (i = 0; i < 64; i++)
448 block[i] = block1[i];
449 idct248_ref(img_dest1, 8, block);
451 for (i = 0; i < 64; i++)
452 block[i] = block1[i];
453 idct248_put(img_dest, 8, block);
455 for (i = 0; i < 64; i++) {
456 v = abs((int) img_dest[i] - (int) img_dest1[i]);
458 printf("%d %d\n", img_dest[i], img_dest1[i]);
467 printf(" %3d", img_dest1[i*8+j]);
476 printf(" %3d", img_dest[i*8+j]);
482 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
487 ti = av_gettime_relative();
490 for (it = 0; it < NB_ITS_SPEED; it++) {
491 for (i = 0; i < 64; i++)
492 block[i] = block1[i];
493 idct248_put(img_dest, 8, block);
497 ti1 = av_gettime_relative() - ti;
498 } while (ti1 < 1000000);
500 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
501 (double) it1 * 1000.0 / (double) ti1);
504 static void help(void)
506 printf("dct-test [-i] [<test-number>] [<bits>]\n"
507 "test-number 0 -> test with random matrixes\n"
508 " 1 -> test with random sparse matrixes\n"
509 " 2 -> do 3. test from mpeg4 std\n"
510 "bits Number of time domain bits to use, 8 is default\n"
511 "-i test IDCT implementations\n"
512 "-4 test IDCT248 implementations\n"
517 #include "compat/getopt.c"
520 int main(int argc, char **argv)
522 int test_idct = 0, test_248_dct = 0;
532 c = getopt(argc, argv, "ih4t");
553 test = atoi(argv[optind]);
554 if(optind+1 < argc) bits= atoi(argv[optind+1]);
556 printf("ffmpeg DCT/IDCT test\n");
559 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
561 const int cpu_flags = av_get_cpu_flags();
562 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
563 for (i = 0; algos[i].name; i++)
564 if (!(~cpu_flags & algos[i].cpu_flag)) {
565 err |= dct_error(&algos[i], test, test_idct, speed, bits);
570 printf("Error: %d.\n", err);