]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
Merge commit '18e3d61e9e3b52c177aa7a1f2a054a8a753e1b09'
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /**
24  * @file
25  * DSP utils
26  */
27
28 #include "libavutil/attributes.h"
29 #include "libavutil/internal.h"
30 #include "avcodec.h"
31 #include "copy_block.h"
32 #include "dsputil.h"
33 #include "simple_idct.h"
34 #include "mpegvideo.h"
35 #include "config.h"
36
37 uint32_t ff_square_tab[512] = { 0, };
38
39 #define BIT_DEPTH 16
40 #include "dsputilenc_template.c"
41 #undef BIT_DEPTH
42
43 #define BIT_DEPTH 8
44 #include "dsputilenc_template.c"
45
46 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47                   int line_size, int h)
48 {
49     int s = 0, i;
50     uint32_t *sq = ff_square_tab + 256;
51
52     for (i = 0; i < h; i++) {
53         s    += sq[pix1[0] - pix2[0]];
54         s    += sq[pix1[1] - pix2[1]];
55         s    += sq[pix1[2] - pix2[2]];
56         s    += sq[pix1[3] - pix2[3]];
57         pix1 += line_size;
58         pix2 += line_size;
59     }
60     return s;
61 }
62
63 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64                   int line_size, int h)
65 {
66     int s = 0, i;
67     uint32_t *sq = ff_square_tab + 256;
68
69     for (i = 0; i < h; i++) {
70         s    += sq[pix1[0] - pix2[0]];
71         s    += sq[pix1[1] - pix2[1]];
72         s    += sq[pix1[2] - pix2[2]];
73         s    += sq[pix1[3] - pix2[3]];
74         s    += sq[pix1[4] - pix2[4]];
75         s    += sq[pix1[5] - pix2[5]];
76         s    += sq[pix1[6] - pix2[6]];
77         s    += sq[pix1[7] - pix2[7]];
78         pix1 += line_size;
79         pix2 += line_size;
80     }
81     return s;
82 }
83
84 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
85                    int line_size, int h)
86 {
87     int s = 0, i;
88     uint32_t *sq = ff_square_tab + 256;
89
90     for (i = 0; i < h; i++) {
91         s += sq[pix1[0]  - pix2[0]];
92         s += sq[pix1[1]  - pix2[1]];
93         s += sq[pix1[2]  - pix2[2]];
94         s += sq[pix1[3]  - pix2[3]];
95         s += sq[pix1[4]  - pix2[4]];
96         s += sq[pix1[5]  - pix2[5]];
97         s += sq[pix1[6]  - pix2[6]];
98         s += sq[pix1[7]  - pix2[7]];
99         s += sq[pix1[8]  - pix2[8]];
100         s += sq[pix1[9]  - pix2[9]];
101         s += sq[pix1[10] - pix2[10]];
102         s += sq[pix1[11] - pix2[11]];
103         s += sq[pix1[12] - pix2[12]];
104         s += sq[pix1[13] - pix2[13]];
105         s += sq[pix1[14] - pix2[14]];
106         s += sq[pix1[15] - pix2[15]];
107
108         pix1 += line_size;
109         pix2 += line_size;
110     }
111     return s;
112 }
113
114 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
115                           const uint8_t *s2, int stride)
116 {
117     int i;
118
119     /* read the pixels */
120     for (i = 0; i < 8; i++) {
121         block[0] = s1[0] - s2[0];
122         block[1] = s1[1] - s2[1];
123         block[2] = s1[2] - s2[2];
124         block[3] = s1[3] - s2[3];
125         block[4] = s1[4] - s2[4];
126         block[5] = s1[5] - s2[5];
127         block[6] = s1[6] - s2[6];
128         block[7] = s1[7] - s2[7];
129         s1      += stride;
130         s2      += stride;
131         block   += 8;
132     }
133 }
134
135 static int sum_abs_dctelem_c(int16_t *block)
136 {
137     int sum = 0, i;
138
139     for (i = 0; i < 64; i++)
140         sum += FFABS(block[i]);
141     return sum;
142 }
143
144 #define avg2(a, b) ((a + b + 1) >> 1)
145 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
146
147 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
148                               int line_size, int h)
149 {
150     int s = 0, i;
151
152     for (i = 0; i < h; i++) {
153         s    += abs(pix1[0]  - pix2[0]);
154         s    += abs(pix1[1]  - pix2[1]);
155         s    += abs(pix1[2]  - pix2[2]);
156         s    += abs(pix1[3]  - pix2[3]);
157         s    += abs(pix1[4]  - pix2[4]);
158         s    += abs(pix1[5]  - pix2[5]);
159         s    += abs(pix1[6]  - pix2[6]);
160         s    += abs(pix1[7]  - pix2[7]);
161         s    += abs(pix1[8]  - pix2[8]);
162         s    += abs(pix1[9]  - pix2[9]);
163         s    += abs(pix1[10] - pix2[10]);
164         s    += abs(pix1[11] - pix2[11]);
165         s    += abs(pix1[12] - pix2[12]);
166         s    += abs(pix1[13] - pix2[13]);
167         s    += abs(pix1[14] - pix2[14]);
168         s    += abs(pix1[15] - pix2[15]);
169         pix1 += line_size;
170         pix2 += line_size;
171     }
172     return s;
173 }
174
175 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
176                           int line_size, int h)
177 {
178     int s = 0, i;
179
180     for (i = 0; i < h; i++) {
181         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
182         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
183         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
184         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
185         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
186         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
187         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
188         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
189         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
190         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
191         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
192         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
193         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
194         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
195         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
196         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
197         pix1 += line_size;
198         pix2 += line_size;
199     }
200     return s;
201 }
202
203 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
204                           int line_size, int h)
205 {
206     int s = 0, i;
207     uint8_t *pix3 = pix2 + line_size;
208
209     for (i = 0; i < h; i++) {
210         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
211         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
212         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
213         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
214         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
215         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
216         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
217         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
218         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
219         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
220         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
221         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
222         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
223         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
224         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
225         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
226         pix1 += line_size;
227         pix2 += line_size;
228         pix3 += line_size;
229     }
230     return s;
231 }
232
233 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
234                            int line_size, int h)
235 {
236     int s = 0, i;
237     uint8_t *pix3 = pix2 + line_size;
238
239     for (i = 0; i < h; i++) {
240         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
241         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
242         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
243         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
244         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
245         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
246         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
247         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
248         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
249         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
250         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
251         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
252         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
253         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
254         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
255         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
256         pix1 += line_size;
257         pix2 += line_size;
258         pix3 += line_size;
259     }
260     return s;
261 }
262
263 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
264                              int line_size, int h)
265 {
266     int s = 0, i;
267
268     for (i = 0; i < h; i++) {
269         s    += abs(pix1[0] - pix2[0]);
270         s    += abs(pix1[1] - pix2[1]);
271         s    += abs(pix1[2] - pix2[2]);
272         s    += abs(pix1[3] - pix2[3]);
273         s    += abs(pix1[4] - pix2[4]);
274         s    += abs(pix1[5] - pix2[5]);
275         s    += abs(pix1[6] - pix2[6]);
276         s    += abs(pix1[7] - pix2[7]);
277         pix1 += line_size;
278         pix2 += line_size;
279     }
280     return s;
281 }
282
283 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
284                          int line_size, int h)
285 {
286     int s = 0, i;
287
288     for (i = 0; i < h; i++) {
289         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
290         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
291         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
292         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
293         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
294         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
295         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
296         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
297         pix1 += line_size;
298         pix2 += line_size;
299     }
300     return s;
301 }
302
303 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
304                          int line_size, int h)
305 {
306     int s = 0, i;
307     uint8_t *pix3 = pix2 + line_size;
308
309     for (i = 0; i < h; i++) {
310         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
311         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
312         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
313         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
314         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
315         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
316         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
317         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
318         pix1 += line_size;
319         pix2 += line_size;
320         pix3 += line_size;
321     }
322     return s;
323 }
324
325 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
326                           int line_size, int h)
327 {
328     int s = 0, i;
329     uint8_t *pix3 = pix2 + line_size;
330
331     for (i = 0; i < h; i++) {
332         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
333         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
334         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
335         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
336         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
337         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
338         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
339         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
340         pix1 += line_size;
341         pix2 += line_size;
342         pix3 += line_size;
343     }
344     return s;
345 }
346
347 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
348 {
349     int score1 = 0, score2 = 0, x, y;
350
351     for (y = 0; y < h; y++) {
352         for (x = 0; x < 16; x++)
353             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
354         if (y + 1 < h) {
355             for (x = 0; x < 15; x++)
356                 score2 += FFABS(s1[x]     - s1[x + stride] -
357                                 s1[x + 1] + s1[x + stride + 1]) -
358                           FFABS(s2[x]     - s2[x + stride] -
359                                 s2[x + 1] + s2[x + stride + 1]);
360         }
361         s1 += stride;
362         s2 += stride;
363     }
364
365     if (c)
366         return score1 + FFABS(score2) * c->avctx->nsse_weight;
367     else
368         return score1 + FFABS(score2) * 8;
369 }
370
371 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
372 {
373     int score1 = 0, score2 = 0, x, y;
374
375     for (y = 0; y < h; y++) {
376         for (x = 0; x < 8; x++)
377             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
378         if (y + 1 < h) {
379             for (x = 0; x < 7; x++)
380                 score2 += FFABS(s1[x]     - s1[x + stride] -
381                                 s1[x + 1] + s1[x + stride + 1]) -
382                           FFABS(s2[x]     - s2[x + stride] -
383                                 s2[x + 1] + s2[x + stride + 1]);
384         }
385         s1 += stride;
386         s2 += stride;
387     }
388
389     if (c)
390         return score1 + FFABS(score2) * c->avctx->nsse_weight;
391     else
392         return score1 + FFABS(score2) * 8;
393 }
394
395 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
396                     int stride, int h)
397 {
398     return 0;
399 }
400
401 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
402 {
403     int i;
404
405     memset(cmp, 0, sizeof(void *) * 6);
406
407     for (i = 0; i < 6; i++) {
408         switch (type & 0xFF) {
409         case FF_CMP_SAD:
410             cmp[i] = c->sad[i];
411             break;
412         case FF_CMP_SATD:
413             cmp[i] = c->hadamard8_diff[i];
414             break;
415         case FF_CMP_SSE:
416             cmp[i] = c->sse[i];
417             break;
418         case FF_CMP_DCT:
419             cmp[i] = c->dct_sad[i];
420             break;
421         case FF_CMP_DCT264:
422             cmp[i] = c->dct264_sad[i];
423             break;
424         case FF_CMP_DCTMAX:
425             cmp[i] = c->dct_max[i];
426             break;
427         case FF_CMP_PSNR:
428             cmp[i] = c->quant_psnr[i];
429             break;
430         case FF_CMP_BIT:
431             cmp[i] = c->bit[i];
432             break;
433         case FF_CMP_RD:
434             cmp[i] = c->rd[i];
435             break;
436         case FF_CMP_VSAD:
437             cmp[i] = c->vsad[i];
438             break;
439         case FF_CMP_VSSE:
440             cmp[i] = c->vsse[i];
441             break;
442         case FF_CMP_ZERO:
443             cmp[i] = zero_cmp;
444             break;
445         case FF_CMP_NSSE:
446             cmp[i] = c->nsse[i];
447             break;
448 #if CONFIG_DWT
449         case FF_CMP_W53:
450             cmp[i]= c->w53[i];
451             break;
452         case FF_CMP_W97:
453             cmp[i]= c->w97[i];
454             break;
455 #endif
456         default:
457             av_log(NULL, AV_LOG_ERROR,
458                    "internal error in cmp function selection\n");
459         }
460     }
461 }
462
463 #define BUTTERFLY2(o1, o2, i1, i2)              \
464     o1 = (i1) + (i2);                           \
465     o2 = (i1) - (i2);
466
467 #define BUTTERFLY1(x, y)                        \
468     {                                           \
469         int a, b;                               \
470         a = x;                                  \
471         b = y;                                  \
472         x = a + b;                              \
473         y = a - b;                              \
474     }
475
476 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
477
478 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
479                                uint8_t *src, int stride, int h)
480 {
481     int i, temp[64], sum = 0;
482
483     av_assert2(h == 8);
484
485     for (i = 0; i < 8; i++) {
486         // FIXME: try pointer walks
487         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
488                    src[stride * i + 0] - dst[stride * i + 0],
489                    src[stride * i + 1] - dst[stride * i + 1]);
490         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
491                    src[stride * i + 2] - dst[stride * i + 2],
492                    src[stride * i + 3] - dst[stride * i + 3]);
493         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
494                    src[stride * i + 4] - dst[stride * i + 4],
495                    src[stride * i + 5] - dst[stride * i + 5]);
496         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
497                    src[stride * i + 6] - dst[stride * i + 6],
498                    src[stride * i + 7] - dst[stride * i + 7]);
499
500         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
501         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
502         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
503         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
504
505         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
506         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
507         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
508         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
509     }
510
511     for (i = 0; i < 8; i++) {
512         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
513         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
514         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
515         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
516
517         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
518         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
519         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
520         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
521
522         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
523                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
524                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
525                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
526     }
527     return sum;
528 }
529
530 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
531                                 uint8_t *dummy, int stride, int h)
532 {
533     int i, temp[64], sum = 0;
534
535     av_assert2(h == 8);
536
537     for (i = 0; i < 8; i++) {
538         // FIXME: try pointer walks
539         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
540                    src[stride * i + 0], src[stride * i + 1]);
541         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
542                    src[stride * i + 2], src[stride * i + 3]);
543         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
544                    src[stride * i + 4], src[stride * i + 5]);
545         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
546                    src[stride * i + 6], src[stride * i + 7]);
547
548         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
549         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
550         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
551         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
552
553         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
554         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
555         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
556         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
557     }
558
559     for (i = 0; i < 8; i++) {
560         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
561         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
562         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
563         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
564
565         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
566         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
567         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
568         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
569
570         sum +=
571             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
572             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
573             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
574             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
575     }
576
577     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
578
579     return sum;
580 }
581
582 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
583                         uint8_t *src2, int stride, int h)
584 {
585     LOCAL_ALIGNED_16(int16_t, temp, [64]);
586
587     av_assert2(h == 8);
588
589     s->dsp.diff_pixels(temp, src1, src2, stride);
590     s->fdsp.fdct(temp);
591     return s->dsp.sum_abs_dctelem(temp);
592 }
593
594 #if CONFIG_GPL
595 #define DCT8_1D                                         \
596     {                                                   \
597         const int s07 = SRC(0) + SRC(7);                \
598         const int s16 = SRC(1) + SRC(6);                \
599         const int s25 = SRC(2) + SRC(5);                \
600         const int s34 = SRC(3) + SRC(4);                \
601         const int a0  = s07 + s34;                      \
602         const int a1  = s16 + s25;                      \
603         const int a2  = s07 - s34;                      \
604         const int a3  = s16 - s25;                      \
605         const int d07 = SRC(0) - SRC(7);                \
606         const int d16 = SRC(1) - SRC(6);                \
607         const int d25 = SRC(2) - SRC(5);                \
608         const int d34 = SRC(3) - SRC(4);                \
609         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
610         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
611         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
612         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
613         DST(0, a0 + a1);                                \
614         DST(1, a4 + (a7 >> 2));                         \
615         DST(2, a2 + (a3 >> 1));                         \
616         DST(3, a5 + (a6 >> 2));                         \
617         DST(4, a0 - a1);                                \
618         DST(5, a6 - (a5 >> 2));                         \
619         DST(6, (a2 >> 1) - a3);                         \
620         DST(7, (a4 >> 2) - a7);                         \
621     }
622
623 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
624                            uint8_t *src2, int stride, int h)
625 {
626     int16_t dct[8][8];
627     int i, sum = 0;
628
629     s->dsp.diff_pixels(dct[0], src1, src2, stride);
630
631 #define SRC(x) dct[i][x]
632 #define DST(x, v) dct[i][x] = v
633     for (i = 0; i < 8; i++)
634         DCT8_1D
635 #undef SRC
636 #undef DST
637
638 #define SRC(x) dct[x][i]
639 #define DST(x, v) sum += FFABS(v)
640         for (i = 0; i < 8; i++)
641             DCT8_1D
642 #undef SRC
643 #undef DST
644             return sum;
645 }
646 #endif
647
648 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
649                         uint8_t *src2, int stride, int h)
650 {
651     LOCAL_ALIGNED_16(int16_t, temp, [64]);
652     int sum = 0, i;
653
654     av_assert2(h == 8);
655
656     s->dsp.diff_pixels(temp, src1, src2, stride);
657     s->fdsp.fdct(temp);
658
659     for (i = 0; i < 64; i++)
660         sum = FFMAX(sum, FFABS(temp[i]));
661
662     return sum;
663 }
664
665 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
666                            uint8_t *src2, int stride, int h)
667 {
668     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
669     int16_t *const bak = temp + 64;
670     int sum = 0, i;
671
672     av_assert2(h == 8);
673     s->mb_intra = 0;
674
675     s->dsp.diff_pixels(temp, src1, src2, stride);
676
677     memcpy(bak, temp, 64 * sizeof(int16_t));
678
679     s->block_last_index[0 /* FIXME */] =
680         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
681     s->dct_unquantize_inter(s, temp, 0, s->qscale);
682     ff_simple_idct_8(temp); // FIXME
683
684     for (i = 0; i < 64; i++)
685         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
686
687     return sum;
688 }
689
690 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
691                    int stride, int h)
692 {
693     const uint8_t *scantable = s->intra_scantable.permutated;
694     LOCAL_ALIGNED_16(int16_t, temp, [64]);
695     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
696     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
697     int i, last, run, bits, level, distortion, start_i;
698     const int esc_length = s->ac_esc_length;
699     uint8_t *length, *last_length;
700
701     av_assert2(h == 8);
702
703     copy_block8(lsrc1, src1, 8, stride, 8);
704     copy_block8(lsrc2, src2, 8, stride, 8);
705
706     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
707
708     s->block_last_index[0 /* FIXME */] =
709     last                               =
710         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
711
712     bits = 0;
713
714     if (s->mb_intra) {
715         start_i     = 1;
716         length      = s->intra_ac_vlc_length;
717         last_length = s->intra_ac_vlc_last_length;
718         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
719     } else {
720         start_i     = 0;
721         length      = s->inter_ac_vlc_length;
722         last_length = s->inter_ac_vlc_last_length;
723     }
724
725     if (last >= start_i) {
726         run = 0;
727         for (i = start_i; i < last; i++) {
728             int j = scantable[i];
729             level = temp[j];
730
731             if (level) {
732                 level += 64;
733                 if ((level & (~127)) == 0)
734                     bits += length[UNI_AC_ENC_INDEX(run, level)];
735                 else
736                     bits += esc_length;
737                 run = 0;
738             } else
739                 run++;
740         }
741         i = scantable[last];
742
743         level = temp[i] + 64;
744
745         av_assert2(level - 64);
746
747         if ((level & (~127)) == 0) {
748             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
749         } else
750             bits += esc_length;
751     }
752
753     if (last >= 0) {
754         if (s->mb_intra)
755             s->dct_unquantize_intra(s, temp, 0, s->qscale);
756         else
757             s->dct_unquantize_inter(s, temp, 0, s->qscale);
758     }
759
760     s->idsp.idct_add(lsrc2, 8, temp);
761
762     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
763
764     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
765 }
766
767 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
768                     int stride, int h)
769 {
770     const uint8_t *scantable = s->intra_scantable.permutated;
771     LOCAL_ALIGNED_16(int16_t, temp, [64]);
772     int i, last, run, bits, level, start_i;
773     const int esc_length = s->ac_esc_length;
774     uint8_t *length, *last_length;
775
776     av_assert2(h == 8);
777
778     s->dsp.diff_pixels(temp, src1, src2, stride);
779
780     s->block_last_index[0 /* FIXME */] =
781     last                               =
782         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
783
784     bits = 0;
785
786     if (s->mb_intra) {
787         start_i     = 1;
788         length      = s->intra_ac_vlc_length;
789         last_length = s->intra_ac_vlc_last_length;
790         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
791     } else {
792         start_i     = 0;
793         length      = s->inter_ac_vlc_length;
794         last_length = s->inter_ac_vlc_last_length;
795     }
796
797     if (last >= start_i) {
798         run = 0;
799         for (i = start_i; i < last; i++) {
800             int j = scantable[i];
801             level = temp[j];
802
803             if (level) {
804                 level += 64;
805                 if ((level & (~127)) == 0)
806                     bits += length[UNI_AC_ENC_INDEX(run, level)];
807                 else
808                     bits += esc_length;
809                 run = 0;
810             } else
811                 run++;
812         }
813         i = scantable[last];
814
815         level = temp[i] + 64;
816
817         av_assert2(level - 64);
818
819         if ((level & (~127)) == 0)
820             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
821         else
822             bits += esc_length;
823     }
824
825     return bits;
826 }
827
828 #define VSAD_INTRA(size)                                                \
829 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
830                                     uint8_t *s, uint8_t *dummy,         \
831                                     int stride, int h)                  \
832 {                                                                       \
833     int score = 0, x, y;                                                \
834                                                                         \
835     for (y = 1; y < h; y++) {                                           \
836         for (x = 0; x < size; x += 4) {                                 \
837             score += FFABS(s[x]     - s[x + stride])     +              \
838                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
839                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
840                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
841         }                                                               \
842         s += stride;                                                    \
843     }                                                                   \
844                                                                         \
845     return score;                                                       \
846 }
847 VSAD_INTRA(8)
848 VSAD_INTRA(16)
849
850 #define VSAD(size)                                                             \
851 static int vsad ## size ## _c(MpegEncContext *c,                               \
852                               uint8_t *s1, uint8_t *s2,                        \
853                               int stride, int h)                               \
854 {                                                                              \
855     int score = 0, x, y;                                                       \
856                                                                                \
857     for (y = 1; y < h; y++) {                                                  \
858         for (x = 0; x < size; x++)                                             \
859             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
860         s1 += stride;                                                          \
861         s2 += stride;                                                          \
862     }                                                                          \
863                                                                                \
864     return score;                                                              \
865 }
866 VSAD(8)
867 VSAD(16)
868
869 #define SQ(a) ((a) * (a))
870 #define VSSE_INTRA(size)                                                \
871 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
872                                     uint8_t *s, uint8_t *dummy,         \
873                                     int stride, int h)                  \
874 {                                                                       \
875     int score = 0, x, y;                                                \
876                                                                         \
877     for (y = 1; y < h; y++) {                                           \
878         for (x = 0; x < size; x += 4) {                                 \
879             score += SQ(s[x]     - s[x + stride]) +                     \
880                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
881                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
882                      SQ(s[x + 3] - s[x + stride + 3]);                  \
883         }                                                               \
884         s += stride;                                                    \
885     }                                                                   \
886                                                                         \
887     return score;                                                       \
888 }
889 VSSE_INTRA(8)
890 VSSE_INTRA(16)
891
892 #define VSSE(size)                                                             \
893 static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
894                     int stride, int h)                                         \
895 {                                                                              \
896     int score = 0, x, y;                                                       \
897                                                                                \
898     for (y = 1; y < h; y++) {                                                  \
899         for (x = 0; x < size; x++)                                             \
900             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
901         s1 += stride;                                                          \
902         s2 += stride;                                                          \
903     }                                                                          \
904                                                                                \
905     return score;                                                              \
906 }
907 VSSE(8)
908 VSSE(16)
909
910 #define WRAPPER8_16_SQ(name8, name16)                                   \
911 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
912                   int stride, int h)                                    \
913 {                                                                       \
914     int score = 0;                                                      \
915                                                                         \
916     score += name8(s, dst, src, stride, 8);                             \
917     score += name8(s, dst + 8, src + 8, stride, 8);                     \
918     if (h == 16) {                                                      \
919         dst   += 8 * stride;                                            \
920         src   += 8 * stride;                                            \
921         score += name8(s, dst, src, stride, 8);                         \
922         score += name8(s, dst + 8, src + 8, stride, 8);                 \
923     }                                                                   \
924     return score;                                                       \
925 }
926
927 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
928 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
929 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
930 #if CONFIG_GPL
931 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
932 #endif
933 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
934 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
935 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
936 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
937
938 /* init static data */
939 av_cold void ff_dsputil_static_init(void)
940 {
941     int i;
942
943     for (i = 0; i < 512; i++)
944         ff_square_tab[i] = (i - 256) * (i - 256);
945 }
946
947 int ff_check_alignment(void)
948 {
949     static int did_fail = 0;
950     LOCAL_ALIGNED_16(int, aligned, [4]);
951
952     if ((intptr_t)aligned & 15) {
953         if (!did_fail) {
954 #if HAVE_MMX || HAVE_ALTIVEC
955             av_log(NULL, AV_LOG_ERROR,
956                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
957                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
958                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
959                 "Do not report crashes to FFmpeg developers.\n");
960 #endif
961             did_fail=1;
962         }
963         return -1;
964     }
965     return 0;
966 }
967
968 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
969 {
970     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
971
972     ff_check_alignment();
973
974     c->diff_pixels = diff_pixels_c;
975
976     c->sum_abs_dctelem = sum_abs_dctelem_c;
977
978     /* TODO [0] 16  [1] 8 */
979     c->pix_abs[0][0] = pix_abs16_c;
980     c->pix_abs[0][1] = pix_abs16_x2_c;
981     c->pix_abs[0][2] = pix_abs16_y2_c;
982     c->pix_abs[0][3] = pix_abs16_xy2_c;
983     c->pix_abs[1][0] = pix_abs8_c;
984     c->pix_abs[1][1] = pix_abs8_x2_c;
985     c->pix_abs[1][2] = pix_abs8_y2_c;
986     c->pix_abs[1][3] = pix_abs8_xy2_c;
987
988 #define SET_CMP_FUNC(name)                      \
989     c->name[0] = name ## 16_c;                  \
990     c->name[1] = name ## 8x8_c;
991
992     SET_CMP_FUNC(hadamard8_diff)
993     c->hadamard8_diff[4] = hadamard8_intra16_c;
994     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
995     SET_CMP_FUNC(dct_sad)
996     SET_CMP_FUNC(dct_max)
997 #if CONFIG_GPL
998     SET_CMP_FUNC(dct264_sad)
999 #endif
1000     c->sad[0] = pix_abs16_c;
1001     c->sad[1] = pix_abs8_c;
1002     c->sse[0] = sse16_c;
1003     c->sse[1] = sse8_c;
1004     c->sse[2] = sse4_c;
1005     SET_CMP_FUNC(quant_psnr)
1006     SET_CMP_FUNC(rd)
1007     SET_CMP_FUNC(bit)
1008     c->vsad[0] = vsad16_c;
1009     c->vsad[1] = vsad8_c;
1010     c->vsad[4] = vsad_intra16_c;
1011     c->vsad[5] = vsad_intra8_c;
1012     c->vsse[0] = vsse16_c;
1013     c->vsse[1] = vsse8_c;
1014     c->vsse[4] = vsse_intra16_c;
1015     c->vsse[5] = vsse_intra8_c;
1016     c->nsse[0] = nsse16_c;
1017     c->nsse[1] = nsse8_c;
1018 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
1019     ff_dsputil_init_dwt(c);
1020 #endif
1021
1022     switch (avctx->bits_per_raw_sample) {
1023     case 9:
1024     case 10:
1025     case 12:
1026     case 14:
1027         c->get_pixels = get_pixels_16_c;
1028         break;
1029     default:
1030         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
1031             c->get_pixels = get_pixels_8_c;
1032         }
1033         break;
1034     }
1035
1036
1037     if (ARCH_ALPHA)
1038         ff_dsputil_init_alpha(c, avctx);
1039     if (ARCH_ARM)
1040         ff_dsputil_init_arm(c, avctx, high_bit_depth);
1041     if (ARCH_PPC)
1042         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
1043     if (ARCH_X86)
1044         ff_dsputil_init_x86(c, avctx, high_bit_depth);
1045 }
1046
1047 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
1048 {
1049     ff_dsputil_init(c, avctx);
1050 }
1051
1052 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
1053 {
1054     ff_dsputil_init(c, avctx);
1055 }