]> git.sesse.net Git - ffmpeg/blob - libavcodec/me_cmp.c
Merge commit 'be1db21ba88fe86036fea9f8d2c1a5f47c2a0a7e'
[ffmpeg] / libavcodec / me_cmp.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 #include "libavutil/attributes.h"
24 #include "libavutil/internal.h"
25 #include "avcodec.h"
26 #include "copy_block.h"
27 #include "simple_idct.h"
28 #include "me_cmp.h"
29 #include "mpegvideo.h"
30 #include "config.h"
31
32 uint32_t ff_square_tab[512] = { 0, };
33
34 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
35                   ptrdiff_t stride, int h)
36 {
37     int s = 0, i;
38     uint32_t *sq = ff_square_tab + 256;
39
40     for (i = 0; i < h; i++) {
41         s    += sq[pix1[0] - pix2[0]];
42         s    += sq[pix1[1] - pix2[1]];
43         s    += sq[pix1[2] - pix2[2]];
44         s    += sq[pix1[3] - pix2[3]];
45         pix1 += stride;
46         pix2 += stride;
47     }
48     return s;
49 }
50
51 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52                   ptrdiff_t stride, int h)
53 {
54     int s = 0, i;
55     uint32_t *sq = ff_square_tab + 256;
56
57     for (i = 0; i < h; i++) {
58         s    += sq[pix1[0] - pix2[0]];
59         s    += sq[pix1[1] - pix2[1]];
60         s    += sq[pix1[2] - pix2[2]];
61         s    += sq[pix1[3] - pix2[3]];
62         s    += sq[pix1[4] - pix2[4]];
63         s    += sq[pix1[5] - pix2[5]];
64         s    += sq[pix1[6] - pix2[6]];
65         s    += sq[pix1[7] - pix2[7]];
66         pix1 += stride;
67         pix2 += stride;
68     }
69     return s;
70 }
71
72 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
73                    ptrdiff_t stride, int h)
74 {
75     int s = 0, i;
76     uint32_t *sq = ff_square_tab + 256;
77
78     for (i = 0; i < h; i++) {
79         s += sq[pix1[0]  - pix2[0]];
80         s += sq[pix1[1]  - pix2[1]];
81         s += sq[pix1[2]  - pix2[2]];
82         s += sq[pix1[3]  - pix2[3]];
83         s += sq[pix1[4]  - pix2[4]];
84         s += sq[pix1[5]  - pix2[5]];
85         s += sq[pix1[6]  - pix2[6]];
86         s += sq[pix1[7]  - pix2[7]];
87         s += sq[pix1[8]  - pix2[8]];
88         s += sq[pix1[9]  - pix2[9]];
89         s += sq[pix1[10] - pix2[10]];
90         s += sq[pix1[11] - pix2[11]];
91         s += sq[pix1[12] - pix2[12]];
92         s += sq[pix1[13] - pix2[13]];
93         s += sq[pix1[14] - pix2[14]];
94         s += sq[pix1[15] - pix2[15]];
95
96         pix1 += stride;
97         pix2 += stride;
98     }
99     return s;
100 }
101
102 static int sum_abs_dctelem_c(int16_t *block)
103 {
104     int sum = 0, i;
105
106     for (i = 0; i < 64; i++)
107         sum += FFABS(block[i]);
108     return sum;
109 }
110
111 #define avg2(a, b) (((a) + (b) + 1) >> 1)
112 #define avg4(a, b, c, d) (((a) + (b) + (c) + (d) + 2) >> 2)
113
114 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
115                               ptrdiff_t stride, int h)
116 {
117     int s = 0, i;
118
119     for (i = 0; i < h; i++) {
120         s    += abs(pix1[0]  - pix2[0]);
121         s    += abs(pix1[1]  - pix2[1]);
122         s    += abs(pix1[2]  - pix2[2]);
123         s    += abs(pix1[3]  - pix2[3]);
124         s    += abs(pix1[4]  - pix2[4]);
125         s    += abs(pix1[5]  - pix2[5]);
126         s    += abs(pix1[6]  - pix2[6]);
127         s    += abs(pix1[7]  - pix2[7]);
128         s    += abs(pix1[8]  - pix2[8]);
129         s    += abs(pix1[9]  - pix2[9]);
130         s    += abs(pix1[10] - pix2[10]);
131         s    += abs(pix1[11] - pix2[11]);
132         s    += abs(pix1[12] - pix2[12]);
133         s    += abs(pix1[13] - pix2[13]);
134         s    += abs(pix1[14] - pix2[14]);
135         s    += abs(pix1[15] - pix2[15]);
136         pix1 += stride;
137         pix2 += stride;
138     }
139     return s;
140 }
141
142 static inline int pix_median_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
143                              ptrdiff_t stride, int h)
144 {
145     int s = 0, i, j;
146
147 #define V(x) (pix1[x] - pix2[x])
148
149     s    += abs(V(0));
150     s    += abs(V(1) - V(0));
151     s    += abs(V(2) - V(1));
152     s    += abs(V(3) - V(2));
153     s    += abs(V(4) - V(3));
154     s    += abs(V(5) - V(4));
155     s    += abs(V(6) - V(5));
156     s    += abs(V(7) - V(6));
157     s    += abs(V(8) - V(7));
158     s    += abs(V(9) - V(8));
159     s    += abs(V(10) - V(9));
160     s    += abs(V(11) - V(10));
161     s    += abs(V(12) - V(11));
162     s    += abs(V(13) - V(12));
163     s    += abs(V(14) - V(13));
164     s    += abs(V(15) - V(14));
165
166     pix1 += stride;
167     pix2 += stride;
168
169     for (i = 1; i < h; i++) {
170         s    += abs(V(0) - V(-stride));
171         for (j = 1; j < 16; j++)
172             s    += abs(V(j) - mid_pred(V(j-stride), V(j-1), V(j-stride) + V(j-1) - V(j-stride-1)));
173         pix1 += stride;
174         pix2 += stride;
175
176     }
177 #undef V
178     return s;
179 }
180
181 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
182                           ptrdiff_t stride, int h)
183 {
184     int s = 0, i;
185
186     for (i = 0; i < h; i++) {
187         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
188         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
189         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
190         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
191         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
192         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
193         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
194         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
195         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
196         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
197         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
198         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
199         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
200         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
201         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
202         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
203         pix1 += stride;
204         pix2 += stride;
205     }
206     return s;
207 }
208
209 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
210                           ptrdiff_t stride, int h)
211 {
212     int s = 0, i;
213     uint8_t *pix3 = pix2 + stride;
214
215     for (i = 0; i < h; i++) {
216         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
217         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
218         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
219         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
220         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
221         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
222         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
223         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
224         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
225         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
226         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
227         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
228         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
229         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
230         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
231         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
232         pix1 += stride;
233         pix2 += stride;
234         pix3 += stride;
235     }
236     return s;
237 }
238
239 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
240                            ptrdiff_t stride, int h)
241 {
242     int s = 0, i;
243     uint8_t *pix3 = pix2 + stride;
244
245     for (i = 0; i < h; i++) {
246         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
247         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
248         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
249         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
250         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
251         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
252         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
253         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
254         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
255         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
256         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
257         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
258         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
259         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
260         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
261         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
262         pix1 += stride;
263         pix2 += stride;
264         pix3 += stride;
265     }
266     return s;
267 }
268
269 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
270                              ptrdiff_t stride, int h)
271 {
272     int s = 0, i;
273
274     for (i = 0; i < h; i++) {
275         s    += abs(pix1[0] - pix2[0]);
276         s    += abs(pix1[1] - pix2[1]);
277         s    += abs(pix1[2] - pix2[2]);
278         s    += abs(pix1[3] - pix2[3]);
279         s    += abs(pix1[4] - pix2[4]);
280         s    += abs(pix1[5] - pix2[5]);
281         s    += abs(pix1[6] - pix2[6]);
282         s    += abs(pix1[7] - pix2[7]);
283         pix1 += stride;
284         pix2 += stride;
285     }
286     return s;
287 }
288
289 static inline int pix_median_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
290                              ptrdiff_t stride, int h)
291 {
292     int s = 0, i, j;
293
294 #define V(x) (pix1[x] - pix2[x])
295
296     s    += abs(V(0));
297     s    += abs(V(1) - V(0));
298     s    += abs(V(2) - V(1));
299     s    += abs(V(3) - V(2));
300     s    += abs(V(4) - V(3));
301     s    += abs(V(5) - V(4));
302     s    += abs(V(6) - V(5));
303     s    += abs(V(7) - V(6));
304
305     pix1 += stride;
306     pix2 += stride;
307
308     for (i = 1; i < h; i++) {
309         s    += abs(V(0) - V(-stride));
310         for (j = 1; j < 8; j++)
311             s    += abs(V(j) - mid_pred(V(j-stride), V(j-1), V(j-stride) + V(j-1) - V(j-stride-1)));
312         pix1 += stride;
313         pix2 += stride;
314
315     }
316 #undef V
317     return s;
318 }
319
320 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
321                          ptrdiff_t stride, int h)
322 {
323     int s = 0, i;
324
325     for (i = 0; i < h; i++) {
326         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
327         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
328         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
329         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
330         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
331         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
332         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
333         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
334         pix1 += stride;
335         pix2 += stride;
336     }
337     return s;
338 }
339
340 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
341                          ptrdiff_t stride, int h)
342 {
343     int s = 0, i;
344     uint8_t *pix3 = pix2 + stride;
345
346     for (i = 0; i < h; i++) {
347         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
348         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
349         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
350         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
351         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
352         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
353         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
354         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
355         pix1 += stride;
356         pix2 += stride;
357         pix3 += stride;
358     }
359     return s;
360 }
361
362 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
363                           ptrdiff_t stride, int h)
364 {
365     int s = 0, i;
366     uint8_t *pix3 = pix2 + stride;
367
368     for (i = 0; i < h; i++) {
369         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
370         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
371         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
372         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
373         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
374         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
375         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
376         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
377         pix1 += stride;
378         pix2 += stride;
379         pix3 += stride;
380     }
381     return s;
382 }
383
384 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
385                     ptrdiff_t stride, int h)
386 {
387     int score1 = 0, score2 = 0, x, y;
388
389     for (y = 0; y < h; y++) {
390         for (x = 0; x < 16; x++)
391             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
392         if (y + 1 < h) {
393             for (x = 0; x < 15; x++)
394                 score2 += FFABS(s1[x]     - s1[x + stride] -
395                                 s1[x + 1] + s1[x + stride + 1]) -
396                           FFABS(s2[x]     - s2[x + stride] -
397                                 s2[x + 1] + s2[x + stride + 1]);
398         }
399         s1 += stride;
400         s2 += stride;
401     }
402
403     if (c)
404         return score1 + FFABS(score2) * c->avctx->nsse_weight;
405     else
406         return score1 + FFABS(score2) * 8;
407 }
408
409 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
410                    ptrdiff_t stride, int h)
411 {
412     int score1 = 0, score2 = 0, x, y;
413
414     for (y = 0; y < h; y++) {
415         for (x = 0; x < 8; x++)
416             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
417         if (y + 1 < h) {
418             for (x = 0; x < 7; x++)
419                 score2 += FFABS(s1[x]     - s1[x + stride] -
420                                 s1[x + 1] + s1[x + stride + 1]) -
421                           FFABS(s2[x]     - s2[x + stride] -
422                                 s2[x + 1] + s2[x + stride + 1]);
423         }
424         s1 += stride;
425         s2 += stride;
426     }
427
428     if (c)
429         return score1 + FFABS(score2) * c->avctx->nsse_weight;
430     else
431         return score1 + FFABS(score2) * 8;
432 }
433
434 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
435                     ptrdiff_t stride, int h)
436 {
437     return 0;
438 }
439
440 void ff_set_cmp(MECmpContext *c, me_cmp_func *cmp, int type)
441 {
442     int i;
443
444     memset(cmp, 0, sizeof(void *) * 6);
445
446     for (i = 0; i < 6; i++) {
447         switch (type & 0xFF) {
448         case FF_CMP_SAD:
449             cmp[i] = c->sad[i];
450             break;
451         case FF_CMP_MEDIAN_SAD:
452             cmp[i] = c->median_sad[i];
453             break;
454         case FF_CMP_SATD:
455             cmp[i] = c->hadamard8_diff[i];
456             break;
457         case FF_CMP_SSE:
458             cmp[i] = c->sse[i];
459             break;
460         case FF_CMP_DCT:
461             cmp[i] = c->dct_sad[i];
462             break;
463         case FF_CMP_DCT264:
464             cmp[i] = c->dct264_sad[i];
465             break;
466         case FF_CMP_DCTMAX:
467             cmp[i] = c->dct_max[i];
468             break;
469         case FF_CMP_PSNR:
470             cmp[i] = c->quant_psnr[i];
471             break;
472         case FF_CMP_BIT:
473             cmp[i] = c->bit[i];
474             break;
475         case FF_CMP_RD:
476             cmp[i] = c->rd[i];
477             break;
478         case FF_CMP_VSAD:
479             cmp[i] = c->vsad[i];
480             break;
481         case FF_CMP_VSSE:
482             cmp[i] = c->vsse[i];
483             break;
484         case FF_CMP_ZERO:
485             cmp[i] = zero_cmp;
486             break;
487         case FF_CMP_NSSE:
488             cmp[i] = c->nsse[i];
489             break;
490 #if CONFIG_DWT
491         case FF_CMP_W53:
492             cmp[i]= c->w53[i];
493             break;
494         case FF_CMP_W97:
495             cmp[i]= c->w97[i];
496             break;
497 #endif
498         default:
499             av_log(NULL, AV_LOG_ERROR,
500                    "internal error in cmp function selection\n");
501         }
502     }
503 }
504
505 #define BUTTERFLY2(o1, o2, i1, i2)              \
506     o1 = (i1) + (i2);                           \
507     o2 = (i1) - (i2);
508
509 #define BUTTERFLY1(x, y)                        \
510     {                                           \
511         int a, b;                               \
512         a = x;                                  \
513         b = y;                                  \
514         x = a + b;                              \
515         y = a - b;                              \
516     }
517
518 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
519
520 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
521                                uint8_t *src, ptrdiff_t stride, int h)
522 {
523     int i, temp[64], sum = 0;
524
525     av_assert2(h == 8);
526
527     for (i = 0; i < 8; i++) {
528         // FIXME: try pointer walks
529         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
530                    src[stride * i + 0] - dst[stride * i + 0],
531                    src[stride * i + 1] - dst[stride * i + 1]);
532         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
533                    src[stride * i + 2] - dst[stride * i + 2],
534                    src[stride * i + 3] - dst[stride * i + 3]);
535         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
536                    src[stride * i + 4] - dst[stride * i + 4],
537                    src[stride * i + 5] - dst[stride * i + 5]);
538         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
539                    src[stride * i + 6] - dst[stride * i + 6],
540                    src[stride * i + 7] - dst[stride * i + 7]);
541
542         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
543         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
544         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
545         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
546
547         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
548         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
549         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
550         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
551     }
552
553     for (i = 0; i < 8; i++) {
554         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
555         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
556         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
557         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
558
559         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
560         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
561         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
562         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
563
564         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
565                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
566                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
567                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
568     }
569     return sum;
570 }
571
572 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
573                                 uint8_t *dummy, ptrdiff_t stride, int h)
574 {
575     int i, temp[64], sum = 0;
576
577     av_assert2(h == 8);
578
579     for (i = 0; i < 8; i++) {
580         // FIXME: try pointer walks
581         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
582                    src[stride * i + 0], src[stride * i + 1]);
583         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
584                    src[stride * i + 2], src[stride * i + 3]);
585         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
586                    src[stride * i + 4], src[stride * i + 5]);
587         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
588                    src[stride * i + 6], src[stride * i + 7]);
589
590         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
591         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
592         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
593         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
594
595         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
596         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
597         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
598         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
599     }
600
601     for (i = 0; i < 8; i++) {
602         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
603         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
604         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
605         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
606
607         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
608         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
609         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
610         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
611
612         sum +=
613             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
614             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
615             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
616             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
617     }
618
619     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
620
621     return sum;
622 }
623
624 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
625                         uint8_t *src2, ptrdiff_t stride, int h)
626 {
627     LOCAL_ALIGNED_16(int16_t, temp, [64]);
628
629     av_assert2(h == 8);
630
631     s->pdsp.diff_pixels(temp, src1, src2, stride);
632     s->fdsp.fdct(temp);
633     return s->mecc.sum_abs_dctelem(temp);
634 }
635
636 #if CONFIG_GPL
637 #define DCT8_1D                                         \
638     {                                                   \
639         const int s07 = SRC(0) + SRC(7);                \
640         const int s16 = SRC(1) + SRC(6);                \
641         const int s25 = SRC(2) + SRC(5);                \
642         const int s34 = SRC(3) + SRC(4);                \
643         const int a0  = s07 + s34;                      \
644         const int a1  = s16 + s25;                      \
645         const int a2  = s07 - s34;                      \
646         const int a3  = s16 - s25;                      \
647         const int d07 = SRC(0) - SRC(7);                \
648         const int d16 = SRC(1) - SRC(6);                \
649         const int d25 = SRC(2) - SRC(5);                \
650         const int d34 = SRC(3) - SRC(4);                \
651         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
652         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
653         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
654         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
655         DST(0, a0 + a1);                                \
656         DST(1, a4 + (a7 >> 2));                         \
657         DST(2, a2 + (a3 >> 1));                         \
658         DST(3, a5 + (a6 >> 2));                         \
659         DST(4, a0 - a1);                                \
660         DST(5, a6 - (a5 >> 2));                         \
661         DST(6, (a2 >> 1) - a3);                         \
662         DST(7, (a4 >> 2) - a7);                         \
663     }
664
665 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
666                            uint8_t *src2, ptrdiff_t stride, int h)
667 {
668     int16_t dct[8][8];
669     int i, sum = 0;
670
671     s->pdsp.diff_pixels(dct[0], src1, src2, stride);
672
673 #define SRC(x) dct[i][x]
674 #define DST(x, v) dct[i][x] = v
675     for (i = 0; i < 8; i++)
676         DCT8_1D
677 #undef SRC
678 #undef DST
679
680 #define SRC(x) dct[x][i]
681 #define DST(x, v) sum += FFABS(v)
682         for (i = 0; i < 8; i++)
683             DCT8_1D
684 #undef SRC
685 #undef DST
686             return sum;
687 }
688 #endif
689
690 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
691                         uint8_t *src2, ptrdiff_t stride, int h)
692 {
693     LOCAL_ALIGNED_16(int16_t, temp, [64]);
694     int sum = 0, i;
695
696     av_assert2(h == 8);
697
698     s->pdsp.diff_pixels(temp, src1, src2, stride);
699     s->fdsp.fdct(temp);
700
701     for (i = 0; i < 64; i++)
702         sum = FFMAX(sum, FFABS(temp[i]));
703
704     return sum;
705 }
706
707 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
708                            uint8_t *src2, ptrdiff_t stride, int h)
709 {
710     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
711     int16_t *const bak = temp + 64;
712     int sum = 0, i;
713
714     av_assert2(h == 8);
715     s->mb_intra = 0;
716
717     s->pdsp.diff_pixels(temp, src1, src2, stride);
718
719     memcpy(bak, temp, 64 * sizeof(int16_t));
720
721     s->block_last_index[0 /* FIXME */] =
722         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
723     s->dct_unquantize_inter(s, temp, 0, s->qscale);
724     ff_simple_idct_8(temp); // FIXME
725
726     for (i = 0; i < 64; i++)
727         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
728
729     return sum;
730 }
731
732 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
733                    ptrdiff_t stride, int h)
734 {
735     const uint8_t *scantable = s->intra_scantable.permutated;
736     LOCAL_ALIGNED_16(int16_t, temp, [64]);
737     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
738     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
739     int i, last, run, bits, level, distortion, start_i;
740     const int esc_length = s->ac_esc_length;
741     uint8_t *length, *last_length;
742
743     av_assert2(h == 8);
744
745     copy_block8(lsrc1, src1, 8, stride, 8);
746     copy_block8(lsrc2, src2, 8, stride, 8);
747
748     s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
749
750     s->block_last_index[0 /* FIXME */] =
751     last                               =
752         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
753
754     bits = 0;
755
756     if (s->mb_intra) {
757         start_i     = 1;
758         length      = s->intra_ac_vlc_length;
759         last_length = s->intra_ac_vlc_last_length;
760         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
761     } else {
762         start_i     = 0;
763         length      = s->inter_ac_vlc_length;
764         last_length = s->inter_ac_vlc_last_length;
765     }
766
767     if (last >= start_i) {
768         run = 0;
769         for (i = start_i; i < last; i++) {
770             int j = scantable[i];
771             level = temp[j];
772
773             if (level) {
774                 level += 64;
775                 if ((level & (~127)) == 0)
776                     bits += length[UNI_AC_ENC_INDEX(run, level)];
777                 else
778                     bits += esc_length;
779                 run = 0;
780             } else
781                 run++;
782         }
783         i = scantable[last];
784
785         level = temp[i] + 64;
786
787         av_assert2(level - 64);
788
789         if ((level & (~127)) == 0) {
790             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
791         } else
792             bits += esc_length;
793     }
794
795     if (last >= 0) {
796         if (s->mb_intra)
797             s->dct_unquantize_intra(s, temp, 0, s->qscale);
798         else
799             s->dct_unquantize_inter(s, temp, 0, s->qscale);
800     }
801
802     s->idsp.idct_add(lsrc2, 8, temp);
803
804     distortion = s->mecc.sse[1](NULL, lsrc2, lsrc1, 8, 8);
805
806     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
807 }
808
809 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
810                     ptrdiff_t stride, int h)
811 {
812     const uint8_t *scantable = s->intra_scantable.permutated;
813     LOCAL_ALIGNED_16(int16_t, temp, [64]);
814     int i, last, run, bits, level, start_i;
815     const int esc_length = s->ac_esc_length;
816     uint8_t *length, *last_length;
817
818     av_assert2(h == 8);
819
820     s->pdsp.diff_pixels(temp, src1, src2, stride);
821
822     s->block_last_index[0 /* FIXME */] =
823     last                               =
824         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
825
826     bits = 0;
827
828     if (s->mb_intra) {
829         start_i     = 1;
830         length      = s->intra_ac_vlc_length;
831         last_length = s->intra_ac_vlc_last_length;
832         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
833     } else {
834         start_i     = 0;
835         length      = s->inter_ac_vlc_length;
836         last_length = s->inter_ac_vlc_last_length;
837     }
838
839     if (last >= start_i) {
840         run = 0;
841         for (i = start_i; i < last; i++) {
842             int j = scantable[i];
843             level = temp[j];
844
845             if (level) {
846                 level += 64;
847                 if ((level & (~127)) == 0)
848                     bits += length[UNI_AC_ENC_INDEX(run, level)];
849                 else
850                     bits += esc_length;
851                 run = 0;
852             } else
853                 run++;
854         }
855         i = scantable[last];
856
857         level = temp[i] + 64;
858
859         av_assert2(level - 64);
860
861         if ((level & (~127)) == 0)
862             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
863         else
864             bits += esc_length;
865     }
866
867     return bits;
868 }
869
870 #define VSAD_INTRA(size)                                                \
871 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
872                                     uint8_t *s, uint8_t *dummy,         \
873                                     ptrdiff_t stride, int h)            \
874 {                                                                       \
875     int score = 0, x, y;                                                \
876                                                                         \
877     for (y = 1; y < h; y++) {                                           \
878         for (x = 0; x < size; x += 4) {                                 \
879             score += FFABS(s[x]     - s[x + stride])     +              \
880                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
881                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
882                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
883         }                                                               \
884         s += stride;                                                    \
885     }                                                                   \
886                                                                         \
887     return score;                                                       \
888 }
889 VSAD_INTRA(8)
890 VSAD_INTRA(16)
891
892 #define VSAD(size)                                                             \
893 static int vsad ## size ## _c(MpegEncContext *c,                               \
894                               uint8_t *s1, uint8_t *s2,                        \
895                               ptrdiff_t stride, int h)                               \
896 {                                                                              \
897     int score = 0, x, y;                                                       \
898                                                                                \
899     for (y = 1; y < h; y++) {                                                  \
900         for (x = 0; x < size; x++)                                             \
901             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);   \
902         s1 += stride;                                                          \
903         s2 += stride;                                                          \
904     }                                                                          \
905                                                                                \
906     return score;                                                              \
907 }
908 VSAD(8)
909 VSAD(16)
910
911 #define SQ(a) ((a) * (a))
912 #define VSSE_INTRA(size)                                                \
913 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
914                                     uint8_t *s, uint8_t *dummy,         \
915                                     ptrdiff_t stride, int h)            \
916 {                                                                       \
917     int score = 0, x, y;                                                \
918                                                                         \
919     for (y = 1; y < h; y++) {                                           \
920         for (x = 0; x < size; x += 4) {                                 \
921             score += SQ(s[x]     - s[x + stride]) +                     \
922                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
923                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
924                      SQ(s[x + 3] - s[x + stride + 3]);                  \
925         }                                                               \
926         s += stride;                                                    \
927     }                                                                   \
928                                                                         \
929     return score;                                                       \
930 }
931 VSSE_INTRA(8)
932 VSSE_INTRA(16)
933
934 #define VSSE(size)                                                             \
935 static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,     \
936                               ptrdiff_t stride, int h)                         \
937 {                                                                              \
938     int score = 0, x, y;                                                       \
939                                                                                \
940     for (y = 1; y < h; y++) {                                                  \
941         for (x = 0; x < size; x++)                                             \
942             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);      \
943         s1 += stride;                                                          \
944         s2 += stride;                                                          \
945     }                                                                          \
946                                                                                \
947     return score;                                                              \
948 }
949 VSSE(8)
950 VSSE(16)
951
952 #define WRAPPER8_16_SQ(name8, name16)                                   \
953 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
954                   ptrdiff_t stride, int h)                              \
955 {                                                                       \
956     int score = 0;                                                      \
957                                                                         \
958     score += name8(s, dst, src, stride, 8);                             \
959     score += name8(s, dst + 8, src + 8, stride, 8);                     \
960     if (h == 16) {                                                      \
961         dst   += 8 * stride;                                            \
962         src   += 8 * stride;                                            \
963         score += name8(s, dst, src, stride, 8);                         \
964         score += name8(s, dst + 8, src + 8, stride, 8);                 \
965     }                                                                   \
966     return score;                                                       \
967 }
968
969 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
970 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
971 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
972 #if CONFIG_GPL
973 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
974 #endif
975 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
976 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
977 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
978 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
979
980 av_cold void ff_me_cmp_init_static(void)
981 {
982     int i;
983
984     for (i = 0; i < 512; i++)
985         ff_square_tab[i] = (i - 256) * (i - 256);
986 }
987
988 int ff_check_alignment(void)
989 {
990     static int did_fail = 0;
991     LOCAL_ALIGNED_16(int, aligned, [4]);
992
993     if ((intptr_t)aligned & 15) {
994         if (!did_fail) {
995 #if HAVE_MMX || HAVE_ALTIVEC
996             av_log(NULL, AV_LOG_ERROR,
997                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
998                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
999                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
1000                 "Do not report crashes to FFmpeg developers.\n");
1001 #endif
1002             did_fail=1;
1003         }
1004         return -1;
1005     }
1006     return 0;
1007 }
1008
1009 av_cold void ff_me_cmp_init(MECmpContext *c, AVCodecContext *avctx)
1010 {
1011     ff_check_alignment();
1012
1013     c->sum_abs_dctelem = sum_abs_dctelem_c;
1014
1015     /* TODO [0] 16  [1] 8 */
1016     c->pix_abs[0][0] = pix_abs16_c;
1017     c->pix_abs[0][1] = pix_abs16_x2_c;
1018     c->pix_abs[0][2] = pix_abs16_y2_c;
1019     c->pix_abs[0][3] = pix_abs16_xy2_c;
1020     c->pix_abs[1][0] = pix_abs8_c;
1021     c->pix_abs[1][1] = pix_abs8_x2_c;
1022     c->pix_abs[1][2] = pix_abs8_y2_c;
1023     c->pix_abs[1][3] = pix_abs8_xy2_c;
1024
1025 #define SET_CMP_FUNC(name)                      \
1026     c->name[0] = name ## 16_c;                  \
1027     c->name[1] = name ## 8x8_c;
1028
1029     SET_CMP_FUNC(hadamard8_diff)
1030     c->hadamard8_diff[4] = hadamard8_intra16_c;
1031     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
1032     SET_CMP_FUNC(dct_sad)
1033     SET_CMP_FUNC(dct_max)
1034 #if CONFIG_GPL
1035     SET_CMP_FUNC(dct264_sad)
1036 #endif
1037     c->sad[0] = pix_abs16_c;
1038     c->sad[1] = pix_abs8_c;
1039     c->sse[0] = sse16_c;
1040     c->sse[1] = sse8_c;
1041     c->sse[2] = sse4_c;
1042     SET_CMP_FUNC(quant_psnr)
1043     SET_CMP_FUNC(rd)
1044     SET_CMP_FUNC(bit)
1045     c->vsad[0] = vsad16_c;
1046     c->vsad[1] = vsad8_c;
1047     c->vsad[4] = vsad_intra16_c;
1048     c->vsad[5] = vsad_intra8_c;
1049     c->vsse[0] = vsse16_c;
1050     c->vsse[1] = vsse8_c;
1051     c->vsse[4] = vsse_intra16_c;
1052     c->vsse[5] = vsse_intra8_c;
1053     c->nsse[0] = nsse16_c;
1054     c->nsse[1] = nsse8_c;
1055 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
1056     ff_dsputil_init_dwt(c);
1057 #endif
1058
1059     if (ARCH_ALPHA)
1060         ff_me_cmp_init_alpha(c, avctx);
1061     if (ARCH_ARM)
1062         ff_me_cmp_init_arm(c, avctx);
1063     if (ARCH_PPC)
1064         ff_me_cmp_init_ppc(c, avctx);
1065     if (ARCH_X86)
1066         ff_me_cmp_init_x86(c, avctx);
1067     if (ARCH_MIPS)
1068         ff_me_cmp_init_mips(c, avctx);
1069
1070     c->median_sad[0] = pix_median_abs16_c;
1071     c->median_sad[1] = pix_median_abs8_c;
1072 }