]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
x86: dsputil_x86.h: K&R formatting cosmetics
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "avcodec.h"
33 #include "copy_block.h"
34 #include "dct.h"
35 #include "dsputil.h"
36 #include "simple_idct.h"
37 #include "faandct.h"
38 #include "faanidct.h"
39 #include "imgconvert.h"
40 #include "mathops.h"
41 #include "mpegvideo.h"
42 #include "config.h"
43
44 uint32_t ff_square_tab[512] = { 0, };
45
46 #define BIT_DEPTH 16
47 #include "dsputil_template.c"
48 #undef BIT_DEPTH
49
50 #define BIT_DEPTH 8
51 #include "dsputil_template.c"
52
53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54 #define pb_7f (~0UL / 255 * 0x7f)
55 #define pb_80 (~0UL / 255 * 0x80)
56
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58  * specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60      0,  8,  1,  9, 16, 24,  2, 10,
61     17, 25, 32, 40, 48, 56, 33, 41,
62     18, 26,  3, 11,  4, 12, 19, 27,
63     34, 42, 49, 57, 50, 58, 35, 43,
64     20, 28,  5, 13,  6, 14, 21, 29,
65     36, 44, 51, 59, 52, 60, 37, 45,
66     22, 30,  7, 15, 23, 31, 38, 46,
67     53, 61, 54, 62, 39, 47, 55, 63,
68 };
69
70 const uint8_t ff_alternate_horizontal_scan[64] = {
71      0,  1,  2,  3,  8,  9, 16, 17,
72     10, 11,  4,  5,  6,  7, 15, 14,
73     13, 12, 19, 18, 24, 25, 32, 33,
74     26, 27, 20, 21, 22, 23, 28, 29,
75     30, 31, 34, 35, 40, 41, 48, 49,
76     42, 43, 36, 37, 38, 39, 44, 45,
77     46, 47, 50, 51, 56, 57, 58, 59,
78     52, 53, 54, 55, 60, 61, 62, 63,
79 };
80
81 const uint8_t ff_alternate_vertical_scan[64] = {
82      0,  8, 16, 24,  1,  9,  2, 10,
83     17, 25, 32, 40, 48, 56, 57, 49,
84     41, 33, 26, 18,  3, 11,  4, 12,
85     19, 27, 34, 42, 50, 58, 35, 43,
86     51, 59, 20, 28,  5, 13,  6, 14,
87     21, 29, 36, 44, 52, 60, 37, 45,
88     53, 61, 22, 30,  7, 15, 23, 31,
89     38, 46, 54, 62, 39, 47, 55, 63,
90 };
91
92 /* Input permutation for the simple_idct_mmx */
93 static const uint8_t simple_mmx_permutation[64] = {
94     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
95     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
96     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
97     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
98     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
99     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
100     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
101     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
102 };
103
104 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
105
106 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
107                                const uint8_t *src_scantable)
108 {
109     int i, end;
110
111     st->scantable = src_scantable;
112
113     for (i = 0; i < 64; i++) {
114         int j = src_scantable[i];
115         st->permutated[i] = permutation[j];
116     }
117
118     end = -1;
119     for (i = 0; i < 64; i++) {
120         int j = st->permutated[i];
121         if (j > end)
122             end = j;
123         st->raster_end[i] = end;
124     }
125 }
126
127 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
128                                            int idct_permutation_type)
129 {
130     int i;
131
132     switch (idct_permutation_type) {
133     case FF_NO_IDCT_PERM:
134         for (i = 0; i < 64; i++)
135             idct_permutation[i] = i;
136         break;
137     case FF_LIBMPEG2_IDCT_PERM:
138         for (i = 0; i < 64; i++)
139             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
140         break;
141     case FF_SIMPLE_IDCT_PERM:
142         for (i = 0; i < 64; i++)
143             idct_permutation[i] = simple_mmx_permutation[i];
144         break;
145     case FF_TRANSPOSE_IDCT_PERM:
146         for (i = 0; i < 64; i++)
147             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
148         break;
149     case FF_PARTTRANS_IDCT_PERM:
150         for (i = 0; i < 64; i++)
151             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
152         break;
153     case FF_SSE2_IDCT_PERM:
154         for (i = 0; i < 64; i++)
155             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
156         break;
157     default:
158         av_log(NULL, AV_LOG_ERROR,
159                "Internal error, IDCT permutation not set\n");
160     }
161 }
162
163 static int pix_sum_c(uint8_t *pix, int line_size)
164 {
165     int s = 0, i, j;
166
167     for (i = 0; i < 16; i++) {
168         for (j = 0; j < 16; j += 8) {
169             s   += pix[0];
170             s   += pix[1];
171             s   += pix[2];
172             s   += pix[3];
173             s   += pix[4];
174             s   += pix[5];
175             s   += pix[6];
176             s   += pix[7];
177             pix += 8;
178         }
179         pix += line_size - 16;
180     }
181     return s;
182 }
183
184 static int pix_norm1_c(uint8_t *pix, int line_size)
185 {
186     int s = 0, i, j;
187     uint32_t *sq = ff_square_tab + 256;
188
189     for (i = 0; i < 16; i++) {
190         for (j = 0; j < 16; j += 8) {
191 #if 0
192             s += sq[pix[0]];
193             s += sq[pix[1]];
194             s += sq[pix[2]];
195             s += sq[pix[3]];
196             s += sq[pix[4]];
197             s += sq[pix[5]];
198             s += sq[pix[6]];
199             s += sq[pix[7]];
200 #else
201 #if HAVE_FAST_64BIT
202             register uint64_t x = *(uint64_t *) pix;
203             s += sq[x         & 0xff];
204             s += sq[(x >>  8) & 0xff];
205             s += sq[(x >> 16) & 0xff];
206             s += sq[(x >> 24) & 0xff];
207             s += sq[(x >> 32) & 0xff];
208             s += sq[(x >> 40) & 0xff];
209             s += sq[(x >> 48) & 0xff];
210             s += sq[(x >> 56) & 0xff];
211 #else
212             register uint32_t x = *(uint32_t *) pix;
213             s += sq[x         & 0xff];
214             s += sq[(x >>  8) & 0xff];
215             s += sq[(x >> 16) & 0xff];
216             s += sq[(x >> 24) & 0xff];
217             x  = *(uint32_t *) (pix + 4);
218             s += sq[x         & 0xff];
219             s += sq[(x >>  8) & 0xff];
220             s += sq[(x >> 16) & 0xff];
221             s += sq[(x >> 24) & 0xff];
222 #endif
223 #endif
224             pix += 8;
225         }
226         pix += line_size - 16;
227     }
228     return s;
229 }
230
231 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
232 {
233     int i;
234
235     for (i = 0; i + 8 <= w; i += 8) {
236         dst[i + 0] = av_bswap32(src[i + 0]);
237         dst[i + 1] = av_bswap32(src[i + 1]);
238         dst[i + 2] = av_bswap32(src[i + 2]);
239         dst[i + 3] = av_bswap32(src[i + 3]);
240         dst[i + 4] = av_bswap32(src[i + 4]);
241         dst[i + 5] = av_bswap32(src[i + 5]);
242         dst[i + 6] = av_bswap32(src[i + 6]);
243         dst[i + 7] = av_bswap32(src[i + 7]);
244     }
245     for (; i < w; i++)
246         dst[i + 0] = av_bswap32(src[i + 0]);
247 }
248
249 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
250 {
251     while (len--)
252         *dst++ = av_bswap16(*src++);
253 }
254
255 static int sse4_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
256 {
257     int s = 0, i;
258     uint32_t *sq = ff_square_tab + 256;
259
260     for (i = 0; i < h; i++) {
261         s    += sq[pix1[0] - pix2[0]];
262         s    += sq[pix1[1] - pix2[1]];
263         s    += sq[pix1[2] - pix2[2]];
264         s    += sq[pix1[3] - pix2[3]];
265         pix1 += line_size;
266         pix2 += line_size;
267     }
268     return s;
269 }
270
271 static int sse8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 {
273     int s = 0, i;
274     uint32_t *sq = ff_square_tab + 256;
275
276     for (i = 0; i < h; i++) {
277         s    += sq[pix1[0] - pix2[0]];
278         s    += sq[pix1[1] - pix2[1]];
279         s    += sq[pix1[2] - pix2[2]];
280         s    += sq[pix1[3] - pix2[3]];
281         s    += sq[pix1[4] - pix2[4]];
282         s    += sq[pix1[5] - pix2[5]];
283         s    += sq[pix1[6] - pix2[6]];
284         s    += sq[pix1[7] - pix2[7]];
285         pix1 += line_size;
286         pix2 += line_size;
287     }
288     return s;
289 }
290
291 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
292 {
293     int s = 0, i;
294     uint32_t *sq = ff_square_tab + 256;
295
296     for (i = 0; i < h; i++) {
297         s += sq[pix1[0]  - pix2[0]];
298         s += sq[pix1[1]  - pix2[1]];
299         s += sq[pix1[2]  - pix2[2]];
300         s += sq[pix1[3]  - pix2[3]];
301         s += sq[pix1[4]  - pix2[4]];
302         s += sq[pix1[5]  - pix2[5]];
303         s += sq[pix1[6]  - pix2[6]];
304         s += sq[pix1[7]  - pix2[7]];
305         s += sq[pix1[8]  - pix2[8]];
306         s += sq[pix1[9]  - pix2[9]];
307         s += sq[pix1[10] - pix2[10]];
308         s += sq[pix1[11] - pix2[11]];
309         s += sq[pix1[12] - pix2[12]];
310         s += sq[pix1[13] - pix2[13]];
311         s += sq[pix1[14] - pix2[14]];
312         s += sq[pix1[15] - pix2[15]];
313
314         pix1 += line_size;
315         pix2 += line_size;
316     }
317     return s;
318 }
319
320 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
321                           const uint8_t *s2, int stride)
322 {
323     int i;
324
325     /* read the pixels */
326     for (i = 0; i < 8; i++) {
327         block[0] = s1[0] - s2[0];
328         block[1] = s1[1] - s2[1];
329         block[2] = s1[2] - s2[2];
330         block[3] = s1[3] - s2[3];
331         block[4] = s1[4] - s2[4];
332         block[5] = s1[5] - s2[5];
333         block[6] = s1[6] - s2[6];
334         block[7] = s1[7] - s2[7];
335         s1      += stride;
336         s2      += stride;
337         block   += 8;
338     }
339 }
340
341 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
342                                  int line_size)
343 {
344     int i;
345
346     /* read the pixels */
347     for (i = 0; i < 8; i++) {
348         pixels[0] = av_clip_uint8(block[0]);
349         pixels[1] = av_clip_uint8(block[1]);
350         pixels[2] = av_clip_uint8(block[2]);
351         pixels[3] = av_clip_uint8(block[3]);
352         pixels[4] = av_clip_uint8(block[4]);
353         pixels[5] = av_clip_uint8(block[5]);
354         pixels[6] = av_clip_uint8(block[6]);
355         pixels[7] = av_clip_uint8(block[7]);
356
357         pixels += line_size;
358         block  += 8;
359     }
360 }
361
362 static void put_signed_pixels_clamped_c(const int16_t *block,
363                                         uint8_t *restrict pixels,
364                                         int line_size)
365 {
366     int i, j;
367
368     for (i = 0; i < 8; i++) {
369         for (j = 0; j < 8; j++) {
370             if (*block < -128)
371                 *pixels = 0;
372             else if (*block > 127)
373                 *pixels = 255;
374             else
375                 *pixels = (uint8_t) (*block + 128);
376             block++;
377             pixels++;
378         }
379         pixels += (line_size - 8);
380     }
381 }
382
383 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
384                           int line_size)
385 {
386     int i;
387
388     for (i = 0; i < 8; i++) {
389         pixels[0] += block[0];
390         pixels[1] += block[1];
391         pixels[2] += block[2];
392         pixels[3] += block[3];
393         pixels[4] += block[4];
394         pixels[5] += block[5];
395         pixels[6] += block[6];
396         pixels[7] += block[7];
397         pixels    += line_size;
398         block     += 8;
399     }
400 }
401
402 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
403                                  int line_size)
404 {
405     int i;
406
407     /* read the pixels */
408     for (i = 0; i < 8; i++) {
409         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
410         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
411         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
412         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
413         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
414         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
415         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
416         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
417         pixels   += line_size;
418         block    += 8;
419     }
420 }
421
422 static int sum_abs_dctelem_c(int16_t *block)
423 {
424     int sum = 0, i;
425
426     for (i = 0; i < 64; i++)
427         sum += FFABS(block[i]);
428     return sum;
429 }
430
431 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
432 {
433     int i;
434
435     for (i = 0; i < h; i++) {
436         memset(block, value, 16);
437         block += line_size;
438     }
439 }
440
441 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
442 {
443     int i;
444
445     for (i = 0; i < h; i++) {
446         memset(block, value, 8);
447         block += line_size;
448     }
449 }
450
451 #define avg2(a, b) ((a + b + 1) >> 1)
452 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
453
454 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
455                    int x16, int y16, int rounder)
456 {
457     const int A = (16 - x16) * (16 - y16);
458     const int B = (x16)      * (16 - y16);
459     const int C = (16 - x16) * (y16);
460     const int D = (x16)      * (y16);
461     int i;
462
463     for (i = 0; i < h; i++) {
464         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
465         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
466         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
467         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
468         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
469         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
470         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
471         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
472         dst   += stride;
473         src   += stride;
474     }
475 }
476
477 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
478               int dxx, int dxy, int dyx, int dyy, int shift, int r,
479               int width, int height)
480 {
481     int y, vx, vy;
482     const int s = 1 << shift;
483
484     width--;
485     height--;
486
487     for (y = 0; y < h; y++) {
488         int x;
489
490         vx = ox;
491         vy = oy;
492         for (x = 0; x < 8; x++) { // FIXME: optimize
493             int index;
494             int src_x  = vx >> 16;
495             int src_y  = vy >> 16;
496             int frac_x = src_x & (s - 1);
497             int frac_y = src_y & (s - 1);
498
499             src_x >>= shift;
500             src_y >>= shift;
501
502             if ((unsigned) src_x < width) {
503                 if ((unsigned) src_y < height) {
504                     index = src_x + src_y * stride;
505                     dst[y * stride + x] =
506                         ((src[index]                        * (s - frac_x) +
507                           src[index + 1]          * frac_x) * (s - frac_y) +
508                          (src[index + stride]               * (s - frac_x) +
509                           src[index + stride + 1] * frac_x) *      frac_y  +
510                          r) >> (shift * 2);
511                 } else {
512                     index = src_x + av_clip(src_y, 0, height) * stride;
513                     dst[y * stride + x] =
514                         ((src[index]               * (s - frac_x) +
515                           src[index + 1] * frac_x) *  s           +
516                          r) >> (shift * 2);
517                 }
518             } else {
519                 if ((unsigned) src_y < height) {
520                     index = av_clip(src_x, 0, width) + src_y * stride;
521                     dst[y * stride + x] =
522                         ((src[index]                    * (s - frac_y) +
523                           src[index + stride] * frac_y) *  s           +
524                          r) >> (shift * 2);
525                 } else {
526                     index = av_clip(src_x, 0, width) +
527                             av_clip(src_y, 0, height) * stride;
528                     dst[y * stride + x] = src[index];
529                 }
530             }
531
532             vx += dxx;
533             vy += dyx;
534         }
535         ox += dxy;
536         oy += dyy;
537     }
538 }
539
540 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
541                                           int stride, int width, int height)
542 {
543     switch (width) {
544     case 2:
545         put_pixels2_8_c(dst, src, stride, height);
546         break;
547     case 4:
548         put_pixels4_8_c(dst, src, stride, height);
549         break;
550     case 8:
551         put_pixels8_8_c(dst, src, stride, height);
552         break;
553     case 16:
554         put_pixels16_8_c(dst, src, stride, height);
555         break;
556     }
557 }
558
559 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
560                                           int stride, int width, int height)
561 {
562     int i, j;
563
564     for (i = 0; i < height; i++) {
565         for (j = 0; j < width; j++)
566             dst[j] = ((2 * src[j] + src[j + 1] + 1) *
567                       683) >> 11;
568         src += stride;
569         dst += stride;
570     }
571 }
572
573 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
574                                           int stride, int width, int height)
575 {
576     int i, j;
577
578     for (i = 0; i < height; i++) {
579         for (j = 0; j < width; j++)
580             dst[j] = ((src[j] + 2 * src[j + 1] + 1) *
581                       683) >> 11;
582         src += stride;
583         dst += stride;
584     }
585 }
586
587 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
588                                           int stride, int width, int height)
589 {
590     int i, j;
591
592     for (i = 0; i < height; i++) {
593         for (j = 0; j < width; j++)
594             dst[j] = ((2 * src[j] + src[j + stride] + 1) *
595                       683) >> 11;
596         src += stride;
597         dst += stride;
598     }
599 }
600
601 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
602                                           int stride, int width, int height)
603 {
604     int i, j;
605
606     for (i = 0; i < height; i++) {
607         for (j = 0; j < width; j++)
608             dst[j] = ((4 * src[j]          + 3 * src[j + 1] +
609                        3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
610                       2731) >> 15;
611         src += stride;
612         dst += stride;
613     }
614 }
615
616 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
617                                           int stride, int width, int height)
618 {
619     int i, j;
620
621     for (i = 0; i < height; i++) {
622         for (j = 0; j < width; j++)
623             dst[j] = ((3 * src[j]          + 2 * src[j + 1] +
624                        4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
625                       2731) >> 15;
626         src += stride;
627         dst += stride;
628     }
629 }
630
631 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
632                                           int stride, int width, int height)
633 {
634     int i, j;
635
636     for (i = 0; i < height; i++) {
637         for (j = 0; j < width; j++)
638             dst[j] = ((src[j] + 2 * src[j + stride] + 1) *
639                       683) >> 11;
640         src += stride;
641         dst += stride;
642     }
643 }
644
645 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
646                                           int stride, int width, int height)
647 {
648     int i, j;
649
650     for (i = 0; i < height; i++) {
651         for (j = 0; j < width; j++)
652             dst[j] = ((3 * src[j]          + 4 * src[j + 1] +
653                        2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
654                       2731) >> 15;
655         src += stride;
656         dst += stride;
657     }
658 }
659
660 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
661                                           int stride, int width, int height)
662 {
663     int i, j;
664
665     for (i = 0; i < height; i++) {
666         for (j = 0; j < width; j++)
667             dst[j] = ((2 * src[j]          + 3 * src[j + 1] +
668                        3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
669                       2731) >> 15;
670         src += stride;
671         dst += stride;
672     }
673 }
674
675 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
676                                           int stride, int width, int height)
677 {
678     switch (width) {
679     case 2:
680         avg_pixels2_8_c(dst, src, stride, height);
681         break;
682     case 4:
683         avg_pixels4_8_c(dst, src, stride, height);
684         break;
685     case 8:
686         avg_pixels8_8_c(dst, src, stride, height);
687         break;
688     case 16:
689         avg_pixels16_8_c(dst, src, stride, height);
690         break;
691     }
692 }
693
694 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
695                                           int stride, int width, int height)
696 {
697     int i, j;
698
699     for (i = 0; i < height; i++) {
700         for (j = 0; j < width; j++)
701             dst[j] = (dst[j] +
702                       (((2 * src[j] + src[j + 1] + 1) *
703                         683) >> 11) + 1) >> 1;
704         src += stride;
705         dst += stride;
706     }
707 }
708
709 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
710                                           int stride, int width, int height)
711 {
712     int i, j;
713
714     for (i = 0; i < height; i++) {
715         for (j = 0; j < width; j++)
716             dst[j] = (dst[j] +
717                       (((src[j] + 2 * src[j + 1] + 1) *
718                         683) >> 11) + 1) >> 1;
719         src += stride;
720         dst += stride;
721     }
722 }
723
724 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
725                                           int stride, int width, int height)
726 {
727     int i, j;
728
729     for (i = 0; i < height; i++) {
730         for (j = 0; j < width; j++)
731             dst[j] = (dst[j] +
732                       (((2 * src[j] + src[j + stride] + 1) *
733                         683) >> 11) + 1) >> 1;
734         src += stride;
735         dst += stride;
736     }
737 }
738
739 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
740                                           int stride, int width, int height)
741 {
742     int i, j;
743
744     for (i = 0; i < height; i++) {
745         for (j = 0; j < width; j++)
746             dst[j] = (dst[j] +
747                       (((4 * src[j]          + 3 * src[j + 1] +
748                          3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
749                         2731) >> 15) + 1) >> 1;
750         src += stride;
751         dst += stride;
752     }
753 }
754
755 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
756                                           int stride, int width, int height)
757 {
758     int i, j;
759
760     for (i = 0; i < height; i++) {
761         for (j = 0; j < width; j++)
762             dst[j] = (dst[j] +
763                       (((3 * src[j]          + 2 * src[j + 1] +
764                          4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
765                         2731) >> 15) + 1) >> 1;
766         src += stride;
767         dst += stride;
768     }
769 }
770
771 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
772                                           int stride, int width, int height)
773 {
774     int i, j;
775
776     for (i = 0; i < height; i++) {
777         for (j = 0; j < width; j++)
778             dst[j] = (dst[j] +
779                       (((src[j] + 2 * src[j + stride] + 1) *
780                         683) >> 11) + 1) >> 1;
781         src += stride;
782         dst += stride;
783     }
784 }
785
786 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
787                                           int stride, int width, int height)
788 {
789     int i, j;
790
791     for (i = 0; i < height; i++) {
792         for (j = 0; j < width; j++)
793             dst[j] = (dst[j] +
794                       (((3 * src[j]          + 4 * src[j + 1] +
795                          2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
796                         2731) >> 15) + 1) >> 1;
797         src += stride;
798         dst += stride;
799     }
800 }
801
802 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
803                                           int stride, int width, int height)
804 {
805     int i, j;
806
807     for (i = 0; i < height; i++) {
808         for (j = 0; j < width; j++)
809             dst[j] = (dst[j] +
810                       (((2 * src[j]          + 3 * src[j + 1] +
811                          3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
812                         2731) >> 15) + 1) >> 1;
813         src += stride;
814         dst += stride;
815     }
816 }
817
818 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
819 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
820                                             int dstStride, int srcStride,     \
821                                             int h)                            \
822 {                                                                             \
823     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
824     int i;                                                                    \
825                                                                               \
826     for (i = 0; i < h; i++) {                                                 \
827         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
828         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
829         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
830         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
831         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
832         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
833         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
834         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
835         dst += dstStride;                                                     \
836         src += srcStride;                                                     \
837     }                                                                         \
838 }                                                                             \
839                                                                               \
840 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
841                                             int dstStride, int srcStride)     \
842 {                                                                             \
843     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
844     const int w = 8;                                                          \
845     int i;                                                                    \
846                                                                               \
847     for (i = 0; i < w; i++) {                                                 \
848         const int src0 = src[0 * srcStride];                                  \
849         const int src1 = src[1 * srcStride];                                  \
850         const int src2 = src[2 * srcStride];                                  \
851         const int src3 = src[3 * srcStride];                                  \
852         const int src4 = src[4 * srcStride];                                  \
853         const int src5 = src[5 * srcStride];                                  \
854         const int src6 = src[6 * srcStride];                                  \
855         const int src7 = src[7 * srcStride];                                  \
856         const int src8 = src[8 * srcStride];                                  \
857         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
858         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
859         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
860         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
861         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
862         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
863         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
864         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
865         dst++;                                                                \
866         src++;                                                                \
867     }                                                                         \
868 }                                                                             \
869                                                                               \
870 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
871                                              int dstStride, int srcStride,    \
872                                              int h)                           \
873 {                                                                             \
874     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
875     int i;                                                                    \
876                                                                               \
877     for (i = 0; i < h; i++) {                                                 \
878         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
879         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
880         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
881         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
882         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
883         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
884         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
885         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
886         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
887         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
888         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
889         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
890         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
891         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
892         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
893         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
894         dst += dstStride;                                                     \
895         src += srcStride;                                                     \
896     }                                                                         \
897 }                                                                             \
898                                                                               \
899 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
900                                              int dstStride, int srcStride)    \
901 {                                                                             \
902     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
903     const int w = 16;                                                         \
904     int i;                                                                    \
905                                                                               \
906     for (i = 0; i < w; i++) {                                                 \
907         const int src0  = src[0  * srcStride];                                \
908         const int src1  = src[1  * srcStride];                                \
909         const int src2  = src[2  * srcStride];                                \
910         const int src3  = src[3  * srcStride];                                \
911         const int src4  = src[4  * srcStride];                                \
912         const int src5  = src[5  * srcStride];                                \
913         const int src6  = src[6  * srcStride];                                \
914         const int src7  = src[7  * srcStride];                                \
915         const int src8  = src[8  * srcStride];                                \
916         const int src9  = src[9  * srcStride];                                \
917         const int src10 = src[10 * srcStride];                                \
918         const int src11 = src[11 * srcStride];                                \
919         const int src12 = src[12 * srcStride];                                \
920         const int src13 = src[13 * srcStride];                                \
921         const int src14 = src[14 * srcStride];                                \
922         const int src15 = src[15 * srcStride];                                \
923         const int src16 = src[16 * srcStride];                                \
924         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
925         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
926         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
927         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
928         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
929         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
930         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
931         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
932         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
933         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
934         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
935         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
936         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
937         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
938         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
939         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
940         dst++;                                                                \
941         src++;                                                                \
942     }                                                                         \
943 }                                                                             \
944                                                                               \
945 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
946                                    ptrdiff_t stride)                          \
947 {                                                                             \
948     uint8_t half[64];                                                         \
949                                                                               \
950     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
951     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
952 }                                                                             \
953                                                                               \
954 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
955                                    ptrdiff_t stride)                          \
956 {                                                                             \
957     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
958 }                                                                             \
959                                                                               \
960 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
961                                    ptrdiff_t stride)                          \
962 {                                                                             \
963     uint8_t half[64];                                                         \
964                                                                               \
965     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
966     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
967 }                                                                             \
968                                                                               \
969 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
970                                    ptrdiff_t stride)                          \
971 {                                                                             \
972     uint8_t full[16 * 9];                                                     \
973     uint8_t half[64];                                                         \
974                                                                               \
975     copy_block9(full, src, 16, stride, 9);                                    \
976     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
977     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
978 }                                                                             \
979                                                                               \
980 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
981                                    ptrdiff_t stride)                          \
982 {                                                                             \
983     uint8_t full[16 * 9];                                                     \
984                                                                               \
985     copy_block9(full, src, 16, stride, 9);                                    \
986     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
987 }                                                                             \
988                                                                               \
989 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
990                                    ptrdiff_t stride)                          \
991 {                                                                             \
992     uint8_t full[16 * 9];                                                     \
993     uint8_t half[64];                                                         \
994                                                                               \
995     copy_block9(full, src, 16, stride, 9);                                    \
996     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
997     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
998 }                                                                             \
999                                                                               \
1000 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
1001                                        ptrdiff_t stride)                      \
1002 {                                                                             \
1003     uint8_t full[16 * 9];                                                     \
1004     uint8_t halfH[72];                                                        \
1005     uint8_t halfV[64];                                                        \
1006     uint8_t halfHV[64];                                                       \
1007                                                                               \
1008     copy_block9(full, src, 16, stride, 9);                                    \
1009     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1010     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1011     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1012     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
1013                            stride, 16, 8, 8, 8, 8);                           \
1014 }                                                                             \
1015                                                                               \
1016 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
1017                                    ptrdiff_t stride)                          \
1018 {                                                                             \
1019     uint8_t full[16 * 9];                                                     \
1020     uint8_t halfH[72];                                                        \
1021     uint8_t halfHV[64];                                                       \
1022                                                                               \
1023     copy_block9(full, src, 16, stride, 9);                                    \
1024     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1025     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1026     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1027     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1028 }                                                                             \
1029                                                                               \
1030 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
1031                                        ptrdiff_t stride)                      \
1032 {                                                                             \
1033     uint8_t full[16 * 9];                                                     \
1034     uint8_t halfH[72];                                                        \
1035     uint8_t halfV[64];                                                        \
1036     uint8_t halfHV[64];                                                       \
1037                                                                               \
1038     copy_block9(full, src, 16, stride, 9);                                    \
1039     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1040     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1041     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1042     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
1043                            stride, 16, 8, 8, 8, 8);                           \
1044 }                                                                             \
1045                                                                               \
1046 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
1047                                    ptrdiff_t stride)                          \
1048 {                                                                             \
1049     uint8_t full[16 * 9];                                                     \
1050     uint8_t halfH[72];                                                        \
1051     uint8_t halfHV[64];                                                       \
1052                                                                               \
1053     copy_block9(full, src, 16, stride, 9);                                    \
1054     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1055     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1056     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1057     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1058 }                                                                             \
1059                                                                               \
1060 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
1061                                        ptrdiff_t stride)                      \
1062 {                                                                             \
1063     uint8_t full[16 * 9];                                                     \
1064     uint8_t halfH[72];                                                        \
1065     uint8_t halfV[64];                                                        \
1066     uint8_t halfHV[64];                                                       \
1067                                                                               \
1068     copy_block9(full, src, 16, stride, 9);                                    \
1069     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1070     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1071     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1072     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
1073                            stride, 16, 8, 8, 8, 8);                           \
1074 }                                                                             \
1075                                                                               \
1076 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
1077                                    ptrdiff_t stride)                          \
1078 {                                                                             \
1079     uint8_t full[16 * 9];                                                     \
1080     uint8_t halfH[72];                                                        \
1081     uint8_t halfHV[64];                                                       \
1082                                                                               \
1083     copy_block9(full, src, 16, stride, 9);                                    \
1084     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1085     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1086     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1087     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1088 }                                                                             \
1089                                                                               \
1090 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
1091                                        ptrdiff_t stride)                      \
1092 {                                                                             \
1093     uint8_t full[16 * 9];                                                     \
1094     uint8_t halfH[72];                                                        \
1095     uint8_t halfV[64];                                                        \
1096     uint8_t halfHV[64];                                                       \
1097                                                                               \
1098     copy_block9(full, src, 16, stride, 9);                                    \
1099     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1100     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1101     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1102     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
1103                            stride, 16, 8, 8, 8, 8);                           \
1104 }                                                                             \
1105                                                                               \
1106 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
1107                                    ptrdiff_t stride)                          \
1108 {                                                                             \
1109     uint8_t full[16 * 9];                                                     \
1110     uint8_t halfH[72];                                                        \
1111     uint8_t halfHV[64];                                                       \
1112                                                                               \
1113     copy_block9(full, src, 16, stride, 9);                                    \
1114     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1115     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1116     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1117     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1118 }                                                                             \
1119                                                                               \
1120 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
1121                                    ptrdiff_t stride)                          \
1122 {                                                                             \
1123     uint8_t halfH[72];                                                        \
1124     uint8_t halfHV[64];                                                       \
1125                                                                               \
1126     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1127     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1128     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
1129 }                                                                             \
1130                                                                               \
1131 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
1132                                    ptrdiff_t stride)                          \
1133 {                                                                             \
1134     uint8_t halfH[72];                                                        \
1135     uint8_t halfHV[64];                                                       \
1136                                                                               \
1137     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1138     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1139     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
1140 }                                                                             \
1141                                                                               \
1142 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
1143                                        ptrdiff_t stride)                      \
1144 {                                                                             \
1145     uint8_t full[16 * 9];                                                     \
1146     uint8_t halfH[72];                                                        \
1147     uint8_t halfV[64];                                                        \
1148     uint8_t halfHV[64];                                                       \
1149                                                                               \
1150     copy_block9(full, src, 16, stride, 9);                                    \
1151     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1152     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
1153     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1154     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
1155 }                                                                             \
1156                                                                               \
1157 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
1158                                    ptrdiff_t stride)                          \
1159 {                                                                             \
1160     uint8_t full[16 * 9];                                                     \
1161     uint8_t halfH[72];                                                        \
1162                                                                               \
1163     copy_block9(full, src, 16, stride, 9);                                    \
1164     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1165     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
1166     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1167 }                                                                             \
1168                                                                               \
1169 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
1170                                        ptrdiff_t stride)                      \
1171 {                                                                             \
1172     uint8_t full[16 * 9];                                                     \
1173     uint8_t halfH[72];                                                        \
1174     uint8_t halfV[64];                                                        \
1175     uint8_t halfHV[64];                                                       \
1176                                                                               \
1177     copy_block9(full, src, 16, stride, 9);                                    \
1178     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1179     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
1180     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
1181     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
1182 }                                                                             \
1183                                                                               \
1184 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
1185                                    ptrdiff_t stride)                          \
1186 {                                                                             \
1187     uint8_t full[16 * 9];                                                     \
1188     uint8_t halfH[72];                                                        \
1189                                                                               \
1190     copy_block9(full, src, 16, stride, 9);                                    \
1191     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
1192     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
1193     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1194 }                                                                             \
1195                                                                               \
1196 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
1197                                    ptrdiff_t stride)                          \
1198 {                                                                             \
1199     uint8_t halfH[72];                                                        \
1200                                                                               \
1201     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
1202     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
1203 }                                                                             \
1204                                                                               \
1205 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
1206                                     ptrdiff_t stride)                         \
1207 {                                                                             \
1208     uint8_t half[256];                                                        \
1209                                                                               \
1210     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1211     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
1212 }                                                                             \
1213                                                                               \
1214 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
1215                                     ptrdiff_t stride)                         \
1216 {                                                                             \
1217     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
1218 }                                                                             \
1219                                                                               \
1220 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
1221                                     ptrdiff_t stride)                         \
1222 {                                                                             \
1223     uint8_t half[256];                                                        \
1224                                                                               \
1225     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1226     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1227 }                                                                             \
1228                                                                               \
1229 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1230                                     ptrdiff_t stride)                         \
1231 {                                                                             \
1232     uint8_t full[24 * 17];                                                    \
1233     uint8_t half[256];                                                        \
1234                                                                               \
1235     copy_block17(full, src, 24, stride, 17);                                  \
1236     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1237     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1238 }                                                                             \
1239                                                                               \
1240 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1241                                     ptrdiff_t stride)                         \
1242 {                                                                             \
1243     uint8_t full[24 * 17];                                                    \
1244                                                                               \
1245     copy_block17(full, src, 24, stride, 17);                                  \
1246     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1247 }                                                                             \
1248                                                                               \
1249 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1250                                     ptrdiff_t stride)                         \
1251 {                                                                             \
1252     uint8_t full[24 * 17];                                                    \
1253     uint8_t half[256];                                                        \
1254                                                                               \
1255     copy_block17(full, src, 24, stride, 17);                                  \
1256     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1257     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1258 }                                                                             \
1259                                                                               \
1260 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1261                                         ptrdiff_t stride)                     \
1262 {                                                                             \
1263     uint8_t full[24 * 17];                                                    \
1264     uint8_t halfH[272];                                                       \
1265     uint8_t halfV[256];                                                       \
1266     uint8_t halfHV[256];                                                      \
1267                                                                               \
1268     copy_block17(full, src, 24, stride, 17);                                  \
1269     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1270     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1271     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1272     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1273                             stride, 24, 16, 16, 16, 16);                      \
1274 }                                                                             \
1275                                                                               \
1276 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1277                                     ptrdiff_t stride)                         \
1278 {                                                                             \
1279     uint8_t full[24 * 17];                                                    \
1280     uint8_t halfH[272];                                                       \
1281     uint8_t halfHV[256];                                                      \
1282                                                                               \
1283     copy_block17(full, src, 24, stride, 17);                                  \
1284     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1285     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1286     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1287     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1288 }                                                                             \
1289                                                                               \
1290 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1291                                         ptrdiff_t stride)                     \
1292 {                                                                             \
1293     uint8_t full[24 * 17];                                                    \
1294     uint8_t halfH[272];                                                       \
1295     uint8_t halfV[256];                                                       \
1296     uint8_t halfHV[256];                                                      \
1297                                                                               \
1298     copy_block17(full, src, 24, stride, 17);                                  \
1299     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1300     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1301     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1302     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1303                             stride, 24, 16, 16, 16, 16);                      \
1304 }                                                                             \
1305                                                                               \
1306 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1307                                     ptrdiff_t stride)                         \
1308 {                                                                             \
1309     uint8_t full[24 * 17];                                                    \
1310     uint8_t halfH[272];                                                       \
1311     uint8_t halfHV[256];                                                      \
1312                                                                               \
1313     copy_block17(full, src, 24, stride, 17);                                  \
1314     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1315     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1316     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1317     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1318 }                                                                             \
1319                                                                               \
1320 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1321                                         ptrdiff_t stride)                     \
1322 {                                                                             \
1323     uint8_t full[24 * 17];                                                    \
1324     uint8_t halfH[272];                                                       \
1325     uint8_t halfV[256];                                                       \
1326     uint8_t halfHV[256];                                                      \
1327                                                                               \
1328     copy_block17(full, src, 24, stride, 17);                                  \
1329     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1330     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1331     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1332     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1333                             stride, 24, 16, 16, 16, 16);                      \
1334 }                                                                             \
1335                                                                               \
1336 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1337                                     ptrdiff_t stride)                         \
1338 {                                                                             \
1339     uint8_t full[24 * 17];                                                    \
1340     uint8_t halfH[272];                                                       \
1341     uint8_t halfHV[256];                                                      \
1342                                                                               \
1343     copy_block17(full, src, 24, stride, 17);                                  \
1344     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1345     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1346     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1347     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1348 }                                                                             \
1349                                                                               \
1350 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1351                                         ptrdiff_t stride)                     \
1352 {                                                                             \
1353     uint8_t full[24 * 17];                                                    \
1354     uint8_t halfH[272];                                                       \
1355     uint8_t halfV[256];                                                       \
1356     uint8_t halfHV[256];                                                      \
1357                                                                               \
1358     copy_block17(full, src, 24, stride, 17);                                  \
1359     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1360     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1361     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1362     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1363                             stride, 24, 16, 16, 16, 16);                      \
1364 }                                                                             \
1365                                                                               \
1366 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1367                                     ptrdiff_t stride)                         \
1368 {                                                                             \
1369     uint8_t full[24 * 17];                                                    \
1370     uint8_t halfH[272];                                                       \
1371     uint8_t halfHV[256];                                                      \
1372                                                                               \
1373     copy_block17(full, src, 24, stride, 17);                                  \
1374     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1375     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1376     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1377     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1378 }                                                                             \
1379                                                                               \
1380 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1381                                     ptrdiff_t stride)                         \
1382 {                                                                             \
1383     uint8_t halfH[272];                                                       \
1384     uint8_t halfHV[256];                                                      \
1385                                                                               \
1386     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1387     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1388     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1389 }                                                                             \
1390                                                                               \
1391 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1392                                     ptrdiff_t stride)                         \
1393 {                                                                             \
1394     uint8_t halfH[272];                                                       \
1395     uint8_t halfHV[256];                                                      \
1396                                                                               \
1397     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1398     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1399     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1400 }                                                                             \
1401                                                                               \
1402 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1403                                         ptrdiff_t stride)                     \
1404 {                                                                             \
1405     uint8_t full[24 * 17];                                                    \
1406     uint8_t halfH[272];                                                       \
1407     uint8_t halfV[256];                                                       \
1408     uint8_t halfHV[256];                                                      \
1409                                                                               \
1410     copy_block17(full, src, 24, stride, 17);                                  \
1411     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1412     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1413     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1414     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1415 }                                                                             \
1416                                                                               \
1417 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1418                                     ptrdiff_t stride)                         \
1419 {                                                                             \
1420     uint8_t full[24 * 17];                                                    \
1421     uint8_t halfH[272];                                                       \
1422                                                                               \
1423     copy_block17(full, src, 24, stride, 17);                                  \
1424     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1425     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1426     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1427 }                                                                             \
1428                                                                               \
1429 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1430                                         ptrdiff_t stride)                     \
1431 {                                                                             \
1432     uint8_t full[24 * 17];                                                    \
1433     uint8_t halfH[272];                                                       \
1434     uint8_t halfV[256];                                                       \
1435     uint8_t halfHV[256];                                                      \
1436                                                                               \
1437     copy_block17(full, src, 24, stride, 17);                                  \
1438     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1439     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1440     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1441     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1442 }                                                                             \
1443                                                                               \
1444 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1445                                     ptrdiff_t stride)                         \
1446 {                                                                             \
1447     uint8_t full[24 * 17];                                                    \
1448     uint8_t halfH[272];                                                       \
1449                                                                               \
1450     copy_block17(full, src, 24, stride, 17);                                  \
1451     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1452     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1453     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1454 }                                                                             \
1455                                                                               \
1456 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1457                                     ptrdiff_t stride)                         \
1458 {                                                                             \
1459     uint8_t halfH[272];                                                       \
1460                                                                               \
1461     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1462     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1463 }
1464
1465 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1466 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1467 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1468 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1469
1470 QPEL_MC(0, put_, _, op_put)
1471 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1472 QPEL_MC(0, avg_, _, op_avg)
1473
1474 #undef op_avg
1475 #undef op_put
1476 #undef op_put_no_rnd
1477
1478 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1479 {
1480     put_pixels8_8_c(dst, src, stride, 8);
1481 }
1482
1483 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1484 {
1485     avg_pixels8_8_c(dst, src, stride, 8);
1486 }
1487
1488 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1489 {
1490     put_pixels16_8_c(dst, src, stride, 16);
1491 }
1492
1493 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1494 {
1495     avg_pixels16_8_c(dst, src, stride, 16);
1496 }
1497
1498 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1499 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1500 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1501 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1502 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1503 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1504
1505 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1506                                   int dstStride, int srcStride, int h)
1507 {
1508     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1509     int i;
1510
1511     for (i = 0; i < h; i++) {
1512         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1513         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1514         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1515         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1516         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1517         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1518         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1519         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1520         dst   += dstStride;
1521         src   += srcStride;
1522     }
1523 }
1524
1525 #if CONFIG_RV40_DECODER
1526 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1527 {
1528     put_pixels16_xy2_8_c(dst, src, stride, 16);
1529 }
1530
1531 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1532 {
1533     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1534 }
1535
1536 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1537 {
1538     put_pixels8_xy2_8_c(dst, src, stride, 8);
1539 }
1540
1541 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1542 {
1543     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1544 }
1545 #endif /* CONFIG_RV40_DECODER */
1546
1547 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1548                                   int dstStride, int srcStride, int w)
1549 {
1550     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1551     int i;
1552
1553     for (i = 0; i < w; i++) {
1554         const int src_1 = src[-srcStride];
1555         const int src0  = src[0];
1556         const int src1  = src[srcStride];
1557         const int src2  = src[2 * srcStride];
1558         const int src3  = src[3 * srcStride];
1559         const int src4  = src[4 * srcStride];
1560         const int src5  = src[5 * srcStride];
1561         const int src6  = src[6 * srcStride];
1562         const int src7  = src[7 * srcStride];
1563         const int src8  = src[8 * srcStride];
1564         const int src9  = src[9 * srcStride];
1565         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1566         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1567         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1568         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1569         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1570         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1571         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1572         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1573         src++;
1574         dst++;
1575     }
1576 }
1577
1578 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1579 {
1580     uint8_t half[64];
1581
1582     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1583     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1584 }
1585
1586 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1587 {
1588     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1589 }
1590
1591 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1592 {
1593     uint8_t half[64];
1594
1595     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1596     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1597 }
1598
1599 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1600 {
1601     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1602 }
1603
1604 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1605 {
1606     uint8_t halfH[88];
1607     uint8_t halfV[64];
1608     uint8_t halfHV[64];
1609
1610     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1611     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1612     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1613     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1614 }
1615
1616 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1617 {
1618     uint8_t halfH[88];
1619     uint8_t halfV[64];
1620     uint8_t halfHV[64];
1621
1622     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1623     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1624     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1625     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1626 }
1627
1628 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1629 {
1630     uint8_t halfH[88];
1631
1632     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1633     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1634 }
1635
1636 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2,
1637                               int line_size, int h)
1638 {
1639     int s = 0, i;
1640
1641     for (i = 0; i < h; i++) {
1642         s    += abs(pix1[0]  - pix2[0]);
1643         s    += abs(pix1[1]  - pix2[1]);
1644         s    += abs(pix1[2]  - pix2[2]);
1645         s    += abs(pix1[3]  - pix2[3]);
1646         s    += abs(pix1[4]  - pix2[4]);
1647         s    += abs(pix1[5]  - pix2[5]);
1648         s    += abs(pix1[6]  - pix2[6]);
1649         s    += abs(pix1[7]  - pix2[7]);
1650         s    += abs(pix1[8]  - pix2[8]);
1651         s    += abs(pix1[9]  - pix2[9]);
1652         s    += abs(pix1[10] - pix2[10]);
1653         s    += abs(pix1[11] - pix2[11]);
1654         s    += abs(pix1[12] - pix2[12]);
1655         s    += abs(pix1[13] - pix2[13]);
1656         s    += abs(pix1[14] - pix2[14]);
1657         s    += abs(pix1[15] - pix2[15]);
1658         pix1 += line_size;
1659         pix2 += line_size;
1660     }
1661     return s;
1662 }
1663
1664 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1665                           int line_size, int h)
1666 {
1667     int s = 0, i;
1668
1669     for (i = 0; i < h; i++) {
1670         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1671         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1672         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1673         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1674         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1675         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1676         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1677         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1678         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1679         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1680         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1681         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1682         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1683         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1684         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1685         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1686         pix1 += line_size;
1687         pix2 += line_size;
1688     }
1689     return s;
1690 }
1691
1692 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1693                           int line_size, int h)
1694 {
1695     int s = 0, i;
1696     uint8_t *pix3 = pix2 + line_size;
1697
1698     for (i = 0; i < h; i++) {
1699         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1700         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1701         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1702         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1703         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1704         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1705         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1706         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1707         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1708         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1709         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1710         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1711         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1712         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1713         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1714         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1715         pix1 += line_size;
1716         pix2 += line_size;
1717         pix3 += line_size;
1718     }
1719     return s;
1720 }
1721
1722 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1723                            int line_size, int h)
1724 {
1725     int s = 0, i;
1726     uint8_t *pix3 = pix2 + line_size;
1727
1728     for (i = 0; i < h; i++) {
1729         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1730         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1731         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1732         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1733         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1734         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1735         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1736         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1737         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1738         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1739         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1740         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1741         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1742         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1743         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1744         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1745         pix1 += line_size;
1746         pix2 += line_size;
1747         pix3 += line_size;
1748     }
1749     return s;
1750 }
1751
1752 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2,
1753                              int line_size, int h)
1754 {
1755     int s = 0, i;
1756
1757     for (i = 0; i < h; i++) {
1758         s    += abs(pix1[0] - pix2[0]);
1759         s    += abs(pix1[1] - pix2[1]);
1760         s    += abs(pix1[2] - pix2[2]);
1761         s    += abs(pix1[3] - pix2[3]);
1762         s    += abs(pix1[4] - pix2[4]);
1763         s    += abs(pix1[5] - pix2[5]);
1764         s    += abs(pix1[6] - pix2[6]);
1765         s    += abs(pix1[7] - pix2[7]);
1766         pix1 += line_size;
1767         pix2 += line_size;
1768     }
1769     return s;
1770 }
1771
1772 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1773                          int line_size, int h)
1774 {
1775     int s = 0, i;
1776
1777     for (i = 0; i < h; i++) {
1778         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1779         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1780         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1781         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1782         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1783         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1784         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1785         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1786         pix1 += line_size;
1787         pix2 += line_size;
1788     }
1789     return s;
1790 }
1791
1792 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1793                          int line_size, int h)
1794 {
1795     int s = 0, i;
1796     uint8_t *pix3 = pix2 + line_size;
1797
1798     for (i = 0; i < h; i++) {
1799         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1800         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1801         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1802         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1803         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1804         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1805         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1806         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1807         pix1 += line_size;
1808         pix2 += line_size;
1809         pix3 += line_size;
1810     }
1811     return s;
1812 }
1813
1814 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1815                           int line_size, int h)
1816 {
1817     int s = 0, i;
1818     uint8_t *pix3 = pix2 + line_size;
1819
1820     for (i = 0; i < h; i++) {
1821         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1822         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1823         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1824         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1825         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1826         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1827         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1828         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1829         pix1 += line_size;
1830         pix2 += line_size;
1831         pix3 += line_size;
1832     }
1833     return s;
1834 }
1835
1836 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1837 {
1838     MpegEncContext *c = v;
1839     int score1 = 0, score2 = 0, x, y;
1840
1841     for (y = 0; y < h; y++) {
1842         for (x = 0; x < 16; x++)
1843             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1844         if (y + 1 < h) {
1845             for (x = 0; x < 15; x++)
1846                 score2 += FFABS(s1[x]     - s1[x + stride] -
1847                                 s1[x + 1] + s1[x + stride + 1]) -
1848                           FFABS(s2[x]     - s2[x + stride] -
1849                                 s2[x + 1] + s2[x + stride + 1]);
1850         }
1851         s1 += stride;
1852         s2 += stride;
1853     }
1854
1855     if (c)
1856         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1857     else
1858         return score1 + FFABS(score2) * 8;
1859 }
1860
1861 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1862 {
1863     MpegEncContext *c = v;
1864     int score1 = 0, score2 = 0, x, y;
1865
1866     for (y = 0; y < h; y++) {
1867         for (x = 0; x < 8; x++)
1868             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1869         if (y + 1 < h) {
1870             for (x = 0; x < 7; x++)
1871                 score2 += FFABS(s1[x]     - s1[x + stride] -
1872                                 s1[x + 1] + s1[x + stride + 1]) -
1873                           FFABS(s2[x]     - s2[x + stride] -
1874                                 s2[x + 1] + s2[x + stride + 1]);
1875         }
1876         s1 += stride;
1877         s2 += stride;
1878     }
1879
1880     if (c)
1881         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1882     else
1883         return score1 + FFABS(score2) * 8;
1884 }
1885
1886 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1887                           int16_t basis[64], int scale)
1888 {
1889     int i;
1890     unsigned int sum = 0;
1891
1892     for (i = 0; i < 8 * 8; i++) {
1893         int b = rem[i] + ((basis[i] * scale +
1894                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1895                           (BASIS_SHIFT - RECON_SHIFT));
1896         int w = weight[i];
1897         b >>= RECON_SHIFT;
1898         assert(-512 < b && b < 512);
1899
1900         sum += (w * b) * (w * b) >> 4;
1901     }
1902     return sum >> 2;
1903 }
1904
1905 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1906 {
1907     int i;
1908
1909     for (i = 0; i < 8 * 8; i++)
1910         rem[i] += (basis[i] * scale +
1911                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1912                   (BASIS_SHIFT - RECON_SHIFT);
1913 }
1914
1915 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h)
1916 {
1917     return 0;
1918 }
1919
1920 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1921 {
1922     int i;
1923
1924     memset(cmp, 0, sizeof(void *) * 6);
1925
1926     for (i = 0; i < 6; i++) {
1927         switch (type & 0xFF) {
1928         case FF_CMP_SAD:
1929             cmp[i] = c->sad[i];
1930             break;
1931         case FF_CMP_SATD:
1932             cmp[i] = c->hadamard8_diff[i];
1933             break;
1934         case FF_CMP_SSE:
1935             cmp[i] = c->sse[i];
1936             break;
1937         case FF_CMP_DCT:
1938             cmp[i] = c->dct_sad[i];
1939             break;
1940         case FF_CMP_DCT264:
1941             cmp[i] = c->dct264_sad[i];
1942             break;
1943         case FF_CMP_DCTMAX:
1944             cmp[i] = c->dct_max[i];
1945             break;
1946         case FF_CMP_PSNR:
1947             cmp[i] = c->quant_psnr[i];
1948             break;
1949         case FF_CMP_BIT:
1950             cmp[i] = c->bit[i];
1951             break;
1952         case FF_CMP_RD:
1953             cmp[i] = c->rd[i];
1954             break;
1955         case FF_CMP_VSAD:
1956             cmp[i] = c->vsad[i];
1957             break;
1958         case FF_CMP_VSSE:
1959             cmp[i] = c->vsse[i];
1960             break;
1961         case FF_CMP_ZERO:
1962             cmp[i] = zero_cmp;
1963             break;
1964         case FF_CMP_NSSE:
1965             cmp[i] = c->nsse[i];
1966             break;
1967         default:
1968             av_log(NULL, AV_LOG_ERROR,
1969                    "internal error in cmp function selection\n");
1970         }
1971     }
1972 }
1973
1974 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1975 {
1976     long i;
1977
1978     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1979         long a = *(long *) (src + i);
1980         long b = *(long *) (dst + i);
1981         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1982     }
1983     for (; i < w; i++)
1984         dst[i + 0] += src[i + 0];
1985 }
1986
1987 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1988 {
1989     long i;
1990
1991 #if !HAVE_FAST_UNALIGNED
1992     if ((long) src2 & (sizeof(long) - 1)) {
1993         for (i = 0; i + 7 < w; i += 8) {
1994             dst[i + 0] = src1[i + 0] - src2[i + 0];
1995             dst[i + 1] = src1[i + 1] - src2[i + 1];
1996             dst[i + 2] = src1[i + 2] - src2[i + 2];
1997             dst[i + 3] = src1[i + 3] - src2[i + 3];
1998             dst[i + 4] = src1[i + 4] - src2[i + 4];
1999             dst[i + 5] = src1[i + 5] - src2[i + 5];
2000             dst[i + 6] = src1[i + 6] - src2[i + 6];
2001             dst[i + 7] = src1[i + 7] - src2[i + 7];
2002         }
2003     } else
2004 #endif
2005     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
2006         long a = *(long *) (src1 + i);
2007         long b = *(long *) (src2 + i);
2008         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
2009                               ((a ^ b ^ pb_80) & pb_80);
2010     }
2011     for (; i < w; i++)
2012         dst[i + 0] = src1[i + 0] - src2[i + 0];
2013 }
2014
2015 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2016                                          const uint8_t *diff, int w,
2017                                          int *left, int *left_top)
2018 {
2019     int i;
2020     uint8_t l, lt;
2021
2022     l  = *left;
2023     lt = *left_top;
2024
2025     for (i = 0; i < w; i++) {
2026         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
2027         lt     = src1[i];
2028         dst[i] = l;
2029     }
2030
2031     *left     = l;
2032     *left_top = lt;
2033 }
2034
2035 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2036                                          const uint8_t *src2, int w,
2037                                          int *left, int *left_top)
2038 {
2039     int i;
2040     uint8_t l, lt;
2041
2042     l  = *left;
2043     lt = *left_top;
2044
2045     for (i = 0; i < w; i++) {
2046         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
2047         lt     = src1[i];
2048         l      = src2[i];
2049         dst[i] = l - pred;
2050     }
2051
2052     *left     = l;
2053     *left_top = lt;
2054 }
2055
2056 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
2057                                       int w, int acc)
2058 {
2059     int i;
2060
2061     for (i = 0; i < w - 1; i++) {
2062         acc   += src[i];
2063         dst[i] = acc;
2064         i++;
2065         acc   += src[i];
2066         dst[i] = acc;
2067     }
2068
2069     for (; i < w; i++) {
2070         acc   += src[i];
2071         dst[i] = acc;
2072     }
2073
2074     return acc;
2075 }
2076
2077 #if HAVE_BIGENDIAN
2078 #define B 3
2079 #define G 2
2080 #define R 1
2081 #define A 0
2082 #else
2083 #define B 0
2084 #define G 1
2085 #define R 2
2086 #define A 3
2087 #endif
2088 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
2089                                              int w, int *red, int *green,
2090                                              int *blue, int *alpha)
2091 {
2092     int i, r = *red, g = *green, b = *blue, a = *alpha;
2093
2094     for (i = 0; i < w; i++) {
2095         b += src[4 * i + B];
2096         g += src[4 * i + G];
2097         r += src[4 * i + R];
2098         a += src[4 * i + A];
2099
2100         dst[4 * i + B] = b;
2101         dst[4 * i + G] = g;
2102         dst[4 * i + R] = r;
2103         dst[4 * i + A] = a;
2104     }
2105
2106     *red   = r;
2107     *green = g;
2108     *blue  = b;
2109     *alpha = a;
2110 }
2111 #undef B
2112 #undef G
2113 #undef R
2114 #undef A
2115
2116 #define BUTTERFLY2(o1, o2, i1, i2)              \
2117     o1 = (i1) + (i2);                           \
2118     o2 = (i1) - (i2);
2119
2120 #define BUTTERFLY1(x, y)                        \
2121     {                                           \
2122         int a, b;                               \
2123         a = x;                                  \
2124         b = y;                                  \
2125         x = a + b;                              \
2126         y = a - b;                              \
2127     }
2128
2129 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
2130
2131 static int hadamard8_diff8x8_c(/* MpegEncContext */ void *s, uint8_t *dst,
2132                                uint8_t *src, int stride, int h)
2133 {
2134     int i, temp[64], sum = 0;
2135
2136     assert(h == 8);
2137
2138     for (i = 0; i < 8; i++) {
2139         // FIXME: try pointer walks
2140         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2141                    src[stride * i + 0] - dst[stride * i + 0],
2142                    src[stride * i + 1] - dst[stride * i + 1]);
2143         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2144                    src[stride * i + 2] - dst[stride * i + 2],
2145                    src[stride * i + 3] - dst[stride * i + 3]);
2146         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2147                    src[stride * i + 4] - dst[stride * i + 4],
2148                    src[stride * i + 5] - dst[stride * i + 5]);
2149         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2150                    src[stride * i + 6] - dst[stride * i + 6],
2151                    src[stride * i + 7] - dst[stride * i + 7]);
2152
2153         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2154         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2155         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2156         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2157
2158         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2159         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2160         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2161         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2162     }
2163
2164     for (i = 0; i < 8; i++) {
2165         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2166         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2167         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2168         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2169
2170         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2171         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2172         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2173         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2174
2175         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
2176                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
2177                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2178                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2179     }
2180     return sum;
2181 }
2182
2183 static int hadamard8_intra8x8_c(/* MpegEncContext */ void *s, uint8_t *src,
2184                                 uint8_t *dummy, int stride, int h)
2185 {
2186     int i, temp[64], sum = 0;
2187
2188     assert(h == 8);
2189
2190     for (i = 0; i < 8; i++) {
2191         // FIXME: try pointer walks
2192         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2193                    src[stride * i + 0], src[stride * i + 1]);
2194         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2195                    src[stride * i + 2], src[stride * i + 3]);
2196         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2197                    src[stride * i + 4], src[stride * i + 5]);
2198         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2199                    src[stride * i + 6], src[stride * i + 7]);
2200
2201         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2202         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2203         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2204         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2205
2206         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2207         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2208         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2209         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2210     }
2211
2212     for (i = 0; i < 8; i++) {
2213         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2214         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2215         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2216         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2217
2218         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2219         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2220         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2221         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2222
2223         sum +=
2224             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2225             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2226             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2227             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2228     }
2229
2230     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2231
2232     return sum;
2233 }
2234
2235 static int dct_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2236                         uint8_t *src2, int stride, int h)
2237 {
2238     MpegEncContext *const s = (MpegEncContext *) c;
2239     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2240
2241     assert(h == 8);
2242
2243     s->dsp.diff_pixels(temp, src1, src2, stride);
2244     s->dsp.fdct(temp);
2245     return s->dsp.sum_abs_dctelem(temp);
2246 }
2247
2248 #if CONFIG_GPL
2249 #define DCT8_1D                                         \
2250     {                                                   \
2251         const int s07 = SRC(0) + SRC(7);                \
2252         const int s16 = SRC(1) + SRC(6);                \
2253         const int s25 = SRC(2) + SRC(5);                \
2254         const int s34 = SRC(3) + SRC(4);                \
2255         const int a0  = s07 + s34;                      \
2256         const int a1  = s16 + s25;                      \
2257         const int a2  = s07 - s34;                      \
2258         const int a3  = s16 - s25;                      \
2259         const int d07 = SRC(0) - SRC(7);                \
2260         const int d16 = SRC(1) - SRC(6);                \
2261         const int d25 = SRC(2) - SRC(5);                \
2262         const int d34 = SRC(3) - SRC(4);                \
2263         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
2264         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
2265         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
2266         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
2267         DST(0, a0 + a1);                                \
2268         DST(1, a4 + (a7 >> 2));                         \
2269         DST(2, a2 + (a3 >> 1));                         \
2270         DST(3, a5 + (a6 >> 2));                         \
2271         DST(4, a0 - a1);                                \
2272         DST(5, a6 - (a5 >> 2));                         \
2273         DST(6, (a2 >> 1) - a3);                         \
2274         DST(7, (a4 >> 2) - a7);                         \
2275     }
2276
2277 static int dct264_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2278                            uint8_t *src2, int stride, int h)
2279 {
2280     MpegEncContext *const s = (MpegEncContext *) c;
2281     int16_t dct[8][8];
2282     int i, sum = 0;
2283
2284     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2285
2286 #define SRC(x) dct[i][x]
2287 #define DST(x, v) dct[i][x] = v
2288     for (i = 0; i < 8; i++)
2289         DCT8_1D
2290 #undef SRC
2291 #undef DST
2292
2293 #define SRC(x) dct[x][i]
2294 #define DST(x, v) sum += FFABS(v)
2295         for (i = 0; i < 8; i++)
2296             DCT8_1D
2297 #undef SRC
2298 #undef DST
2299             return sum;
2300 }
2301 #endif
2302
2303 static int dct_max8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2304                         uint8_t *src2, int stride, int h)
2305 {
2306     MpegEncContext *const s = (MpegEncContext *) c;
2307     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2308     int sum = 0, i;
2309
2310     assert(h == 8);
2311
2312     s->dsp.diff_pixels(temp, src1, src2, stride);
2313     s->dsp.fdct(temp);
2314
2315     for (i = 0; i < 64; i++)
2316         sum = FFMAX(sum, FFABS(temp[i]));
2317
2318     return sum;
2319 }
2320
2321 static int quant_psnr8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2322                            uint8_t *src2, int stride, int h)
2323 {
2324     MpegEncContext *const s = c;
2325     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2326     int16_t *const bak = temp + 64;
2327     int sum = 0, i;
2328
2329     assert(h == 8);
2330     s->mb_intra = 0;
2331
2332     s->dsp.diff_pixels(temp, src1, src2, stride);
2333
2334     memcpy(bak, temp, 64 * sizeof(int16_t));
2335
2336     s->block_last_index[0 /* FIXME */] =
2337         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2338     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2339     ff_simple_idct_8(temp); // FIXME
2340
2341     for (i = 0; i < 64; i++)
2342         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2343
2344     return sum;
2345 }
2346
2347 static int rd8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2348                    int stride, int h)
2349 {
2350     MpegEncContext *const s  = (MpegEncContext *) c;
2351     const uint8_t *scantable = s->intra_scantable.permutated;
2352     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2353     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2354     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2355     int i, last, run, bits, level, distortion, start_i;
2356     const int esc_length = s->ac_esc_length;
2357     uint8_t *length, *last_length;
2358
2359     assert(h == 8);
2360
2361     copy_block8(lsrc1, src1, 8, stride, 8);
2362     copy_block8(lsrc2, src2, 8, stride, 8);
2363
2364     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2365
2366     s->block_last_index[0 /* FIXME */] =
2367     last                               =
2368         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2369
2370     bits = 0;
2371
2372     if (s->mb_intra) {
2373         start_i     = 1;
2374         length      = s->intra_ac_vlc_length;
2375         last_length = s->intra_ac_vlc_last_length;
2376         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2377     } else {
2378         start_i     = 0;
2379         length      = s->inter_ac_vlc_length;
2380         last_length = s->inter_ac_vlc_last_length;
2381     }
2382
2383     if (last >= start_i) {
2384         run = 0;
2385         for (i = start_i; i < last; i++) {
2386             int j = scantable[i];
2387             level = temp[j];
2388
2389             if (level) {
2390                 level += 64;
2391                 if ((level & (~127)) == 0)
2392                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2393                 else
2394                     bits += esc_length;
2395                 run = 0;
2396             } else
2397                 run++;
2398         }
2399         i = scantable[last];
2400
2401         level = temp[i] + 64;
2402
2403         assert(level - 64);
2404
2405         if ((level & (~127)) == 0) {
2406             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2407         } else
2408             bits += esc_length;
2409     }
2410
2411     if (last >= 0) {
2412         if (s->mb_intra)
2413             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2414         else
2415             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2416     }
2417
2418     s->dsp.idct_add(lsrc2, 8, temp);
2419
2420     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2421
2422     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2423 }
2424
2425 static int bit8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2426                     int stride, int h)
2427 {
2428     MpegEncContext *const s  = (MpegEncContext *) c;
2429     const uint8_t *scantable = s->intra_scantable.permutated;
2430     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2431     int i, last, run, bits, level, start_i;
2432     const int esc_length = s->ac_esc_length;
2433     uint8_t *length, *last_length;
2434
2435     assert(h == 8);
2436
2437     s->dsp.diff_pixels(temp, src1, src2, stride);
2438
2439     s->block_last_index[0 /* FIXME */] =
2440     last                               =
2441         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2442
2443     bits = 0;
2444
2445     if (s->mb_intra) {
2446         start_i     = 1;
2447         length      = s->intra_ac_vlc_length;
2448         last_length = s->intra_ac_vlc_last_length;
2449         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2450     } else {
2451         start_i     = 0;
2452         length      = s->inter_ac_vlc_length;
2453         last_length = s->inter_ac_vlc_last_length;
2454     }
2455
2456     if (last >= start_i) {
2457         run = 0;
2458         for (i = start_i; i < last; i++) {
2459             int j = scantable[i];
2460             level = temp[j];
2461
2462             if (level) {
2463                 level += 64;
2464                 if ((level & (~127)) == 0)
2465                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2466                 else
2467                     bits += esc_length;
2468                 run = 0;
2469             } else
2470                 run++;
2471         }
2472         i = scantable[last];
2473
2474         level = temp[i] + 64;
2475
2476         assert(level - 64);
2477
2478         if ((level & (~127)) == 0)
2479             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2480         else
2481             bits += esc_length;
2482     }
2483
2484     return bits;
2485 }
2486
2487 #define VSAD_INTRA(size)                                                \
2488 static int vsad_intra ## size ## _c(/* MpegEncContext */ void *c,       \
2489                                     uint8_t *s, uint8_t *dummy,         \
2490                                     int stride, int h)                  \
2491 {                                                                       \
2492     int score = 0, x, y;                                                \
2493                                                                         \
2494     for (y = 1; y < h; y++) {                                           \
2495         for (x = 0; x < size; x += 4) {                                 \
2496             score += FFABS(s[x]     - s[x + stride])     +              \
2497                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2498                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2499                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2500         }                                                               \
2501         s += stride;                                                    \
2502     }                                                                   \
2503                                                                         \
2504     return score;                                                       \
2505 }
2506 VSAD_INTRA(8)
2507 VSAD_INTRA(16)
2508
2509 static int vsad16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2510                     int stride, int h)
2511 {
2512     int score = 0, x, y;
2513
2514     for (y = 1; y < h; y++) {
2515         for (x = 0; x < 16; x++)
2516             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2517         s1 += stride;
2518         s2 += stride;
2519     }
2520
2521     return score;
2522 }
2523
2524 #define SQ(a) ((a) * (a))
2525 #define VSSE_INTRA(size)                                                \
2526 static int vsse_intra ## size ## _c(/* MpegEncContext */ void *c,       \
2527                                     uint8_t *s, uint8_t *dummy,         \
2528                                     int stride, int h)                  \
2529 {                                                                       \
2530     int score = 0, x, y;                                                \
2531                                                                         \
2532     for (y = 1; y < h; y++) {                                           \
2533         for (x = 0; x < size; x += 4) {                                 \
2534             score += SQ(s[x]     - s[x + stride]) +                     \
2535                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2536                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2537                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2538         }                                                               \
2539         s += stride;                                                    \
2540     }                                                                   \
2541                                                                         \
2542     return score;                                                       \
2543 }
2544 VSSE_INTRA(8)
2545 VSSE_INTRA(16)
2546
2547 static int vsse16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2548                     int stride, int h)
2549 {
2550     int score = 0, x, y;
2551
2552     for (y = 1; y < h; y++) {
2553         for (x = 0; x < 16; x++)
2554             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2555         s1 += stride;
2556         s2 += stride;
2557     }
2558
2559     return score;
2560 }
2561
2562 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2563                                int size)
2564 {
2565     int score = 0, i;
2566
2567     for (i = 0; i < size; i++)
2568         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2569     return score;
2570 }
2571
2572 #define WRAPPER8_16_SQ(name8, name16)                                   \
2573 static int name16(void /*MpegEncContext*/ *s,                           \
2574                   uint8_t *dst, uint8_t *src,                           \
2575                   int stride, int h)                                    \
2576 {                                                                       \
2577     int score = 0;                                                      \
2578                                                                         \
2579     score += name8(s, dst, src, stride, 8);                             \
2580     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2581     if (h == 16) {                                                      \
2582         dst   += 8 * stride;                                            \
2583         src   += 8 * stride;                                            \
2584         score += name8(s, dst, src, stride, 8);                         \
2585         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2586     }                                                                   \
2587     return score;                                                       \
2588 }
2589
2590 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2591 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2592 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2593 #if CONFIG_GPL
2594 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2595 #endif
2596 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2597 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2598 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2599 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2600
2601 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2602                                    uint32_t maxi, uint32_t maxisign)
2603 {
2604     if (a > mini)
2605         return mini;
2606     else if ((a ^ (1U << 31)) > maxisign)
2607         return maxi;
2608     else
2609         return a;
2610 }
2611
2612 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2613                                          float *min, float *max, int len)
2614 {
2615     int i;
2616     uint32_t mini        = *(uint32_t *) min;
2617     uint32_t maxi        = *(uint32_t *) max;
2618     uint32_t maxisign    = maxi ^ (1U << 31);
2619     uint32_t *dsti       = (uint32_t *) dst;
2620     const uint32_t *srci = (const uint32_t *) src;
2621
2622     for (i = 0; i < len; i += 8) {
2623         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2624         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2625         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2626         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2627         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2628         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2629         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2630         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2631     }
2632 }
2633
2634 static void vector_clipf_c(float *dst, const float *src,
2635                            float min, float max, int len)
2636 {
2637     int i;
2638
2639     if (min < 0 && max > 0) {
2640         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2641     } else {
2642         for (i = 0; i < len; i += 8) {
2643             dst[i]     = av_clipf(src[i], min, max);
2644             dst[i + 1] = av_clipf(src[i + 1], min, max);
2645             dst[i + 2] = av_clipf(src[i + 2], min, max);
2646             dst[i + 3] = av_clipf(src[i + 3], min, max);
2647             dst[i + 4] = av_clipf(src[i + 4], min, max);
2648             dst[i + 5] = av_clipf(src[i + 5], min, max);
2649             dst[i + 6] = av_clipf(src[i + 6], min, max);
2650             dst[i + 7] = av_clipf(src[i + 7], min, max);
2651         }
2652     }
2653 }
2654
2655 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2656                                      int order)
2657 {
2658     int res = 0;
2659
2660     while (order--)
2661         res += *v1++ **v2++;
2662
2663     return res;
2664 }
2665
2666 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2667                                               const int16_t *v3,
2668                                               int order, int mul)
2669 {
2670     int res = 0;
2671
2672     while (order--) {
2673         res   += *v1 * *v2++;
2674         *v1++ += mul * *v3++;
2675     }
2676     return res;
2677 }
2678
2679 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2680                                 int32_t max, unsigned int len)
2681 {
2682     do {
2683         *dst++ = av_clip(*src++, min, max);
2684         *dst++ = av_clip(*src++, min, max);
2685         *dst++ = av_clip(*src++, min, max);
2686         *dst++ = av_clip(*src++, min, max);
2687         *dst++ = av_clip(*src++, min, max);
2688         *dst++ = av_clip(*src++, min, max);
2689         *dst++ = av_clip(*src++, min, max);
2690         *dst++ = av_clip(*src++, min, max);
2691         len   -= 8;
2692     } while (len > 0);
2693 }
2694
2695 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2696 {
2697     ff_j_rev_dct(block);
2698     put_pixels_clamped_c(block, dest, line_size);
2699 }
2700
2701 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2702 {
2703     ff_j_rev_dct(block);
2704     add_pixels_clamped_c(block, dest, line_size);
2705 }
2706
2707 /* init static data */
2708 av_cold void ff_dsputil_static_init(void)
2709 {
2710     int i;
2711
2712     for (i = 0; i < 512; i++)
2713         ff_square_tab[i] = (i - 256) * (i - 256);
2714 }
2715
2716 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2717 {
2718 #if CONFIG_ENCODERS
2719     if (avctx->bits_per_raw_sample == 10) {
2720         c->fdct    = ff_jpeg_fdct_islow_10;
2721         c->fdct248 = ff_fdct248_islow_10;
2722     } else {
2723         if (avctx->dct_algo == FF_DCT_FASTINT) {
2724             c->fdct    = ff_fdct_ifast;
2725             c->fdct248 = ff_fdct_ifast248;
2726         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2727             c->fdct    = ff_faandct;
2728             c->fdct248 = ff_faandct248;
2729         } else {
2730             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2731             c->fdct248 = ff_fdct248_islow_8;
2732         }
2733     }
2734 #endif /* CONFIG_ENCODERS */
2735
2736     if (avctx->bits_per_raw_sample == 10) {
2737         c->idct_put              = ff_simple_idct_put_10;
2738         c->idct_add              = ff_simple_idct_add_10;
2739         c->idct                  = ff_simple_idct_10;
2740         c->idct_permutation_type = FF_NO_IDCT_PERM;
2741     } else {
2742         if (avctx->idct_algo == FF_IDCT_INT) {
2743             c->idct_put              = jref_idct_put;
2744             c->idct_add              = jref_idct_add;
2745             c->idct                  = ff_j_rev_dct;
2746             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2747         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2748             c->idct_put              = ff_faanidct_put;
2749             c->idct_add              = ff_faanidct_add;
2750             c->idct                  = ff_faanidct;
2751             c->idct_permutation_type = FF_NO_IDCT_PERM;
2752         } else { // accurate/default
2753             c->idct_put              = ff_simple_idct_put_8;
2754             c->idct_add              = ff_simple_idct_add_8;
2755             c->idct                  = ff_simple_idct_8;
2756             c->idct_permutation_type = FF_NO_IDCT_PERM;
2757         }
2758     }
2759
2760     c->diff_pixels = diff_pixels_c;
2761
2762     c->put_pixels_clamped        = put_pixels_clamped_c;
2763     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2764     c->add_pixels_clamped        = add_pixels_clamped_c;
2765
2766     c->sum_abs_dctelem = sum_abs_dctelem_c;
2767
2768     c->gmc1 = gmc1_c;
2769     c->gmc  = ff_gmc_c;
2770
2771     c->pix_sum   = pix_sum_c;
2772     c->pix_norm1 = pix_norm1_c;
2773
2774     c->fill_block_tab[0] = fill_block16_c;
2775     c->fill_block_tab[1] = fill_block8_c;
2776
2777     /* TODO [0] 16  [1] 8 */
2778     c->pix_abs[0][0] = pix_abs16_c;
2779     c->pix_abs[0][1] = pix_abs16_x2_c;
2780     c->pix_abs[0][2] = pix_abs16_y2_c;
2781     c->pix_abs[0][3] = pix_abs16_xy2_c;
2782     c->pix_abs[1][0] = pix_abs8_c;
2783     c->pix_abs[1][1] = pix_abs8_x2_c;
2784     c->pix_abs[1][2] = pix_abs8_y2_c;
2785     c->pix_abs[1][3] = pix_abs8_xy2_c;
2786
2787     c->put_tpel_pixels_tab[0]  = put_tpel_pixels_mc00_c;
2788     c->put_tpel_pixels_tab[1]  = put_tpel_pixels_mc10_c;
2789     c->put_tpel_pixels_tab[2]  = put_tpel_pixels_mc20_c;
2790     c->put_tpel_pixels_tab[4]  = put_tpel_pixels_mc01_c;
2791     c->put_tpel_pixels_tab[5]  = put_tpel_pixels_mc11_c;
2792     c->put_tpel_pixels_tab[6]  = put_tpel_pixels_mc21_c;
2793     c->put_tpel_pixels_tab[8]  = put_tpel_pixels_mc02_c;
2794     c->put_tpel_pixels_tab[9]  = put_tpel_pixels_mc12_c;
2795     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2796
2797     c->avg_tpel_pixels_tab[0]  = avg_tpel_pixels_mc00_c;
2798     c->avg_tpel_pixels_tab[1]  = avg_tpel_pixels_mc10_c;
2799     c->avg_tpel_pixels_tab[2]  = avg_tpel_pixels_mc20_c;
2800     c->avg_tpel_pixels_tab[4]  = avg_tpel_pixels_mc01_c;
2801     c->avg_tpel_pixels_tab[5]  = avg_tpel_pixels_mc11_c;
2802     c->avg_tpel_pixels_tab[6]  = avg_tpel_pixels_mc21_c;
2803     c->avg_tpel_pixels_tab[8]  = avg_tpel_pixels_mc02_c;
2804     c->avg_tpel_pixels_tab[9]  = avg_tpel_pixels_mc12_c;
2805     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2806
2807 #define dspfunc(PFX, IDX, NUM)                              \
2808     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2809     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2810     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2811     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2812     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2813     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2814     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2815     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2816     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2817     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2818     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2819     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2820     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2821     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2822     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2823     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2824
2825     dspfunc(put_qpel, 0, 16);
2826     dspfunc(put_qpel, 1, 8);
2827
2828     dspfunc(put_no_rnd_qpel, 0, 16);
2829     dspfunc(put_no_rnd_qpel, 1, 8);
2830
2831     dspfunc(avg_qpel, 0, 16);
2832     dspfunc(avg_qpel, 1, 8);
2833
2834 #undef dspfunc
2835
2836     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2837     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2838     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2839     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2840     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2841     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2842     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2843     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2844
2845 #define SET_CMP_FUNC(name)                      \
2846     c->name[0] = name ## 16_c;                  \
2847     c->name[1] = name ## 8x8_c;
2848
2849     SET_CMP_FUNC(hadamard8_diff)
2850     c->hadamard8_diff[4] = hadamard8_intra16_c;
2851     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2852     SET_CMP_FUNC(dct_sad)
2853     SET_CMP_FUNC(dct_max)
2854 #if CONFIG_GPL
2855     SET_CMP_FUNC(dct264_sad)
2856 #endif
2857     c->sad[0] = pix_abs16_c;
2858     c->sad[1] = pix_abs8_c;
2859     c->sse[0] = sse16_c;
2860     c->sse[1] = sse8_c;
2861     c->sse[2] = sse4_c;
2862     SET_CMP_FUNC(quant_psnr)
2863     SET_CMP_FUNC(rd)
2864     SET_CMP_FUNC(bit)
2865     c->vsad[0] = vsad16_c;
2866     c->vsad[4] = vsad_intra16_c;
2867     c->vsad[5] = vsad_intra8_c;
2868     c->vsse[0] = vsse16_c;
2869     c->vsse[4] = vsse_intra16_c;
2870     c->vsse[5] = vsse_intra8_c;
2871     c->nsse[0] = nsse16_c;
2872     c->nsse[1] = nsse8_c;
2873
2874     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2875
2876     c->add_bytes                      = add_bytes_c;
2877     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2878     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2879     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2880
2881     c->diff_bytes                 = diff_bytes_c;
2882     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2883
2884     c->bswap_buf   = bswap_buf;
2885     c->bswap16_buf = bswap16_buf;
2886
2887     c->try_8x8basis = try_8x8basis_c;
2888     c->add_8x8basis = add_8x8basis_c;
2889
2890     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2891
2892     c->scalarproduct_int16 = scalarproduct_int16_c;
2893     c->vector_clip_int32   = vector_clip_int32_c;
2894     c->vector_clipf        = vector_clipf_c;
2895
2896     c->shrink[0] = av_image_copy_plane;
2897     c->shrink[1] = ff_shrink22;
2898     c->shrink[2] = ff_shrink44;
2899     c->shrink[3] = ff_shrink88;
2900
2901     c->add_pixels8 = add_pixels8_c;
2902
2903 #undef FUNC
2904 #undef FUNCC
2905 #define FUNC(f,  depth) f ## _ ## depth
2906 #define FUNCC(f, depth) f ## _ ## depth ## _c
2907
2908     c->draw_edges = FUNCC(draw_edges, 8);
2909
2910     c->clear_block  = FUNCC(clear_block, 8);
2911     c->clear_blocks = FUNCC(clear_blocks, 8);
2912
2913 #define BIT_DEPTH_FUNCS(depth)                  \
2914     c->get_pixels = FUNCC(get_pixels, depth);
2915
2916     switch (avctx->bits_per_raw_sample) {
2917     case 9:
2918     case 10:
2919         BIT_DEPTH_FUNCS(16);
2920         break;
2921     default:
2922         BIT_DEPTH_FUNCS(8);
2923         break;
2924     }
2925
2926     if (ARCH_ARM)
2927         ff_dsputil_init_arm(c, avctx);
2928     if (ARCH_BFIN)
2929         ff_dsputil_init_bfin(c, avctx);
2930     if (ARCH_PPC)
2931         ff_dsputil_init_ppc(c, avctx);
2932     if (ARCH_X86)
2933         ff_dsputil_init_x86(c, avctx);
2934
2935     ff_init_scantable_permutation(c->idct_permutation,
2936                                   c->idct_permutation_type);
2937 }