]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
dsputil: Move hpel_template #include out of dsputil_template
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "avcodec.h"
33 #include "copy_block.h"
34 #include "dct.h"
35 #include "dsputil.h"
36 #include "simple_idct.h"
37 #include "faandct.h"
38 #include "faanidct.h"
39 #include "imgconvert.h"
40 #include "mathops.h"
41 #include "mpegvideo.h"
42 #include "config.h"
43
44 uint32_t ff_square_tab[512] = { 0, };
45
46 #define BIT_DEPTH 16
47 #include "dsputil_template.c"
48 #undef BIT_DEPTH
49
50 #define BIT_DEPTH 8
51 #include "hpel_template.c"
52 #include "tpel_template.c"
53 #include "dsputil_template.c"
54
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL / 255 * 0x7f)
57 #define pb_80 (~0UL / 255 * 0x80)
58
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60  * specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62      0,  8,  1,  9, 16, 24,  2, 10,
63     17, 25, 32, 40, 48, 56, 33, 41,
64     18, 26,  3, 11,  4, 12, 19, 27,
65     34, 42, 49, 57, 50, 58, 35, 43,
66     20, 28,  5, 13,  6, 14, 21, 29,
67     36, 44, 51, 59, 52, 60, 37, 45,
68     22, 30,  7, 15, 23, 31, 38, 46,
69     53, 61, 54, 62, 39, 47, 55, 63,
70 };
71
72 const uint8_t ff_alternate_horizontal_scan[64] = {
73      0,  1,  2,  3,  8,  9, 16, 17,
74     10, 11,  4,  5,  6,  7, 15, 14,
75     13, 12, 19, 18, 24, 25, 32, 33,
76     26, 27, 20, 21, 22, 23, 28, 29,
77     30, 31, 34, 35, 40, 41, 48, 49,
78     42, 43, 36, 37, 38, 39, 44, 45,
79     46, 47, 50, 51, 56, 57, 58, 59,
80     52, 53, 54, 55, 60, 61, 62, 63,
81 };
82
83 const uint8_t ff_alternate_vertical_scan[64] = {
84      0,  8, 16, 24,  1,  9,  2, 10,
85     17, 25, 32, 40, 48, 56, 57, 49,
86     41, 33, 26, 18,  3, 11,  4, 12,
87     19, 27, 34, 42, 50, 58, 35, 43,
88     51, 59, 20, 28,  5, 13,  6, 14,
89     21, 29, 36, 44, 52, 60, 37, 45,
90     53, 61, 22, 30,  7, 15, 23, 31,
91     38, 46, 54, 62, 39, 47, 55, 63,
92 };
93
94 /* Input permutation for the simple_idct_mmx */
95 static const uint8_t simple_mmx_permutation[64] = {
96     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
97     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
98     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
99     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
100     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
101     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
102     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
103     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
104 };
105
106 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
107
108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
109                                const uint8_t *src_scantable)
110 {
111     int i, end;
112
113     st->scantable = src_scantable;
114
115     for (i = 0; i < 64; i++) {
116         int j = src_scantable[i];
117         st->permutated[i] = permutation[j];
118     }
119
120     end = -1;
121     for (i = 0; i < 64; i++) {
122         int j = st->permutated[i];
123         if (j > end)
124             end = j;
125         st->raster_end[i] = end;
126     }
127 }
128
129 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
130                                            int idct_permutation_type)
131 {
132     int i;
133
134     switch (idct_permutation_type) {
135     case FF_NO_IDCT_PERM:
136         for (i = 0; i < 64; i++)
137             idct_permutation[i] = i;
138         break;
139     case FF_LIBMPEG2_IDCT_PERM:
140         for (i = 0; i < 64; i++)
141             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
142         break;
143     case FF_SIMPLE_IDCT_PERM:
144         for (i = 0; i < 64; i++)
145             idct_permutation[i] = simple_mmx_permutation[i];
146         break;
147     case FF_TRANSPOSE_IDCT_PERM:
148         for (i = 0; i < 64; i++)
149             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
150         break;
151     case FF_PARTTRANS_IDCT_PERM:
152         for (i = 0; i < 64; i++)
153             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
154         break;
155     case FF_SSE2_IDCT_PERM:
156         for (i = 0; i < 64; i++)
157             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
158         break;
159     default:
160         av_log(NULL, AV_LOG_ERROR,
161                "Internal error, IDCT permutation not set\n");
162     }
163 }
164
165 static int pix_sum_c(uint8_t *pix, int line_size)
166 {
167     int s = 0, i, j;
168
169     for (i = 0; i < 16; i++) {
170         for (j = 0; j < 16; j += 8) {
171             s   += pix[0];
172             s   += pix[1];
173             s   += pix[2];
174             s   += pix[3];
175             s   += pix[4];
176             s   += pix[5];
177             s   += pix[6];
178             s   += pix[7];
179             pix += 8;
180         }
181         pix += line_size - 16;
182     }
183     return s;
184 }
185
186 static int pix_norm1_c(uint8_t *pix, int line_size)
187 {
188     int s = 0, i, j;
189     uint32_t *sq = ff_square_tab + 256;
190
191     for (i = 0; i < 16; i++) {
192         for (j = 0; j < 16; j += 8) {
193 #if 0
194             s += sq[pix[0]];
195             s += sq[pix[1]];
196             s += sq[pix[2]];
197             s += sq[pix[3]];
198             s += sq[pix[4]];
199             s += sq[pix[5]];
200             s += sq[pix[6]];
201             s += sq[pix[7]];
202 #else
203 #if HAVE_FAST_64BIT
204             register uint64_t x = *(uint64_t *) pix;
205             s += sq[x         & 0xff];
206             s += sq[(x >>  8) & 0xff];
207             s += sq[(x >> 16) & 0xff];
208             s += sq[(x >> 24) & 0xff];
209             s += sq[(x >> 32) & 0xff];
210             s += sq[(x >> 40) & 0xff];
211             s += sq[(x >> 48) & 0xff];
212             s += sq[(x >> 56) & 0xff];
213 #else
214             register uint32_t x = *(uint32_t *) pix;
215             s += sq[x         & 0xff];
216             s += sq[(x >>  8) & 0xff];
217             s += sq[(x >> 16) & 0xff];
218             s += sq[(x >> 24) & 0xff];
219             x  = *(uint32_t *) (pix + 4);
220             s += sq[x         & 0xff];
221             s += sq[(x >>  8) & 0xff];
222             s += sq[(x >> 16) & 0xff];
223             s += sq[(x >> 24) & 0xff];
224 #endif
225 #endif
226             pix += 8;
227         }
228         pix += line_size - 16;
229     }
230     return s;
231 }
232
233 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
234 {
235     int i;
236
237     for (i = 0; i + 8 <= w; i += 8) {
238         dst[i + 0] = av_bswap32(src[i + 0]);
239         dst[i + 1] = av_bswap32(src[i + 1]);
240         dst[i + 2] = av_bswap32(src[i + 2]);
241         dst[i + 3] = av_bswap32(src[i + 3]);
242         dst[i + 4] = av_bswap32(src[i + 4]);
243         dst[i + 5] = av_bswap32(src[i + 5]);
244         dst[i + 6] = av_bswap32(src[i + 6]);
245         dst[i + 7] = av_bswap32(src[i + 7]);
246     }
247     for (; i < w; i++)
248         dst[i + 0] = av_bswap32(src[i + 0]);
249 }
250
251 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
252 {
253     while (len--)
254         *dst++ = av_bswap16(*src++);
255 }
256
257 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
258                   int line_size, int h)
259 {
260     int s = 0, i;
261     uint32_t *sq = ff_square_tab + 256;
262
263     for (i = 0; i < h; i++) {
264         s    += sq[pix1[0] - pix2[0]];
265         s    += sq[pix1[1] - pix2[1]];
266         s    += sq[pix1[2] - pix2[2]];
267         s    += sq[pix1[3] - pix2[3]];
268         pix1 += line_size;
269         pix2 += line_size;
270     }
271     return s;
272 }
273
274 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
275                   int line_size, int h)
276 {
277     int s = 0, i;
278     uint32_t *sq = ff_square_tab + 256;
279
280     for (i = 0; i < h; i++) {
281         s    += sq[pix1[0] - pix2[0]];
282         s    += sq[pix1[1] - pix2[1]];
283         s    += sq[pix1[2] - pix2[2]];
284         s    += sq[pix1[3] - pix2[3]];
285         s    += sq[pix1[4] - pix2[4]];
286         s    += sq[pix1[5] - pix2[5]];
287         s    += sq[pix1[6] - pix2[6]];
288         s    += sq[pix1[7] - pix2[7]];
289         pix1 += line_size;
290         pix2 += line_size;
291     }
292     return s;
293 }
294
295 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
296                    int line_size, int h)
297 {
298     int s = 0, i;
299     uint32_t *sq = ff_square_tab + 256;
300
301     for (i = 0; i < h; i++) {
302         s += sq[pix1[0]  - pix2[0]];
303         s += sq[pix1[1]  - pix2[1]];
304         s += sq[pix1[2]  - pix2[2]];
305         s += sq[pix1[3]  - pix2[3]];
306         s += sq[pix1[4]  - pix2[4]];
307         s += sq[pix1[5]  - pix2[5]];
308         s += sq[pix1[6]  - pix2[6]];
309         s += sq[pix1[7]  - pix2[7]];
310         s += sq[pix1[8]  - pix2[8]];
311         s += sq[pix1[9]  - pix2[9]];
312         s += sq[pix1[10] - pix2[10]];
313         s += sq[pix1[11] - pix2[11]];
314         s += sq[pix1[12] - pix2[12]];
315         s += sq[pix1[13] - pix2[13]];
316         s += sq[pix1[14] - pix2[14]];
317         s += sq[pix1[15] - pix2[15]];
318
319         pix1 += line_size;
320         pix2 += line_size;
321     }
322     return s;
323 }
324
325 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
326                           const uint8_t *s2, int stride)
327 {
328     int i;
329
330     /* read the pixels */
331     for (i = 0; i < 8; i++) {
332         block[0] = s1[0] - s2[0];
333         block[1] = s1[1] - s2[1];
334         block[2] = s1[2] - s2[2];
335         block[3] = s1[3] - s2[3];
336         block[4] = s1[4] - s2[4];
337         block[5] = s1[5] - s2[5];
338         block[6] = s1[6] - s2[6];
339         block[7] = s1[7] - s2[7];
340         s1      += stride;
341         s2      += stride;
342         block   += 8;
343     }
344 }
345
346 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
347                                  int line_size)
348 {
349     int i;
350
351     /* read the pixels */
352     for (i = 0; i < 8; i++) {
353         pixels[0] = av_clip_uint8(block[0]);
354         pixels[1] = av_clip_uint8(block[1]);
355         pixels[2] = av_clip_uint8(block[2]);
356         pixels[3] = av_clip_uint8(block[3]);
357         pixels[4] = av_clip_uint8(block[4]);
358         pixels[5] = av_clip_uint8(block[5]);
359         pixels[6] = av_clip_uint8(block[6]);
360         pixels[7] = av_clip_uint8(block[7]);
361
362         pixels += line_size;
363         block  += 8;
364     }
365 }
366
367 static void put_signed_pixels_clamped_c(const int16_t *block,
368                                         uint8_t *restrict pixels,
369                                         int line_size)
370 {
371     int i, j;
372
373     for (i = 0; i < 8; i++) {
374         for (j = 0; j < 8; j++) {
375             if (*block < -128)
376                 *pixels = 0;
377             else if (*block > 127)
378                 *pixels = 255;
379             else
380                 *pixels = (uint8_t) (*block + 128);
381             block++;
382             pixels++;
383         }
384         pixels += (line_size - 8);
385     }
386 }
387
388 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
389                           int line_size)
390 {
391     int i;
392
393     for (i = 0; i < 8; i++) {
394         pixels[0] += block[0];
395         pixels[1] += block[1];
396         pixels[2] += block[2];
397         pixels[3] += block[3];
398         pixels[4] += block[4];
399         pixels[5] += block[5];
400         pixels[6] += block[6];
401         pixels[7] += block[7];
402         pixels    += line_size;
403         block     += 8;
404     }
405 }
406
407 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
408                                  int line_size)
409 {
410     int i;
411
412     /* read the pixels */
413     for (i = 0; i < 8; i++) {
414         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
415         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
416         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
417         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
418         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
419         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
420         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
421         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
422         pixels   += line_size;
423         block    += 8;
424     }
425 }
426
427 static int sum_abs_dctelem_c(int16_t *block)
428 {
429     int sum = 0, i;
430
431     for (i = 0; i < 64; i++)
432         sum += FFABS(block[i]);
433     return sum;
434 }
435
436 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
437 {
438     int i;
439
440     for (i = 0; i < h; i++) {
441         memset(block, value, 16);
442         block += line_size;
443     }
444 }
445
446 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
447 {
448     int i;
449
450     for (i = 0; i < h; i++) {
451         memset(block, value, 8);
452         block += line_size;
453     }
454 }
455
456 #define avg2(a, b) ((a + b + 1) >> 1)
457 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
458
459 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
460                    int x16, int y16, int rounder)
461 {
462     const int A = (16 - x16) * (16 - y16);
463     const int B = (x16)      * (16 - y16);
464     const int C = (16 - x16) * (y16);
465     const int D = (x16)      * (y16);
466     int i;
467
468     for (i = 0; i < h; i++) {
469         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
470         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
471         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
472         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
473         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
474         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
475         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
476         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
477         dst   += stride;
478         src   += stride;
479     }
480 }
481
482 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
483               int dxx, int dxy, int dyx, int dyy, int shift, int r,
484               int width, int height)
485 {
486     int y, vx, vy;
487     const int s = 1 << shift;
488
489     width--;
490     height--;
491
492     for (y = 0; y < h; y++) {
493         int x;
494
495         vx = ox;
496         vy = oy;
497         for (x = 0; x < 8; x++) { // FIXME: optimize
498             int index;
499             int src_x  = vx >> 16;
500             int src_y  = vy >> 16;
501             int frac_x = src_x & (s - 1);
502             int frac_y = src_y & (s - 1);
503
504             src_x >>= shift;
505             src_y >>= shift;
506
507             if ((unsigned) src_x < width) {
508                 if ((unsigned) src_y < height) {
509                     index = src_x + src_y * stride;
510                     dst[y * stride + x] =
511                         ((src[index]                        * (s - frac_x) +
512                           src[index + 1]          * frac_x) * (s - frac_y) +
513                          (src[index + stride]               * (s - frac_x) +
514                           src[index + stride + 1] * frac_x) *      frac_y  +
515                          r) >> (shift * 2);
516                 } else {
517                     index = src_x + av_clip(src_y, 0, height) * stride;
518                     dst[y * stride + x] =
519                         ((src[index]               * (s - frac_x) +
520                           src[index + 1] * frac_x) *  s           +
521                          r) >> (shift * 2);
522                 }
523             } else {
524                 if ((unsigned) src_y < height) {
525                     index = av_clip(src_x, 0, width) + src_y * stride;
526                     dst[y * stride + x] =
527                         ((src[index]                    * (s - frac_y) +
528                           src[index + stride] * frac_y) *  s           +
529                          r) >> (shift * 2);
530                 } else {
531                     index = av_clip(src_x, 0, width) +
532                             av_clip(src_y, 0, height) * stride;
533                     dst[y * stride + x] = src[index];
534                 }
535             }
536
537             vx += dxx;
538             vy += dyx;
539         }
540         ox += dxy;
541         oy += dyy;
542     }
543 }
544
545 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
546 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
547                                             int dstStride, int srcStride,     \
548                                             int h)                            \
549 {                                                                             \
550     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
551     int i;                                                                    \
552                                                                               \
553     for (i = 0; i < h; i++) {                                                 \
554         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
555         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
556         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
557         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
558         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
559         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
560         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
561         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
562         dst += dstStride;                                                     \
563         src += srcStride;                                                     \
564     }                                                                         \
565 }                                                                             \
566                                                                               \
567 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
568                                             int dstStride, int srcStride)     \
569 {                                                                             \
570     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
571     const int w = 8;                                                          \
572     int i;                                                                    \
573                                                                               \
574     for (i = 0; i < w; i++) {                                                 \
575         const int src0 = src[0 * srcStride];                                  \
576         const int src1 = src[1 * srcStride];                                  \
577         const int src2 = src[2 * srcStride];                                  \
578         const int src3 = src[3 * srcStride];                                  \
579         const int src4 = src[4 * srcStride];                                  \
580         const int src5 = src[5 * srcStride];                                  \
581         const int src6 = src[6 * srcStride];                                  \
582         const int src7 = src[7 * srcStride];                                  \
583         const int src8 = src[8 * srcStride];                                  \
584         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
585         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
586         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
587         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
588         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
589         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
590         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
591         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
592         dst++;                                                                \
593         src++;                                                                \
594     }                                                                         \
595 }                                                                             \
596                                                                               \
597 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
598                                              int dstStride, int srcStride,    \
599                                              int h)                           \
600 {                                                                             \
601     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
602     int i;                                                                    \
603                                                                               \
604     for (i = 0; i < h; i++) {                                                 \
605         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
606         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
607         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
608         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
609         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
610         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
611         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
612         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
613         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
614         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
615         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
616         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
617         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
618         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
619         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
620         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
621         dst += dstStride;                                                     \
622         src += srcStride;                                                     \
623     }                                                                         \
624 }                                                                             \
625                                                                               \
626 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
627                                              int dstStride, int srcStride)    \
628 {                                                                             \
629     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
630     const int w = 16;                                                         \
631     int i;                                                                    \
632                                                                               \
633     for (i = 0; i < w; i++) {                                                 \
634         const int src0  = src[0  * srcStride];                                \
635         const int src1  = src[1  * srcStride];                                \
636         const int src2  = src[2  * srcStride];                                \
637         const int src3  = src[3  * srcStride];                                \
638         const int src4  = src[4  * srcStride];                                \
639         const int src5  = src[5  * srcStride];                                \
640         const int src6  = src[6  * srcStride];                                \
641         const int src7  = src[7  * srcStride];                                \
642         const int src8  = src[8  * srcStride];                                \
643         const int src9  = src[9  * srcStride];                                \
644         const int src10 = src[10 * srcStride];                                \
645         const int src11 = src[11 * srcStride];                                \
646         const int src12 = src[12 * srcStride];                                \
647         const int src13 = src[13 * srcStride];                                \
648         const int src14 = src[14 * srcStride];                                \
649         const int src15 = src[15 * srcStride];                                \
650         const int src16 = src[16 * srcStride];                                \
651         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
652         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
653         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
654         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
655         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
656         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
657         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
658         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
659         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
660         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
661         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
662         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
663         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
664         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
665         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
666         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
667         dst++;                                                                \
668         src++;                                                                \
669     }                                                                         \
670 }                                                                             \
671                                                                               \
672 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
673                                    ptrdiff_t stride)                          \
674 {                                                                             \
675     uint8_t half[64];                                                         \
676                                                                               \
677     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
678     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
679 }                                                                             \
680                                                                               \
681 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
682                                    ptrdiff_t stride)                          \
683 {                                                                             \
684     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
685 }                                                                             \
686                                                                               \
687 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
688                                    ptrdiff_t stride)                          \
689 {                                                                             \
690     uint8_t half[64];                                                         \
691                                                                               \
692     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
693     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
694 }                                                                             \
695                                                                               \
696 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
697                                    ptrdiff_t stride)                          \
698 {                                                                             \
699     uint8_t full[16 * 9];                                                     \
700     uint8_t half[64];                                                         \
701                                                                               \
702     copy_block9(full, src, 16, stride, 9);                                    \
703     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
704     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
705 }                                                                             \
706                                                                               \
707 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
708                                    ptrdiff_t stride)                          \
709 {                                                                             \
710     uint8_t full[16 * 9];                                                     \
711                                                                               \
712     copy_block9(full, src, 16, stride, 9);                                    \
713     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
714 }                                                                             \
715                                                                               \
716 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
717                                    ptrdiff_t stride)                          \
718 {                                                                             \
719     uint8_t full[16 * 9];                                                     \
720     uint8_t half[64];                                                         \
721                                                                               \
722     copy_block9(full, src, 16, stride, 9);                                    \
723     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
724     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
725 }                                                                             \
726                                                                               \
727 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
728                                        ptrdiff_t stride)                      \
729 {                                                                             \
730     uint8_t full[16 * 9];                                                     \
731     uint8_t halfH[72];                                                        \
732     uint8_t halfV[64];                                                        \
733     uint8_t halfHV[64];                                                       \
734                                                                               \
735     copy_block9(full, src, 16, stride, 9);                                    \
736     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
737     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
738     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
739     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
740                            stride, 16, 8, 8, 8, 8);                           \
741 }                                                                             \
742                                                                               \
743 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
744                                    ptrdiff_t stride)                          \
745 {                                                                             \
746     uint8_t full[16 * 9];                                                     \
747     uint8_t halfH[72];                                                        \
748     uint8_t halfHV[64];                                                       \
749                                                                               \
750     copy_block9(full, src, 16, stride, 9);                                    \
751     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
752     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
753     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
754     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
755 }                                                                             \
756                                                                               \
757 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
758                                        ptrdiff_t stride)                      \
759 {                                                                             \
760     uint8_t full[16 * 9];                                                     \
761     uint8_t halfH[72];                                                        \
762     uint8_t halfV[64];                                                        \
763     uint8_t halfHV[64];                                                       \
764                                                                               \
765     copy_block9(full, src, 16, stride, 9);                                    \
766     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
767     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
768     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
769     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
770                            stride, 16, 8, 8, 8, 8);                           \
771 }                                                                             \
772                                                                               \
773 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
774                                    ptrdiff_t stride)                          \
775 {                                                                             \
776     uint8_t full[16 * 9];                                                     \
777     uint8_t halfH[72];                                                        \
778     uint8_t halfHV[64];                                                       \
779                                                                               \
780     copy_block9(full, src, 16, stride, 9);                                    \
781     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
782     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
783     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
784     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
785 }                                                                             \
786                                                                               \
787 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
788                                        ptrdiff_t stride)                      \
789 {                                                                             \
790     uint8_t full[16 * 9];                                                     \
791     uint8_t halfH[72];                                                        \
792     uint8_t halfV[64];                                                        \
793     uint8_t halfHV[64];                                                       \
794                                                                               \
795     copy_block9(full, src, 16, stride, 9);                                    \
796     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
797     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
798     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
799     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
800                            stride, 16, 8, 8, 8, 8);                           \
801 }                                                                             \
802                                                                               \
803 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
804                                    ptrdiff_t stride)                          \
805 {                                                                             \
806     uint8_t full[16 * 9];                                                     \
807     uint8_t halfH[72];                                                        \
808     uint8_t halfHV[64];                                                       \
809                                                                               \
810     copy_block9(full, src, 16, stride, 9);                                    \
811     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
812     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
813     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
814     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
815 }                                                                             \
816                                                                               \
817 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
818                                        ptrdiff_t stride)                      \
819 {                                                                             \
820     uint8_t full[16 * 9];                                                     \
821     uint8_t halfH[72];                                                        \
822     uint8_t halfV[64];                                                        \
823     uint8_t halfHV[64];                                                       \
824                                                                               \
825     copy_block9(full, src, 16, stride, 9);                                    \
826     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
827     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
828     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
829     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
830                            stride, 16, 8, 8, 8, 8);                           \
831 }                                                                             \
832                                                                               \
833 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
834                                    ptrdiff_t stride)                          \
835 {                                                                             \
836     uint8_t full[16 * 9];                                                     \
837     uint8_t halfH[72];                                                        \
838     uint8_t halfHV[64];                                                       \
839                                                                               \
840     copy_block9(full, src, 16, stride, 9);                                    \
841     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
842     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
843     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
844     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
845 }                                                                             \
846                                                                               \
847 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
848                                    ptrdiff_t stride)                          \
849 {                                                                             \
850     uint8_t halfH[72];                                                        \
851     uint8_t halfHV[64];                                                       \
852                                                                               \
853     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
854     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
855     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
856 }                                                                             \
857                                                                               \
858 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
859                                    ptrdiff_t stride)                          \
860 {                                                                             \
861     uint8_t halfH[72];                                                        \
862     uint8_t halfHV[64];                                                       \
863                                                                               \
864     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
865     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
866     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
867 }                                                                             \
868                                                                               \
869 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
870                                        ptrdiff_t stride)                      \
871 {                                                                             \
872     uint8_t full[16 * 9];                                                     \
873     uint8_t halfH[72];                                                        \
874     uint8_t halfV[64];                                                        \
875     uint8_t halfHV[64];                                                       \
876                                                                               \
877     copy_block9(full, src, 16, stride, 9);                                    \
878     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
879     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
880     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
881     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
882 }                                                                             \
883                                                                               \
884 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
885                                    ptrdiff_t stride)                          \
886 {                                                                             \
887     uint8_t full[16 * 9];                                                     \
888     uint8_t halfH[72];                                                        \
889                                                                               \
890     copy_block9(full, src, 16, stride, 9);                                    \
891     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
892     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
893     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
894 }                                                                             \
895                                                                               \
896 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
897                                        ptrdiff_t stride)                      \
898 {                                                                             \
899     uint8_t full[16 * 9];                                                     \
900     uint8_t halfH[72];                                                        \
901     uint8_t halfV[64];                                                        \
902     uint8_t halfHV[64];                                                       \
903                                                                               \
904     copy_block9(full, src, 16, stride, 9);                                    \
905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
906     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
907     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
908     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
909 }                                                                             \
910                                                                               \
911 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
912                                    ptrdiff_t stride)                          \
913 {                                                                             \
914     uint8_t full[16 * 9];                                                     \
915     uint8_t halfH[72];                                                        \
916                                                                               \
917     copy_block9(full, src, 16, stride, 9);                                    \
918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
919     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
920     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
921 }                                                                             \
922                                                                               \
923 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
924                                    ptrdiff_t stride)                          \
925 {                                                                             \
926     uint8_t halfH[72];                                                        \
927                                                                               \
928     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
929     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
930 }                                                                             \
931                                                                               \
932 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
933                                     ptrdiff_t stride)                         \
934 {                                                                             \
935     uint8_t half[256];                                                        \
936                                                                               \
937     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
938     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
939 }                                                                             \
940                                                                               \
941 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
942                                     ptrdiff_t stride)                         \
943 {                                                                             \
944     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
945 }                                                                             \
946                                                                               \
947 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
948                                     ptrdiff_t stride)                         \
949 {                                                                             \
950     uint8_t half[256];                                                        \
951                                                                               \
952     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
953     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
954 }                                                                             \
955                                                                               \
956 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
957                                     ptrdiff_t stride)                         \
958 {                                                                             \
959     uint8_t full[24 * 17];                                                    \
960     uint8_t half[256];                                                        \
961                                                                               \
962     copy_block17(full, src, 24, stride, 17);                                  \
963     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
964     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
965 }                                                                             \
966                                                                               \
967 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
968                                     ptrdiff_t stride)                         \
969 {                                                                             \
970     uint8_t full[24 * 17];                                                    \
971                                                                               \
972     copy_block17(full, src, 24, stride, 17);                                  \
973     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
974 }                                                                             \
975                                                                               \
976 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
977                                     ptrdiff_t stride)                         \
978 {                                                                             \
979     uint8_t full[24 * 17];                                                    \
980     uint8_t half[256];                                                        \
981                                                                               \
982     copy_block17(full, src, 24, stride, 17);                                  \
983     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
984     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
985 }                                                                             \
986                                                                               \
987 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
988                                         ptrdiff_t stride)                     \
989 {                                                                             \
990     uint8_t full[24 * 17];                                                    \
991     uint8_t halfH[272];                                                       \
992     uint8_t halfV[256];                                                       \
993     uint8_t halfHV[256];                                                      \
994                                                                               \
995     copy_block17(full, src, 24, stride, 17);                                  \
996     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
997     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
998     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
999     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1000                             stride, 24, 16, 16, 16, 16);                      \
1001 }                                                                             \
1002                                                                               \
1003 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1004                                     ptrdiff_t stride)                         \
1005 {                                                                             \
1006     uint8_t full[24 * 17];                                                    \
1007     uint8_t halfH[272];                                                       \
1008     uint8_t halfHV[256];                                                      \
1009                                                                               \
1010     copy_block17(full, src, 24, stride, 17);                                  \
1011     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1012     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1013     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1014     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1015 }                                                                             \
1016                                                                               \
1017 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1018                                         ptrdiff_t stride)                     \
1019 {                                                                             \
1020     uint8_t full[24 * 17];                                                    \
1021     uint8_t halfH[272];                                                       \
1022     uint8_t halfV[256];                                                       \
1023     uint8_t halfHV[256];                                                      \
1024                                                                               \
1025     copy_block17(full, src, 24, stride, 17);                                  \
1026     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1027     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1028     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1029     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1030                             stride, 24, 16, 16, 16, 16);                      \
1031 }                                                                             \
1032                                                                               \
1033 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1034                                     ptrdiff_t stride)                         \
1035 {                                                                             \
1036     uint8_t full[24 * 17];                                                    \
1037     uint8_t halfH[272];                                                       \
1038     uint8_t halfHV[256];                                                      \
1039                                                                               \
1040     copy_block17(full, src, 24, stride, 17);                                  \
1041     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1042     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1043     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1044     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1045 }                                                                             \
1046                                                                               \
1047 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1048                                         ptrdiff_t stride)                     \
1049 {                                                                             \
1050     uint8_t full[24 * 17];                                                    \
1051     uint8_t halfH[272];                                                       \
1052     uint8_t halfV[256];                                                       \
1053     uint8_t halfHV[256];                                                      \
1054                                                                               \
1055     copy_block17(full, src, 24, stride, 17);                                  \
1056     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1057     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1058     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1059     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1060                             stride, 24, 16, 16, 16, 16);                      \
1061 }                                                                             \
1062                                                                               \
1063 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1064                                     ptrdiff_t stride)                         \
1065 {                                                                             \
1066     uint8_t full[24 * 17];                                                    \
1067     uint8_t halfH[272];                                                       \
1068     uint8_t halfHV[256];                                                      \
1069                                                                               \
1070     copy_block17(full, src, 24, stride, 17);                                  \
1071     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1072     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1073     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1074     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1075 }                                                                             \
1076                                                                               \
1077 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1078                                         ptrdiff_t stride)                     \
1079 {                                                                             \
1080     uint8_t full[24 * 17];                                                    \
1081     uint8_t halfH[272];                                                       \
1082     uint8_t halfV[256];                                                       \
1083     uint8_t halfHV[256];                                                      \
1084                                                                               \
1085     copy_block17(full, src, 24, stride, 17);                                  \
1086     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1087     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1088     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1089     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1090                             stride, 24, 16, 16, 16, 16);                      \
1091 }                                                                             \
1092                                                                               \
1093 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1094                                     ptrdiff_t stride)                         \
1095 {                                                                             \
1096     uint8_t full[24 * 17];                                                    \
1097     uint8_t halfH[272];                                                       \
1098     uint8_t halfHV[256];                                                      \
1099                                                                               \
1100     copy_block17(full, src, 24, stride, 17);                                  \
1101     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1102     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1103     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1104     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1105 }                                                                             \
1106                                                                               \
1107 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1108                                     ptrdiff_t stride)                         \
1109 {                                                                             \
1110     uint8_t halfH[272];                                                       \
1111     uint8_t halfHV[256];                                                      \
1112                                                                               \
1113     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1114     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1115     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1116 }                                                                             \
1117                                                                               \
1118 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1119                                     ptrdiff_t stride)                         \
1120 {                                                                             \
1121     uint8_t halfH[272];                                                       \
1122     uint8_t halfHV[256];                                                      \
1123                                                                               \
1124     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1126     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1127 }                                                                             \
1128                                                                               \
1129 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1130                                         ptrdiff_t stride)                     \
1131 {                                                                             \
1132     uint8_t full[24 * 17];                                                    \
1133     uint8_t halfH[272];                                                       \
1134     uint8_t halfV[256];                                                       \
1135     uint8_t halfHV[256];                                                      \
1136                                                                               \
1137     copy_block17(full, src, 24, stride, 17);                                  \
1138     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1139     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1141     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1142 }                                                                             \
1143                                                                               \
1144 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1145                                     ptrdiff_t stride)                         \
1146 {                                                                             \
1147     uint8_t full[24 * 17];                                                    \
1148     uint8_t halfH[272];                                                       \
1149                                                                               \
1150     copy_block17(full, src, 24, stride, 17);                                  \
1151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1152     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1153     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1154 }                                                                             \
1155                                                                               \
1156 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1157                                         ptrdiff_t stride)                     \
1158 {                                                                             \
1159     uint8_t full[24 * 17];                                                    \
1160     uint8_t halfH[272];                                                       \
1161     uint8_t halfV[256];                                                       \
1162     uint8_t halfHV[256];                                                      \
1163                                                                               \
1164     copy_block17(full, src, 24, stride, 17);                                  \
1165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1168     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1169 }                                                                             \
1170                                                                               \
1171 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1172                                     ptrdiff_t stride)                         \
1173 {                                                                             \
1174     uint8_t full[24 * 17];                                                    \
1175     uint8_t halfH[272];                                                       \
1176                                                                               \
1177     copy_block17(full, src, 24, stride, 17);                                  \
1178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1179     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1180     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1181 }                                                                             \
1182                                                                               \
1183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1184                                     ptrdiff_t stride)                         \
1185 {                                                                             \
1186     uint8_t halfH[272];                                                       \
1187                                                                               \
1188     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1189     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1190 }
1191
1192 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1193 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1194 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1195 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1196
1197 QPEL_MC(0, put_, _, op_put)
1198 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1199 QPEL_MC(0, avg_, _, op_avg)
1200
1201 #undef op_avg
1202 #undef op_put
1203 #undef op_put_no_rnd
1204
1205 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1206 {
1207     put_pixels8_8_c(dst, src, stride, 8);
1208 }
1209
1210 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1211 {
1212     avg_pixels8_8_c(dst, src, stride, 8);
1213 }
1214
1215 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1216 {
1217     put_pixels16_8_c(dst, src, stride, 16);
1218 }
1219
1220 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1221 {
1222     avg_pixels16_8_c(dst, src, stride, 16);
1223 }
1224
1225 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1226 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1227 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1228 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1229 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1230 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1231
1232 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1233                                   int dstStride, int srcStride, int h)
1234 {
1235     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1236     int i;
1237
1238     for (i = 0; i < h; i++) {
1239         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1240         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1241         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1242         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1243         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1244         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1245         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1246         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1247         dst   += dstStride;
1248         src   += srcStride;
1249     }
1250 }
1251
1252 #if CONFIG_RV40_DECODER
1253 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1254 {
1255     put_pixels16_xy2_8_c(dst, src, stride, 16);
1256 }
1257
1258 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1259 {
1260     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1261 }
1262
1263 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1264 {
1265     put_pixels8_xy2_8_c(dst, src, stride, 8);
1266 }
1267
1268 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1269 {
1270     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1271 }
1272 #endif /* CONFIG_RV40_DECODER */
1273
1274 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1275                                   int dstStride, int srcStride, int w)
1276 {
1277     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1278     int i;
1279
1280     for (i = 0; i < w; i++) {
1281         const int src_1 = src[-srcStride];
1282         const int src0  = src[0];
1283         const int src1  = src[srcStride];
1284         const int src2  = src[2 * srcStride];
1285         const int src3  = src[3 * srcStride];
1286         const int src4  = src[4 * srcStride];
1287         const int src5  = src[5 * srcStride];
1288         const int src6  = src[6 * srcStride];
1289         const int src7  = src[7 * srcStride];
1290         const int src8  = src[8 * srcStride];
1291         const int src9  = src[9 * srcStride];
1292         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1293         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1294         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1295         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1296         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1297         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1298         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1299         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1300         src++;
1301         dst++;
1302     }
1303 }
1304
1305 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1306 {
1307     uint8_t half[64];
1308
1309     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1310     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1311 }
1312
1313 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1314 {
1315     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1316 }
1317
1318 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1319 {
1320     uint8_t half[64];
1321
1322     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1323     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1324 }
1325
1326 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1327 {
1328     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1329 }
1330
1331 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1332 {
1333     uint8_t halfH[88];
1334     uint8_t halfV[64];
1335     uint8_t halfHV[64];
1336
1337     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1338     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1339     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1340     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1341 }
1342
1343 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1344 {
1345     uint8_t halfH[88];
1346     uint8_t halfV[64];
1347     uint8_t halfHV[64];
1348
1349     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1350     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1351     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1352     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1353 }
1354
1355 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1356 {
1357     uint8_t halfH[88];
1358
1359     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1360     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1361 }
1362
1363 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1364                               int line_size, int h)
1365 {
1366     int s = 0, i;
1367
1368     for (i = 0; i < h; i++) {
1369         s    += abs(pix1[0]  - pix2[0]);
1370         s    += abs(pix1[1]  - pix2[1]);
1371         s    += abs(pix1[2]  - pix2[2]);
1372         s    += abs(pix1[3]  - pix2[3]);
1373         s    += abs(pix1[4]  - pix2[4]);
1374         s    += abs(pix1[5]  - pix2[5]);
1375         s    += abs(pix1[6]  - pix2[6]);
1376         s    += abs(pix1[7]  - pix2[7]);
1377         s    += abs(pix1[8]  - pix2[8]);
1378         s    += abs(pix1[9]  - pix2[9]);
1379         s    += abs(pix1[10] - pix2[10]);
1380         s    += abs(pix1[11] - pix2[11]);
1381         s    += abs(pix1[12] - pix2[12]);
1382         s    += abs(pix1[13] - pix2[13]);
1383         s    += abs(pix1[14] - pix2[14]);
1384         s    += abs(pix1[15] - pix2[15]);
1385         pix1 += line_size;
1386         pix2 += line_size;
1387     }
1388     return s;
1389 }
1390
1391 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1392                           int line_size, int h)
1393 {
1394     int s = 0, i;
1395
1396     for (i = 0; i < h; i++) {
1397         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1398         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1399         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1400         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1401         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1402         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1403         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1404         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1405         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1406         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1407         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1408         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1409         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1410         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1411         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1412         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1413         pix1 += line_size;
1414         pix2 += line_size;
1415     }
1416     return s;
1417 }
1418
1419 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1420                           int line_size, int h)
1421 {
1422     int s = 0, i;
1423     uint8_t *pix3 = pix2 + line_size;
1424
1425     for (i = 0; i < h; i++) {
1426         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1427         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1428         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1429         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1430         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1431         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1432         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1433         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1434         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1435         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1436         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1437         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1438         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1439         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1440         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1441         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1442         pix1 += line_size;
1443         pix2 += line_size;
1444         pix3 += line_size;
1445     }
1446     return s;
1447 }
1448
1449 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1450                            int line_size, int h)
1451 {
1452     int s = 0, i;
1453     uint8_t *pix3 = pix2 + line_size;
1454
1455     for (i = 0; i < h; i++) {
1456         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1457         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1458         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1459         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1460         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1461         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1462         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1463         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1464         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1465         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1466         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1467         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1468         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1469         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1470         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1471         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1472         pix1 += line_size;
1473         pix2 += line_size;
1474         pix3 += line_size;
1475     }
1476     return s;
1477 }
1478
1479 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1480                              int line_size, int h)
1481 {
1482     int s = 0, i;
1483
1484     for (i = 0; i < h; i++) {
1485         s    += abs(pix1[0] - pix2[0]);
1486         s    += abs(pix1[1] - pix2[1]);
1487         s    += abs(pix1[2] - pix2[2]);
1488         s    += abs(pix1[3] - pix2[3]);
1489         s    += abs(pix1[4] - pix2[4]);
1490         s    += abs(pix1[5] - pix2[5]);
1491         s    += abs(pix1[6] - pix2[6]);
1492         s    += abs(pix1[7] - pix2[7]);
1493         pix1 += line_size;
1494         pix2 += line_size;
1495     }
1496     return s;
1497 }
1498
1499 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1500                          int line_size, int h)
1501 {
1502     int s = 0, i;
1503
1504     for (i = 0; i < h; i++) {
1505         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1513         pix1 += line_size;
1514         pix2 += line_size;
1515     }
1516     return s;
1517 }
1518
1519 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1520                          int line_size, int h)
1521 {
1522     int s = 0, i;
1523     uint8_t *pix3 = pix2 + line_size;
1524
1525     for (i = 0; i < h; i++) {
1526         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1527         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1528         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1529         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1530         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1531         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1532         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1533         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1534         pix1 += line_size;
1535         pix2 += line_size;
1536         pix3 += line_size;
1537     }
1538     return s;
1539 }
1540
1541 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1542                           int line_size, int h)
1543 {
1544     int s = 0, i;
1545     uint8_t *pix3 = pix2 + line_size;
1546
1547     for (i = 0; i < h; i++) {
1548         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1549         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1550         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1551         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1552         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1553         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1554         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1555         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1556         pix1 += line_size;
1557         pix2 += line_size;
1558         pix3 += line_size;
1559     }
1560     return s;
1561 }
1562
1563 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1564 {
1565     int score1 = 0, score2 = 0, x, y;
1566
1567     for (y = 0; y < h; y++) {
1568         for (x = 0; x < 16; x++)
1569             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1570         if (y + 1 < h) {
1571             for (x = 0; x < 15; x++)
1572                 score2 += FFABS(s1[x]     - s1[x + stride] -
1573                                 s1[x + 1] + s1[x + stride + 1]) -
1574                           FFABS(s2[x]     - s2[x + stride] -
1575                                 s2[x + 1] + s2[x + stride + 1]);
1576         }
1577         s1 += stride;
1578         s2 += stride;
1579     }
1580
1581     if (c)
1582         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1583     else
1584         return score1 + FFABS(score2) * 8;
1585 }
1586
1587 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1588 {
1589     int score1 = 0, score2 = 0, x, y;
1590
1591     for (y = 0; y < h; y++) {
1592         for (x = 0; x < 8; x++)
1593             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1594         if (y + 1 < h) {
1595             for (x = 0; x < 7; x++)
1596                 score2 += FFABS(s1[x]     - s1[x + stride] -
1597                                 s1[x + 1] + s1[x + stride + 1]) -
1598                           FFABS(s2[x]     - s2[x + stride] -
1599                                 s2[x + 1] + s2[x + stride + 1]);
1600         }
1601         s1 += stride;
1602         s2 += stride;
1603     }
1604
1605     if (c)
1606         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1607     else
1608         return score1 + FFABS(score2) * 8;
1609 }
1610
1611 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1612                           int16_t basis[64], int scale)
1613 {
1614     int i;
1615     unsigned int sum = 0;
1616
1617     for (i = 0; i < 8 * 8; i++) {
1618         int b = rem[i] + ((basis[i] * scale +
1619                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1620                           (BASIS_SHIFT - RECON_SHIFT));
1621         int w = weight[i];
1622         b >>= RECON_SHIFT;
1623         assert(-512 < b && b < 512);
1624
1625         sum += (w * b) * (w * b) >> 4;
1626     }
1627     return sum >> 2;
1628 }
1629
1630 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1631 {
1632     int i;
1633
1634     for (i = 0; i < 8 * 8; i++)
1635         rem[i] += (basis[i] * scale +
1636                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1637                   (BASIS_SHIFT - RECON_SHIFT);
1638 }
1639
1640 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1641                     int stride, int h)
1642 {
1643     return 0;
1644 }
1645
1646 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1647 {
1648     int i;
1649
1650     memset(cmp, 0, sizeof(void *) * 6);
1651
1652     for (i = 0; i < 6; i++) {
1653         switch (type & 0xFF) {
1654         case FF_CMP_SAD:
1655             cmp[i] = c->sad[i];
1656             break;
1657         case FF_CMP_SATD:
1658             cmp[i] = c->hadamard8_diff[i];
1659             break;
1660         case FF_CMP_SSE:
1661             cmp[i] = c->sse[i];
1662             break;
1663         case FF_CMP_DCT:
1664             cmp[i] = c->dct_sad[i];
1665             break;
1666         case FF_CMP_DCT264:
1667             cmp[i] = c->dct264_sad[i];
1668             break;
1669         case FF_CMP_DCTMAX:
1670             cmp[i] = c->dct_max[i];
1671             break;
1672         case FF_CMP_PSNR:
1673             cmp[i] = c->quant_psnr[i];
1674             break;
1675         case FF_CMP_BIT:
1676             cmp[i] = c->bit[i];
1677             break;
1678         case FF_CMP_RD:
1679             cmp[i] = c->rd[i];
1680             break;
1681         case FF_CMP_VSAD:
1682             cmp[i] = c->vsad[i];
1683             break;
1684         case FF_CMP_VSSE:
1685             cmp[i] = c->vsse[i];
1686             break;
1687         case FF_CMP_ZERO:
1688             cmp[i] = zero_cmp;
1689             break;
1690         case FF_CMP_NSSE:
1691             cmp[i] = c->nsse[i];
1692             break;
1693         default:
1694             av_log(NULL, AV_LOG_ERROR,
1695                    "internal error in cmp function selection\n");
1696         }
1697     }
1698 }
1699
1700 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1701 {
1702     long i;
1703
1704     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1705         long a = *(long *) (src + i);
1706         long b = *(long *) (dst + i);
1707         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1708     }
1709     for (; i < w; i++)
1710         dst[i + 0] += src[i + 0];
1711 }
1712
1713 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1714 {
1715     long i;
1716
1717 #if !HAVE_FAST_UNALIGNED
1718     if ((long) src2 & (sizeof(long) - 1)) {
1719         for (i = 0; i + 7 < w; i += 8) {
1720             dst[i + 0] = src1[i + 0] - src2[i + 0];
1721             dst[i + 1] = src1[i + 1] - src2[i + 1];
1722             dst[i + 2] = src1[i + 2] - src2[i + 2];
1723             dst[i + 3] = src1[i + 3] - src2[i + 3];
1724             dst[i + 4] = src1[i + 4] - src2[i + 4];
1725             dst[i + 5] = src1[i + 5] - src2[i + 5];
1726             dst[i + 6] = src1[i + 6] - src2[i + 6];
1727             dst[i + 7] = src1[i + 7] - src2[i + 7];
1728         }
1729     } else
1730 #endif
1731     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1732         long a = *(long *) (src1 + i);
1733         long b = *(long *) (src2 + i);
1734         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1735                               ((a ^ b ^ pb_80) & pb_80);
1736     }
1737     for (; i < w; i++)
1738         dst[i + 0] = src1[i + 0] - src2[i + 0];
1739 }
1740
1741 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1742                                          const uint8_t *diff, int w,
1743                                          int *left, int *left_top)
1744 {
1745     int i;
1746     uint8_t l, lt;
1747
1748     l  = *left;
1749     lt = *left_top;
1750
1751     for (i = 0; i < w; i++) {
1752         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1753         lt     = src1[i];
1754         dst[i] = l;
1755     }
1756
1757     *left     = l;
1758     *left_top = lt;
1759 }
1760
1761 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1762                                          const uint8_t *src2, int w,
1763                                          int *left, int *left_top)
1764 {
1765     int i;
1766     uint8_t l, lt;
1767
1768     l  = *left;
1769     lt = *left_top;
1770
1771     for (i = 0; i < w; i++) {
1772         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1773         lt     = src1[i];
1774         l      = src2[i];
1775         dst[i] = l - pred;
1776     }
1777
1778     *left     = l;
1779     *left_top = lt;
1780 }
1781
1782 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1783                                       int w, int acc)
1784 {
1785     int i;
1786
1787     for (i = 0; i < w - 1; i++) {
1788         acc   += src[i];
1789         dst[i] = acc;
1790         i++;
1791         acc   += src[i];
1792         dst[i] = acc;
1793     }
1794
1795     for (; i < w; i++) {
1796         acc   += src[i];
1797         dst[i] = acc;
1798     }
1799
1800     return acc;
1801 }
1802
1803 #if HAVE_BIGENDIAN
1804 #define B 3
1805 #define G 2
1806 #define R 1
1807 #define A 0
1808 #else
1809 #define B 0
1810 #define G 1
1811 #define R 2
1812 #define A 3
1813 #endif
1814 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1815                                              int w, int *red, int *green,
1816                                              int *blue, int *alpha)
1817 {
1818     int i, r = *red, g = *green, b = *blue, a = *alpha;
1819
1820     for (i = 0; i < w; i++) {
1821         b += src[4 * i + B];
1822         g += src[4 * i + G];
1823         r += src[4 * i + R];
1824         a += src[4 * i + A];
1825
1826         dst[4 * i + B] = b;
1827         dst[4 * i + G] = g;
1828         dst[4 * i + R] = r;
1829         dst[4 * i + A] = a;
1830     }
1831
1832     *red   = r;
1833     *green = g;
1834     *blue  = b;
1835     *alpha = a;
1836 }
1837 #undef B
1838 #undef G
1839 #undef R
1840 #undef A
1841
1842 #define BUTTERFLY2(o1, o2, i1, i2)              \
1843     o1 = (i1) + (i2);                           \
1844     o2 = (i1) - (i2);
1845
1846 #define BUTTERFLY1(x, y)                        \
1847     {                                           \
1848         int a, b;                               \
1849         a = x;                                  \
1850         b = y;                                  \
1851         x = a + b;                              \
1852         y = a - b;                              \
1853     }
1854
1855 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1856
1857 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1858                                uint8_t *src, int stride, int h)
1859 {
1860     int i, temp[64], sum = 0;
1861
1862     assert(h == 8);
1863
1864     for (i = 0; i < 8; i++) {
1865         // FIXME: try pointer walks
1866         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1867                    src[stride * i + 0] - dst[stride * i + 0],
1868                    src[stride * i + 1] - dst[stride * i + 1]);
1869         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1870                    src[stride * i + 2] - dst[stride * i + 2],
1871                    src[stride * i + 3] - dst[stride * i + 3]);
1872         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1873                    src[stride * i + 4] - dst[stride * i + 4],
1874                    src[stride * i + 5] - dst[stride * i + 5]);
1875         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1876                    src[stride * i + 6] - dst[stride * i + 6],
1877                    src[stride * i + 7] - dst[stride * i + 7]);
1878
1879         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1880         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1881         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1882         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1883
1884         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1885         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1886         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1887         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1888     }
1889
1890     for (i = 0; i < 8; i++) {
1891         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1892         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1893         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1894         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1895
1896         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1897         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1898         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1899         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1900
1901         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1902                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1903                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1904                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1905     }
1906     return sum;
1907 }
1908
1909 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1910                                 uint8_t *dummy, int stride, int h)
1911 {
1912     int i, temp[64], sum = 0;
1913
1914     assert(h == 8);
1915
1916     for (i = 0; i < 8; i++) {
1917         // FIXME: try pointer walks
1918         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1919                    src[stride * i + 0], src[stride * i + 1]);
1920         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1921                    src[stride * i + 2], src[stride * i + 3]);
1922         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1923                    src[stride * i + 4], src[stride * i + 5]);
1924         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1925                    src[stride * i + 6], src[stride * i + 7]);
1926
1927         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1928         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1929         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1930         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1931
1932         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1933         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1934         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1935         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1936     }
1937
1938     for (i = 0; i < 8; i++) {
1939         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1940         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1941         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1942         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1943
1944         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1945         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1946         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1947         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1948
1949         sum +=
1950             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1951             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1952             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1953             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1954     }
1955
1956     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1957
1958     return sum;
1959 }
1960
1961 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1962                         uint8_t *src2, int stride, int h)
1963 {
1964     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1965
1966     assert(h == 8);
1967
1968     s->dsp.diff_pixels(temp, src1, src2, stride);
1969     s->dsp.fdct(temp);
1970     return s->dsp.sum_abs_dctelem(temp);
1971 }
1972
1973 #if CONFIG_GPL
1974 #define DCT8_1D                                         \
1975     {                                                   \
1976         const int s07 = SRC(0) + SRC(7);                \
1977         const int s16 = SRC(1) + SRC(6);                \
1978         const int s25 = SRC(2) + SRC(5);                \
1979         const int s34 = SRC(3) + SRC(4);                \
1980         const int a0  = s07 + s34;                      \
1981         const int a1  = s16 + s25;                      \
1982         const int a2  = s07 - s34;                      \
1983         const int a3  = s16 - s25;                      \
1984         const int d07 = SRC(0) - SRC(7);                \
1985         const int d16 = SRC(1) - SRC(6);                \
1986         const int d25 = SRC(2) - SRC(5);                \
1987         const int d34 = SRC(3) - SRC(4);                \
1988         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
1989         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
1990         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
1991         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
1992         DST(0, a0 + a1);                                \
1993         DST(1, a4 + (a7 >> 2));                         \
1994         DST(2, a2 + (a3 >> 1));                         \
1995         DST(3, a5 + (a6 >> 2));                         \
1996         DST(4, a0 - a1);                                \
1997         DST(5, a6 - (a5 >> 2));                         \
1998         DST(6, (a2 >> 1) - a3);                         \
1999         DST(7, (a4 >> 2) - a7);                         \
2000     }
2001
2002 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2003                            uint8_t *src2, int stride, int h)
2004 {
2005     int16_t dct[8][8];
2006     int i, sum = 0;
2007
2008     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2009
2010 #define SRC(x) dct[i][x]
2011 #define DST(x, v) dct[i][x] = v
2012     for (i = 0; i < 8; i++)
2013         DCT8_1D
2014 #undef SRC
2015 #undef DST
2016
2017 #define SRC(x) dct[x][i]
2018 #define DST(x, v) sum += FFABS(v)
2019         for (i = 0; i < 8; i++)
2020             DCT8_1D
2021 #undef SRC
2022 #undef DST
2023             return sum;
2024 }
2025 #endif
2026
2027 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2028                         uint8_t *src2, int stride, int h)
2029 {
2030     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2031     int sum = 0, i;
2032
2033     assert(h == 8);
2034
2035     s->dsp.diff_pixels(temp, src1, src2, stride);
2036     s->dsp.fdct(temp);
2037
2038     for (i = 0; i < 64; i++)
2039         sum = FFMAX(sum, FFABS(temp[i]));
2040
2041     return sum;
2042 }
2043
2044 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2045                            uint8_t *src2, int stride, int h)
2046 {
2047     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2048     int16_t *const bak = temp + 64;
2049     int sum = 0, i;
2050
2051     assert(h == 8);
2052     s->mb_intra = 0;
2053
2054     s->dsp.diff_pixels(temp, src1, src2, stride);
2055
2056     memcpy(bak, temp, 64 * sizeof(int16_t));
2057
2058     s->block_last_index[0 /* FIXME */] =
2059         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2060     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2061     ff_simple_idct_8(temp); // FIXME
2062
2063     for (i = 0; i < 64; i++)
2064         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2065
2066     return sum;
2067 }
2068
2069 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2070                    int stride, int h)
2071 {
2072     const uint8_t *scantable = s->intra_scantable.permutated;
2073     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2074     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2075     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2076     int i, last, run, bits, level, distortion, start_i;
2077     const int esc_length = s->ac_esc_length;
2078     uint8_t *length, *last_length;
2079
2080     assert(h == 8);
2081
2082     copy_block8(lsrc1, src1, 8, stride, 8);
2083     copy_block8(lsrc2, src2, 8, stride, 8);
2084
2085     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2086
2087     s->block_last_index[0 /* FIXME */] =
2088     last                               =
2089         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2090
2091     bits = 0;
2092
2093     if (s->mb_intra) {
2094         start_i     = 1;
2095         length      = s->intra_ac_vlc_length;
2096         last_length = s->intra_ac_vlc_last_length;
2097         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2098     } else {
2099         start_i     = 0;
2100         length      = s->inter_ac_vlc_length;
2101         last_length = s->inter_ac_vlc_last_length;
2102     }
2103
2104     if (last >= start_i) {
2105         run = 0;
2106         for (i = start_i; i < last; i++) {
2107             int j = scantable[i];
2108             level = temp[j];
2109
2110             if (level) {
2111                 level += 64;
2112                 if ((level & (~127)) == 0)
2113                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2114                 else
2115                     bits += esc_length;
2116                 run = 0;
2117             } else
2118                 run++;
2119         }
2120         i = scantable[last];
2121
2122         level = temp[i] + 64;
2123
2124         assert(level - 64);
2125
2126         if ((level & (~127)) == 0) {
2127             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2128         } else
2129             bits += esc_length;
2130     }
2131
2132     if (last >= 0) {
2133         if (s->mb_intra)
2134             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2135         else
2136             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2137     }
2138
2139     s->dsp.idct_add(lsrc2, 8, temp);
2140
2141     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2142
2143     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2144 }
2145
2146 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2147                     int stride, int h)
2148 {
2149     const uint8_t *scantable = s->intra_scantable.permutated;
2150     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2151     int i, last, run, bits, level, start_i;
2152     const int esc_length = s->ac_esc_length;
2153     uint8_t *length, *last_length;
2154
2155     assert(h == 8);
2156
2157     s->dsp.diff_pixels(temp, src1, src2, stride);
2158
2159     s->block_last_index[0 /* FIXME */] =
2160     last                               =
2161         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2162
2163     bits = 0;
2164
2165     if (s->mb_intra) {
2166         start_i     = 1;
2167         length      = s->intra_ac_vlc_length;
2168         last_length = s->intra_ac_vlc_last_length;
2169         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2170     } else {
2171         start_i     = 0;
2172         length      = s->inter_ac_vlc_length;
2173         last_length = s->inter_ac_vlc_last_length;
2174     }
2175
2176     if (last >= start_i) {
2177         run = 0;
2178         for (i = start_i; i < last; i++) {
2179             int j = scantable[i];
2180             level = temp[j];
2181
2182             if (level) {
2183                 level += 64;
2184                 if ((level & (~127)) == 0)
2185                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2186                 else
2187                     bits += esc_length;
2188                 run = 0;
2189             } else
2190                 run++;
2191         }
2192         i = scantable[last];
2193
2194         level = temp[i] + 64;
2195
2196         assert(level - 64);
2197
2198         if ((level & (~127)) == 0)
2199             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2200         else
2201             bits += esc_length;
2202     }
2203
2204     return bits;
2205 }
2206
2207 #define VSAD_INTRA(size)                                                \
2208 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2209                                     uint8_t *s, uint8_t *dummy,         \
2210                                     int stride, int h)                  \
2211 {                                                                       \
2212     int score = 0, x, y;                                                \
2213                                                                         \
2214     for (y = 1; y < h; y++) {                                           \
2215         for (x = 0; x < size; x += 4) {                                 \
2216             score += FFABS(s[x]     - s[x + stride])     +              \
2217                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2218                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2219                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2220         }                                                               \
2221         s += stride;                                                    \
2222     }                                                                   \
2223                                                                         \
2224     return score;                                                       \
2225 }
2226 VSAD_INTRA(8)
2227 VSAD_INTRA(16)
2228
2229 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2230                     int stride, int h)
2231 {
2232     int score = 0, x, y;
2233
2234     for (y = 1; y < h; y++) {
2235         for (x = 0; x < 16; x++)
2236             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2237         s1 += stride;
2238         s2 += stride;
2239     }
2240
2241     return score;
2242 }
2243
2244 #define SQ(a) ((a) * (a))
2245 #define VSSE_INTRA(size)                                                \
2246 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2247                                     uint8_t *s, uint8_t *dummy,         \
2248                                     int stride, int h)                  \
2249 {                                                                       \
2250     int score = 0, x, y;                                                \
2251                                                                         \
2252     for (y = 1; y < h; y++) {                                           \
2253         for (x = 0; x < size; x += 4) {                                 \
2254             score += SQ(s[x]     - s[x + stride]) +                     \
2255                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2256                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2257                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2258         }                                                               \
2259         s += stride;                                                    \
2260     }                                                                   \
2261                                                                         \
2262     return score;                                                       \
2263 }
2264 VSSE_INTRA(8)
2265 VSSE_INTRA(16)
2266
2267 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2268                     int stride, int h)
2269 {
2270     int score = 0, x, y;
2271
2272     for (y = 1; y < h; y++) {
2273         for (x = 0; x < 16; x++)
2274             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2275         s1 += stride;
2276         s2 += stride;
2277     }
2278
2279     return score;
2280 }
2281
2282 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2283                                int size)
2284 {
2285     int score = 0, i;
2286
2287     for (i = 0; i < size; i++)
2288         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2289     return score;
2290 }
2291
2292 #define WRAPPER8_16_SQ(name8, name16)                                   \
2293 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2294                   int stride, int h)                                    \
2295 {                                                                       \
2296     int score = 0;                                                      \
2297                                                                         \
2298     score += name8(s, dst, src, stride, 8);                             \
2299     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2300     if (h == 16) {                                                      \
2301         dst   += 8 * stride;                                            \
2302         src   += 8 * stride;                                            \
2303         score += name8(s, dst, src, stride, 8);                         \
2304         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2305     }                                                                   \
2306     return score;                                                       \
2307 }
2308
2309 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2310 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2311 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2312 #if CONFIG_GPL
2313 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2314 #endif
2315 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2316 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2317 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2318 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2319
2320 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2321                                    uint32_t maxi, uint32_t maxisign)
2322 {
2323     if (a > mini)
2324         return mini;
2325     else if ((a ^ (1U << 31)) > maxisign)
2326         return maxi;
2327     else
2328         return a;
2329 }
2330
2331 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2332                                          float *min, float *max, int len)
2333 {
2334     int i;
2335     uint32_t mini        = *(uint32_t *) min;
2336     uint32_t maxi        = *(uint32_t *) max;
2337     uint32_t maxisign    = maxi ^ (1U << 31);
2338     uint32_t *dsti       = (uint32_t *) dst;
2339     const uint32_t *srci = (const uint32_t *) src;
2340
2341     for (i = 0; i < len; i += 8) {
2342         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2343         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2344         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2345         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2346         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2347         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2348         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2349         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2350     }
2351 }
2352
2353 static void vector_clipf_c(float *dst, const float *src,
2354                            float min, float max, int len)
2355 {
2356     int i;
2357
2358     if (min < 0 && max > 0) {
2359         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2360     } else {
2361         for (i = 0; i < len; i += 8) {
2362             dst[i]     = av_clipf(src[i], min, max);
2363             dst[i + 1] = av_clipf(src[i + 1], min, max);
2364             dst[i + 2] = av_clipf(src[i + 2], min, max);
2365             dst[i + 3] = av_clipf(src[i + 3], min, max);
2366             dst[i + 4] = av_clipf(src[i + 4], min, max);
2367             dst[i + 5] = av_clipf(src[i + 5], min, max);
2368             dst[i + 6] = av_clipf(src[i + 6], min, max);
2369             dst[i + 7] = av_clipf(src[i + 7], min, max);
2370         }
2371     }
2372 }
2373
2374 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2375                                      int order)
2376 {
2377     int res = 0;
2378
2379     while (order--)
2380         res += *v1++ **v2++;
2381
2382     return res;
2383 }
2384
2385 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2386                                               const int16_t *v3,
2387                                               int order, int mul)
2388 {
2389     int res = 0;
2390
2391     while (order--) {
2392         res   += *v1 * *v2++;
2393         *v1++ += mul * *v3++;
2394     }
2395     return res;
2396 }
2397
2398 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2399                                 int32_t max, unsigned int len)
2400 {
2401     do {
2402         *dst++ = av_clip(*src++, min, max);
2403         *dst++ = av_clip(*src++, min, max);
2404         *dst++ = av_clip(*src++, min, max);
2405         *dst++ = av_clip(*src++, min, max);
2406         *dst++ = av_clip(*src++, min, max);
2407         *dst++ = av_clip(*src++, min, max);
2408         *dst++ = av_clip(*src++, min, max);
2409         *dst++ = av_clip(*src++, min, max);
2410         len   -= 8;
2411     } while (len > 0);
2412 }
2413
2414 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2415 {
2416     ff_j_rev_dct(block);
2417     put_pixels_clamped_c(block, dest, line_size);
2418 }
2419
2420 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2421 {
2422     ff_j_rev_dct(block);
2423     add_pixels_clamped_c(block, dest, line_size);
2424 }
2425
2426 /* init static data */
2427 av_cold void ff_dsputil_static_init(void)
2428 {
2429     int i;
2430
2431     for (i = 0; i < 512; i++)
2432         ff_square_tab[i] = (i - 256) * (i - 256);
2433 }
2434
2435 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2436 {
2437     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2438
2439 #if CONFIG_ENCODERS
2440     if (avctx->bits_per_raw_sample == 10) {
2441         c->fdct    = ff_jpeg_fdct_islow_10;
2442         c->fdct248 = ff_fdct248_islow_10;
2443     } else {
2444         if (avctx->dct_algo == FF_DCT_FASTINT) {
2445             c->fdct    = ff_fdct_ifast;
2446             c->fdct248 = ff_fdct_ifast248;
2447         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2448             c->fdct    = ff_faandct;
2449             c->fdct248 = ff_faandct248;
2450         } else {
2451             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2452             c->fdct248 = ff_fdct248_islow_8;
2453         }
2454     }
2455 #endif /* CONFIG_ENCODERS */
2456
2457     if (avctx->bits_per_raw_sample == 10) {
2458         c->idct_put              = ff_simple_idct_put_10;
2459         c->idct_add              = ff_simple_idct_add_10;
2460         c->idct                  = ff_simple_idct_10;
2461         c->idct_permutation_type = FF_NO_IDCT_PERM;
2462     } else {
2463         if (avctx->idct_algo == FF_IDCT_INT) {
2464             c->idct_put              = jref_idct_put;
2465             c->idct_add              = jref_idct_add;
2466             c->idct                  = ff_j_rev_dct;
2467             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2468         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2469             c->idct_put              = ff_faanidct_put;
2470             c->idct_add              = ff_faanidct_add;
2471             c->idct                  = ff_faanidct;
2472             c->idct_permutation_type = FF_NO_IDCT_PERM;
2473         } else { // accurate/default
2474             c->idct_put              = ff_simple_idct_put_8;
2475             c->idct_add              = ff_simple_idct_add_8;
2476             c->idct                  = ff_simple_idct_8;
2477             c->idct_permutation_type = FF_NO_IDCT_PERM;
2478         }
2479     }
2480
2481     c->diff_pixels = diff_pixels_c;
2482
2483     c->put_pixels_clamped        = put_pixels_clamped_c;
2484     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2485     c->add_pixels_clamped        = add_pixels_clamped_c;
2486
2487     c->sum_abs_dctelem = sum_abs_dctelem_c;
2488
2489     c->gmc1 = gmc1_c;
2490     c->gmc  = ff_gmc_c;
2491
2492     c->pix_sum   = pix_sum_c;
2493     c->pix_norm1 = pix_norm1_c;
2494
2495     c->fill_block_tab[0] = fill_block16_c;
2496     c->fill_block_tab[1] = fill_block8_c;
2497
2498     /* TODO [0] 16  [1] 8 */
2499     c->pix_abs[0][0] = pix_abs16_c;
2500     c->pix_abs[0][1] = pix_abs16_x2_c;
2501     c->pix_abs[0][2] = pix_abs16_y2_c;
2502     c->pix_abs[0][3] = pix_abs16_xy2_c;
2503     c->pix_abs[1][0] = pix_abs8_c;
2504     c->pix_abs[1][1] = pix_abs8_x2_c;
2505     c->pix_abs[1][2] = pix_abs8_y2_c;
2506     c->pix_abs[1][3] = pix_abs8_xy2_c;
2507
2508 #define dspfunc(PFX, IDX, NUM)                              \
2509     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2510     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2511     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2512     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2513     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2514     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2515     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2516     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2517     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2518     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2519     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2520     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2521     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2522     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2523     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2524     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2525
2526     dspfunc(put_qpel, 0, 16);
2527     dspfunc(put_qpel, 1, 8);
2528
2529     dspfunc(put_no_rnd_qpel, 0, 16);
2530     dspfunc(put_no_rnd_qpel, 1, 8);
2531
2532     dspfunc(avg_qpel, 0, 16);
2533     dspfunc(avg_qpel, 1, 8);
2534
2535 #undef dspfunc
2536
2537     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2538     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2539     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2540     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2541     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2542     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2543     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2544     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2545
2546 #define SET_CMP_FUNC(name)                      \
2547     c->name[0] = name ## 16_c;                  \
2548     c->name[1] = name ## 8x8_c;
2549
2550     SET_CMP_FUNC(hadamard8_diff)
2551     c->hadamard8_diff[4] = hadamard8_intra16_c;
2552     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2553     SET_CMP_FUNC(dct_sad)
2554     SET_CMP_FUNC(dct_max)
2555 #if CONFIG_GPL
2556     SET_CMP_FUNC(dct264_sad)
2557 #endif
2558     c->sad[0] = pix_abs16_c;
2559     c->sad[1] = pix_abs8_c;
2560     c->sse[0] = sse16_c;
2561     c->sse[1] = sse8_c;
2562     c->sse[2] = sse4_c;
2563     SET_CMP_FUNC(quant_psnr)
2564     SET_CMP_FUNC(rd)
2565     SET_CMP_FUNC(bit)
2566     c->vsad[0] = vsad16_c;
2567     c->vsad[4] = vsad_intra16_c;
2568     c->vsad[5] = vsad_intra8_c;
2569     c->vsse[0] = vsse16_c;
2570     c->vsse[4] = vsse_intra16_c;
2571     c->vsse[5] = vsse_intra8_c;
2572     c->nsse[0] = nsse16_c;
2573     c->nsse[1] = nsse8_c;
2574
2575     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2576
2577     c->add_bytes                      = add_bytes_c;
2578     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2579     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2580     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2581
2582     c->diff_bytes                 = diff_bytes_c;
2583     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2584
2585     c->bswap_buf   = bswap_buf;
2586     c->bswap16_buf = bswap16_buf;
2587
2588     c->try_8x8basis = try_8x8basis_c;
2589     c->add_8x8basis = add_8x8basis_c;
2590
2591     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2592
2593     c->scalarproduct_int16 = scalarproduct_int16_c;
2594     c->vector_clip_int32   = vector_clip_int32_c;
2595     c->vector_clipf        = vector_clipf_c;
2596
2597     c->shrink[0] = av_image_copy_plane;
2598     c->shrink[1] = ff_shrink22;
2599     c->shrink[2] = ff_shrink44;
2600     c->shrink[3] = ff_shrink88;
2601
2602     c->add_pixels8 = add_pixels8_c;
2603
2604 #undef FUNC
2605 #undef FUNCC
2606 #define FUNC(f,  depth) f ## _ ## depth
2607 #define FUNCC(f, depth) f ## _ ## depth ## _c
2608
2609     c->draw_edges = FUNCC(draw_edges, 8);
2610
2611     c->clear_block  = FUNCC(clear_block, 8);
2612     c->clear_blocks = FUNCC(clear_blocks, 8);
2613
2614 #define BIT_DEPTH_FUNCS(depth)                  \
2615     c->get_pixels = FUNCC(get_pixels, depth);
2616
2617     switch (avctx->bits_per_raw_sample) {
2618     case 9:
2619     case 10:
2620         BIT_DEPTH_FUNCS(16);
2621         break;
2622     default:
2623         BIT_DEPTH_FUNCS(8);
2624         break;
2625     }
2626
2627     if (ARCH_ARM)
2628         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2629     if (ARCH_BFIN)
2630         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2631     if (ARCH_PPC)
2632         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2633     if (ARCH_X86)
2634         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2635
2636     ff_init_scantable_permutation(c->idct_permutation,
2637                                   c->idct_permutation_type);
2638 }