]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
dsputil_template: Move bits that are used templatized into separate file
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "avcodec.h"
33 #include "copy_block.h"
34 #include "dct.h"
35 #include "dsputil.h"
36 #include "simple_idct.h"
37 #include "faandct.h"
38 #include "faanidct.h"
39 #include "imgconvert.h"
40 #include "mathops.h"
41 #include "mpegvideo.h"
42 #include "config.h"
43
44 uint32_t ff_square_tab[512] = { 0, };
45
46 #define BIT_DEPTH 16
47 #include "dsputilenc_template.c"
48 #undef BIT_DEPTH
49
50 #define BIT_DEPTH 8
51 #include "hpel_template.c"
52 #include "tpel_template.c"
53 #include "dsputil_template.c"
54 #include "dsputilenc_template.c"
55
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL / 255 * 0x7f)
58 #define pb_80 (~0UL / 255 * 0x80)
59
60 /* Specific zigzag scan for 248 idct. NOTE that unlike the
61  * specification, we interleave the fields */
62 const uint8_t ff_zigzag248_direct[64] = {
63      0,  8,  1,  9, 16, 24,  2, 10,
64     17, 25, 32, 40, 48, 56, 33, 41,
65     18, 26,  3, 11,  4, 12, 19, 27,
66     34, 42, 49, 57, 50, 58, 35, 43,
67     20, 28,  5, 13,  6, 14, 21, 29,
68     36, 44, 51, 59, 52, 60, 37, 45,
69     22, 30,  7, 15, 23, 31, 38, 46,
70     53, 61, 54, 62, 39, 47, 55, 63,
71 };
72
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74      0,  1,  2,  3,  8,  9, 16, 17,
75     10, 11,  4,  5,  6,  7, 15, 14,
76     13, 12, 19, 18, 24, 25, 32, 33,
77     26, 27, 20, 21, 22, 23, 28, 29,
78     30, 31, 34, 35, 40, 41, 48, 49,
79     42, 43, 36, 37, 38, 39, 44, 45,
80     46, 47, 50, 51, 56, 57, 58, 59,
81     52, 53, 54, 55, 60, 61, 62, 63,
82 };
83
84 const uint8_t ff_alternate_vertical_scan[64] = {
85      0,  8, 16, 24,  1,  9,  2, 10,
86     17, 25, 32, 40, 48, 56, 57, 49,
87     41, 33, 26, 18,  3, 11,  4, 12,
88     19, 27, 34, 42, 50, 58, 35, 43,
89     51, 59, 20, 28,  5, 13,  6, 14,
90     21, 29, 36, 44, 52, 60, 37, 45,
91     53, 61, 22, 30,  7, 15, 23, 31,
92     38, 46, 54, 62, 39, 47, 55, 63,
93 };
94
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64] = {
97     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105 };
106
107 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
108
109 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
110                                const uint8_t *src_scantable)
111 {
112     int i, end;
113
114     st->scantable = src_scantable;
115
116     for (i = 0; i < 64; i++) {
117         int j = src_scantable[i];
118         st->permutated[i] = permutation[j];
119     }
120
121     end = -1;
122     for (i = 0; i < 64; i++) {
123         int j = st->permutated[i];
124         if (j > end)
125             end = j;
126         st->raster_end[i] = end;
127     }
128 }
129
130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
131                                            int idct_permutation_type)
132 {
133     int i;
134
135     switch (idct_permutation_type) {
136     case FF_NO_IDCT_PERM:
137         for (i = 0; i < 64; i++)
138             idct_permutation[i] = i;
139         break;
140     case FF_LIBMPEG2_IDCT_PERM:
141         for (i = 0; i < 64; i++)
142             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
143         break;
144     case FF_SIMPLE_IDCT_PERM:
145         for (i = 0; i < 64; i++)
146             idct_permutation[i] = simple_mmx_permutation[i];
147         break;
148     case FF_TRANSPOSE_IDCT_PERM:
149         for (i = 0; i < 64; i++)
150             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
151         break;
152     case FF_PARTTRANS_IDCT_PERM:
153         for (i = 0; i < 64; i++)
154             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
155         break;
156     case FF_SSE2_IDCT_PERM:
157         for (i = 0; i < 64; i++)
158             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
159         break;
160     default:
161         av_log(NULL, AV_LOG_ERROR,
162                "Internal error, IDCT permutation not set\n");
163     }
164 }
165
166 static int pix_sum_c(uint8_t *pix, int line_size)
167 {
168     int s = 0, i, j;
169
170     for (i = 0; i < 16; i++) {
171         for (j = 0; j < 16; j += 8) {
172             s   += pix[0];
173             s   += pix[1];
174             s   += pix[2];
175             s   += pix[3];
176             s   += pix[4];
177             s   += pix[5];
178             s   += pix[6];
179             s   += pix[7];
180             pix += 8;
181         }
182         pix += line_size - 16;
183     }
184     return s;
185 }
186
187 static int pix_norm1_c(uint8_t *pix, int line_size)
188 {
189     int s = 0, i, j;
190     uint32_t *sq = ff_square_tab + 256;
191
192     for (i = 0; i < 16; i++) {
193         for (j = 0; j < 16; j += 8) {
194 #if 0
195             s += sq[pix[0]];
196             s += sq[pix[1]];
197             s += sq[pix[2]];
198             s += sq[pix[3]];
199             s += sq[pix[4]];
200             s += sq[pix[5]];
201             s += sq[pix[6]];
202             s += sq[pix[7]];
203 #else
204 #if HAVE_FAST_64BIT
205             register uint64_t x = *(uint64_t *) pix;
206             s += sq[x         & 0xff];
207             s += sq[(x >>  8) & 0xff];
208             s += sq[(x >> 16) & 0xff];
209             s += sq[(x >> 24) & 0xff];
210             s += sq[(x >> 32) & 0xff];
211             s += sq[(x >> 40) & 0xff];
212             s += sq[(x >> 48) & 0xff];
213             s += sq[(x >> 56) & 0xff];
214 #else
215             register uint32_t x = *(uint32_t *) pix;
216             s += sq[x         & 0xff];
217             s += sq[(x >>  8) & 0xff];
218             s += sq[(x >> 16) & 0xff];
219             s += sq[(x >> 24) & 0xff];
220             x  = *(uint32_t *) (pix + 4);
221             s += sq[x         & 0xff];
222             s += sq[(x >>  8) & 0xff];
223             s += sq[(x >> 16) & 0xff];
224             s += sq[(x >> 24) & 0xff];
225 #endif
226 #endif
227             pix += 8;
228         }
229         pix += line_size - 16;
230     }
231     return s;
232 }
233
234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
235 {
236     int i;
237
238     for (i = 0; i + 8 <= w; i += 8) {
239         dst[i + 0] = av_bswap32(src[i + 0]);
240         dst[i + 1] = av_bswap32(src[i + 1]);
241         dst[i + 2] = av_bswap32(src[i + 2]);
242         dst[i + 3] = av_bswap32(src[i + 3]);
243         dst[i + 4] = av_bswap32(src[i + 4]);
244         dst[i + 5] = av_bswap32(src[i + 5]);
245         dst[i + 6] = av_bswap32(src[i + 6]);
246         dst[i + 7] = av_bswap32(src[i + 7]);
247     }
248     for (; i < w; i++)
249         dst[i + 0] = av_bswap32(src[i + 0]);
250 }
251
252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
253 {
254     while (len--)
255         *dst++ = av_bswap16(*src++);
256 }
257
258 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
259                   int line_size, int h)
260 {
261     int s = 0, i;
262     uint32_t *sq = ff_square_tab + 256;
263
264     for (i = 0; i < h; i++) {
265         s    += sq[pix1[0] - pix2[0]];
266         s    += sq[pix1[1] - pix2[1]];
267         s    += sq[pix1[2] - pix2[2]];
268         s    += sq[pix1[3] - pix2[3]];
269         pix1 += line_size;
270         pix2 += line_size;
271     }
272     return s;
273 }
274
275 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276                   int line_size, int h)
277 {
278     int s = 0, i;
279     uint32_t *sq = ff_square_tab + 256;
280
281     for (i = 0; i < h; i++) {
282         s    += sq[pix1[0] - pix2[0]];
283         s    += sq[pix1[1] - pix2[1]];
284         s    += sq[pix1[2] - pix2[2]];
285         s    += sq[pix1[3] - pix2[3]];
286         s    += sq[pix1[4] - pix2[4]];
287         s    += sq[pix1[5] - pix2[5]];
288         s    += sq[pix1[6] - pix2[6]];
289         s    += sq[pix1[7] - pix2[7]];
290         pix1 += line_size;
291         pix2 += line_size;
292     }
293     return s;
294 }
295
296 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297                    int line_size, int h)
298 {
299     int s = 0, i;
300     uint32_t *sq = ff_square_tab + 256;
301
302     for (i = 0; i < h; i++) {
303         s += sq[pix1[0]  - pix2[0]];
304         s += sq[pix1[1]  - pix2[1]];
305         s += sq[pix1[2]  - pix2[2]];
306         s += sq[pix1[3]  - pix2[3]];
307         s += sq[pix1[4]  - pix2[4]];
308         s += sq[pix1[5]  - pix2[5]];
309         s += sq[pix1[6]  - pix2[6]];
310         s += sq[pix1[7]  - pix2[7]];
311         s += sq[pix1[8]  - pix2[8]];
312         s += sq[pix1[9]  - pix2[9]];
313         s += sq[pix1[10] - pix2[10]];
314         s += sq[pix1[11] - pix2[11]];
315         s += sq[pix1[12] - pix2[12]];
316         s += sq[pix1[13] - pix2[13]];
317         s += sq[pix1[14] - pix2[14]];
318         s += sq[pix1[15] - pix2[15]];
319
320         pix1 += line_size;
321         pix2 += line_size;
322     }
323     return s;
324 }
325
326 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
327                           const uint8_t *s2, int stride)
328 {
329     int i;
330
331     /* read the pixels */
332     for (i = 0; i < 8; i++) {
333         block[0] = s1[0] - s2[0];
334         block[1] = s1[1] - s2[1];
335         block[2] = s1[2] - s2[2];
336         block[3] = s1[3] - s2[3];
337         block[4] = s1[4] - s2[4];
338         block[5] = s1[5] - s2[5];
339         block[6] = s1[6] - s2[6];
340         block[7] = s1[7] - s2[7];
341         s1      += stride;
342         s2      += stride;
343         block   += 8;
344     }
345 }
346
347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
348                                  int line_size)
349 {
350     int i;
351
352     /* read the pixels */
353     for (i = 0; i < 8; i++) {
354         pixels[0] = av_clip_uint8(block[0]);
355         pixels[1] = av_clip_uint8(block[1]);
356         pixels[2] = av_clip_uint8(block[2]);
357         pixels[3] = av_clip_uint8(block[3]);
358         pixels[4] = av_clip_uint8(block[4]);
359         pixels[5] = av_clip_uint8(block[5]);
360         pixels[6] = av_clip_uint8(block[6]);
361         pixels[7] = av_clip_uint8(block[7]);
362
363         pixels += line_size;
364         block  += 8;
365     }
366 }
367
368 static void put_signed_pixels_clamped_c(const int16_t *block,
369                                         uint8_t *restrict pixels,
370                                         int line_size)
371 {
372     int i, j;
373
374     for (i = 0; i < 8; i++) {
375         for (j = 0; j < 8; j++) {
376             if (*block < -128)
377                 *pixels = 0;
378             else if (*block > 127)
379                 *pixels = 255;
380             else
381                 *pixels = (uint8_t) (*block + 128);
382             block++;
383             pixels++;
384         }
385         pixels += (line_size - 8);
386     }
387 }
388
389 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
390                           int line_size)
391 {
392     int i;
393
394     for (i = 0; i < 8; i++) {
395         pixels[0] += block[0];
396         pixels[1] += block[1];
397         pixels[2] += block[2];
398         pixels[3] += block[3];
399         pixels[4] += block[4];
400         pixels[5] += block[5];
401         pixels[6] += block[6];
402         pixels[7] += block[7];
403         pixels    += line_size;
404         block     += 8;
405     }
406 }
407
408 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
409                                  int line_size)
410 {
411     int i;
412
413     /* read the pixels */
414     for (i = 0; i < 8; i++) {
415         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
416         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
417         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
418         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
419         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
420         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
421         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
422         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
423         pixels   += line_size;
424         block    += 8;
425     }
426 }
427
428 static int sum_abs_dctelem_c(int16_t *block)
429 {
430     int sum = 0, i;
431
432     for (i = 0; i < 64; i++)
433         sum += FFABS(block[i]);
434     return sum;
435 }
436
437 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
438 {
439     int i;
440
441     for (i = 0; i < h; i++) {
442         memset(block, value, 16);
443         block += line_size;
444     }
445 }
446
447 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
448 {
449     int i;
450
451     for (i = 0; i < h; i++) {
452         memset(block, value, 8);
453         block += line_size;
454     }
455 }
456
457 #define avg2(a, b) ((a + b + 1) >> 1)
458 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
459
460 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
461                    int x16, int y16, int rounder)
462 {
463     const int A = (16 - x16) * (16 - y16);
464     const int B = (x16)      * (16 - y16);
465     const int C = (16 - x16) * (y16);
466     const int D = (x16)      * (y16);
467     int i;
468
469     for (i = 0; i < h; i++) {
470         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
471         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
472         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
473         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
474         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
475         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
476         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
477         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
478         dst   += stride;
479         src   += stride;
480     }
481 }
482
483 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
484               int dxx, int dxy, int dyx, int dyy, int shift, int r,
485               int width, int height)
486 {
487     int y, vx, vy;
488     const int s = 1 << shift;
489
490     width--;
491     height--;
492
493     for (y = 0; y < h; y++) {
494         int x;
495
496         vx = ox;
497         vy = oy;
498         for (x = 0; x < 8; x++) { // FIXME: optimize
499             int index;
500             int src_x  = vx >> 16;
501             int src_y  = vy >> 16;
502             int frac_x = src_x & (s - 1);
503             int frac_y = src_y & (s - 1);
504
505             src_x >>= shift;
506             src_y >>= shift;
507
508             if ((unsigned) src_x < width) {
509                 if ((unsigned) src_y < height) {
510                     index = src_x + src_y * stride;
511                     dst[y * stride + x] =
512                         ((src[index]                        * (s - frac_x) +
513                           src[index + 1]          * frac_x) * (s - frac_y) +
514                          (src[index + stride]               * (s - frac_x) +
515                           src[index + stride + 1] * frac_x) *      frac_y  +
516                          r) >> (shift * 2);
517                 } else {
518                     index = src_x + av_clip(src_y, 0, height) * stride;
519                     dst[y * stride + x] =
520                         ((src[index]               * (s - frac_x) +
521                           src[index + 1] * frac_x) *  s           +
522                          r) >> (shift * 2);
523                 }
524             } else {
525                 if ((unsigned) src_y < height) {
526                     index = av_clip(src_x, 0, width) + src_y * stride;
527                     dst[y * stride + x] =
528                         ((src[index]                    * (s - frac_y) +
529                           src[index + stride] * frac_y) *  s           +
530                          r) >> (shift * 2);
531                 } else {
532                     index = av_clip(src_x, 0, width) +
533                             av_clip(src_y, 0, height) * stride;
534                     dst[y * stride + x] = src[index];
535                 }
536             }
537
538             vx += dxx;
539             vy += dyx;
540         }
541         ox += dxy;
542         oy += dyy;
543     }
544 }
545
546 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
547 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
548                                             int dstStride, int srcStride,     \
549                                             int h)                            \
550 {                                                                             \
551     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
552     int i;                                                                    \
553                                                                               \
554     for (i = 0; i < h; i++) {                                                 \
555         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
556         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
557         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
558         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
559         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
560         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
561         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
562         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
563         dst += dstStride;                                                     \
564         src += srcStride;                                                     \
565     }                                                                         \
566 }                                                                             \
567                                                                               \
568 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
569                                             int dstStride, int srcStride)     \
570 {                                                                             \
571     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
572     const int w = 8;                                                          \
573     int i;                                                                    \
574                                                                               \
575     for (i = 0; i < w; i++) {                                                 \
576         const int src0 = src[0 * srcStride];                                  \
577         const int src1 = src[1 * srcStride];                                  \
578         const int src2 = src[2 * srcStride];                                  \
579         const int src3 = src[3 * srcStride];                                  \
580         const int src4 = src[4 * srcStride];                                  \
581         const int src5 = src[5 * srcStride];                                  \
582         const int src6 = src[6 * srcStride];                                  \
583         const int src7 = src[7 * srcStride];                                  \
584         const int src8 = src[8 * srcStride];                                  \
585         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
586         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
587         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
588         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
589         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
590         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
591         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
592         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
593         dst++;                                                                \
594         src++;                                                                \
595     }                                                                         \
596 }                                                                             \
597                                                                               \
598 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
599                                              int dstStride, int srcStride,    \
600                                              int h)                           \
601 {                                                                             \
602     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
603     int i;                                                                    \
604                                                                               \
605     for (i = 0; i < h; i++) {                                                 \
606         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
607         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
608         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
609         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
610         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
611         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
612         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
613         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
614         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
615         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
616         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
617         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
618         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
619         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
620         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
621         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
622         dst += dstStride;                                                     \
623         src += srcStride;                                                     \
624     }                                                                         \
625 }                                                                             \
626                                                                               \
627 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
628                                              int dstStride, int srcStride)    \
629 {                                                                             \
630     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
631     const int w = 16;                                                         \
632     int i;                                                                    \
633                                                                               \
634     for (i = 0; i < w; i++) {                                                 \
635         const int src0  = src[0  * srcStride];                                \
636         const int src1  = src[1  * srcStride];                                \
637         const int src2  = src[2  * srcStride];                                \
638         const int src3  = src[3  * srcStride];                                \
639         const int src4  = src[4  * srcStride];                                \
640         const int src5  = src[5  * srcStride];                                \
641         const int src6  = src[6  * srcStride];                                \
642         const int src7  = src[7  * srcStride];                                \
643         const int src8  = src[8  * srcStride];                                \
644         const int src9  = src[9  * srcStride];                                \
645         const int src10 = src[10 * srcStride];                                \
646         const int src11 = src[11 * srcStride];                                \
647         const int src12 = src[12 * srcStride];                                \
648         const int src13 = src[13 * srcStride];                                \
649         const int src14 = src[14 * srcStride];                                \
650         const int src15 = src[15 * srcStride];                                \
651         const int src16 = src[16 * srcStride];                                \
652         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
653         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
654         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
655         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
656         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
657         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
658         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
659         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
660         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
661         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
662         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
663         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
664         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
665         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
666         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
667         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
668         dst++;                                                                \
669         src++;                                                                \
670     }                                                                         \
671 }                                                                             \
672                                                                               \
673 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
674                                    ptrdiff_t stride)                          \
675 {                                                                             \
676     uint8_t half[64];                                                         \
677                                                                               \
678     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
679     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
680 }                                                                             \
681                                                                               \
682 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
683                                    ptrdiff_t stride)                          \
684 {                                                                             \
685     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
686 }                                                                             \
687                                                                               \
688 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
689                                    ptrdiff_t stride)                          \
690 {                                                                             \
691     uint8_t half[64];                                                         \
692                                                                               \
693     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
694     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
695 }                                                                             \
696                                                                               \
697 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
698                                    ptrdiff_t stride)                          \
699 {                                                                             \
700     uint8_t full[16 * 9];                                                     \
701     uint8_t half[64];                                                         \
702                                                                               \
703     copy_block9(full, src, 16, stride, 9);                                    \
704     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
705     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
706 }                                                                             \
707                                                                               \
708 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
709                                    ptrdiff_t stride)                          \
710 {                                                                             \
711     uint8_t full[16 * 9];                                                     \
712                                                                               \
713     copy_block9(full, src, 16, stride, 9);                                    \
714     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
715 }                                                                             \
716                                                                               \
717 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
718                                    ptrdiff_t stride)                          \
719 {                                                                             \
720     uint8_t full[16 * 9];                                                     \
721     uint8_t half[64];                                                         \
722                                                                               \
723     copy_block9(full, src, 16, stride, 9);                                    \
724     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
725     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
726 }                                                                             \
727                                                                               \
728 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
729                                        ptrdiff_t stride)                      \
730 {                                                                             \
731     uint8_t full[16 * 9];                                                     \
732     uint8_t halfH[72];                                                        \
733     uint8_t halfV[64];                                                        \
734     uint8_t halfHV[64];                                                       \
735                                                                               \
736     copy_block9(full, src, 16, stride, 9);                                    \
737     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
738     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
739     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
740     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
741                            stride, 16, 8, 8, 8, 8);                           \
742 }                                                                             \
743                                                                               \
744 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
745                                    ptrdiff_t stride)                          \
746 {                                                                             \
747     uint8_t full[16 * 9];                                                     \
748     uint8_t halfH[72];                                                        \
749     uint8_t halfHV[64];                                                       \
750                                                                               \
751     copy_block9(full, src, 16, stride, 9);                                    \
752     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
753     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
754     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
755     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
756 }                                                                             \
757                                                                               \
758 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
759                                        ptrdiff_t stride)                      \
760 {                                                                             \
761     uint8_t full[16 * 9];                                                     \
762     uint8_t halfH[72];                                                        \
763     uint8_t halfV[64];                                                        \
764     uint8_t halfHV[64];                                                       \
765                                                                               \
766     copy_block9(full, src, 16, stride, 9);                                    \
767     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
768     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
769     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
770     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
771                            stride, 16, 8, 8, 8, 8);                           \
772 }                                                                             \
773                                                                               \
774 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
775                                    ptrdiff_t stride)                          \
776 {                                                                             \
777     uint8_t full[16 * 9];                                                     \
778     uint8_t halfH[72];                                                        \
779     uint8_t halfHV[64];                                                       \
780                                                                               \
781     copy_block9(full, src, 16, stride, 9);                                    \
782     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
783     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
784     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
785     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
786 }                                                                             \
787                                                                               \
788 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
789                                        ptrdiff_t stride)                      \
790 {                                                                             \
791     uint8_t full[16 * 9];                                                     \
792     uint8_t halfH[72];                                                        \
793     uint8_t halfV[64];                                                        \
794     uint8_t halfHV[64];                                                       \
795                                                                               \
796     copy_block9(full, src, 16, stride, 9);                                    \
797     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
798     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
799     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
800     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
801                            stride, 16, 8, 8, 8, 8);                           \
802 }                                                                             \
803                                                                               \
804 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
805                                    ptrdiff_t stride)                          \
806 {                                                                             \
807     uint8_t full[16 * 9];                                                     \
808     uint8_t halfH[72];                                                        \
809     uint8_t halfHV[64];                                                       \
810                                                                               \
811     copy_block9(full, src, 16, stride, 9);                                    \
812     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
813     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
814     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
815     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
816 }                                                                             \
817                                                                               \
818 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
819                                        ptrdiff_t stride)                      \
820 {                                                                             \
821     uint8_t full[16 * 9];                                                     \
822     uint8_t halfH[72];                                                        \
823     uint8_t halfV[64];                                                        \
824     uint8_t halfHV[64];                                                       \
825                                                                               \
826     copy_block9(full, src, 16, stride, 9);                                    \
827     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
828     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
829     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
830     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
831                            stride, 16, 8, 8, 8, 8);                           \
832 }                                                                             \
833                                                                               \
834 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
835                                    ptrdiff_t stride)                          \
836 {                                                                             \
837     uint8_t full[16 * 9];                                                     \
838     uint8_t halfH[72];                                                        \
839     uint8_t halfHV[64];                                                       \
840                                                                               \
841     copy_block9(full, src, 16, stride, 9);                                    \
842     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
843     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
844     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
845     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
846 }                                                                             \
847                                                                               \
848 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
849                                    ptrdiff_t stride)                          \
850 {                                                                             \
851     uint8_t halfH[72];                                                        \
852     uint8_t halfHV[64];                                                       \
853                                                                               \
854     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
855     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
856     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
857 }                                                                             \
858                                                                               \
859 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
860                                    ptrdiff_t stride)                          \
861 {                                                                             \
862     uint8_t halfH[72];                                                        \
863     uint8_t halfHV[64];                                                       \
864                                                                               \
865     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
866     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
867     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
868 }                                                                             \
869                                                                               \
870 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
871                                        ptrdiff_t stride)                      \
872 {                                                                             \
873     uint8_t full[16 * 9];                                                     \
874     uint8_t halfH[72];                                                        \
875     uint8_t halfV[64];                                                        \
876     uint8_t halfHV[64];                                                       \
877                                                                               \
878     copy_block9(full, src, 16, stride, 9);                                    \
879     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
880     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
881     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
882     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
883 }                                                                             \
884                                                                               \
885 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
886                                    ptrdiff_t stride)                          \
887 {                                                                             \
888     uint8_t full[16 * 9];                                                     \
889     uint8_t halfH[72];                                                        \
890                                                                               \
891     copy_block9(full, src, 16, stride, 9);                                    \
892     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
893     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
894     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
895 }                                                                             \
896                                                                               \
897 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
898                                        ptrdiff_t stride)                      \
899 {                                                                             \
900     uint8_t full[16 * 9];                                                     \
901     uint8_t halfH[72];                                                        \
902     uint8_t halfV[64];                                                        \
903     uint8_t halfHV[64];                                                       \
904                                                                               \
905     copy_block9(full, src, 16, stride, 9);                                    \
906     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
907     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
908     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
909     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
910 }                                                                             \
911                                                                               \
912 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
913                                    ptrdiff_t stride)                          \
914 {                                                                             \
915     uint8_t full[16 * 9];                                                     \
916     uint8_t halfH[72];                                                        \
917                                                                               \
918     copy_block9(full, src, 16, stride, 9);                                    \
919     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
920     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
921     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
922 }                                                                             \
923                                                                               \
924 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
925                                    ptrdiff_t stride)                          \
926 {                                                                             \
927     uint8_t halfH[72];                                                        \
928                                                                               \
929     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
930     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
931 }                                                                             \
932                                                                               \
933 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
934                                     ptrdiff_t stride)                         \
935 {                                                                             \
936     uint8_t half[256];                                                        \
937                                                                               \
938     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
939     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
940 }                                                                             \
941                                                                               \
942 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
943                                     ptrdiff_t stride)                         \
944 {                                                                             \
945     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
946 }                                                                             \
947                                                                               \
948 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
949                                     ptrdiff_t stride)                         \
950 {                                                                             \
951     uint8_t half[256];                                                        \
952                                                                               \
953     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
954     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
955 }                                                                             \
956                                                                               \
957 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
958                                     ptrdiff_t stride)                         \
959 {                                                                             \
960     uint8_t full[24 * 17];                                                    \
961     uint8_t half[256];                                                        \
962                                                                               \
963     copy_block17(full, src, 24, stride, 17);                                  \
964     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
965     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
966 }                                                                             \
967                                                                               \
968 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
969                                     ptrdiff_t stride)                         \
970 {                                                                             \
971     uint8_t full[24 * 17];                                                    \
972                                                                               \
973     copy_block17(full, src, 24, stride, 17);                                  \
974     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
975 }                                                                             \
976                                                                               \
977 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
978                                     ptrdiff_t stride)                         \
979 {                                                                             \
980     uint8_t full[24 * 17];                                                    \
981     uint8_t half[256];                                                        \
982                                                                               \
983     copy_block17(full, src, 24, stride, 17);                                  \
984     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
985     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
986 }                                                                             \
987                                                                               \
988 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
989                                         ptrdiff_t stride)                     \
990 {                                                                             \
991     uint8_t full[24 * 17];                                                    \
992     uint8_t halfH[272];                                                       \
993     uint8_t halfV[256];                                                       \
994     uint8_t halfHV[256];                                                      \
995                                                                               \
996     copy_block17(full, src, 24, stride, 17);                                  \
997     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
998     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
999     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1000     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1001                             stride, 24, 16, 16, 16, 16);                      \
1002 }                                                                             \
1003                                                                               \
1004 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1005                                     ptrdiff_t stride)                         \
1006 {                                                                             \
1007     uint8_t full[24 * 17];                                                    \
1008     uint8_t halfH[272];                                                       \
1009     uint8_t halfHV[256];                                                      \
1010                                                                               \
1011     copy_block17(full, src, 24, stride, 17);                                  \
1012     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1013     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1014     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1015     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1016 }                                                                             \
1017                                                                               \
1018 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1019                                         ptrdiff_t stride)                     \
1020 {                                                                             \
1021     uint8_t full[24 * 17];                                                    \
1022     uint8_t halfH[272];                                                       \
1023     uint8_t halfV[256];                                                       \
1024     uint8_t halfHV[256];                                                      \
1025                                                                               \
1026     copy_block17(full, src, 24, stride, 17);                                  \
1027     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1028     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1029     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1030     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1031                             stride, 24, 16, 16, 16, 16);                      \
1032 }                                                                             \
1033                                                                               \
1034 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1035                                     ptrdiff_t stride)                         \
1036 {                                                                             \
1037     uint8_t full[24 * 17];                                                    \
1038     uint8_t halfH[272];                                                       \
1039     uint8_t halfHV[256];                                                      \
1040                                                                               \
1041     copy_block17(full, src, 24, stride, 17);                                  \
1042     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1043     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1044     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1045     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1046 }                                                                             \
1047                                                                               \
1048 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1049                                         ptrdiff_t stride)                     \
1050 {                                                                             \
1051     uint8_t full[24 * 17];                                                    \
1052     uint8_t halfH[272];                                                       \
1053     uint8_t halfV[256];                                                       \
1054     uint8_t halfHV[256];                                                      \
1055                                                                               \
1056     copy_block17(full, src, 24, stride, 17);                                  \
1057     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1058     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1059     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1060     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1061                             stride, 24, 16, 16, 16, 16);                      \
1062 }                                                                             \
1063                                                                               \
1064 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1065                                     ptrdiff_t stride)                         \
1066 {                                                                             \
1067     uint8_t full[24 * 17];                                                    \
1068     uint8_t halfH[272];                                                       \
1069     uint8_t halfHV[256];                                                      \
1070                                                                               \
1071     copy_block17(full, src, 24, stride, 17);                                  \
1072     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1073     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1074     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1075     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1076 }                                                                             \
1077                                                                               \
1078 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1079                                         ptrdiff_t stride)                     \
1080 {                                                                             \
1081     uint8_t full[24 * 17];                                                    \
1082     uint8_t halfH[272];                                                       \
1083     uint8_t halfV[256];                                                       \
1084     uint8_t halfHV[256];                                                      \
1085                                                                               \
1086     copy_block17(full, src, 24, stride, 17);                                  \
1087     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1088     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1089     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1090     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1091                             stride, 24, 16, 16, 16, 16);                      \
1092 }                                                                             \
1093                                                                               \
1094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1095                                     ptrdiff_t stride)                         \
1096 {                                                                             \
1097     uint8_t full[24 * 17];                                                    \
1098     uint8_t halfH[272];                                                       \
1099     uint8_t halfHV[256];                                                      \
1100                                                                               \
1101     copy_block17(full, src, 24, stride, 17);                                  \
1102     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1103     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1104     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1105     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1106 }                                                                             \
1107                                                                               \
1108 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1109                                     ptrdiff_t stride)                         \
1110 {                                                                             \
1111     uint8_t halfH[272];                                                       \
1112     uint8_t halfHV[256];                                                      \
1113                                                                               \
1114     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1116     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1117 }                                                                             \
1118                                                                               \
1119 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1120                                     ptrdiff_t stride)                         \
1121 {                                                                             \
1122     uint8_t halfH[272];                                                       \
1123     uint8_t halfHV[256];                                                      \
1124                                                                               \
1125     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1126     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1127     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1128 }                                                                             \
1129                                                                               \
1130 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1131                                         ptrdiff_t stride)                     \
1132 {                                                                             \
1133     uint8_t full[24 * 17];                                                    \
1134     uint8_t halfH[272];                                                       \
1135     uint8_t halfV[256];                                                       \
1136     uint8_t halfHV[256];                                                      \
1137                                                                               \
1138     copy_block17(full, src, 24, stride, 17);                                  \
1139     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1141     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1142     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1143 }                                                                             \
1144                                                                               \
1145 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1146                                     ptrdiff_t stride)                         \
1147 {                                                                             \
1148     uint8_t full[24 * 17];                                                    \
1149     uint8_t halfH[272];                                                       \
1150                                                                               \
1151     copy_block17(full, src, 24, stride, 17);                                  \
1152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1153     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1154     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1155 }                                                                             \
1156                                                                               \
1157 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1158                                         ptrdiff_t stride)                     \
1159 {                                                                             \
1160     uint8_t full[24 * 17];                                                    \
1161     uint8_t halfH[272];                                                       \
1162     uint8_t halfV[256];                                                       \
1163     uint8_t halfHV[256];                                                      \
1164                                                                               \
1165     copy_block17(full, src, 24, stride, 17);                                  \
1166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1167     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1169     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1170 }                                                                             \
1171                                                                               \
1172 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1173                                     ptrdiff_t stride)                         \
1174 {                                                                             \
1175     uint8_t full[24 * 17];                                                    \
1176     uint8_t halfH[272];                                                       \
1177                                                                               \
1178     copy_block17(full, src, 24, stride, 17);                                  \
1179     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1180     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1181     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1182 }                                                                             \
1183                                                                               \
1184 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1185                                     ptrdiff_t stride)                         \
1186 {                                                                             \
1187     uint8_t halfH[272];                                                       \
1188                                                                               \
1189     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1190     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1191 }
1192
1193 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1194 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1195 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1196 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1197
1198 QPEL_MC(0, put_, _, op_put)
1199 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1200 QPEL_MC(0, avg_, _, op_avg)
1201
1202 #undef op_avg
1203 #undef op_put
1204 #undef op_put_no_rnd
1205
1206 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1207 {
1208     put_pixels8_8_c(dst, src, stride, 8);
1209 }
1210
1211 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1212 {
1213     avg_pixels8_8_c(dst, src, stride, 8);
1214 }
1215
1216 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1217 {
1218     put_pixels16_8_c(dst, src, stride, 16);
1219 }
1220
1221 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1222 {
1223     avg_pixels16_8_c(dst, src, stride, 16);
1224 }
1225
1226 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1227 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1228 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1229 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1230 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1231 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1232
1233 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1234                                   int dstStride, int srcStride, int h)
1235 {
1236     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1237     int i;
1238
1239     for (i = 0; i < h; i++) {
1240         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1241         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1242         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1243         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1244         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1245         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1246         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1247         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1248         dst   += dstStride;
1249         src   += srcStride;
1250     }
1251 }
1252
1253 #if CONFIG_RV40_DECODER
1254 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1255 {
1256     put_pixels16_xy2_8_c(dst, src, stride, 16);
1257 }
1258
1259 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1260 {
1261     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1262 }
1263
1264 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1265 {
1266     put_pixels8_xy2_8_c(dst, src, stride, 8);
1267 }
1268
1269 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1270 {
1271     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1272 }
1273 #endif /* CONFIG_RV40_DECODER */
1274
1275 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1276                                   int dstStride, int srcStride, int w)
1277 {
1278     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1279     int i;
1280
1281     for (i = 0; i < w; i++) {
1282         const int src_1 = src[-srcStride];
1283         const int src0  = src[0];
1284         const int src1  = src[srcStride];
1285         const int src2  = src[2 * srcStride];
1286         const int src3  = src[3 * srcStride];
1287         const int src4  = src[4 * srcStride];
1288         const int src5  = src[5 * srcStride];
1289         const int src6  = src[6 * srcStride];
1290         const int src7  = src[7 * srcStride];
1291         const int src8  = src[8 * srcStride];
1292         const int src9  = src[9 * srcStride];
1293         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1294         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1295         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1296         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1297         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1298         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1299         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1300         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1301         src++;
1302         dst++;
1303     }
1304 }
1305
1306 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1307 {
1308     uint8_t half[64];
1309
1310     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1311     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1312 }
1313
1314 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1315 {
1316     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1317 }
1318
1319 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 {
1321     uint8_t half[64];
1322
1323     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1324     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1325 }
1326
1327 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 {
1329     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1330 }
1331
1332 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1333 {
1334     uint8_t halfH[88];
1335     uint8_t halfV[64];
1336     uint8_t halfHV[64];
1337
1338     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1339     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1340     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1341     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1342 }
1343
1344 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1345 {
1346     uint8_t halfH[88];
1347     uint8_t halfV[64];
1348     uint8_t halfHV[64];
1349
1350     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1351     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1352     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1353     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1354 }
1355
1356 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1357 {
1358     uint8_t halfH[88];
1359
1360     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1361     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1362 }
1363
1364 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1365                               int line_size, int h)
1366 {
1367     int s = 0, i;
1368
1369     for (i = 0; i < h; i++) {
1370         s    += abs(pix1[0]  - pix2[0]);
1371         s    += abs(pix1[1]  - pix2[1]);
1372         s    += abs(pix1[2]  - pix2[2]);
1373         s    += abs(pix1[3]  - pix2[3]);
1374         s    += abs(pix1[4]  - pix2[4]);
1375         s    += abs(pix1[5]  - pix2[5]);
1376         s    += abs(pix1[6]  - pix2[6]);
1377         s    += abs(pix1[7]  - pix2[7]);
1378         s    += abs(pix1[8]  - pix2[8]);
1379         s    += abs(pix1[9]  - pix2[9]);
1380         s    += abs(pix1[10] - pix2[10]);
1381         s    += abs(pix1[11] - pix2[11]);
1382         s    += abs(pix1[12] - pix2[12]);
1383         s    += abs(pix1[13] - pix2[13]);
1384         s    += abs(pix1[14] - pix2[14]);
1385         s    += abs(pix1[15] - pix2[15]);
1386         pix1 += line_size;
1387         pix2 += line_size;
1388     }
1389     return s;
1390 }
1391
1392 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1393                           int line_size, int h)
1394 {
1395     int s = 0, i;
1396
1397     for (i = 0; i < h; i++) {
1398         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1399         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1400         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1401         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1402         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1403         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1404         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1405         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1406         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1407         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1408         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1409         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1410         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1411         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1412         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1413         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1414         pix1 += line_size;
1415         pix2 += line_size;
1416     }
1417     return s;
1418 }
1419
1420 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1421                           int line_size, int h)
1422 {
1423     int s = 0, i;
1424     uint8_t *pix3 = pix2 + line_size;
1425
1426     for (i = 0; i < h; i++) {
1427         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1428         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1429         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1430         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1431         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1432         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1433         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1434         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1435         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1436         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1437         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1438         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1439         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1440         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1441         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1442         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1443         pix1 += line_size;
1444         pix2 += line_size;
1445         pix3 += line_size;
1446     }
1447     return s;
1448 }
1449
1450 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1451                            int line_size, int h)
1452 {
1453     int s = 0, i;
1454     uint8_t *pix3 = pix2 + line_size;
1455
1456     for (i = 0; i < h; i++) {
1457         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1458         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1459         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1460         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1461         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1462         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1463         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1464         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1465         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1466         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1467         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1468         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1469         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1470         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1471         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1472         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1473         pix1 += line_size;
1474         pix2 += line_size;
1475         pix3 += line_size;
1476     }
1477     return s;
1478 }
1479
1480 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1481                              int line_size, int h)
1482 {
1483     int s = 0, i;
1484
1485     for (i = 0; i < h; i++) {
1486         s    += abs(pix1[0] - pix2[0]);
1487         s    += abs(pix1[1] - pix2[1]);
1488         s    += abs(pix1[2] - pix2[2]);
1489         s    += abs(pix1[3] - pix2[3]);
1490         s    += abs(pix1[4] - pix2[4]);
1491         s    += abs(pix1[5] - pix2[5]);
1492         s    += abs(pix1[6] - pix2[6]);
1493         s    += abs(pix1[7] - pix2[7]);
1494         pix1 += line_size;
1495         pix2 += line_size;
1496     }
1497     return s;
1498 }
1499
1500 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1501                          int line_size, int h)
1502 {
1503     int s = 0, i;
1504
1505     for (i = 0; i < h; i++) {
1506         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1507         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1508         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1509         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1510         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1511         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1512         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1513         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1514         pix1 += line_size;
1515         pix2 += line_size;
1516     }
1517     return s;
1518 }
1519
1520 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1521                          int line_size, int h)
1522 {
1523     int s = 0, i;
1524     uint8_t *pix3 = pix2 + line_size;
1525
1526     for (i = 0; i < h; i++) {
1527         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1528         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1529         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1530         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1531         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1532         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1533         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1534         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1535         pix1 += line_size;
1536         pix2 += line_size;
1537         pix3 += line_size;
1538     }
1539     return s;
1540 }
1541
1542 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1543                           int line_size, int h)
1544 {
1545     int s = 0, i;
1546     uint8_t *pix3 = pix2 + line_size;
1547
1548     for (i = 0; i < h; i++) {
1549         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1550         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1551         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1552         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1553         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1554         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1555         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1556         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1557         pix1 += line_size;
1558         pix2 += line_size;
1559         pix3 += line_size;
1560     }
1561     return s;
1562 }
1563
1564 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1565 {
1566     int score1 = 0, score2 = 0, x, y;
1567
1568     for (y = 0; y < h; y++) {
1569         for (x = 0; x < 16; x++)
1570             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1571         if (y + 1 < h) {
1572             for (x = 0; x < 15; x++)
1573                 score2 += FFABS(s1[x]     - s1[x + stride] -
1574                                 s1[x + 1] + s1[x + stride + 1]) -
1575                           FFABS(s2[x]     - s2[x + stride] -
1576                                 s2[x + 1] + s2[x + stride + 1]);
1577         }
1578         s1 += stride;
1579         s2 += stride;
1580     }
1581
1582     if (c)
1583         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1584     else
1585         return score1 + FFABS(score2) * 8;
1586 }
1587
1588 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1589 {
1590     int score1 = 0, score2 = 0, x, y;
1591
1592     for (y = 0; y < h; y++) {
1593         for (x = 0; x < 8; x++)
1594             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1595         if (y + 1 < h) {
1596             for (x = 0; x < 7; x++)
1597                 score2 += FFABS(s1[x]     - s1[x + stride] -
1598                                 s1[x + 1] + s1[x + stride + 1]) -
1599                           FFABS(s2[x]     - s2[x + stride] -
1600                                 s2[x + 1] + s2[x + stride + 1]);
1601         }
1602         s1 += stride;
1603         s2 += stride;
1604     }
1605
1606     if (c)
1607         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1608     else
1609         return score1 + FFABS(score2) * 8;
1610 }
1611
1612 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1613                           int16_t basis[64], int scale)
1614 {
1615     int i;
1616     unsigned int sum = 0;
1617
1618     for (i = 0; i < 8 * 8; i++) {
1619         int b = rem[i] + ((basis[i] * scale +
1620                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1621                           (BASIS_SHIFT - RECON_SHIFT));
1622         int w = weight[i];
1623         b >>= RECON_SHIFT;
1624         assert(-512 < b && b < 512);
1625
1626         sum += (w * b) * (w * b) >> 4;
1627     }
1628     return sum >> 2;
1629 }
1630
1631 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1632 {
1633     int i;
1634
1635     for (i = 0; i < 8 * 8; i++)
1636         rem[i] += (basis[i] * scale +
1637                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1638                   (BASIS_SHIFT - RECON_SHIFT);
1639 }
1640
1641 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1642                     int stride, int h)
1643 {
1644     return 0;
1645 }
1646
1647 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1648 {
1649     int i;
1650
1651     memset(cmp, 0, sizeof(void *) * 6);
1652
1653     for (i = 0; i < 6; i++) {
1654         switch (type & 0xFF) {
1655         case FF_CMP_SAD:
1656             cmp[i] = c->sad[i];
1657             break;
1658         case FF_CMP_SATD:
1659             cmp[i] = c->hadamard8_diff[i];
1660             break;
1661         case FF_CMP_SSE:
1662             cmp[i] = c->sse[i];
1663             break;
1664         case FF_CMP_DCT:
1665             cmp[i] = c->dct_sad[i];
1666             break;
1667         case FF_CMP_DCT264:
1668             cmp[i] = c->dct264_sad[i];
1669             break;
1670         case FF_CMP_DCTMAX:
1671             cmp[i] = c->dct_max[i];
1672             break;
1673         case FF_CMP_PSNR:
1674             cmp[i] = c->quant_psnr[i];
1675             break;
1676         case FF_CMP_BIT:
1677             cmp[i] = c->bit[i];
1678             break;
1679         case FF_CMP_RD:
1680             cmp[i] = c->rd[i];
1681             break;
1682         case FF_CMP_VSAD:
1683             cmp[i] = c->vsad[i];
1684             break;
1685         case FF_CMP_VSSE:
1686             cmp[i] = c->vsse[i];
1687             break;
1688         case FF_CMP_ZERO:
1689             cmp[i] = zero_cmp;
1690             break;
1691         case FF_CMP_NSSE:
1692             cmp[i] = c->nsse[i];
1693             break;
1694         default:
1695             av_log(NULL, AV_LOG_ERROR,
1696                    "internal error in cmp function selection\n");
1697         }
1698     }
1699 }
1700
1701 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1702 {
1703     long i;
1704
1705     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1706         long a = *(long *) (src + i);
1707         long b = *(long *) (dst + i);
1708         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1709     }
1710     for (; i < w; i++)
1711         dst[i + 0] += src[i + 0];
1712 }
1713
1714 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1715 {
1716     long i;
1717
1718 #if !HAVE_FAST_UNALIGNED
1719     if ((long) src2 & (sizeof(long) - 1)) {
1720         for (i = 0; i + 7 < w; i += 8) {
1721             dst[i + 0] = src1[i + 0] - src2[i + 0];
1722             dst[i + 1] = src1[i + 1] - src2[i + 1];
1723             dst[i + 2] = src1[i + 2] - src2[i + 2];
1724             dst[i + 3] = src1[i + 3] - src2[i + 3];
1725             dst[i + 4] = src1[i + 4] - src2[i + 4];
1726             dst[i + 5] = src1[i + 5] - src2[i + 5];
1727             dst[i + 6] = src1[i + 6] - src2[i + 6];
1728             dst[i + 7] = src1[i + 7] - src2[i + 7];
1729         }
1730     } else
1731 #endif
1732     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1733         long a = *(long *) (src1 + i);
1734         long b = *(long *) (src2 + i);
1735         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1736                               ((a ^ b ^ pb_80) & pb_80);
1737     }
1738     for (; i < w; i++)
1739         dst[i + 0] = src1[i + 0] - src2[i + 0];
1740 }
1741
1742 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1743                                          const uint8_t *diff, int w,
1744                                          int *left, int *left_top)
1745 {
1746     int i;
1747     uint8_t l, lt;
1748
1749     l  = *left;
1750     lt = *left_top;
1751
1752     for (i = 0; i < w; i++) {
1753         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1754         lt     = src1[i];
1755         dst[i] = l;
1756     }
1757
1758     *left     = l;
1759     *left_top = lt;
1760 }
1761
1762 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1763                                          const uint8_t *src2, int w,
1764                                          int *left, int *left_top)
1765 {
1766     int i;
1767     uint8_t l, lt;
1768
1769     l  = *left;
1770     lt = *left_top;
1771
1772     for (i = 0; i < w; i++) {
1773         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1774         lt     = src1[i];
1775         l      = src2[i];
1776         dst[i] = l - pred;
1777     }
1778
1779     *left     = l;
1780     *left_top = lt;
1781 }
1782
1783 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1784                                       int w, int acc)
1785 {
1786     int i;
1787
1788     for (i = 0; i < w - 1; i++) {
1789         acc   += src[i];
1790         dst[i] = acc;
1791         i++;
1792         acc   += src[i];
1793         dst[i] = acc;
1794     }
1795
1796     for (; i < w; i++) {
1797         acc   += src[i];
1798         dst[i] = acc;
1799     }
1800
1801     return acc;
1802 }
1803
1804 #if HAVE_BIGENDIAN
1805 #define B 3
1806 #define G 2
1807 #define R 1
1808 #define A 0
1809 #else
1810 #define B 0
1811 #define G 1
1812 #define R 2
1813 #define A 3
1814 #endif
1815 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1816                                              int w, int *red, int *green,
1817                                              int *blue, int *alpha)
1818 {
1819     int i, r = *red, g = *green, b = *blue, a = *alpha;
1820
1821     for (i = 0; i < w; i++) {
1822         b += src[4 * i + B];
1823         g += src[4 * i + G];
1824         r += src[4 * i + R];
1825         a += src[4 * i + A];
1826
1827         dst[4 * i + B] = b;
1828         dst[4 * i + G] = g;
1829         dst[4 * i + R] = r;
1830         dst[4 * i + A] = a;
1831     }
1832
1833     *red   = r;
1834     *green = g;
1835     *blue  = b;
1836     *alpha = a;
1837 }
1838 #undef B
1839 #undef G
1840 #undef R
1841 #undef A
1842
1843 #define BUTTERFLY2(o1, o2, i1, i2)              \
1844     o1 = (i1) + (i2);                           \
1845     o2 = (i1) - (i2);
1846
1847 #define BUTTERFLY1(x, y)                        \
1848     {                                           \
1849         int a, b;                               \
1850         a = x;                                  \
1851         b = y;                                  \
1852         x = a + b;                              \
1853         y = a - b;                              \
1854     }
1855
1856 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1857
1858 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1859                                uint8_t *src, int stride, int h)
1860 {
1861     int i, temp[64], sum = 0;
1862
1863     assert(h == 8);
1864
1865     for (i = 0; i < 8; i++) {
1866         // FIXME: try pointer walks
1867         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1868                    src[stride * i + 0] - dst[stride * i + 0],
1869                    src[stride * i + 1] - dst[stride * i + 1]);
1870         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1871                    src[stride * i + 2] - dst[stride * i + 2],
1872                    src[stride * i + 3] - dst[stride * i + 3]);
1873         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1874                    src[stride * i + 4] - dst[stride * i + 4],
1875                    src[stride * i + 5] - dst[stride * i + 5]);
1876         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1877                    src[stride * i + 6] - dst[stride * i + 6],
1878                    src[stride * i + 7] - dst[stride * i + 7]);
1879
1880         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1881         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1882         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1883         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1884
1885         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1886         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1887         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1888         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1889     }
1890
1891     for (i = 0; i < 8; i++) {
1892         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1893         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1894         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1895         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1896
1897         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1898         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1899         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1900         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1901
1902         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1903                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1904                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1905                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1906     }
1907     return sum;
1908 }
1909
1910 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1911                                 uint8_t *dummy, int stride, int h)
1912 {
1913     int i, temp[64], sum = 0;
1914
1915     assert(h == 8);
1916
1917     for (i = 0; i < 8; i++) {
1918         // FIXME: try pointer walks
1919         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1920                    src[stride * i + 0], src[stride * i + 1]);
1921         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1922                    src[stride * i + 2], src[stride * i + 3]);
1923         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1924                    src[stride * i + 4], src[stride * i + 5]);
1925         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1926                    src[stride * i + 6], src[stride * i + 7]);
1927
1928         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1929         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1930         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1931         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1932
1933         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1934         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1935         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1936         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1937     }
1938
1939     for (i = 0; i < 8; i++) {
1940         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1941         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1942         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1943         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1944
1945         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1946         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1947         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1948         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1949
1950         sum +=
1951             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1952             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1953             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1954             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1955     }
1956
1957     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1958
1959     return sum;
1960 }
1961
1962 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1963                         uint8_t *src2, int stride, int h)
1964 {
1965     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1966
1967     assert(h == 8);
1968
1969     s->dsp.diff_pixels(temp, src1, src2, stride);
1970     s->dsp.fdct(temp);
1971     return s->dsp.sum_abs_dctelem(temp);
1972 }
1973
1974 #if CONFIG_GPL
1975 #define DCT8_1D                                         \
1976     {                                                   \
1977         const int s07 = SRC(0) + SRC(7);                \
1978         const int s16 = SRC(1) + SRC(6);                \
1979         const int s25 = SRC(2) + SRC(5);                \
1980         const int s34 = SRC(3) + SRC(4);                \
1981         const int a0  = s07 + s34;                      \
1982         const int a1  = s16 + s25;                      \
1983         const int a2  = s07 - s34;                      \
1984         const int a3  = s16 - s25;                      \
1985         const int d07 = SRC(0) - SRC(7);                \
1986         const int d16 = SRC(1) - SRC(6);                \
1987         const int d25 = SRC(2) - SRC(5);                \
1988         const int d34 = SRC(3) - SRC(4);                \
1989         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
1990         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
1991         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
1992         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
1993         DST(0, a0 + a1);                                \
1994         DST(1, a4 + (a7 >> 2));                         \
1995         DST(2, a2 + (a3 >> 1));                         \
1996         DST(3, a5 + (a6 >> 2));                         \
1997         DST(4, a0 - a1);                                \
1998         DST(5, a6 - (a5 >> 2));                         \
1999         DST(6, (a2 >> 1) - a3);                         \
2000         DST(7, (a4 >> 2) - a7);                         \
2001     }
2002
2003 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2004                            uint8_t *src2, int stride, int h)
2005 {
2006     int16_t dct[8][8];
2007     int i, sum = 0;
2008
2009     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2010
2011 #define SRC(x) dct[i][x]
2012 #define DST(x, v) dct[i][x] = v
2013     for (i = 0; i < 8; i++)
2014         DCT8_1D
2015 #undef SRC
2016 #undef DST
2017
2018 #define SRC(x) dct[x][i]
2019 #define DST(x, v) sum += FFABS(v)
2020         for (i = 0; i < 8; i++)
2021             DCT8_1D
2022 #undef SRC
2023 #undef DST
2024             return sum;
2025 }
2026 #endif
2027
2028 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2029                         uint8_t *src2, int stride, int h)
2030 {
2031     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2032     int sum = 0, i;
2033
2034     assert(h == 8);
2035
2036     s->dsp.diff_pixels(temp, src1, src2, stride);
2037     s->dsp.fdct(temp);
2038
2039     for (i = 0; i < 64; i++)
2040         sum = FFMAX(sum, FFABS(temp[i]));
2041
2042     return sum;
2043 }
2044
2045 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2046                            uint8_t *src2, int stride, int h)
2047 {
2048     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2049     int16_t *const bak = temp + 64;
2050     int sum = 0, i;
2051
2052     assert(h == 8);
2053     s->mb_intra = 0;
2054
2055     s->dsp.diff_pixels(temp, src1, src2, stride);
2056
2057     memcpy(bak, temp, 64 * sizeof(int16_t));
2058
2059     s->block_last_index[0 /* FIXME */] =
2060         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2061     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2062     ff_simple_idct_8(temp); // FIXME
2063
2064     for (i = 0; i < 64; i++)
2065         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2066
2067     return sum;
2068 }
2069
2070 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2071                    int stride, int h)
2072 {
2073     const uint8_t *scantable = s->intra_scantable.permutated;
2074     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2075     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2076     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2077     int i, last, run, bits, level, distortion, start_i;
2078     const int esc_length = s->ac_esc_length;
2079     uint8_t *length, *last_length;
2080
2081     assert(h == 8);
2082
2083     copy_block8(lsrc1, src1, 8, stride, 8);
2084     copy_block8(lsrc2, src2, 8, stride, 8);
2085
2086     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2087
2088     s->block_last_index[0 /* FIXME */] =
2089     last                               =
2090         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2091
2092     bits = 0;
2093
2094     if (s->mb_intra) {
2095         start_i     = 1;
2096         length      = s->intra_ac_vlc_length;
2097         last_length = s->intra_ac_vlc_last_length;
2098         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2099     } else {
2100         start_i     = 0;
2101         length      = s->inter_ac_vlc_length;
2102         last_length = s->inter_ac_vlc_last_length;
2103     }
2104
2105     if (last >= start_i) {
2106         run = 0;
2107         for (i = start_i; i < last; i++) {
2108             int j = scantable[i];
2109             level = temp[j];
2110
2111             if (level) {
2112                 level += 64;
2113                 if ((level & (~127)) == 0)
2114                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2115                 else
2116                     bits += esc_length;
2117                 run = 0;
2118             } else
2119                 run++;
2120         }
2121         i = scantable[last];
2122
2123         level = temp[i] + 64;
2124
2125         assert(level - 64);
2126
2127         if ((level & (~127)) == 0) {
2128             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2129         } else
2130             bits += esc_length;
2131     }
2132
2133     if (last >= 0) {
2134         if (s->mb_intra)
2135             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2136         else
2137             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2138     }
2139
2140     s->dsp.idct_add(lsrc2, 8, temp);
2141
2142     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2143
2144     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2145 }
2146
2147 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2148                     int stride, int h)
2149 {
2150     const uint8_t *scantable = s->intra_scantable.permutated;
2151     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2152     int i, last, run, bits, level, start_i;
2153     const int esc_length = s->ac_esc_length;
2154     uint8_t *length, *last_length;
2155
2156     assert(h == 8);
2157
2158     s->dsp.diff_pixels(temp, src1, src2, stride);
2159
2160     s->block_last_index[0 /* FIXME */] =
2161     last                               =
2162         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2163
2164     bits = 0;
2165
2166     if (s->mb_intra) {
2167         start_i     = 1;
2168         length      = s->intra_ac_vlc_length;
2169         last_length = s->intra_ac_vlc_last_length;
2170         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2171     } else {
2172         start_i     = 0;
2173         length      = s->inter_ac_vlc_length;
2174         last_length = s->inter_ac_vlc_last_length;
2175     }
2176
2177     if (last >= start_i) {
2178         run = 0;
2179         for (i = start_i; i < last; i++) {
2180             int j = scantable[i];
2181             level = temp[j];
2182
2183             if (level) {
2184                 level += 64;
2185                 if ((level & (~127)) == 0)
2186                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2187                 else
2188                     bits += esc_length;
2189                 run = 0;
2190             } else
2191                 run++;
2192         }
2193         i = scantable[last];
2194
2195         level = temp[i] + 64;
2196
2197         assert(level - 64);
2198
2199         if ((level & (~127)) == 0)
2200             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2201         else
2202             bits += esc_length;
2203     }
2204
2205     return bits;
2206 }
2207
2208 #define VSAD_INTRA(size)                                                \
2209 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2210                                     uint8_t *s, uint8_t *dummy,         \
2211                                     int stride, int h)                  \
2212 {                                                                       \
2213     int score = 0, x, y;                                                \
2214                                                                         \
2215     for (y = 1; y < h; y++) {                                           \
2216         for (x = 0; x < size; x += 4) {                                 \
2217             score += FFABS(s[x]     - s[x + stride])     +              \
2218                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2219                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2220                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2221         }                                                               \
2222         s += stride;                                                    \
2223     }                                                                   \
2224                                                                         \
2225     return score;                                                       \
2226 }
2227 VSAD_INTRA(8)
2228 VSAD_INTRA(16)
2229
2230 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2231                     int stride, int h)
2232 {
2233     int score = 0, x, y;
2234
2235     for (y = 1; y < h; y++) {
2236         for (x = 0; x < 16; x++)
2237             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2238         s1 += stride;
2239         s2 += stride;
2240     }
2241
2242     return score;
2243 }
2244
2245 #define SQ(a) ((a) * (a))
2246 #define VSSE_INTRA(size)                                                \
2247 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2248                                     uint8_t *s, uint8_t *dummy,         \
2249                                     int stride, int h)                  \
2250 {                                                                       \
2251     int score = 0, x, y;                                                \
2252                                                                         \
2253     for (y = 1; y < h; y++) {                                           \
2254         for (x = 0; x < size; x += 4) {                                 \
2255             score += SQ(s[x]     - s[x + stride]) +                     \
2256                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2257                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2258                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2259         }                                                               \
2260         s += stride;                                                    \
2261     }                                                                   \
2262                                                                         \
2263     return score;                                                       \
2264 }
2265 VSSE_INTRA(8)
2266 VSSE_INTRA(16)
2267
2268 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2269                     int stride, int h)
2270 {
2271     int score = 0, x, y;
2272
2273     for (y = 1; y < h; y++) {
2274         for (x = 0; x < 16; x++)
2275             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2276         s1 += stride;
2277         s2 += stride;
2278     }
2279
2280     return score;
2281 }
2282
2283 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2284                                int size)
2285 {
2286     int score = 0, i;
2287
2288     for (i = 0; i < size; i++)
2289         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2290     return score;
2291 }
2292
2293 #define WRAPPER8_16_SQ(name8, name16)                                   \
2294 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2295                   int stride, int h)                                    \
2296 {                                                                       \
2297     int score = 0;                                                      \
2298                                                                         \
2299     score += name8(s, dst, src, stride, 8);                             \
2300     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2301     if (h == 16) {                                                      \
2302         dst   += 8 * stride;                                            \
2303         src   += 8 * stride;                                            \
2304         score += name8(s, dst, src, stride, 8);                         \
2305         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2306     }                                                                   \
2307     return score;                                                       \
2308 }
2309
2310 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2311 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2312 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2313 #if CONFIG_GPL
2314 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2315 #endif
2316 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2317 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2318 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2319 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2320
2321 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2322                                    uint32_t maxi, uint32_t maxisign)
2323 {
2324     if (a > mini)
2325         return mini;
2326     else if ((a ^ (1U << 31)) > maxisign)
2327         return maxi;
2328     else
2329         return a;
2330 }
2331
2332 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2333                                          float *min, float *max, int len)
2334 {
2335     int i;
2336     uint32_t mini        = *(uint32_t *) min;
2337     uint32_t maxi        = *(uint32_t *) max;
2338     uint32_t maxisign    = maxi ^ (1U << 31);
2339     uint32_t *dsti       = (uint32_t *) dst;
2340     const uint32_t *srci = (const uint32_t *) src;
2341
2342     for (i = 0; i < len; i += 8) {
2343         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2344         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2345         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2346         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2347         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2348         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2349         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2350         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2351     }
2352 }
2353
2354 static void vector_clipf_c(float *dst, const float *src,
2355                            float min, float max, int len)
2356 {
2357     int i;
2358
2359     if (min < 0 && max > 0) {
2360         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2361     } else {
2362         for (i = 0; i < len; i += 8) {
2363             dst[i]     = av_clipf(src[i], min, max);
2364             dst[i + 1] = av_clipf(src[i + 1], min, max);
2365             dst[i + 2] = av_clipf(src[i + 2], min, max);
2366             dst[i + 3] = av_clipf(src[i + 3], min, max);
2367             dst[i + 4] = av_clipf(src[i + 4], min, max);
2368             dst[i + 5] = av_clipf(src[i + 5], min, max);
2369             dst[i + 6] = av_clipf(src[i + 6], min, max);
2370             dst[i + 7] = av_clipf(src[i + 7], min, max);
2371         }
2372     }
2373 }
2374
2375 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2376                                      int order)
2377 {
2378     int res = 0;
2379
2380     while (order--)
2381         res += *v1++ **v2++;
2382
2383     return res;
2384 }
2385
2386 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2387                                               const int16_t *v3,
2388                                               int order, int mul)
2389 {
2390     int res = 0;
2391
2392     while (order--) {
2393         res   += *v1 * *v2++;
2394         *v1++ += mul * *v3++;
2395     }
2396     return res;
2397 }
2398
2399 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2400                                 int32_t max, unsigned int len)
2401 {
2402     do {
2403         *dst++ = av_clip(*src++, min, max);
2404         *dst++ = av_clip(*src++, min, max);
2405         *dst++ = av_clip(*src++, min, max);
2406         *dst++ = av_clip(*src++, min, max);
2407         *dst++ = av_clip(*src++, min, max);
2408         *dst++ = av_clip(*src++, min, max);
2409         *dst++ = av_clip(*src++, min, max);
2410         *dst++ = av_clip(*src++, min, max);
2411         len   -= 8;
2412     } while (len > 0);
2413 }
2414
2415 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2416 {
2417     ff_j_rev_dct(block);
2418     put_pixels_clamped_c(block, dest, line_size);
2419 }
2420
2421 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2422 {
2423     ff_j_rev_dct(block);
2424     add_pixels_clamped_c(block, dest, line_size);
2425 }
2426
2427 /* init static data */
2428 av_cold void ff_dsputil_static_init(void)
2429 {
2430     int i;
2431
2432     for (i = 0; i < 512; i++)
2433         ff_square_tab[i] = (i - 256) * (i - 256);
2434 }
2435
2436 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2437 {
2438     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2439
2440 #if CONFIG_ENCODERS
2441     if (avctx->bits_per_raw_sample == 10) {
2442         c->fdct    = ff_jpeg_fdct_islow_10;
2443         c->fdct248 = ff_fdct248_islow_10;
2444     } else {
2445         if (avctx->dct_algo == FF_DCT_FASTINT) {
2446             c->fdct    = ff_fdct_ifast;
2447             c->fdct248 = ff_fdct_ifast248;
2448         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2449             c->fdct    = ff_faandct;
2450             c->fdct248 = ff_faandct248;
2451         } else {
2452             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2453             c->fdct248 = ff_fdct248_islow_8;
2454         }
2455     }
2456 #endif /* CONFIG_ENCODERS */
2457
2458     if (avctx->bits_per_raw_sample == 10) {
2459         c->idct_put              = ff_simple_idct_put_10;
2460         c->idct_add              = ff_simple_idct_add_10;
2461         c->idct                  = ff_simple_idct_10;
2462         c->idct_permutation_type = FF_NO_IDCT_PERM;
2463     } else {
2464         if (avctx->idct_algo == FF_IDCT_INT) {
2465             c->idct_put              = jref_idct_put;
2466             c->idct_add              = jref_idct_add;
2467             c->idct                  = ff_j_rev_dct;
2468             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2469         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2470             c->idct_put              = ff_faanidct_put;
2471             c->idct_add              = ff_faanidct_add;
2472             c->idct                  = ff_faanidct;
2473             c->idct_permutation_type = FF_NO_IDCT_PERM;
2474         } else { // accurate/default
2475             c->idct_put              = ff_simple_idct_put_8;
2476             c->idct_add              = ff_simple_idct_add_8;
2477             c->idct                  = ff_simple_idct_8;
2478             c->idct_permutation_type = FF_NO_IDCT_PERM;
2479         }
2480     }
2481
2482     c->diff_pixels = diff_pixels_c;
2483
2484     c->put_pixels_clamped        = put_pixels_clamped_c;
2485     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2486     c->add_pixels_clamped        = add_pixels_clamped_c;
2487
2488     c->sum_abs_dctelem = sum_abs_dctelem_c;
2489
2490     c->gmc1 = gmc1_c;
2491     c->gmc  = ff_gmc_c;
2492
2493     c->pix_sum   = pix_sum_c;
2494     c->pix_norm1 = pix_norm1_c;
2495
2496     c->fill_block_tab[0] = fill_block16_c;
2497     c->fill_block_tab[1] = fill_block8_c;
2498
2499     /* TODO [0] 16  [1] 8 */
2500     c->pix_abs[0][0] = pix_abs16_c;
2501     c->pix_abs[0][1] = pix_abs16_x2_c;
2502     c->pix_abs[0][2] = pix_abs16_y2_c;
2503     c->pix_abs[0][3] = pix_abs16_xy2_c;
2504     c->pix_abs[1][0] = pix_abs8_c;
2505     c->pix_abs[1][1] = pix_abs8_x2_c;
2506     c->pix_abs[1][2] = pix_abs8_y2_c;
2507     c->pix_abs[1][3] = pix_abs8_xy2_c;
2508
2509 #define dspfunc(PFX, IDX, NUM)                              \
2510     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2511     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2512     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2513     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2514     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2515     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2516     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2517     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2518     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2519     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2520     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2521     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2522     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2523     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2524     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2525     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2526
2527     dspfunc(put_qpel, 0, 16);
2528     dspfunc(put_qpel, 1, 8);
2529
2530     dspfunc(put_no_rnd_qpel, 0, 16);
2531     dspfunc(put_no_rnd_qpel, 1, 8);
2532
2533     dspfunc(avg_qpel, 0, 16);
2534     dspfunc(avg_qpel, 1, 8);
2535
2536 #undef dspfunc
2537
2538     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2539     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2540     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2541     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2542     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2543     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2544     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2545     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2546
2547 #define SET_CMP_FUNC(name)                      \
2548     c->name[0] = name ## 16_c;                  \
2549     c->name[1] = name ## 8x8_c;
2550
2551     SET_CMP_FUNC(hadamard8_diff)
2552     c->hadamard8_diff[4] = hadamard8_intra16_c;
2553     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2554     SET_CMP_FUNC(dct_sad)
2555     SET_CMP_FUNC(dct_max)
2556 #if CONFIG_GPL
2557     SET_CMP_FUNC(dct264_sad)
2558 #endif
2559     c->sad[0] = pix_abs16_c;
2560     c->sad[1] = pix_abs8_c;
2561     c->sse[0] = sse16_c;
2562     c->sse[1] = sse8_c;
2563     c->sse[2] = sse4_c;
2564     SET_CMP_FUNC(quant_psnr)
2565     SET_CMP_FUNC(rd)
2566     SET_CMP_FUNC(bit)
2567     c->vsad[0] = vsad16_c;
2568     c->vsad[4] = vsad_intra16_c;
2569     c->vsad[5] = vsad_intra8_c;
2570     c->vsse[0] = vsse16_c;
2571     c->vsse[4] = vsse_intra16_c;
2572     c->vsse[5] = vsse_intra8_c;
2573     c->nsse[0] = nsse16_c;
2574     c->nsse[1] = nsse8_c;
2575
2576     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2577
2578     c->add_bytes                      = add_bytes_c;
2579     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2580     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2581     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2582
2583     c->diff_bytes                 = diff_bytes_c;
2584     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2585
2586     c->bswap_buf   = bswap_buf;
2587     c->bswap16_buf = bswap16_buf;
2588
2589     c->try_8x8basis = try_8x8basis_c;
2590     c->add_8x8basis = add_8x8basis_c;
2591
2592     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2593
2594     c->scalarproduct_int16 = scalarproduct_int16_c;
2595     c->vector_clip_int32   = vector_clip_int32_c;
2596     c->vector_clipf        = vector_clipf_c;
2597
2598     c->shrink[0] = av_image_copy_plane;
2599     c->shrink[1] = ff_shrink22;
2600     c->shrink[2] = ff_shrink44;
2601     c->shrink[3] = ff_shrink88;
2602
2603     c->add_pixels8 = add_pixels8_c;
2604
2605 #undef FUNC
2606 #undef FUNCC
2607 #define FUNC(f,  depth) f ## _ ## depth
2608 #define FUNCC(f, depth) f ## _ ## depth ## _c
2609
2610     c->draw_edges = FUNCC(draw_edges, 8);
2611
2612     c->clear_block  = FUNCC(clear_block, 8);
2613     c->clear_blocks = FUNCC(clear_blocks, 8);
2614
2615 #define BIT_DEPTH_FUNCS(depth)                  \
2616     c->get_pixels = FUNCC(get_pixels, depth);
2617
2618     switch (avctx->bits_per_raw_sample) {
2619     case 9:
2620     case 10:
2621         BIT_DEPTH_FUNCS(16);
2622         break;
2623     default:
2624         BIT_DEPTH_FUNCS(8);
2625         break;
2626     }
2627
2628     if (ARCH_ARM)
2629         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2630     if (ARCH_BFIN)
2631         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2632     if (ARCH_PPC)
2633         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2634     if (ARCH_X86)
2635         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2636
2637     ff_init_scantable_permutation(c->idct_permutation,
2638                                   c->idct_permutation_type);
2639 }