]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
Merge commit 'c94e2e85cb6af8a570d8542a830556243bd32873'
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45
46 uint32_t ff_square_tab[512] = { 0, };
47
48 #define BIT_DEPTH 16
49 #include "dsputilenc_template.c"
50 #undef BIT_DEPTH
51
52 #define BIT_DEPTH 8
53 #include "hpel_template.c"
54 #include "tpel_template.c"
55 #include "dsputil_template.c"
56 #include "dsputilenc_template.c"
57
58 const uint8_t ff_alternate_horizontal_scan[64] = {
59      0,  1,  2,  3,  8,  9, 16, 17,
60     10, 11,  4,  5,  6,  7, 15, 14,
61     13, 12, 19, 18, 24, 25, 32, 33,
62     26, 27, 20, 21, 22, 23, 28, 29,
63     30, 31, 34, 35, 40, 41, 48, 49,
64     42, 43, 36, 37, 38, 39, 44, 45,
65     46, 47, 50, 51, 56, 57, 58, 59,
66     52, 53, 54, 55, 60, 61, 62, 63,
67 };
68
69 const uint8_t ff_alternate_vertical_scan[64] = {
70      0,  8, 16, 24,  1,  9,  2, 10,
71     17, 25, 32, 40, 48, 56, 57, 49,
72     41, 33, 26, 18,  3, 11,  4, 12,
73     19, 27, 34, 42, 50, 58, 35, 43,
74     51, 59, 20, 28,  5, 13,  6, 14,
75     21, 29, 36, 44, 52, 60, 37, 45,
76     53, 61, 22, 30,  7, 15, 23, 31,
77     38, 46, 54, 62, 39, 47, 55, 63,
78 };
79
80 /* Input permutation for the simple_idct_mmx */
81 static const uint8_t simple_mmx_permutation[64] = {
82     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
83     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
84     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
85     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
86     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
87     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
88     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
89     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
90 };
91
92 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
93
94 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
95                                const uint8_t *src_scantable)
96 {
97     int i, end;
98
99     st->scantable = src_scantable;
100
101     for (i = 0; i < 64; i++) {
102         int j = src_scantable[i];
103         st->permutated[i] = permutation[j];
104     }
105
106     end = -1;
107     for (i = 0; i < 64; i++) {
108         int j = st->permutated[i];
109         if (j > end)
110             end = j;
111         st->raster_end[i] = end;
112     }
113 }
114
115 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
116                                            int idct_permutation_type)
117 {
118     int i;
119
120     switch (idct_permutation_type) {
121     case FF_NO_IDCT_PERM:
122         for (i = 0; i < 64; i++)
123             idct_permutation[i] = i;
124         break;
125     case FF_LIBMPEG2_IDCT_PERM:
126         for (i = 0; i < 64; i++)
127             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
128         break;
129     case FF_SIMPLE_IDCT_PERM:
130         for (i = 0; i < 64; i++)
131             idct_permutation[i] = simple_mmx_permutation[i];
132         break;
133     case FF_TRANSPOSE_IDCT_PERM:
134         for (i = 0; i < 64; i++)
135             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
136         break;
137     case FF_PARTTRANS_IDCT_PERM:
138         for (i = 0; i < 64; i++)
139             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
140         break;
141     case FF_SSE2_IDCT_PERM:
142         for (i = 0; i < 64; i++)
143             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
144         break;
145     default:
146         av_log(NULL, AV_LOG_ERROR,
147                "Internal error, IDCT permutation not set\n");
148     }
149 }
150
151 static int pix_sum_c(uint8_t *pix, int line_size)
152 {
153     int s = 0, i, j;
154
155     for (i = 0; i < 16; i++) {
156         for (j = 0; j < 16; j += 8) {
157             s   += pix[0];
158             s   += pix[1];
159             s   += pix[2];
160             s   += pix[3];
161             s   += pix[4];
162             s   += pix[5];
163             s   += pix[6];
164             s   += pix[7];
165             pix += 8;
166         }
167         pix += line_size - 16;
168     }
169     return s;
170 }
171
172 static int pix_norm1_c(uint8_t *pix, int line_size)
173 {
174     int s = 0, i, j;
175     uint32_t *sq = ff_square_tab + 256;
176
177     for (i = 0; i < 16; i++) {
178         for (j = 0; j < 16; j += 8) {
179 #if 0
180             s += sq[pix[0]];
181             s += sq[pix[1]];
182             s += sq[pix[2]];
183             s += sq[pix[3]];
184             s += sq[pix[4]];
185             s += sq[pix[5]];
186             s += sq[pix[6]];
187             s += sq[pix[7]];
188 #else
189 #if HAVE_FAST_64BIT
190             register uint64_t x = *(uint64_t *) pix;
191             s += sq[x         & 0xff];
192             s += sq[(x >>  8) & 0xff];
193             s += sq[(x >> 16) & 0xff];
194             s += sq[(x >> 24) & 0xff];
195             s += sq[(x >> 32) & 0xff];
196             s += sq[(x >> 40) & 0xff];
197             s += sq[(x >> 48) & 0xff];
198             s += sq[(x >> 56) & 0xff];
199 #else
200             register uint32_t x = *(uint32_t *) pix;
201             s += sq[x         & 0xff];
202             s += sq[(x >>  8) & 0xff];
203             s += sq[(x >> 16) & 0xff];
204             s += sq[(x >> 24) & 0xff];
205             x  = *(uint32_t *) (pix + 4);
206             s += sq[x         & 0xff];
207             s += sq[(x >>  8) & 0xff];
208             s += sq[(x >> 16) & 0xff];
209             s += sq[(x >> 24) & 0xff];
210 #endif
211 #endif
212             pix += 8;
213         }
214         pix += line_size - 16;
215     }
216     return s;
217 }
218
219 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
220 {
221     int i;
222
223     for (i = 0; i + 8 <= w; i += 8) {
224         dst[i + 0] = av_bswap32(src[i + 0]);
225         dst[i + 1] = av_bswap32(src[i + 1]);
226         dst[i + 2] = av_bswap32(src[i + 2]);
227         dst[i + 3] = av_bswap32(src[i + 3]);
228         dst[i + 4] = av_bswap32(src[i + 4]);
229         dst[i + 5] = av_bswap32(src[i + 5]);
230         dst[i + 6] = av_bswap32(src[i + 6]);
231         dst[i + 7] = av_bswap32(src[i + 7]);
232     }
233     for (; i < w; i++)
234         dst[i + 0] = av_bswap32(src[i + 0]);
235 }
236
237 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
238 {
239     while (len--)
240         *dst++ = av_bswap16(*src++);
241 }
242
243 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
244                   int line_size, int h)
245 {
246     int s = 0, i;
247     uint32_t *sq = ff_square_tab + 256;
248
249     for (i = 0; i < h; i++) {
250         s    += sq[pix1[0] - pix2[0]];
251         s    += sq[pix1[1] - pix2[1]];
252         s    += sq[pix1[2] - pix2[2]];
253         s    += sq[pix1[3] - pix2[3]];
254         pix1 += line_size;
255         pix2 += line_size;
256     }
257     return s;
258 }
259
260 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
261                   int line_size, int h)
262 {
263     int s = 0, i;
264     uint32_t *sq = ff_square_tab + 256;
265
266     for (i = 0; i < h; i++) {
267         s    += sq[pix1[0] - pix2[0]];
268         s    += sq[pix1[1] - pix2[1]];
269         s    += sq[pix1[2] - pix2[2]];
270         s    += sq[pix1[3] - pix2[3]];
271         s    += sq[pix1[4] - pix2[4]];
272         s    += sq[pix1[5] - pix2[5]];
273         s    += sq[pix1[6] - pix2[6]];
274         s    += sq[pix1[7] - pix2[7]];
275         pix1 += line_size;
276         pix2 += line_size;
277     }
278     return s;
279 }
280
281 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
282                    int line_size, int h)
283 {
284     int s = 0, i;
285     uint32_t *sq = ff_square_tab + 256;
286
287     for (i = 0; i < h; i++) {
288         s += sq[pix1[0]  - pix2[0]];
289         s += sq[pix1[1]  - pix2[1]];
290         s += sq[pix1[2]  - pix2[2]];
291         s += sq[pix1[3]  - pix2[3]];
292         s += sq[pix1[4]  - pix2[4]];
293         s += sq[pix1[5]  - pix2[5]];
294         s += sq[pix1[6]  - pix2[6]];
295         s += sq[pix1[7]  - pix2[7]];
296         s += sq[pix1[8]  - pix2[8]];
297         s += sq[pix1[9]  - pix2[9]];
298         s += sq[pix1[10] - pix2[10]];
299         s += sq[pix1[11] - pix2[11]];
300         s += sq[pix1[12] - pix2[12]];
301         s += sq[pix1[13] - pix2[13]];
302         s += sq[pix1[14] - pix2[14]];
303         s += sq[pix1[15] - pix2[15]];
304
305         pix1 += line_size;
306         pix2 += line_size;
307     }
308     return s;
309 }
310
311 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
312                           const uint8_t *s2, int stride)
313 {
314     int i;
315
316     /* read the pixels */
317     for (i = 0; i < 8; i++) {
318         block[0] = s1[0] - s2[0];
319         block[1] = s1[1] - s2[1];
320         block[2] = s1[2] - s2[2];
321         block[3] = s1[3] - s2[3];
322         block[4] = s1[4] - s2[4];
323         block[5] = s1[5] - s2[5];
324         block[6] = s1[6] - s2[6];
325         block[7] = s1[7] - s2[7];
326         s1      += stride;
327         s2      += stride;
328         block   += 8;
329     }
330 }
331
332 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
333                                  int line_size)
334 {
335     int i;
336
337     /* read the pixels */
338     for (i = 0; i < 8; i++) {
339         pixels[0] = av_clip_uint8(block[0]);
340         pixels[1] = av_clip_uint8(block[1]);
341         pixels[2] = av_clip_uint8(block[2]);
342         pixels[3] = av_clip_uint8(block[3]);
343         pixels[4] = av_clip_uint8(block[4]);
344         pixels[5] = av_clip_uint8(block[5]);
345         pixels[6] = av_clip_uint8(block[6]);
346         pixels[7] = av_clip_uint8(block[7]);
347
348         pixels += line_size;
349         block  += 8;
350     }
351 }
352
353 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
354                                  int line_size)
355 {
356     int i;
357
358     /* read the pixels */
359     for(i=0;i<4;i++) {
360         pixels[0] = av_clip_uint8(block[0]);
361         pixels[1] = av_clip_uint8(block[1]);
362         pixels[2] = av_clip_uint8(block[2]);
363         pixels[3] = av_clip_uint8(block[3]);
364
365         pixels += line_size;
366         block += 8;
367     }
368 }
369
370 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
371                                  int line_size)
372 {
373     int i;
374
375     /* read the pixels */
376     for(i=0;i<2;i++) {
377         pixels[0] = av_clip_uint8(block[0]);
378         pixels[1] = av_clip_uint8(block[1]);
379
380         pixels += line_size;
381         block += 8;
382     }
383 }
384
385 static void put_signed_pixels_clamped_c(const int16_t *block,
386                                         uint8_t *av_restrict pixels,
387                                         int line_size)
388 {
389     int i, j;
390
391     for (i = 0; i < 8; i++) {
392         for (j = 0; j < 8; j++) {
393             if (*block < -128)
394                 *pixels = 0;
395             else if (*block > 127)
396                 *pixels = 255;
397             else
398                 *pixels = (uint8_t) (*block + 128);
399             block++;
400             pixels++;
401         }
402         pixels += (line_size - 8);
403     }
404 }
405
406 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
407                           int line_size)
408 {
409     int i;
410
411     for (i = 0; i < 8; i++) {
412         pixels[0] += block[0];
413         pixels[1] += block[1];
414         pixels[2] += block[2];
415         pixels[3] += block[3];
416         pixels[4] += block[4];
417         pixels[5] += block[5];
418         pixels[6] += block[6];
419         pixels[7] += block[7];
420         pixels    += line_size;
421         block     += 8;
422     }
423 }
424
425 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
426                                  int line_size)
427 {
428     int i;
429
430     /* read the pixels */
431     for (i = 0; i < 8; i++) {
432         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
433         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
434         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
435         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
436         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
437         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
438         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
439         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
440         pixels   += line_size;
441         block    += 8;
442     }
443 }
444
445 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
446                           int line_size)
447 {
448     int i;
449
450     /* read the pixels */
451     for(i=0;i<4;i++) {
452         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
453         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
454         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
455         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
456         pixels += line_size;
457         block += 8;
458     }
459 }
460
461 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
462                           int line_size)
463 {
464     int i;
465
466     /* read the pixels */
467     for(i=0;i<2;i++) {
468         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
469         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
470         pixels += line_size;
471         block += 8;
472     }
473 }
474
475 static int sum_abs_dctelem_c(int16_t *block)
476 {
477     int sum = 0, i;
478
479     for (i = 0; i < 64; i++)
480         sum += FFABS(block[i]);
481     return sum;
482 }
483
484 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
485 {
486     int i;
487
488     for (i = 0; i < h; i++) {
489         memset(block, value, 16);
490         block += line_size;
491     }
492 }
493
494 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
495 {
496     int i;
497
498     for (i = 0; i < h; i++) {
499         memset(block, value, 8);
500         block += line_size;
501     }
502 }
503
504 #define avg2(a, b) ((a + b + 1) >> 1)
505 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
506
507 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
508                    int x16, int y16, int rounder)
509 {
510     const int A = (16 - x16) * (16 - y16);
511     const int B = (x16)      * (16 - y16);
512     const int C = (16 - x16) * (y16);
513     const int D = (x16)      * (y16);
514     int i;
515
516     for (i = 0; i < h; i++) {
517         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
518         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
519         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
520         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
521         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
522         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
523         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
524         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
525         dst   += stride;
526         src   += stride;
527     }
528 }
529
530 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
531               int dxx, int dxy, int dyx, int dyy, int shift, int r,
532               int width, int height)
533 {
534     int y, vx, vy;
535     const int s = 1 << shift;
536
537     width--;
538     height--;
539
540     for (y = 0; y < h; y++) {
541         int x;
542
543         vx = ox;
544         vy = oy;
545         for (x = 0; x < 8; x++) { // FIXME: optimize
546             int index;
547             int src_x  = vx >> 16;
548             int src_y  = vy >> 16;
549             int frac_x = src_x & (s - 1);
550             int frac_y = src_y & (s - 1);
551
552             src_x >>= shift;
553             src_y >>= shift;
554
555             if ((unsigned) src_x < width) {
556                 if ((unsigned) src_y < height) {
557                     index = src_x + src_y * stride;
558                     dst[y * stride + x] =
559                         ((src[index]                        * (s - frac_x) +
560                           src[index + 1]          * frac_x) * (s - frac_y) +
561                          (src[index + stride]               * (s - frac_x) +
562                           src[index + stride + 1] * frac_x) *      frac_y  +
563                          r) >> (shift * 2);
564                 } else {
565                     index = src_x + av_clip(src_y, 0, height) * stride;
566                     dst[y * stride + x] =
567                         ((src[index]               * (s - frac_x) +
568                           src[index + 1] * frac_x) *  s           +
569                          r) >> (shift * 2);
570                 }
571             } else {
572                 if ((unsigned) src_y < height) {
573                     index = av_clip(src_x, 0, width) + src_y * stride;
574                     dst[y * stride + x] =
575                         ((src[index]                    * (s - frac_y) +
576                           src[index + stride] * frac_y) *  s           +
577                          r) >> (shift * 2);
578                 } else {
579                     index = av_clip(src_x, 0, width) +
580                             av_clip(src_y, 0, height) * stride;
581                     dst[y * stride + x] = src[index];
582                 }
583             }
584
585             vx += dxx;
586             vy += dyx;
587         }
588         ox += dxy;
589         oy += dyy;
590     }
591 }
592
593 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
594 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
595                                             int dstStride, int srcStride,     \
596                                             int h)                            \
597 {                                                                             \
598     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
599     int i;                                                                    \
600                                                                               \
601     for (i = 0; i < h; i++) {                                                 \
602         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
603         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
604         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
605         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
606         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
607         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
608         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
609         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
610         dst += dstStride;                                                     \
611         src += srcStride;                                                     \
612     }                                                                         \
613 }                                                                             \
614                                                                               \
615 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
616                                             int dstStride, int srcStride)     \
617 {                                                                             \
618     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
619     const int w = 8;                                                          \
620     int i;                                                                    \
621                                                                               \
622     for (i = 0; i < w; i++) {                                                 \
623         const int src0 = src[0 * srcStride];                                  \
624         const int src1 = src[1 * srcStride];                                  \
625         const int src2 = src[2 * srcStride];                                  \
626         const int src3 = src[3 * srcStride];                                  \
627         const int src4 = src[4 * srcStride];                                  \
628         const int src5 = src[5 * srcStride];                                  \
629         const int src6 = src[6 * srcStride];                                  \
630         const int src7 = src[7 * srcStride];                                  \
631         const int src8 = src[8 * srcStride];                                  \
632         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
633         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
634         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
635         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
636         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
637         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
638         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
639         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
640         dst++;                                                                \
641         src++;                                                                \
642     }                                                                         \
643 }                                                                             \
644                                                                               \
645 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
646                                              int dstStride, int srcStride,    \
647                                              int h)                           \
648 {                                                                             \
649     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
650     int i;                                                                    \
651                                                                               \
652     for (i = 0; i < h; i++) {                                                 \
653         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
654         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
655         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
656         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
657         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
658         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
659         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
660         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
661         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
662         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
663         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
664         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
665         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
666         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
667         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
668         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
669         dst += dstStride;                                                     \
670         src += srcStride;                                                     \
671     }                                                                         \
672 }                                                                             \
673                                                                               \
674 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
675                                              int dstStride, int srcStride)    \
676 {                                                                             \
677     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
678     const int w = 16;                                                         \
679     int i;                                                                    \
680                                                                               \
681     for (i = 0; i < w; i++) {                                                 \
682         const int src0  = src[0  * srcStride];                                \
683         const int src1  = src[1  * srcStride];                                \
684         const int src2  = src[2  * srcStride];                                \
685         const int src3  = src[3  * srcStride];                                \
686         const int src4  = src[4  * srcStride];                                \
687         const int src5  = src[5  * srcStride];                                \
688         const int src6  = src[6  * srcStride];                                \
689         const int src7  = src[7  * srcStride];                                \
690         const int src8  = src[8  * srcStride];                                \
691         const int src9  = src[9  * srcStride];                                \
692         const int src10 = src[10 * srcStride];                                \
693         const int src11 = src[11 * srcStride];                                \
694         const int src12 = src[12 * srcStride];                                \
695         const int src13 = src[13 * srcStride];                                \
696         const int src14 = src[14 * srcStride];                                \
697         const int src15 = src[15 * srcStride];                                \
698         const int src16 = src[16 * srcStride];                                \
699         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
700         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
701         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
702         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
703         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
704         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
705         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
706         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
707         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
708         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
709         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
710         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
711         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
712         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
713         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
714         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
715         dst++;                                                                \
716         src++;                                                                \
717     }                                                                         \
718 }                                                                             \
719                                                                               \
720 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
721                                    ptrdiff_t stride)                          \
722 {                                                                             \
723     uint8_t half[64];                                                         \
724                                                                               \
725     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
726     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
727 }                                                                             \
728                                                                               \
729 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
730                                    ptrdiff_t stride)                          \
731 {                                                                             \
732     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
733 }                                                                             \
734                                                                               \
735 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
736                                    ptrdiff_t stride)                          \
737 {                                                                             \
738     uint8_t half[64];                                                         \
739                                                                               \
740     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
741     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
742 }                                                                             \
743                                                                               \
744 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
745                                    ptrdiff_t stride)                          \
746 {                                                                             \
747     uint8_t full[16 * 9];                                                     \
748     uint8_t half[64];                                                         \
749                                                                               \
750     copy_block9(full, src, 16, stride, 9);                                    \
751     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
752     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
753 }                                                                             \
754                                                                               \
755 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
756                                    ptrdiff_t stride)                          \
757 {                                                                             \
758     uint8_t full[16 * 9];                                                     \
759                                                                               \
760     copy_block9(full, src, 16, stride, 9);                                    \
761     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
762 }                                                                             \
763                                                                               \
764 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
765                                    ptrdiff_t stride)                          \
766 {                                                                             \
767     uint8_t full[16 * 9];                                                     \
768     uint8_t half[64];                                                         \
769                                                                               \
770     copy_block9(full, src, 16, stride, 9);                                    \
771     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
772     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
773 }                                                                             \
774                                                                               \
775 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
776                                        ptrdiff_t stride)                      \
777 {                                                                             \
778     uint8_t full[16 * 9];                                                     \
779     uint8_t halfH[72];                                                        \
780     uint8_t halfV[64];                                                        \
781     uint8_t halfHV[64];                                                       \
782                                                                               \
783     copy_block9(full, src, 16, stride, 9);                                    \
784     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
785     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
786     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
787     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
788                            stride, 16, 8, 8, 8, 8);                           \
789 }                                                                             \
790                                                                               \
791 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
792                                    ptrdiff_t stride)                          \
793 {                                                                             \
794     uint8_t full[16 * 9];                                                     \
795     uint8_t halfH[72];                                                        \
796     uint8_t halfHV[64];                                                       \
797                                                                               \
798     copy_block9(full, src, 16, stride, 9);                                    \
799     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
800     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
801     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
802     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
803 }                                                                             \
804                                                                               \
805 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
806                                        ptrdiff_t stride)                      \
807 {                                                                             \
808     uint8_t full[16 * 9];                                                     \
809     uint8_t halfH[72];                                                        \
810     uint8_t halfV[64];                                                        \
811     uint8_t halfHV[64];                                                       \
812                                                                               \
813     copy_block9(full, src, 16, stride, 9);                                    \
814     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
815     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
816     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
817     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
818                            stride, 16, 8, 8, 8, 8);                           \
819 }                                                                             \
820                                                                               \
821 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
822                                    ptrdiff_t stride)                          \
823 {                                                                             \
824     uint8_t full[16 * 9];                                                     \
825     uint8_t halfH[72];                                                        \
826     uint8_t halfHV[64];                                                       \
827                                                                               \
828     copy_block9(full, src, 16, stride, 9);                                    \
829     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
830     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
831     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
832     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
833 }                                                                             \
834                                                                               \
835 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
836                                        ptrdiff_t stride)                      \
837 {                                                                             \
838     uint8_t full[16 * 9];                                                     \
839     uint8_t halfH[72];                                                        \
840     uint8_t halfV[64];                                                        \
841     uint8_t halfHV[64];                                                       \
842                                                                               \
843     copy_block9(full, src, 16, stride, 9);                                    \
844     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
845     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
846     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
847     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
848                            stride, 16, 8, 8, 8, 8);                           \
849 }                                                                             \
850                                                                               \
851 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
852                                    ptrdiff_t stride)                          \
853 {                                                                             \
854     uint8_t full[16 * 9];                                                     \
855     uint8_t halfH[72];                                                        \
856     uint8_t halfHV[64];                                                       \
857                                                                               \
858     copy_block9(full, src, 16, stride, 9);                                    \
859     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
860     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
861     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
862     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
863 }                                                                             \
864                                                                               \
865 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
866                                        ptrdiff_t stride)                      \
867 {                                                                             \
868     uint8_t full[16 * 9];                                                     \
869     uint8_t halfH[72];                                                        \
870     uint8_t halfV[64];                                                        \
871     uint8_t halfHV[64];                                                       \
872                                                                               \
873     copy_block9(full, src, 16, stride, 9);                                    \
874     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
875     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
876     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
877     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
878                            stride, 16, 8, 8, 8, 8);                           \
879 }                                                                             \
880                                                                               \
881 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
882                                    ptrdiff_t stride)                          \
883 {                                                                             \
884     uint8_t full[16 * 9];                                                     \
885     uint8_t halfH[72];                                                        \
886     uint8_t halfHV[64];                                                       \
887                                                                               \
888     copy_block9(full, src, 16, stride, 9);                                    \
889     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
890     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
891     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
892     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
893 }                                                                             \
894                                                                               \
895 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
896                                    ptrdiff_t stride)                          \
897 {                                                                             \
898     uint8_t halfH[72];                                                        \
899     uint8_t halfHV[64];                                                       \
900                                                                               \
901     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
902     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
903     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
904 }                                                                             \
905                                                                               \
906 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
907                                    ptrdiff_t stride)                          \
908 {                                                                             \
909     uint8_t halfH[72];                                                        \
910     uint8_t halfHV[64];                                                       \
911                                                                               \
912     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
913     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
914     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
915 }                                                                             \
916                                                                               \
917 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
918                                        ptrdiff_t stride)                      \
919 {                                                                             \
920     uint8_t full[16 * 9];                                                     \
921     uint8_t halfH[72];                                                        \
922     uint8_t halfV[64];                                                        \
923     uint8_t halfHV[64];                                                       \
924                                                                               \
925     copy_block9(full, src, 16, stride, 9);                                    \
926     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
927     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
928     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
929     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
930 }                                                                             \
931                                                                               \
932 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
933                                    ptrdiff_t stride)                          \
934 {                                                                             \
935     uint8_t full[16 * 9];                                                     \
936     uint8_t halfH[72];                                                        \
937                                                                               \
938     copy_block9(full, src, 16, stride, 9);                                    \
939     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
940     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
941     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
942 }                                                                             \
943                                                                               \
944 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
945                                        ptrdiff_t stride)                      \
946 {                                                                             \
947     uint8_t full[16 * 9];                                                     \
948     uint8_t halfH[72];                                                        \
949     uint8_t halfV[64];                                                        \
950     uint8_t halfHV[64];                                                       \
951                                                                               \
952     copy_block9(full, src, 16, stride, 9);                                    \
953     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
954     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
955     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
956     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
957 }                                                                             \
958                                                                               \
959 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
960                                    ptrdiff_t stride)                          \
961 {                                                                             \
962     uint8_t full[16 * 9];                                                     \
963     uint8_t halfH[72];                                                        \
964                                                                               \
965     copy_block9(full, src, 16, stride, 9);                                    \
966     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
967     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
968     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
969 }                                                                             \
970                                                                               \
971 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
972                                    ptrdiff_t stride)                          \
973 {                                                                             \
974     uint8_t halfH[72];                                                        \
975                                                                               \
976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
977     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
978 }                                                                             \
979                                                                               \
980 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
981                                     ptrdiff_t stride)                         \
982 {                                                                             \
983     uint8_t half[256];                                                        \
984                                                                               \
985     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
986     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
987 }                                                                             \
988                                                                               \
989 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
990                                     ptrdiff_t stride)                         \
991 {                                                                             \
992     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
993 }                                                                             \
994                                                                               \
995 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
996                                     ptrdiff_t stride)                         \
997 {                                                                             \
998     uint8_t half[256];                                                        \
999                                                                               \
1000     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1001     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1002 }                                                                             \
1003                                                                               \
1004 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1005                                     ptrdiff_t stride)                         \
1006 {                                                                             \
1007     uint8_t full[24 * 17];                                                    \
1008     uint8_t half[256];                                                        \
1009                                                                               \
1010     copy_block17(full, src, 24, stride, 17);                                  \
1011     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1012     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1013 }                                                                             \
1014                                                                               \
1015 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1016                                     ptrdiff_t stride)                         \
1017 {                                                                             \
1018     uint8_t full[24 * 17];                                                    \
1019                                                                               \
1020     copy_block17(full, src, 24, stride, 17);                                  \
1021     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1022 }                                                                             \
1023                                                                               \
1024 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1025                                     ptrdiff_t stride)                         \
1026 {                                                                             \
1027     uint8_t full[24 * 17];                                                    \
1028     uint8_t half[256];                                                        \
1029                                                                               \
1030     copy_block17(full, src, 24, stride, 17);                                  \
1031     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1032     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1033 }                                                                             \
1034                                                                               \
1035 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1036                                         ptrdiff_t stride)                     \
1037 {                                                                             \
1038     uint8_t full[24 * 17];                                                    \
1039     uint8_t halfH[272];                                                       \
1040     uint8_t halfV[256];                                                       \
1041     uint8_t halfHV[256];                                                      \
1042                                                                               \
1043     copy_block17(full, src, 24, stride, 17);                                  \
1044     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1045     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1046     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1047     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1048                             stride, 24, 16, 16, 16, 16);                      \
1049 }                                                                             \
1050                                                                               \
1051 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1052                                     ptrdiff_t stride)                         \
1053 {                                                                             \
1054     uint8_t full[24 * 17];                                                    \
1055     uint8_t halfH[272];                                                       \
1056     uint8_t halfHV[256];                                                      \
1057                                                                               \
1058     copy_block17(full, src, 24, stride, 17);                                  \
1059     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1060     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1061     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1062     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1063 }                                                                             \
1064                                                                               \
1065 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1066                                         ptrdiff_t stride)                     \
1067 {                                                                             \
1068     uint8_t full[24 * 17];                                                    \
1069     uint8_t halfH[272];                                                       \
1070     uint8_t halfV[256];                                                       \
1071     uint8_t halfHV[256];                                                      \
1072                                                                               \
1073     copy_block17(full, src, 24, stride, 17);                                  \
1074     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1075     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1076     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1077     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1078                             stride, 24, 16, 16, 16, 16);                      \
1079 }                                                                             \
1080                                                                               \
1081 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1082                                     ptrdiff_t stride)                         \
1083 {                                                                             \
1084     uint8_t full[24 * 17];                                                    \
1085     uint8_t halfH[272];                                                       \
1086     uint8_t halfHV[256];                                                      \
1087                                                                               \
1088     copy_block17(full, src, 24, stride, 17);                                  \
1089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1090     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1092     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1093 }                                                                             \
1094                                                                               \
1095 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1096                                         ptrdiff_t stride)                     \
1097 {                                                                             \
1098     uint8_t full[24 * 17];                                                    \
1099     uint8_t halfH[272];                                                       \
1100     uint8_t halfV[256];                                                       \
1101     uint8_t halfHV[256];                                                      \
1102                                                                               \
1103     copy_block17(full, src, 24, stride, 17);                                  \
1104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1105     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1107     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1108                             stride, 24, 16, 16, 16, 16);                      \
1109 }                                                                             \
1110                                                                               \
1111 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1112                                     ptrdiff_t stride)                         \
1113 {                                                                             \
1114     uint8_t full[24 * 17];                                                    \
1115     uint8_t halfH[272];                                                       \
1116     uint8_t halfHV[256];                                                      \
1117                                                                               \
1118     copy_block17(full, src, 24, stride, 17);                                  \
1119     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1120     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1121     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1122     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1123 }                                                                             \
1124                                                                               \
1125 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1126                                         ptrdiff_t stride)                     \
1127 {                                                                             \
1128     uint8_t full[24 * 17];                                                    \
1129     uint8_t halfH[272];                                                       \
1130     uint8_t halfV[256];                                                       \
1131     uint8_t halfHV[256];                                                      \
1132                                                                               \
1133     copy_block17(full, src, 24, stride, 17);                                  \
1134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1135     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1136     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1137     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1138                             stride, 24, 16, 16, 16, 16);                      \
1139 }                                                                             \
1140                                                                               \
1141 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1142                                     ptrdiff_t stride)                         \
1143 {                                                                             \
1144     uint8_t full[24 * 17];                                                    \
1145     uint8_t halfH[272];                                                       \
1146     uint8_t halfHV[256];                                                      \
1147                                                                               \
1148     copy_block17(full, src, 24, stride, 17);                                  \
1149     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1150     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1152     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1153 }                                                                             \
1154                                                                               \
1155 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1156                                     ptrdiff_t stride)                         \
1157 {                                                                             \
1158     uint8_t halfH[272];                                                       \
1159     uint8_t halfHV[256];                                                      \
1160                                                                               \
1161     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1162     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1163     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1164 }                                                                             \
1165                                                                               \
1166 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1167                                     ptrdiff_t stride)                         \
1168 {                                                                             \
1169     uint8_t halfH[272];                                                       \
1170     uint8_t halfHV[256];                                                      \
1171                                                                               \
1172     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1173     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1174     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1175 }                                                                             \
1176                                                                               \
1177 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1178                                         ptrdiff_t stride)                     \
1179 {                                                                             \
1180     uint8_t full[24 * 17];                                                    \
1181     uint8_t halfH[272];                                                       \
1182     uint8_t halfV[256];                                                       \
1183     uint8_t halfHV[256];                                                      \
1184                                                                               \
1185     copy_block17(full, src, 24, stride, 17);                                  \
1186     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1187     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1188     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1189     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1190 }                                                                             \
1191                                                                               \
1192 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1193                                     ptrdiff_t stride)                         \
1194 {                                                                             \
1195     uint8_t full[24 * 17];                                                    \
1196     uint8_t halfH[272];                                                       \
1197                                                                               \
1198     copy_block17(full, src, 24, stride, 17);                                  \
1199     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1200     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1201     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1202 }                                                                             \
1203                                                                               \
1204 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1205                                         ptrdiff_t stride)                     \
1206 {                                                                             \
1207     uint8_t full[24 * 17];                                                    \
1208     uint8_t halfH[272];                                                       \
1209     uint8_t halfV[256];                                                       \
1210     uint8_t halfHV[256];                                                      \
1211                                                                               \
1212     copy_block17(full, src, 24, stride, 17);                                  \
1213     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1214     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1215     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1216     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1217 }                                                                             \
1218                                                                               \
1219 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1220                                     ptrdiff_t stride)                         \
1221 {                                                                             \
1222     uint8_t full[24 * 17];                                                    \
1223     uint8_t halfH[272];                                                       \
1224                                                                               \
1225     copy_block17(full, src, 24, stride, 17);                                  \
1226     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1227     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1228     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1229 }                                                                             \
1230                                                                               \
1231 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1232                                     ptrdiff_t stride)                         \
1233 {                                                                             \
1234     uint8_t halfH[272];                                                       \
1235                                                                               \
1236     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1237     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1238 }
1239
1240 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1241 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1242 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1243 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1244
1245 QPEL_MC(0, put_, _, op_put)
1246 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1247 QPEL_MC(0, avg_, _, op_avg)
1248
1249 #undef op_avg
1250 #undef op_put
1251 #undef op_put_no_rnd
1252
1253 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1254 {
1255     put_pixels8_8_c(dst, src, stride, 8);
1256 }
1257
1258 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1259 {
1260     avg_pixels8_8_c(dst, src, stride, 8);
1261 }
1262
1263 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1264 {
1265     put_pixels16_8_c(dst, src, stride, 16);
1266 }
1267
1268 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1269 {
1270     avg_pixels16_8_c(dst, src, stride, 16);
1271 }
1272
1273 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1274 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1275 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1276 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1277 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1278 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1279
1280 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1281                                   int dstStride, int srcStride, int h)
1282 {
1283     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1284     int i;
1285
1286     for (i = 0; i < h; i++) {
1287         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1288         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1289         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1290         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1291         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1292         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1293         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1294         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1295         dst   += dstStride;
1296         src   += srcStride;
1297     }
1298 }
1299
1300 #if CONFIG_DIRAC_DECODER
1301 #define DIRAC_MC(OPNAME)\
1302 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1303 {\
1304      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1305 }\
1306 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1307 {\
1308     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1309 }\
1310 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1311 {\
1312     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1313     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1314 }\
1315 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1316 {\
1317     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1318 }\
1319 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1320 {\
1321     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1322 }\
1323 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1324 {\
1325     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1326     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1327 }\
1328 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1329 {\
1330     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1331 }\
1332 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1333 {\
1334     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1335 }\
1336 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1337 {\
1338     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1339     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1340 }
1341 DIRAC_MC(put)
1342 DIRAC_MC(avg)
1343 #endif
1344
1345 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1346                                   int dstStride, int srcStride, int w)
1347 {
1348     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1349     int i;
1350
1351     for (i = 0; i < w; i++) {
1352         const int src_1 = src[-srcStride];
1353         const int src0  = src[0];
1354         const int src1  = src[srcStride];
1355         const int src2  = src[2 * srcStride];
1356         const int src3  = src[3 * srcStride];
1357         const int src4  = src[4 * srcStride];
1358         const int src5  = src[5 * srcStride];
1359         const int src6  = src[6 * srcStride];
1360         const int src7  = src[7 * srcStride];
1361         const int src8  = src[8 * srcStride];
1362         const int src9  = src[9 * srcStride];
1363         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1364         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1365         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1366         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1367         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1368         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1369         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1370         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1371         src++;
1372         dst++;
1373     }
1374 }
1375
1376 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1377 {
1378     uint8_t half[64];
1379
1380     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1381     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1382 }
1383
1384 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1385 {
1386     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1387 }
1388
1389 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1390 {
1391     uint8_t half[64];
1392
1393     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1395 }
1396
1397 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1398 {
1399     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1400 }
1401
1402 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1403 {
1404     uint8_t halfH[88];
1405     uint8_t halfV[64];
1406     uint8_t halfHV[64];
1407
1408     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1409     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1410     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1411     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1412 }
1413
1414 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1415 {
1416     uint8_t halfH[88];
1417     uint8_t halfV[64];
1418     uint8_t halfHV[64];
1419
1420     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1421     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1422     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1423     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1424 }
1425
1426 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1427 {
1428     uint8_t halfH[88];
1429
1430     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1431     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1432 }
1433
1434 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1435                               int line_size, int h)
1436 {
1437     int s = 0, i;
1438
1439     for (i = 0; i < h; i++) {
1440         s    += abs(pix1[0]  - pix2[0]);
1441         s    += abs(pix1[1]  - pix2[1]);
1442         s    += abs(pix1[2]  - pix2[2]);
1443         s    += abs(pix1[3]  - pix2[3]);
1444         s    += abs(pix1[4]  - pix2[4]);
1445         s    += abs(pix1[5]  - pix2[5]);
1446         s    += abs(pix1[6]  - pix2[6]);
1447         s    += abs(pix1[7]  - pix2[7]);
1448         s    += abs(pix1[8]  - pix2[8]);
1449         s    += abs(pix1[9]  - pix2[9]);
1450         s    += abs(pix1[10] - pix2[10]);
1451         s    += abs(pix1[11] - pix2[11]);
1452         s    += abs(pix1[12] - pix2[12]);
1453         s    += abs(pix1[13] - pix2[13]);
1454         s    += abs(pix1[14] - pix2[14]);
1455         s    += abs(pix1[15] - pix2[15]);
1456         pix1 += line_size;
1457         pix2 += line_size;
1458     }
1459     return s;
1460 }
1461
1462 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1463                           int line_size, int h)
1464 {
1465     int s = 0, i;
1466
1467     for (i = 0; i < h; i++) {
1468         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1469         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1470         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1471         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1472         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1473         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1474         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1475         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1476         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1477         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1478         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1479         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1480         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1481         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1482         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1483         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1484         pix1 += line_size;
1485         pix2 += line_size;
1486     }
1487     return s;
1488 }
1489
1490 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1491                           int line_size, int h)
1492 {
1493     int s = 0, i;
1494     uint8_t *pix3 = pix2 + line_size;
1495
1496     for (i = 0; i < h; i++) {
1497         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1498         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1499         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1500         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1501         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1502         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1503         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1504         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1505         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1506         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1507         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1508         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1509         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1510         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1511         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1512         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1513         pix1 += line_size;
1514         pix2 += line_size;
1515         pix3 += line_size;
1516     }
1517     return s;
1518 }
1519
1520 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1521                            int line_size, int h)
1522 {
1523     int s = 0, i;
1524     uint8_t *pix3 = pix2 + line_size;
1525
1526     for (i = 0; i < h; i++) {
1527         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1528         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1529         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1530         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1531         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1532         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1533         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1534         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1535         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1536         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1537         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1538         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1539         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1540         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1541         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1542         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1543         pix1 += line_size;
1544         pix2 += line_size;
1545         pix3 += line_size;
1546     }
1547     return s;
1548 }
1549
1550 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1551                              int line_size, int h)
1552 {
1553     int s = 0, i;
1554
1555     for (i = 0; i < h; i++) {
1556         s    += abs(pix1[0] - pix2[0]);
1557         s    += abs(pix1[1] - pix2[1]);
1558         s    += abs(pix1[2] - pix2[2]);
1559         s    += abs(pix1[3] - pix2[3]);
1560         s    += abs(pix1[4] - pix2[4]);
1561         s    += abs(pix1[5] - pix2[5]);
1562         s    += abs(pix1[6] - pix2[6]);
1563         s    += abs(pix1[7] - pix2[7]);
1564         pix1 += line_size;
1565         pix2 += line_size;
1566     }
1567     return s;
1568 }
1569
1570 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1571                          int line_size, int h)
1572 {
1573     int s = 0, i;
1574
1575     for (i = 0; i < h; i++) {
1576         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1577         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1578         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1579         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1580         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1581         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1582         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1583         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1584         pix1 += line_size;
1585         pix2 += line_size;
1586     }
1587     return s;
1588 }
1589
1590 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1591                          int line_size, int h)
1592 {
1593     int s = 0, i;
1594     uint8_t *pix3 = pix2 + line_size;
1595
1596     for (i = 0; i < h; i++) {
1597         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1598         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1599         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1600         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1601         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1602         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1603         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1604         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1605         pix1 += line_size;
1606         pix2 += line_size;
1607         pix3 += line_size;
1608     }
1609     return s;
1610 }
1611
1612 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1613                           int line_size, int h)
1614 {
1615     int s = 0, i;
1616     uint8_t *pix3 = pix2 + line_size;
1617
1618     for (i = 0; i < h; i++) {
1619         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1620         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1621         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1622         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1623         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1624         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1625         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1626         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1627         pix1 += line_size;
1628         pix2 += line_size;
1629         pix3 += line_size;
1630     }
1631     return s;
1632 }
1633
1634 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1635 {
1636     int score1 = 0, score2 = 0, x, y;
1637
1638     for (y = 0; y < h; y++) {
1639         for (x = 0; x < 16; x++)
1640             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1641         if (y + 1 < h) {
1642             for (x = 0; x < 15; x++)
1643                 score2 += FFABS(s1[x]     - s1[x + stride] -
1644                                 s1[x + 1] + s1[x + stride + 1]) -
1645                           FFABS(s2[x]     - s2[x + stride] -
1646                                 s2[x + 1] + s2[x + stride + 1]);
1647         }
1648         s1 += stride;
1649         s2 += stride;
1650     }
1651
1652     if (c)
1653         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1654     else
1655         return score1 + FFABS(score2) * 8;
1656 }
1657
1658 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1659 {
1660     int score1 = 0, score2 = 0, x, y;
1661
1662     for (y = 0; y < h; y++) {
1663         for (x = 0; x < 8; x++)
1664             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1665         if (y + 1 < h) {
1666             for (x = 0; x < 7; x++)
1667                 score2 += FFABS(s1[x]     - s1[x + stride] -
1668                                 s1[x + 1] + s1[x + stride + 1]) -
1669                           FFABS(s2[x]     - s2[x + stride] -
1670                                 s2[x + 1] + s2[x + stride + 1]);
1671         }
1672         s1 += stride;
1673         s2 += stride;
1674     }
1675
1676     if (c)
1677         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1678     else
1679         return score1 + FFABS(score2) * 8;
1680 }
1681
1682 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1683                           int16_t basis[64], int scale)
1684 {
1685     int i;
1686     unsigned int sum = 0;
1687
1688     for (i = 0; i < 8 * 8; i++) {
1689         int b = rem[i] + ((basis[i] * scale +
1690                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1691                           (BASIS_SHIFT - RECON_SHIFT));
1692         int w = weight[i];
1693         b >>= RECON_SHIFT;
1694         av_assert2(-512 < b && b < 512);
1695
1696         sum += (w * b) * (w * b) >> 4;
1697     }
1698     return sum >> 2;
1699 }
1700
1701 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1702 {
1703     int i;
1704
1705     for (i = 0; i < 8 * 8; i++)
1706         rem[i] += (basis[i] * scale +
1707                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1708                   (BASIS_SHIFT - RECON_SHIFT);
1709 }
1710
1711 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1712                     int stride, int h)
1713 {
1714     return 0;
1715 }
1716
1717 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1718 {
1719     int i;
1720
1721     memset(cmp, 0, sizeof(void *) * 6);
1722
1723     for (i = 0; i < 6; i++) {
1724         switch (type & 0xFF) {
1725         case FF_CMP_SAD:
1726             cmp[i] = c->sad[i];
1727             break;
1728         case FF_CMP_SATD:
1729             cmp[i] = c->hadamard8_diff[i];
1730             break;
1731         case FF_CMP_SSE:
1732             cmp[i] = c->sse[i];
1733             break;
1734         case FF_CMP_DCT:
1735             cmp[i] = c->dct_sad[i];
1736             break;
1737         case FF_CMP_DCT264:
1738             cmp[i] = c->dct264_sad[i];
1739             break;
1740         case FF_CMP_DCTMAX:
1741             cmp[i] = c->dct_max[i];
1742             break;
1743         case FF_CMP_PSNR:
1744             cmp[i] = c->quant_psnr[i];
1745             break;
1746         case FF_CMP_BIT:
1747             cmp[i] = c->bit[i];
1748             break;
1749         case FF_CMP_RD:
1750             cmp[i] = c->rd[i];
1751             break;
1752         case FF_CMP_VSAD:
1753             cmp[i] = c->vsad[i];
1754             break;
1755         case FF_CMP_VSSE:
1756             cmp[i] = c->vsse[i];
1757             break;
1758         case FF_CMP_ZERO:
1759             cmp[i] = zero_cmp;
1760             break;
1761         case FF_CMP_NSSE:
1762             cmp[i] = c->nsse[i];
1763             break;
1764 #if CONFIG_DWT
1765         case FF_CMP_W53:
1766             cmp[i]= c->w53[i];
1767             break;
1768         case FF_CMP_W97:
1769             cmp[i]= c->w97[i];
1770             break;
1771 #endif
1772         default:
1773             av_log(NULL, AV_LOG_ERROR,
1774                    "internal error in cmp function selection\n");
1775         }
1776     }
1777 }
1778
1779 #define BUTTERFLY2(o1, o2, i1, i2)              \
1780     o1 = (i1) + (i2);                           \
1781     o2 = (i1) - (i2);
1782
1783 #define BUTTERFLY1(x, y)                        \
1784     {                                           \
1785         int a, b;                               \
1786         a = x;                                  \
1787         b = y;                                  \
1788         x = a + b;                              \
1789         y = a - b;                              \
1790     }
1791
1792 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1793
1794 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1795                                uint8_t *src, int stride, int h)
1796 {
1797     int i, temp[64], sum = 0;
1798
1799     av_assert2(h == 8);
1800
1801     for (i = 0; i < 8; i++) {
1802         // FIXME: try pointer walks
1803         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1804                    src[stride * i + 0] - dst[stride * i + 0],
1805                    src[stride * i + 1] - dst[stride * i + 1]);
1806         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1807                    src[stride * i + 2] - dst[stride * i + 2],
1808                    src[stride * i + 3] - dst[stride * i + 3]);
1809         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1810                    src[stride * i + 4] - dst[stride * i + 4],
1811                    src[stride * i + 5] - dst[stride * i + 5]);
1812         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1813                    src[stride * i + 6] - dst[stride * i + 6],
1814                    src[stride * i + 7] - dst[stride * i + 7]);
1815
1816         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1817         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1818         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1819         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1820
1821         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1822         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1823         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1824         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1825     }
1826
1827     for (i = 0; i < 8; i++) {
1828         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1829         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1830         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1831         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1832
1833         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1834         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1835         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1836         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1837
1838         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1839                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1840                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1841                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1842     }
1843     return sum;
1844 }
1845
1846 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1847                                 uint8_t *dummy, int stride, int h)
1848 {
1849     int i, temp[64], sum = 0;
1850
1851     av_assert2(h == 8);
1852
1853     for (i = 0; i < 8; i++) {
1854         // FIXME: try pointer walks
1855         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1856                    src[stride * i + 0], src[stride * i + 1]);
1857         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1858                    src[stride * i + 2], src[stride * i + 3]);
1859         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1860                    src[stride * i + 4], src[stride * i + 5]);
1861         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1862                    src[stride * i + 6], src[stride * i + 7]);
1863
1864         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1865         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1866         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1867         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1868
1869         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1870         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1871         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1872         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1873     }
1874
1875     for (i = 0; i < 8; i++) {
1876         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1877         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1878         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1879         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1880
1881         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1882         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1883         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1884         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1885
1886         sum +=
1887             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1888             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1889             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1890             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1891     }
1892
1893     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1894
1895     return sum;
1896 }
1897
1898 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1899                         uint8_t *src2, int stride, int h)
1900 {
1901     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1902
1903     av_assert2(h == 8);
1904
1905     s->dsp.diff_pixels(temp, src1, src2, stride);
1906     s->dsp.fdct(temp);
1907     return s->dsp.sum_abs_dctelem(temp);
1908 }
1909
1910 #if CONFIG_GPL
1911 #define DCT8_1D                                         \
1912     {                                                   \
1913         const int s07 = SRC(0) + SRC(7);                \
1914         const int s16 = SRC(1) + SRC(6);                \
1915         const int s25 = SRC(2) + SRC(5);                \
1916         const int s34 = SRC(3) + SRC(4);                \
1917         const int a0  = s07 + s34;                      \
1918         const int a1  = s16 + s25;                      \
1919         const int a2  = s07 - s34;                      \
1920         const int a3  = s16 - s25;                      \
1921         const int d07 = SRC(0) - SRC(7);                \
1922         const int d16 = SRC(1) - SRC(6);                \
1923         const int d25 = SRC(2) - SRC(5);                \
1924         const int d34 = SRC(3) - SRC(4);                \
1925         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
1926         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
1927         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
1928         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
1929         DST(0, a0 + a1);                                \
1930         DST(1, a4 + (a7 >> 2));                         \
1931         DST(2, a2 + (a3 >> 1));                         \
1932         DST(3, a5 + (a6 >> 2));                         \
1933         DST(4, a0 - a1);                                \
1934         DST(5, a6 - (a5 >> 2));                         \
1935         DST(6, (a2 >> 1) - a3);                         \
1936         DST(7, (a4 >> 2) - a7);                         \
1937     }
1938
1939 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1940                            uint8_t *src2, int stride, int h)
1941 {
1942     int16_t dct[8][8];
1943     int i, sum = 0;
1944
1945     s->dsp.diff_pixels(dct[0], src1, src2, stride);
1946
1947 #define SRC(x) dct[i][x]
1948 #define DST(x, v) dct[i][x] = v
1949     for (i = 0; i < 8; i++)
1950         DCT8_1D
1951 #undef SRC
1952 #undef DST
1953
1954 #define SRC(x) dct[x][i]
1955 #define DST(x, v) sum += FFABS(v)
1956         for (i = 0; i < 8; i++)
1957             DCT8_1D
1958 #undef SRC
1959 #undef DST
1960             return sum;
1961 }
1962 #endif
1963
1964 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
1965                         uint8_t *src2, int stride, int h)
1966 {
1967     LOCAL_ALIGNED_16(int16_t, temp, [64]);
1968     int sum = 0, i;
1969
1970     av_assert2(h == 8);
1971
1972     s->dsp.diff_pixels(temp, src1, src2, stride);
1973     s->dsp.fdct(temp);
1974
1975     for (i = 0; i < 64; i++)
1976         sum = FFMAX(sum, FFABS(temp[i]));
1977
1978     return sum;
1979 }
1980
1981 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
1982                            uint8_t *src2, int stride, int h)
1983 {
1984     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
1985     int16_t *const bak = temp + 64;
1986     int sum = 0, i;
1987
1988     av_assert2(h == 8);
1989     s->mb_intra = 0;
1990
1991     s->dsp.diff_pixels(temp, src1, src2, stride);
1992
1993     memcpy(bak, temp, 64 * sizeof(int16_t));
1994
1995     s->block_last_index[0 /* FIXME */] =
1996         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
1997     s->dct_unquantize_inter(s, temp, 0, s->qscale);
1998     ff_simple_idct_8(temp); // FIXME
1999
2000     for (i = 0; i < 64; i++)
2001         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2002
2003     return sum;
2004 }
2005
2006 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2007                    int stride, int h)
2008 {
2009     const uint8_t *scantable = s->intra_scantable.permutated;
2010     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2011     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2012     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2013     int i, last, run, bits, level, distortion, start_i;
2014     const int esc_length = s->ac_esc_length;
2015     uint8_t *length, *last_length;
2016
2017     av_assert2(h == 8);
2018
2019     copy_block8(lsrc1, src1, 8, stride, 8);
2020     copy_block8(lsrc2, src2, 8, stride, 8);
2021
2022     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2023
2024     s->block_last_index[0 /* FIXME */] =
2025     last                               =
2026         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2027
2028     bits = 0;
2029
2030     if (s->mb_intra) {
2031         start_i     = 1;
2032         length      = s->intra_ac_vlc_length;
2033         last_length = s->intra_ac_vlc_last_length;
2034         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2035     } else {
2036         start_i     = 0;
2037         length      = s->inter_ac_vlc_length;
2038         last_length = s->inter_ac_vlc_last_length;
2039     }
2040
2041     if (last >= start_i) {
2042         run = 0;
2043         for (i = start_i; i < last; i++) {
2044             int j = scantable[i];
2045             level = temp[j];
2046
2047             if (level) {
2048                 level += 64;
2049                 if ((level & (~127)) == 0)
2050                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2051                 else
2052                     bits += esc_length;
2053                 run = 0;
2054             } else
2055                 run++;
2056         }
2057         i = scantable[last];
2058
2059         level = temp[i] + 64;
2060
2061         av_assert2(level - 64);
2062
2063         if ((level & (~127)) == 0) {
2064             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2065         } else
2066             bits += esc_length;
2067     }
2068
2069     if (last >= 0) {
2070         if (s->mb_intra)
2071             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2072         else
2073             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2074     }
2075
2076     s->dsp.idct_add(lsrc2, 8, temp);
2077
2078     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2079
2080     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2081 }
2082
2083 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2084                     int stride, int h)
2085 {
2086     const uint8_t *scantable = s->intra_scantable.permutated;
2087     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2088     int i, last, run, bits, level, start_i;
2089     const int esc_length = s->ac_esc_length;
2090     uint8_t *length, *last_length;
2091
2092     av_assert2(h == 8);
2093
2094     s->dsp.diff_pixels(temp, src1, src2, stride);
2095
2096     s->block_last_index[0 /* FIXME */] =
2097     last                               =
2098         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2099
2100     bits = 0;
2101
2102     if (s->mb_intra) {
2103         start_i     = 1;
2104         length      = s->intra_ac_vlc_length;
2105         last_length = s->intra_ac_vlc_last_length;
2106         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2107     } else {
2108         start_i     = 0;
2109         length      = s->inter_ac_vlc_length;
2110         last_length = s->inter_ac_vlc_last_length;
2111     }
2112
2113     if (last >= start_i) {
2114         run = 0;
2115         for (i = start_i; i < last; i++) {
2116             int j = scantable[i];
2117             level = temp[j];
2118
2119             if (level) {
2120                 level += 64;
2121                 if ((level & (~127)) == 0)
2122                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2123                 else
2124                     bits += esc_length;
2125                 run = 0;
2126             } else
2127                 run++;
2128         }
2129         i = scantable[last];
2130
2131         level = temp[i] + 64;
2132
2133         av_assert2(level - 64);
2134
2135         if ((level & (~127)) == 0)
2136             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2137         else
2138             bits += esc_length;
2139     }
2140
2141     return bits;
2142 }
2143
2144 #define VSAD_INTRA(size)                                                \
2145 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2146                                     uint8_t *s, uint8_t *dummy,         \
2147                                     int stride, int h)                  \
2148 {                                                                       \
2149     int score = 0, x, y;                                                \
2150                                                                         \
2151     for (y = 1; y < h; y++) {                                           \
2152         for (x = 0; x < size; x += 4) {                                 \
2153             score += FFABS(s[x]     - s[x + stride])     +              \
2154                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2155                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2156                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2157         }                                                               \
2158         s += stride;                                                    \
2159     }                                                                   \
2160                                                                         \
2161     return score;                                                       \
2162 }
2163 VSAD_INTRA(8)
2164 VSAD_INTRA(16)
2165
2166 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2167                     int stride, int h)
2168 {
2169     int score = 0, x, y;
2170
2171     for (y = 1; y < h; y++) {
2172         for (x = 0; x < 16; x++)
2173             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2174         s1 += stride;
2175         s2 += stride;
2176     }
2177
2178     return score;
2179 }
2180
2181 #define SQ(a) ((a) * (a))
2182 #define VSSE_INTRA(size)                                                \
2183 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2184                                     uint8_t *s, uint8_t *dummy,         \
2185                                     int stride, int h)                  \
2186 {                                                                       \
2187     int score = 0, x, y;                                                \
2188                                                                         \
2189     for (y = 1; y < h; y++) {                                           \
2190         for (x = 0; x < size; x += 4) {                                 \
2191             score += SQ(s[x]     - s[x + stride]) +                     \
2192                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2193                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2194                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2195         }                                                               \
2196         s += stride;                                                    \
2197     }                                                                   \
2198                                                                         \
2199     return score;                                                       \
2200 }
2201 VSSE_INTRA(8)
2202 VSSE_INTRA(16)
2203
2204 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2205                     int stride, int h)
2206 {
2207     int score = 0, x, y;
2208
2209     for (y = 1; y < h; y++) {
2210         for (x = 0; x < 16; x++)
2211             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2212         s1 += stride;
2213         s2 += stride;
2214     }
2215
2216     return score;
2217 }
2218
2219 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2220                                int size)
2221 {
2222     int score = 0, i;
2223
2224     for (i = 0; i < size; i++)
2225         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2226     return score;
2227 }
2228
2229 #define WRAPPER8_16_SQ(name8, name16)                                   \
2230 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2231                   int stride, int h)                                    \
2232 {                                                                       \
2233     int score = 0;                                                      \
2234                                                                         \
2235     score += name8(s, dst, src, stride, 8);                             \
2236     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2237     if (h == 16) {                                                      \
2238         dst   += 8 * stride;                                            \
2239         src   += 8 * stride;                                            \
2240         score += name8(s, dst, src, stride, 8);                         \
2241         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2242     }                                                                   \
2243     return score;                                                       \
2244 }
2245
2246 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2247 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2248 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2249 #if CONFIG_GPL
2250 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2251 #endif
2252 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2253 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2254 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2255 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2256
2257 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2258                                    uint32_t maxi, uint32_t maxisign)
2259 {
2260     if (a > mini)
2261         return mini;
2262     else if ((a ^ (1U << 31)) > maxisign)
2263         return maxi;
2264     else
2265         return a;
2266 }
2267
2268 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2269                                          float *min, float *max, int len)
2270 {
2271     int i;
2272     uint32_t mini        = *(uint32_t *) min;
2273     uint32_t maxi        = *(uint32_t *) max;
2274     uint32_t maxisign    = maxi ^ (1U << 31);
2275     uint32_t *dsti       = (uint32_t *) dst;
2276     const uint32_t *srci = (const uint32_t *) src;
2277
2278     for (i = 0; i < len; i += 8) {
2279         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2280         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2281         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2282         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2283         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2284         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2285         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2286         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2287     }
2288 }
2289
2290 static void vector_clipf_c(float *dst, const float *src,
2291                            float min, float max, int len)
2292 {
2293     int i;
2294
2295     if (min < 0 && max > 0) {
2296         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2297     } else {
2298         for (i = 0; i < len; i += 8) {
2299             dst[i]     = av_clipf(src[i], min, max);
2300             dst[i + 1] = av_clipf(src[i + 1], min, max);
2301             dst[i + 2] = av_clipf(src[i + 2], min, max);
2302             dst[i + 3] = av_clipf(src[i + 3], min, max);
2303             dst[i + 4] = av_clipf(src[i + 4], min, max);
2304             dst[i + 5] = av_clipf(src[i + 5], min, max);
2305             dst[i + 6] = av_clipf(src[i + 6], min, max);
2306             dst[i + 7] = av_clipf(src[i + 7], min, max);
2307         }
2308     }
2309 }
2310
2311 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2312                                      int order)
2313 {
2314     int res = 0;
2315
2316     while (order--)
2317         res += *v1++ **v2++;
2318
2319     return res;
2320 }
2321
2322 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2323                                               const int16_t *v3,
2324                                               int order, int mul)
2325 {
2326     int res = 0;
2327
2328     while (order--) {
2329         res   += *v1 * *v2++;
2330         *v1++ += mul * *v3++;
2331     }
2332     return res;
2333 }
2334
2335 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2336                                 int32_t max, unsigned int len)
2337 {
2338     do {
2339         *dst++ = av_clip(*src++, min, max);
2340         *dst++ = av_clip(*src++, min, max);
2341         *dst++ = av_clip(*src++, min, max);
2342         *dst++ = av_clip(*src++, min, max);
2343         *dst++ = av_clip(*src++, min, max);
2344         *dst++ = av_clip(*src++, min, max);
2345         *dst++ = av_clip(*src++, min, max);
2346         *dst++ = av_clip(*src++, min, max);
2347         len   -= 8;
2348     } while (len > 0);
2349 }
2350
2351 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2352 {
2353     ff_j_rev_dct(block);
2354     put_pixels_clamped_c(block, dest, line_size);
2355 }
2356
2357 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2358 {
2359     ff_j_rev_dct(block);
2360     add_pixels_clamped_c(block, dest, line_size);
2361 }
2362
2363 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2364 {
2365     ff_j_rev_dct4 (block);
2366     put_pixels_clamped4_c(block, dest, line_size);
2367 }
2368 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2369 {
2370     ff_j_rev_dct4 (block);
2371     add_pixels_clamped4_c(block, dest, line_size);
2372 }
2373
2374 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2375 {
2376     ff_j_rev_dct2 (block);
2377     put_pixels_clamped2_c(block, dest, line_size);
2378 }
2379 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2380 {
2381     ff_j_rev_dct2 (block);
2382     add_pixels_clamped2_c(block, dest, line_size);
2383 }
2384
2385 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2386 {
2387     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2388 }
2389 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2390 {
2391     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2392 }
2393
2394 /* draw the edges of width 'w' of an image of size width, height */
2395 // FIXME: Check that this is OK for MPEG-4 interlaced.
2396 static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
2397                            int w, int h, int sides)
2398 {
2399     uint8_t *ptr = buf, *last_line;
2400     int i;
2401
2402     /* left and right */
2403     for (i = 0; i < height; i++) {
2404         memset(ptr - w, ptr[0], w);
2405         memset(ptr + width, ptr[width - 1], w);
2406         ptr += wrap;
2407     }
2408
2409     /* top and bottom + corners */
2410     buf -= w;
2411     last_line = buf + (height - 1) * wrap;
2412     if (sides & EDGE_TOP)
2413         for (i = 0; i < h; i++)
2414             // top
2415             memcpy(buf - (i + 1) * wrap, buf, width + w + w);
2416     if (sides & EDGE_BOTTOM)
2417         for (i = 0; i < h; i++)
2418             // bottom
2419             memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);
2420 }
2421
2422 static void clear_block_8_c(int16_t *block)
2423 {
2424     memset(block, 0, sizeof(int16_t) * 64);
2425 }
2426
2427 static void clear_blocks_8_c(int16_t *blocks)
2428 {
2429     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
2430 }
2431
2432 /* init static data */
2433 av_cold void ff_dsputil_static_init(void)
2434 {
2435     int i;
2436
2437     for (i = 0; i < 512; i++)
2438         ff_square_tab[i] = (i - 256) * (i - 256);
2439 }
2440
2441 int ff_check_alignment(void)
2442 {
2443     static int did_fail = 0;
2444     LOCAL_ALIGNED_16(int, aligned, [4]);
2445
2446     if ((intptr_t)aligned & 15) {
2447         if (!did_fail) {
2448 #if HAVE_MMX || HAVE_ALTIVEC
2449             av_log(NULL, AV_LOG_ERROR,
2450                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2451                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2452                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2453                 "Do not report crashes to FFmpeg developers.\n");
2454 #endif
2455             did_fail=1;
2456         }
2457         return -1;
2458     }
2459     return 0;
2460 }
2461
2462 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2463 {
2464     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2465
2466     ff_check_alignment();
2467
2468 #if CONFIG_ENCODERS
2469     if (avctx->bits_per_raw_sample == 10) {
2470         c->fdct    = ff_jpeg_fdct_islow_10;
2471         c->fdct248 = ff_fdct248_islow_10;
2472     } else {
2473         if (avctx->dct_algo == FF_DCT_FASTINT) {
2474             c->fdct    = ff_fdct_ifast;
2475             c->fdct248 = ff_fdct_ifast248;
2476         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2477             c->fdct    = ff_faandct;
2478             c->fdct248 = ff_faandct248;
2479         } else {
2480             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2481             c->fdct248 = ff_fdct248_islow_8;
2482         }
2483     }
2484 #endif /* CONFIG_ENCODERS */
2485
2486     if (avctx->lowres==1) {
2487         c->idct_put              = ff_jref_idct4_put;
2488         c->idct_add              = ff_jref_idct4_add;
2489         c->idct                  = ff_j_rev_dct4;
2490         c->idct_permutation_type = FF_NO_IDCT_PERM;
2491     } else if (avctx->lowres==2) {
2492         c->idct_put              =  ff_jref_idct2_put;
2493         c->idct_add              =  ff_jref_idct2_add;
2494         c->idct                  =  ff_j_rev_dct2;
2495         c->idct_permutation_type = FF_NO_IDCT_PERM;
2496     } else if (avctx->lowres==3) {
2497         c->idct_put              =  ff_jref_idct1_put;
2498         c->idct_add              =  ff_jref_idct1_add;
2499         c->idct                  =  ff_j_rev_dct1;
2500         c->idct_permutation_type = FF_NO_IDCT_PERM;
2501     } else {
2502         if (avctx->bits_per_raw_sample == 10) {
2503             c->idct_put              = ff_simple_idct_put_10;
2504             c->idct_add              = ff_simple_idct_add_10;
2505             c->idct                  = ff_simple_idct_10;
2506             c->idct_permutation_type = FF_NO_IDCT_PERM;
2507         } else if (avctx->bits_per_raw_sample == 12) {
2508             c->idct_put              = ff_simple_idct_put_12;
2509             c->idct_add              = ff_simple_idct_add_12;
2510             c->idct                  = ff_simple_idct_12;
2511             c->idct_permutation_type = FF_NO_IDCT_PERM;
2512         } else {
2513         if (avctx->idct_algo == FF_IDCT_INT) {
2514             c->idct_put              = jref_idct_put;
2515             c->idct_add              = jref_idct_add;
2516             c->idct                  = ff_j_rev_dct;
2517             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2518         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2519             c->idct_put              = ff_faanidct_put;
2520             c->idct_add              = ff_faanidct_add;
2521             c->idct                  = ff_faanidct;
2522             c->idct_permutation_type = FF_NO_IDCT_PERM;
2523         } else { // accurate/default
2524             c->idct_put              = ff_simple_idct_put_8;
2525             c->idct_add              = ff_simple_idct_add_8;
2526             c->idct                  = ff_simple_idct_8;
2527             c->idct_permutation_type = FF_NO_IDCT_PERM;
2528         }
2529         }
2530     }
2531
2532     c->diff_pixels = diff_pixels_c;
2533
2534     c->put_pixels_clamped        = put_pixels_clamped_c;
2535     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2536     c->add_pixels_clamped        = add_pixels_clamped_c;
2537
2538     c->sum_abs_dctelem = sum_abs_dctelem_c;
2539
2540     c->gmc1 = gmc1_c;
2541     c->gmc  = ff_gmc_c;
2542
2543     c->pix_sum   = pix_sum_c;
2544     c->pix_norm1 = pix_norm1_c;
2545
2546     c->fill_block_tab[0] = fill_block16_c;
2547     c->fill_block_tab[1] = fill_block8_c;
2548
2549     /* TODO [0] 16  [1] 8 */
2550     c->pix_abs[0][0] = pix_abs16_c;
2551     c->pix_abs[0][1] = pix_abs16_x2_c;
2552     c->pix_abs[0][2] = pix_abs16_y2_c;
2553     c->pix_abs[0][3] = pix_abs16_xy2_c;
2554     c->pix_abs[1][0] = pix_abs8_c;
2555     c->pix_abs[1][1] = pix_abs8_x2_c;
2556     c->pix_abs[1][2] = pix_abs8_y2_c;
2557     c->pix_abs[1][3] = pix_abs8_xy2_c;
2558
2559 #define dspfunc(PFX, IDX, NUM)                              \
2560     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2561     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2562     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2563     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2564     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2565     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2566     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2567     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2568     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2569     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2570     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2571     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2572     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2573     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2574     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2575     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2576
2577     dspfunc(put_qpel, 0, 16);
2578     dspfunc(put_qpel, 1, 8);
2579
2580     dspfunc(put_no_rnd_qpel, 0, 16);
2581     dspfunc(put_no_rnd_qpel, 1, 8);
2582
2583     dspfunc(avg_qpel, 0, 16);
2584     dspfunc(avg_qpel, 1, 8);
2585
2586 #undef dspfunc
2587
2588     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2589     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2590     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2591     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2592     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2593     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2594     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2595     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2596
2597 #define SET_CMP_FUNC(name)                      \
2598     c->name[0] = name ## 16_c;                  \
2599     c->name[1] = name ## 8x8_c;
2600
2601     SET_CMP_FUNC(hadamard8_diff)
2602     c->hadamard8_diff[4] = hadamard8_intra16_c;
2603     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2604     SET_CMP_FUNC(dct_sad)
2605     SET_CMP_FUNC(dct_max)
2606 #if CONFIG_GPL
2607     SET_CMP_FUNC(dct264_sad)
2608 #endif
2609     c->sad[0] = pix_abs16_c;
2610     c->sad[1] = pix_abs8_c;
2611     c->sse[0] = sse16_c;
2612     c->sse[1] = sse8_c;
2613     c->sse[2] = sse4_c;
2614     SET_CMP_FUNC(quant_psnr)
2615     SET_CMP_FUNC(rd)
2616     SET_CMP_FUNC(bit)
2617     c->vsad[0] = vsad16_c;
2618     c->vsad[4] = vsad_intra16_c;
2619     c->vsad[5] = vsad_intra8_c;
2620     c->vsse[0] = vsse16_c;
2621     c->vsse[4] = vsse_intra16_c;
2622     c->vsse[5] = vsse_intra8_c;
2623     c->nsse[0] = nsse16_c;
2624     c->nsse[1] = nsse8_c;
2625 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2626     ff_dsputil_init_dwt(c);
2627 #endif
2628
2629     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2630
2631     c->bswap_buf   = bswap_buf;
2632     c->bswap16_buf = bswap16_buf;
2633
2634     c->try_8x8basis = try_8x8basis_c;
2635     c->add_8x8basis = add_8x8basis_c;
2636
2637     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2638
2639     c->scalarproduct_int16 = scalarproduct_int16_c;
2640     c->vector_clip_int32   = vector_clip_int32_c;
2641     c->vector_clipf        = vector_clipf_c;
2642
2643     c->shrink[0] = av_image_copy_plane;
2644     c->shrink[1] = ff_shrink22;
2645     c->shrink[2] = ff_shrink44;
2646     c->shrink[3] = ff_shrink88;
2647
2648     c->add_pixels8 = add_pixels8_c;
2649
2650     c->draw_edges = draw_edges_8_c;
2651
2652     c->clear_block  = clear_block_8_c;
2653     c->clear_blocks = clear_blocks_8_c;
2654
2655     switch (avctx->bits_per_raw_sample) {
2656     case 9:
2657     case 10:
2658     case 12:
2659     case 14:
2660         c->get_pixels = get_pixels_16_c;
2661         break;
2662     default:
2663         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2664             c->get_pixels = get_pixels_8_c;
2665         }
2666         break;
2667     }
2668
2669
2670     if (ARCH_ALPHA)
2671         ff_dsputil_init_alpha(c, avctx);
2672     if (ARCH_ARM)
2673         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2674     if (ARCH_BFIN)
2675         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2676     if (ARCH_PPC)
2677         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2678     if (ARCH_X86)
2679         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2680
2681     ff_init_scantable_permutation(c->idct_permutation,
2682                                   c->idct_permutation_type);
2683 }
2684
2685 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2686 {
2687     ff_dsputil_init(c, avctx);
2688 }
2689
2690 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2691 {
2692     ff_dsputil_init(c, avctx);
2693 }