]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
Merge commit '4754345027eb85cfa51aeb88beec68d7b036c11e'
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45
46 uint32_t ff_square_tab[512] = { 0, };
47
48 #define BIT_DEPTH 16
49 #include "dsputil_template.c"
50 #undef BIT_DEPTH
51
52 #define BIT_DEPTH 8
53 #include "tpel_template.c"
54 #include "dsputil_template.c"
55
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL / 255 * 0x7f)
58 #define pb_80 (~0UL / 255 * 0x80)
59
60 /* Specific zigzag scan for 248 idct. NOTE that unlike the
61  * specification, we interleave the fields */
62 const uint8_t ff_zigzag248_direct[64] = {
63      0,  8,  1,  9, 16, 24,  2, 10,
64     17, 25, 32, 40, 48, 56, 33, 41,
65     18, 26,  3, 11,  4, 12, 19, 27,
66     34, 42, 49, 57, 50, 58, 35, 43,
67     20, 28,  5, 13,  6, 14, 21, 29,
68     36, 44, 51, 59, 52, 60, 37, 45,
69     22, 30,  7, 15, 23, 31, 38, 46,
70     53, 61, 54, 62, 39, 47, 55, 63,
71 };
72
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74      0,  1,  2,  3,  8,  9, 16, 17,
75     10, 11,  4,  5,  6,  7, 15, 14,
76     13, 12, 19, 18, 24, 25, 32, 33,
77     26, 27, 20, 21, 22, 23, 28, 29,
78     30, 31, 34, 35, 40, 41, 48, 49,
79     42, 43, 36, 37, 38, 39, 44, 45,
80     46, 47, 50, 51, 56, 57, 58, 59,
81     52, 53, 54, 55, 60, 61, 62, 63,
82 };
83
84 const uint8_t ff_alternate_vertical_scan[64] = {
85      0,  8, 16, 24,  1,  9,  2, 10,
86     17, 25, 32, 40, 48, 56, 57, 49,
87     41, 33, 26, 18,  3, 11,  4, 12,
88     19, 27, 34, 42, 50, 58, 35, 43,
89     51, 59, 20, 28,  5, 13,  6, 14,
90     21, 29, 36, 44, 52, 60, 37, 45,
91     53, 61, 22, 30,  7, 15, 23, 31,
92     38, 46, 54, 62, 39, 47, 55, 63,
93 };
94
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64] = {
97     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
105 };
106
107 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
108
109 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
110                                const uint8_t *src_scantable)
111 {
112     int i, end;
113
114     st->scantable = src_scantable;
115
116     for (i = 0; i < 64; i++) {
117         int j = src_scantable[i];
118         st->permutated[i] = permutation[j];
119     }
120
121     end = -1;
122     for (i = 0; i < 64; i++) {
123         int j = st->permutated[i];
124         if (j > end)
125             end = j;
126         st->raster_end[i] = end;
127     }
128 }
129
130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
131                                            int idct_permutation_type)
132 {
133     int i;
134
135     switch (idct_permutation_type) {
136     case FF_NO_IDCT_PERM:
137         for (i = 0; i < 64; i++)
138             idct_permutation[i] = i;
139         break;
140     case FF_LIBMPEG2_IDCT_PERM:
141         for (i = 0; i < 64; i++)
142             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
143         break;
144     case FF_SIMPLE_IDCT_PERM:
145         for (i = 0; i < 64; i++)
146             idct_permutation[i] = simple_mmx_permutation[i];
147         break;
148     case FF_TRANSPOSE_IDCT_PERM:
149         for (i = 0; i < 64; i++)
150             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
151         break;
152     case FF_PARTTRANS_IDCT_PERM:
153         for (i = 0; i < 64; i++)
154             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
155         break;
156     case FF_SSE2_IDCT_PERM:
157         for (i = 0; i < 64; i++)
158             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
159         break;
160     default:
161         av_log(NULL, AV_LOG_ERROR,
162                "Internal error, IDCT permutation not set\n");
163     }
164 }
165
166 static int pix_sum_c(uint8_t *pix, int line_size)
167 {
168     int s = 0, i, j;
169
170     for (i = 0; i < 16; i++) {
171         for (j = 0; j < 16; j += 8) {
172             s   += pix[0];
173             s   += pix[1];
174             s   += pix[2];
175             s   += pix[3];
176             s   += pix[4];
177             s   += pix[5];
178             s   += pix[6];
179             s   += pix[7];
180             pix += 8;
181         }
182         pix += line_size - 16;
183     }
184     return s;
185 }
186
187 static int pix_norm1_c(uint8_t *pix, int line_size)
188 {
189     int s = 0, i, j;
190     uint32_t *sq = ff_square_tab + 256;
191
192     for (i = 0; i < 16; i++) {
193         for (j = 0; j < 16; j += 8) {
194 #if 0
195             s += sq[pix[0]];
196             s += sq[pix[1]];
197             s += sq[pix[2]];
198             s += sq[pix[3]];
199             s += sq[pix[4]];
200             s += sq[pix[5]];
201             s += sq[pix[6]];
202             s += sq[pix[7]];
203 #else
204 #if HAVE_FAST_64BIT
205             register uint64_t x = *(uint64_t *) pix;
206             s += sq[x         & 0xff];
207             s += sq[(x >>  8) & 0xff];
208             s += sq[(x >> 16) & 0xff];
209             s += sq[(x >> 24) & 0xff];
210             s += sq[(x >> 32) & 0xff];
211             s += sq[(x >> 40) & 0xff];
212             s += sq[(x >> 48) & 0xff];
213             s += sq[(x >> 56) & 0xff];
214 #else
215             register uint32_t x = *(uint32_t *) pix;
216             s += sq[x         & 0xff];
217             s += sq[(x >>  8) & 0xff];
218             s += sq[(x >> 16) & 0xff];
219             s += sq[(x >> 24) & 0xff];
220             x  = *(uint32_t *) (pix + 4);
221             s += sq[x         & 0xff];
222             s += sq[(x >>  8) & 0xff];
223             s += sq[(x >> 16) & 0xff];
224             s += sq[(x >> 24) & 0xff];
225 #endif
226 #endif
227             pix += 8;
228         }
229         pix += line_size - 16;
230     }
231     return s;
232 }
233
234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
235 {
236     int i;
237
238     for (i = 0; i + 8 <= w; i += 8) {
239         dst[i + 0] = av_bswap32(src[i + 0]);
240         dst[i + 1] = av_bswap32(src[i + 1]);
241         dst[i + 2] = av_bswap32(src[i + 2]);
242         dst[i + 3] = av_bswap32(src[i + 3]);
243         dst[i + 4] = av_bswap32(src[i + 4]);
244         dst[i + 5] = av_bswap32(src[i + 5]);
245         dst[i + 6] = av_bswap32(src[i + 6]);
246         dst[i + 7] = av_bswap32(src[i + 7]);
247     }
248     for (; i < w; i++)
249         dst[i + 0] = av_bswap32(src[i + 0]);
250 }
251
252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
253 {
254     while (len--)
255         *dst++ = av_bswap16(*src++);
256 }
257
258 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
259                   int line_size, int h)
260 {
261     int s = 0, i;
262     uint32_t *sq = ff_square_tab + 256;
263
264     for (i = 0; i < h; i++) {
265         s    += sq[pix1[0] - pix2[0]];
266         s    += sq[pix1[1] - pix2[1]];
267         s    += sq[pix1[2] - pix2[2]];
268         s    += sq[pix1[3] - pix2[3]];
269         pix1 += line_size;
270         pix2 += line_size;
271     }
272     return s;
273 }
274
275 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276                   int line_size, int h)
277 {
278     int s = 0, i;
279     uint32_t *sq = ff_square_tab + 256;
280
281     for (i = 0; i < h; i++) {
282         s    += sq[pix1[0] - pix2[0]];
283         s    += sq[pix1[1] - pix2[1]];
284         s    += sq[pix1[2] - pix2[2]];
285         s    += sq[pix1[3] - pix2[3]];
286         s    += sq[pix1[4] - pix2[4]];
287         s    += sq[pix1[5] - pix2[5]];
288         s    += sq[pix1[6] - pix2[6]];
289         s    += sq[pix1[7] - pix2[7]];
290         pix1 += line_size;
291         pix2 += line_size;
292     }
293     return s;
294 }
295
296 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297                    int line_size, int h)
298 {
299     int s = 0, i;
300     uint32_t *sq = ff_square_tab + 256;
301
302     for (i = 0; i < h; i++) {
303         s += sq[pix1[0]  - pix2[0]];
304         s += sq[pix1[1]  - pix2[1]];
305         s += sq[pix1[2]  - pix2[2]];
306         s += sq[pix1[3]  - pix2[3]];
307         s += sq[pix1[4]  - pix2[4]];
308         s += sq[pix1[5]  - pix2[5]];
309         s += sq[pix1[6]  - pix2[6]];
310         s += sq[pix1[7]  - pix2[7]];
311         s += sq[pix1[8]  - pix2[8]];
312         s += sq[pix1[9]  - pix2[9]];
313         s += sq[pix1[10] - pix2[10]];
314         s += sq[pix1[11] - pix2[11]];
315         s += sq[pix1[12] - pix2[12]];
316         s += sq[pix1[13] - pix2[13]];
317         s += sq[pix1[14] - pix2[14]];
318         s += sq[pix1[15] - pix2[15]];
319
320         pix1 += line_size;
321         pix2 += line_size;
322     }
323     return s;
324 }
325
326 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
327                           const uint8_t *s2, int stride)
328 {
329     int i;
330
331     /* read the pixels */
332     for (i = 0; i < 8; i++) {
333         block[0] = s1[0] - s2[0];
334         block[1] = s1[1] - s2[1];
335         block[2] = s1[2] - s2[2];
336         block[3] = s1[3] - s2[3];
337         block[4] = s1[4] - s2[4];
338         block[5] = s1[5] - s2[5];
339         block[6] = s1[6] - s2[6];
340         block[7] = s1[7] - s2[7];
341         s1      += stride;
342         s2      += stride;
343         block   += 8;
344     }
345 }
346
347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
348                                  int line_size)
349 {
350     int i;
351
352     /* read the pixels */
353     for (i = 0; i < 8; i++) {
354         pixels[0] = av_clip_uint8(block[0]);
355         pixels[1] = av_clip_uint8(block[1]);
356         pixels[2] = av_clip_uint8(block[2]);
357         pixels[3] = av_clip_uint8(block[3]);
358         pixels[4] = av_clip_uint8(block[4]);
359         pixels[5] = av_clip_uint8(block[5]);
360         pixels[6] = av_clip_uint8(block[6]);
361         pixels[7] = av_clip_uint8(block[7]);
362
363         pixels += line_size;
364         block  += 8;
365     }
366 }
367
368 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
369                                  int line_size)
370 {
371     int i;
372
373     /* read the pixels */
374     for(i=0;i<4;i++) {
375         pixels[0] = av_clip_uint8(block[0]);
376         pixels[1] = av_clip_uint8(block[1]);
377         pixels[2] = av_clip_uint8(block[2]);
378         pixels[3] = av_clip_uint8(block[3]);
379
380         pixels += line_size;
381         block += 8;
382     }
383 }
384
385 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
386                                  int line_size)
387 {
388     int i;
389
390     /* read the pixels */
391     for(i=0;i<2;i++) {
392         pixels[0] = av_clip_uint8(block[0]);
393         pixels[1] = av_clip_uint8(block[1]);
394
395         pixels += line_size;
396         block += 8;
397     }
398 }
399
400 static void put_signed_pixels_clamped_c(const int16_t *block,
401                                         uint8_t *av_restrict pixels,
402                                         int line_size)
403 {
404     int i, j;
405
406     for (i = 0; i < 8; i++) {
407         for (j = 0; j < 8; j++) {
408             if (*block < -128)
409                 *pixels = 0;
410             else if (*block > 127)
411                 *pixels = 255;
412             else
413                 *pixels = (uint8_t) (*block + 128);
414             block++;
415             pixels++;
416         }
417         pixels += (line_size - 8);
418     }
419 }
420
421 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
422                           int line_size)
423 {
424     int i;
425
426     for (i = 0; i < 8; i++) {
427         pixels[0] += block[0];
428         pixels[1] += block[1];
429         pixels[2] += block[2];
430         pixels[3] += block[3];
431         pixels[4] += block[4];
432         pixels[5] += block[5];
433         pixels[6] += block[6];
434         pixels[7] += block[7];
435         pixels    += line_size;
436         block     += 8;
437     }
438 }
439
440 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
441                                  int line_size)
442 {
443     int i;
444
445     /* read the pixels */
446     for (i = 0; i < 8; i++) {
447         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
448         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
449         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
450         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
451         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
452         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
453         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
454         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
455         pixels   += line_size;
456         block    += 8;
457     }
458 }
459
460 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
461                           int line_size)
462 {
463     int i;
464
465     /* read the pixels */
466     for(i=0;i<4;i++) {
467         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
468         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
469         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
470         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
471         pixels += line_size;
472         block += 8;
473     }
474 }
475
476 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
477                           int line_size)
478 {
479     int i;
480
481     /* read the pixels */
482     for(i=0;i<2;i++) {
483         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
484         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
485         pixels += line_size;
486         block += 8;
487     }
488 }
489
490 static int sum_abs_dctelem_c(int16_t *block)
491 {
492     int sum = 0, i;
493
494     for (i = 0; i < 64; i++)
495         sum += FFABS(block[i]);
496     return sum;
497 }
498
499 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
500 {
501     int i;
502
503     for (i = 0; i < h; i++) {
504         memset(block, value, 16);
505         block += line_size;
506     }
507 }
508
509 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
510 {
511     int i;
512
513     for (i = 0; i < h; i++) {
514         memset(block, value, 8);
515         block += line_size;
516     }
517 }
518
519 #define avg2(a, b) ((a + b + 1) >> 1)
520 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
521
522 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
523                    int x16, int y16, int rounder)
524 {
525     const int A = (16 - x16) * (16 - y16);
526     const int B = (x16)      * (16 - y16);
527     const int C = (16 - x16) * (y16);
528     const int D = (x16)      * (y16);
529     int i;
530
531     for (i = 0; i < h; i++) {
532         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
533         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
534         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
535         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
536         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
537         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
538         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
539         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
540         dst   += stride;
541         src   += stride;
542     }
543 }
544
545 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
546               int dxx, int dxy, int dyx, int dyy, int shift, int r,
547               int width, int height)
548 {
549     int y, vx, vy;
550     const int s = 1 << shift;
551
552     width--;
553     height--;
554
555     for (y = 0; y < h; y++) {
556         int x;
557
558         vx = ox;
559         vy = oy;
560         for (x = 0; x < 8; x++) { // FIXME: optimize
561             int index;
562             int src_x  = vx >> 16;
563             int src_y  = vy >> 16;
564             int frac_x = src_x & (s - 1);
565             int frac_y = src_y & (s - 1);
566
567             src_x >>= shift;
568             src_y >>= shift;
569
570             if ((unsigned) src_x < width) {
571                 if ((unsigned) src_y < height) {
572                     index = src_x + src_y * stride;
573                     dst[y * stride + x] =
574                         ((src[index]                        * (s - frac_x) +
575                           src[index + 1]          * frac_x) * (s - frac_y) +
576                          (src[index + stride]               * (s - frac_x) +
577                           src[index + stride + 1] * frac_x) *      frac_y  +
578                          r) >> (shift * 2);
579                 } else {
580                     index = src_x + av_clip(src_y, 0, height) * stride;
581                     dst[y * stride + x] =
582                         ((src[index]               * (s - frac_x) +
583                           src[index + 1] * frac_x) *  s           +
584                          r) >> (shift * 2);
585                 }
586             } else {
587                 if ((unsigned) src_y < height) {
588                     index = av_clip(src_x, 0, width) + src_y * stride;
589                     dst[y * stride + x] =
590                         ((src[index]                    * (s - frac_y) +
591                           src[index + stride] * frac_y) *  s           +
592                          r) >> (shift * 2);
593                 } else {
594                     index = av_clip(src_x, 0, width) +
595                             av_clip(src_y, 0, height) * stride;
596                     dst[y * stride + x] = src[index];
597                 }
598             }
599
600             vx += dxx;
601             vy += dyx;
602         }
603         ox += dxy;
604         oy += dyy;
605     }
606 }
607
608 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
609 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
610                                             int dstStride, int srcStride,     \
611                                             int h)                            \
612 {                                                                             \
613     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
614     int i;                                                                    \
615                                                                               \
616     for (i = 0; i < h; i++) {                                                 \
617         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
618         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
619         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
620         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
621         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
622         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
623         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
624         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
625         dst += dstStride;                                                     \
626         src += srcStride;                                                     \
627     }                                                                         \
628 }                                                                             \
629                                                                               \
630 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
631                                             int dstStride, int srcStride)     \
632 {                                                                             \
633     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
634     const int w = 8;                                                          \
635     int i;                                                                    \
636                                                                               \
637     for (i = 0; i < w; i++) {                                                 \
638         const int src0 = src[0 * srcStride];                                  \
639         const int src1 = src[1 * srcStride];                                  \
640         const int src2 = src[2 * srcStride];                                  \
641         const int src3 = src[3 * srcStride];                                  \
642         const int src4 = src[4 * srcStride];                                  \
643         const int src5 = src[5 * srcStride];                                  \
644         const int src6 = src[6 * srcStride];                                  \
645         const int src7 = src[7 * srcStride];                                  \
646         const int src8 = src[8 * srcStride];                                  \
647         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
648         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
649         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
650         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
651         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
652         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
653         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
654         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
655         dst++;                                                                \
656         src++;                                                                \
657     }                                                                         \
658 }                                                                             \
659                                                                               \
660 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
661                                              int dstStride, int srcStride,    \
662                                              int h)                           \
663 {                                                                             \
664     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
665     int i;                                                                    \
666                                                                               \
667     for (i = 0; i < h; i++) {                                                 \
668         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
669         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
670         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
671         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
672         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
673         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
674         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
675         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
676         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
677         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
678         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
679         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
680         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
681         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
682         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
683         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
684         dst += dstStride;                                                     \
685         src += srcStride;                                                     \
686     }                                                                         \
687 }                                                                             \
688                                                                               \
689 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
690                                              int dstStride, int srcStride)    \
691 {                                                                             \
692     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
693     const int w = 16;                                                         \
694     int i;                                                                    \
695                                                                               \
696     for (i = 0; i < w; i++) {                                                 \
697         const int src0  = src[0  * srcStride];                                \
698         const int src1  = src[1  * srcStride];                                \
699         const int src2  = src[2  * srcStride];                                \
700         const int src3  = src[3  * srcStride];                                \
701         const int src4  = src[4  * srcStride];                                \
702         const int src5  = src[5  * srcStride];                                \
703         const int src6  = src[6  * srcStride];                                \
704         const int src7  = src[7  * srcStride];                                \
705         const int src8  = src[8  * srcStride];                                \
706         const int src9  = src[9  * srcStride];                                \
707         const int src10 = src[10 * srcStride];                                \
708         const int src11 = src[11 * srcStride];                                \
709         const int src12 = src[12 * srcStride];                                \
710         const int src13 = src[13 * srcStride];                                \
711         const int src14 = src[14 * srcStride];                                \
712         const int src15 = src[15 * srcStride];                                \
713         const int src16 = src[16 * srcStride];                                \
714         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
715         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
716         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
717         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
718         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
719         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
720         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
721         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
722         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
723         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
724         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
725         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
726         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
727         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
728         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
729         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
730         dst++;                                                                \
731         src++;                                                                \
732     }                                                                         \
733 }                                                                             \
734                                                                               \
735 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
736                                    ptrdiff_t stride)                          \
737 {                                                                             \
738     uint8_t half[64];                                                         \
739                                                                               \
740     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
741     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
742 }                                                                             \
743                                                                               \
744 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
745                                    ptrdiff_t stride)                          \
746 {                                                                             \
747     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
748 }                                                                             \
749                                                                               \
750 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
751                                    ptrdiff_t stride)                          \
752 {                                                                             \
753     uint8_t half[64];                                                         \
754                                                                               \
755     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
756     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
757 }                                                                             \
758                                                                               \
759 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
760                                    ptrdiff_t stride)                          \
761 {                                                                             \
762     uint8_t full[16 * 9];                                                     \
763     uint8_t half[64];                                                         \
764                                                                               \
765     copy_block9(full, src, 16, stride, 9);                                    \
766     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
767     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
768 }                                                                             \
769                                                                               \
770 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
771                                    ptrdiff_t stride)                          \
772 {                                                                             \
773     uint8_t full[16 * 9];                                                     \
774                                                                               \
775     copy_block9(full, src, 16, stride, 9);                                    \
776     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
777 }                                                                             \
778                                                                               \
779 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
780                                    ptrdiff_t stride)                          \
781 {                                                                             \
782     uint8_t full[16 * 9];                                                     \
783     uint8_t half[64];                                                         \
784                                                                               \
785     copy_block9(full, src, 16, stride, 9);                                    \
786     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
787     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
788 }                                                                             \
789                                                                               \
790 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
791                                        ptrdiff_t stride)                      \
792 {                                                                             \
793     uint8_t full[16 * 9];                                                     \
794     uint8_t halfH[72];                                                        \
795     uint8_t halfV[64];                                                        \
796     uint8_t halfHV[64];                                                       \
797                                                                               \
798     copy_block9(full, src, 16, stride, 9);                                    \
799     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
800     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
801     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
802     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
803                            stride, 16, 8, 8, 8, 8);                           \
804 }                                                                             \
805                                                                               \
806 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
807                                    ptrdiff_t stride)                          \
808 {                                                                             \
809     uint8_t full[16 * 9];                                                     \
810     uint8_t halfH[72];                                                        \
811     uint8_t halfHV[64];                                                       \
812                                                                               \
813     copy_block9(full, src, 16, stride, 9);                                    \
814     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
815     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
816     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
817     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
818 }                                                                             \
819                                                                               \
820 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
821                                        ptrdiff_t stride)                      \
822 {                                                                             \
823     uint8_t full[16 * 9];                                                     \
824     uint8_t halfH[72];                                                        \
825     uint8_t halfV[64];                                                        \
826     uint8_t halfHV[64];                                                       \
827                                                                               \
828     copy_block9(full, src, 16, stride, 9);                                    \
829     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
830     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
831     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
832     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
833                            stride, 16, 8, 8, 8, 8);                           \
834 }                                                                             \
835                                                                               \
836 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
837                                    ptrdiff_t stride)                          \
838 {                                                                             \
839     uint8_t full[16 * 9];                                                     \
840     uint8_t halfH[72];                                                        \
841     uint8_t halfHV[64];                                                       \
842                                                                               \
843     copy_block9(full, src, 16, stride, 9);                                    \
844     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
845     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
846     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
847     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
848 }                                                                             \
849                                                                               \
850 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
851                                        ptrdiff_t stride)                      \
852 {                                                                             \
853     uint8_t full[16 * 9];                                                     \
854     uint8_t halfH[72];                                                        \
855     uint8_t halfV[64];                                                        \
856     uint8_t halfHV[64];                                                       \
857                                                                               \
858     copy_block9(full, src, 16, stride, 9);                                    \
859     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
860     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
861     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
862     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
863                            stride, 16, 8, 8, 8, 8);                           \
864 }                                                                             \
865                                                                               \
866 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
867                                    ptrdiff_t stride)                          \
868 {                                                                             \
869     uint8_t full[16 * 9];                                                     \
870     uint8_t halfH[72];                                                        \
871     uint8_t halfHV[64];                                                       \
872                                                                               \
873     copy_block9(full, src, 16, stride, 9);                                    \
874     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
875     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
876     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
877     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
878 }                                                                             \
879                                                                               \
880 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
881                                        ptrdiff_t stride)                      \
882 {                                                                             \
883     uint8_t full[16 * 9];                                                     \
884     uint8_t halfH[72];                                                        \
885     uint8_t halfV[64];                                                        \
886     uint8_t halfHV[64];                                                       \
887                                                                               \
888     copy_block9(full, src, 16, stride, 9);                                    \
889     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
890     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
891     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
892     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
893                            stride, 16, 8, 8, 8, 8);                           \
894 }                                                                             \
895                                                                               \
896 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
897                                    ptrdiff_t stride)                          \
898 {                                                                             \
899     uint8_t full[16 * 9];                                                     \
900     uint8_t halfH[72];                                                        \
901     uint8_t halfHV[64];                                                       \
902                                                                               \
903     copy_block9(full, src, 16, stride, 9);                                    \
904     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
905     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
907     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
908 }                                                                             \
909                                                                               \
910 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
911                                    ptrdiff_t stride)                          \
912 {                                                                             \
913     uint8_t halfH[72];                                                        \
914     uint8_t halfHV[64];                                                       \
915                                                                               \
916     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
917     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
918     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
919 }                                                                             \
920                                                                               \
921 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
922                                    ptrdiff_t stride)                          \
923 {                                                                             \
924     uint8_t halfH[72];                                                        \
925     uint8_t halfHV[64];                                                       \
926                                                                               \
927     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
928     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
929     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
930 }                                                                             \
931                                                                               \
932 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
933                                        ptrdiff_t stride)                      \
934 {                                                                             \
935     uint8_t full[16 * 9];                                                     \
936     uint8_t halfH[72];                                                        \
937     uint8_t halfV[64];                                                        \
938     uint8_t halfHV[64];                                                       \
939                                                                               \
940     copy_block9(full, src, 16, stride, 9);                                    \
941     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
942     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
943     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
944     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
945 }                                                                             \
946                                                                               \
947 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
948                                    ptrdiff_t stride)                          \
949 {                                                                             \
950     uint8_t full[16 * 9];                                                     \
951     uint8_t halfH[72];                                                        \
952                                                                               \
953     copy_block9(full, src, 16, stride, 9);                                    \
954     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
955     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
956     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
957 }                                                                             \
958                                                                               \
959 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
960                                        ptrdiff_t stride)                      \
961 {                                                                             \
962     uint8_t full[16 * 9];                                                     \
963     uint8_t halfH[72];                                                        \
964     uint8_t halfV[64];                                                        \
965     uint8_t halfHV[64];                                                       \
966                                                                               \
967     copy_block9(full, src, 16, stride, 9);                                    \
968     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
969     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
970     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
971     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
972 }                                                                             \
973                                                                               \
974 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
975                                    ptrdiff_t stride)                          \
976 {                                                                             \
977     uint8_t full[16 * 9];                                                     \
978     uint8_t halfH[72];                                                        \
979                                                                               \
980     copy_block9(full, src, 16, stride, 9);                                    \
981     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
982     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
983     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
984 }                                                                             \
985                                                                               \
986 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
987                                    ptrdiff_t stride)                          \
988 {                                                                             \
989     uint8_t halfH[72];                                                        \
990                                                                               \
991     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
992     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
993 }                                                                             \
994                                                                               \
995 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
996                                     ptrdiff_t stride)                         \
997 {                                                                             \
998     uint8_t half[256];                                                        \
999                                                                               \
1000     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1001     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
1002 }                                                                             \
1003                                                                               \
1004 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
1005                                     ptrdiff_t stride)                         \
1006 {                                                                             \
1007     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
1008 }                                                                             \
1009                                                                               \
1010 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
1011                                     ptrdiff_t stride)                         \
1012 {                                                                             \
1013     uint8_t half[256];                                                        \
1014                                                                               \
1015     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1016     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1017 }                                                                             \
1018                                                                               \
1019 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1020                                     ptrdiff_t stride)                         \
1021 {                                                                             \
1022     uint8_t full[24 * 17];                                                    \
1023     uint8_t half[256];                                                        \
1024                                                                               \
1025     copy_block17(full, src, 24, stride, 17);                                  \
1026     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1027     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1028 }                                                                             \
1029                                                                               \
1030 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1031                                     ptrdiff_t stride)                         \
1032 {                                                                             \
1033     uint8_t full[24 * 17];                                                    \
1034                                                                               \
1035     copy_block17(full, src, 24, stride, 17);                                  \
1036     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1037 }                                                                             \
1038                                                                               \
1039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1040                                     ptrdiff_t stride)                         \
1041 {                                                                             \
1042     uint8_t full[24 * 17];                                                    \
1043     uint8_t half[256];                                                        \
1044                                                                               \
1045     copy_block17(full, src, 24, stride, 17);                                  \
1046     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1047     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1048 }                                                                             \
1049                                                                               \
1050 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1051                                         ptrdiff_t stride)                     \
1052 {                                                                             \
1053     uint8_t full[24 * 17];                                                    \
1054     uint8_t halfH[272];                                                       \
1055     uint8_t halfV[256];                                                       \
1056     uint8_t halfHV[256];                                                      \
1057                                                                               \
1058     copy_block17(full, src, 24, stride, 17);                                  \
1059     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1060     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1061     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1062     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1063                             stride, 24, 16, 16, 16, 16);                      \
1064 }                                                                             \
1065                                                                               \
1066 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1067                                     ptrdiff_t stride)                         \
1068 {                                                                             \
1069     uint8_t full[24 * 17];                                                    \
1070     uint8_t halfH[272];                                                       \
1071     uint8_t halfHV[256];                                                      \
1072                                                                               \
1073     copy_block17(full, src, 24, stride, 17);                                  \
1074     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1075     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1076     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1077     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1078 }                                                                             \
1079                                                                               \
1080 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1081                                         ptrdiff_t stride)                     \
1082 {                                                                             \
1083     uint8_t full[24 * 17];                                                    \
1084     uint8_t halfH[272];                                                       \
1085     uint8_t halfV[256];                                                       \
1086     uint8_t halfHV[256];                                                      \
1087                                                                               \
1088     copy_block17(full, src, 24, stride, 17);                                  \
1089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1090     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1092     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1093                             stride, 24, 16, 16, 16, 16);                      \
1094 }                                                                             \
1095                                                                               \
1096 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1097                                     ptrdiff_t stride)                         \
1098 {                                                                             \
1099     uint8_t full[24 * 17];                                                    \
1100     uint8_t halfH[272];                                                       \
1101     uint8_t halfHV[256];                                                      \
1102                                                                               \
1103     copy_block17(full, src, 24, stride, 17);                                  \
1104     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1105     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1106     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1107     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1108 }                                                                             \
1109                                                                               \
1110 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1111                                         ptrdiff_t stride)                     \
1112 {                                                                             \
1113     uint8_t full[24 * 17];                                                    \
1114     uint8_t halfH[272];                                                       \
1115     uint8_t halfV[256];                                                       \
1116     uint8_t halfHV[256];                                                      \
1117                                                                               \
1118     copy_block17(full, src, 24, stride, 17);                                  \
1119     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1120     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1121     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1122     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1123                             stride, 24, 16, 16, 16, 16);                      \
1124 }                                                                             \
1125                                                                               \
1126 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1127                                     ptrdiff_t stride)                         \
1128 {                                                                             \
1129     uint8_t full[24 * 17];                                                    \
1130     uint8_t halfH[272];                                                       \
1131     uint8_t halfHV[256];                                                      \
1132                                                                               \
1133     copy_block17(full, src, 24, stride, 17);                                  \
1134     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1135     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1136     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1137     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1138 }                                                                             \
1139                                                                               \
1140 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1141                                         ptrdiff_t stride)                     \
1142 {                                                                             \
1143     uint8_t full[24 * 17];                                                    \
1144     uint8_t halfH[272];                                                       \
1145     uint8_t halfV[256];                                                       \
1146     uint8_t halfHV[256];                                                      \
1147                                                                               \
1148     copy_block17(full, src, 24, stride, 17);                                  \
1149     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1150     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1151     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1152     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1153                             stride, 24, 16, 16, 16, 16);                      \
1154 }                                                                             \
1155                                                                               \
1156 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1157                                     ptrdiff_t stride)                         \
1158 {                                                                             \
1159     uint8_t full[24 * 17];                                                    \
1160     uint8_t halfH[272];                                                       \
1161     uint8_t halfHV[256];                                                      \
1162                                                                               \
1163     copy_block17(full, src, 24, stride, 17);                                  \
1164     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1165     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1167     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1168 }                                                                             \
1169                                                                               \
1170 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1171                                     ptrdiff_t stride)                         \
1172 {                                                                             \
1173     uint8_t halfH[272];                                                       \
1174     uint8_t halfHV[256];                                                      \
1175                                                                               \
1176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1177     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1178     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1179 }                                                                             \
1180                                                                               \
1181 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1182                                     ptrdiff_t stride)                         \
1183 {                                                                             \
1184     uint8_t halfH[272];                                                       \
1185     uint8_t halfHV[256];                                                      \
1186                                                                               \
1187     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1188     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1189     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1190 }                                                                             \
1191                                                                               \
1192 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1193                                         ptrdiff_t stride)                     \
1194 {                                                                             \
1195     uint8_t full[24 * 17];                                                    \
1196     uint8_t halfH[272];                                                       \
1197     uint8_t halfV[256];                                                       \
1198     uint8_t halfHV[256];                                                      \
1199                                                                               \
1200     copy_block17(full, src, 24, stride, 17);                                  \
1201     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1202     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1203     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1204     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1205 }                                                                             \
1206                                                                               \
1207 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1208                                     ptrdiff_t stride)                         \
1209 {                                                                             \
1210     uint8_t full[24 * 17];                                                    \
1211     uint8_t halfH[272];                                                       \
1212                                                                               \
1213     copy_block17(full, src, 24, stride, 17);                                  \
1214     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1215     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1216     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1217 }                                                                             \
1218                                                                               \
1219 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1220                                         ptrdiff_t stride)                     \
1221 {                                                                             \
1222     uint8_t full[24 * 17];                                                    \
1223     uint8_t halfH[272];                                                       \
1224     uint8_t halfV[256];                                                       \
1225     uint8_t halfHV[256];                                                      \
1226                                                                               \
1227     copy_block17(full, src, 24, stride, 17);                                  \
1228     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1229     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1230     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1231     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1232 }                                                                             \
1233                                                                               \
1234 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1235                                     ptrdiff_t stride)                         \
1236 {                                                                             \
1237     uint8_t full[24 * 17];                                                    \
1238     uint8_t halfH[272];                                                       \
1239                                                                               \
1240     copy_block17(full, src, 24, stride, 17);                                  \
1241     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1242     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1243     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1244 }                                                                             \
1245                                                                               \
1246 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1247                                     ptrdiff_t stride)                         \
1248 {                                                                             \
1249     uint8_t halfH[272];                                                       \
1250                                                                               \
1251     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1252     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1253 }
1254
1255 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1256 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1257 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1258 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1259
1260 QPEL_MC(0, put_, _, op_put)
1261 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1262 QPEL_MC(0, avg_, _, op_avg)
1263
1264 #undef op_avg
1265 #undef op_put
1266 #undef op_put_no_rnd
1267
1268 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1269 {
1270     put_pixels8_8_c(dst, src, stride, 8);
1271 }
1272
1273 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1274 {
1275     avg_pixels8_8_c(dst, src, stride, 8);
1276 }
1277
1278 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1279 {
1280     put_pixels16_8_c(dst, src, stride, 16);
1281 }
1282
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1284 {
1285     avg_pixels16_8_c(dst, src, stride, 16);
1286 }
1287
1288 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1294
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1296                                   int dstStride, int srcStride, int h)
1297 {
1298     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1299     int i;
1300
1301     for (i = 0; i < h; i++) {
1302         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1303         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1304         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1305         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1306         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1307         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1308         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1309         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1310         dst   += dstStride;
1311         src   += srcStride;
1312     }
1313 }
1314
1315 #if CONFIG_RV40_DECODER
1316 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1317 {
1318     put_pixels16_xy2_8_c(dst, src, stride, 16);
1319 }
1320
1321 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1322 {
1323     avg_pixels16_xy2_8_c(dst, src, stride, 16);
1324 }
1325
1326 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1327 {
1328     put_pixels8_xy2_8_c(dst, src, stride, 8);
1329 }
1330
1331 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1332 {
1333     avg_pixels8_xy2_8_c(dst, src, stride, 8);
1334 }
1335 #endif /* CONFIG_RV40_DECODER */
1336
1337 #if CONFIG_DIRAC_DECODER
1338 #define DIRAC_MC(OPNAME)\
1339 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1340 {\
1341      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1342 }\
1343 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1344 {\
1345     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1346 }\
1347 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1348 {\
1349     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1350     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1351 }\
1352 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1353 {\
1354     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1355 }\
1356 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 {\
1358     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1359 }\
1360 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 {\
1362     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1363     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1364 }\
1365 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1366 {\
1367     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1368 }\
1369 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1370 {\
1371     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1372 }\
1373 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1374 {\
1375     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1376     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1377 }
1378 DIRAC_MC(put)
1379 DIRAC_MC(avg)
1380 #endif
1381
1382 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1383                                   int dstStride, int srcStride, int w)
1384 {
1385     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1386     int i;
1387
1388     for (i = 0; i < w; i++) {
1389         const int src_1 = src[-srcStride];
1390         const int src0  = src[0];
1391         const int src1  = src[srcStride];
1392         const int src2  = src[2 * srcStride];
1393         const int src3  = src[3 * srcStride];
1394         const int src4  = src[4 * srcStride];
1395         const int src5  = src[5 * srcStride];
1396         const int src6  = src[6 * srcStride];
1397         const int src7  = src[7 * srcStride];
1398         const int src8  = src[8 * srcStride];
1399         const int src9  = src[9 * srcStride];
1400         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1401         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1402         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1403         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1404         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1405         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1406         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1407         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1408         src++;
1409         dst++;
1410     }
1411 }
1412
1413 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1414 {
1415     uint8_t half[64];
1416
1417     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1418     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1419 }
1420
1421 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1422 {
1423     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1424 }
1425
1426 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1427 {
1428     uint8_t half[64];
1429
1430     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1431     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1432 }
1433
1434 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1435 {
1436     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1437 }
1438
1439 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1440 {
1441     uint8_t halfH[88];
1442     uint8_t halfV[64];
1443     uint8_t halfHV[64];
1444
1445     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1446     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1447     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1448     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1449 }
1450
1451 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1452 {
1453     uint8_t halfH[88];
1454     uint8_t halfV[64];
1455     uint8_t halfHV[64];
1456
1457     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1458     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1459     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1460     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1461 }
1462
1463 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1464 {
1465     uint8_t halfH[88];
1466
1467     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1468     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1469 }
1470
1471 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1472                               int line_size, int h)
1473 {
1474     int s = 0, i;
1475
1476     for (i = 0; i < h; i++) {
1477         s    += abs(pix1[0]  - pix2[0]);
1478         s    += abs(pix1[1]  - pix2[1]);
1479         s    += abs(pix1[2]  - pix2[2]);
1480         s    += abs(pix1[3]  - pix2[3]);
1481         s    += abs(pix1[4]  - pix2[4]);
1482         s    += abs(pix1[5]  - pix2[5]);
1483         s    += abs(pix1[6]  - pix2[6]);
1484         s    += abs(pix1[7]  - pix2[7]);
1485         s    += abs(pix1[8]  - pix2[8]);
1486         s    += abs(pix1[9]  - pix2[9]);
1487         s    += abs(pix1[10] - pix2[10]);
1488         s    += abs(pix1[11] - pix2[11]);
1489         s    += abs(pix1[12] - pix2[12]);
1490         s    += abs(pix1[13] - pix2[13]);
1491         s    += abs(pix1[14] - pix2[14]);
1492         s    += abs(pix1[15] - pix2[15]);
1493         pix1 += line_size;
1494         pix2 += line_size;
1495     }
1496     return s;
1497 }
1498
1499 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1500                           int line_size, int h)
1501 {
1502     int s = 0, i;
1503
1504     for (i = 0; i < h; i++) {
1505         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1506         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1507         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1508         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1509         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1510         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1511         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1512         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1513         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1514         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1515         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1516         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1517         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1518         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1519         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1520         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1521         pix1 += line_size;
1522         pix2 += line_size;
1523     }
1524     return s;
1525 }
1526
1527 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1528                           int line_size, int h)
1529 {
1530     int s = 0, i;
1531     uint8_t *pix3 = pix2 + line_size;
1532
1533     for (i = 0; i < h; i++) {
1534         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1535         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1536         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1537         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1538         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1539         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1540         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1541         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1542         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1543         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1544         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1545         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1546         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1547         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1548         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1549         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1550         pix1 += line_size;
1551         pix2 += line_size;
1552         pix3 += line_size;
1553     }
1554     return s;
1555 }
1556
1557 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1558                            int line_size, int h)
1559 {
1560     int s = 0, i;
1561     uint8_t *pix3 = pix2 + line_size;
1562
1563     for (i = 0; i < h; i++) {
1564         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1565         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1566         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1567         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1568         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1569         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1570         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1571         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1572         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1573         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1574         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1575         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1576         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1577         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1578         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1579         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1580         pix1 += line_size;
1581         pix2 += line_size;
1582         pix3 += line_size;
1583     }
1584     return s;
1585 }
1586
1587 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1588                              int line_size, int h)
1589 {
1590     int s = 0, i;
1591
1592     for (i = 0; i < h; i++) {
1593         s    += abs(pix1[0] - pix2[0]);
1594         s    += abs(pix1[1] - pix2[1]);
1595         s    += abs(pix1[2] - pix2[2]);
1596         s    += abs(pix1[3] - pix2[3]);
1597         s    += abs(pix1[4] - pix2[4]);
1598         s    += abs(pix1[5] - pix2[5]);
1599         s    += abs(pix1[6] - pix2[6]);
1600         s    += abs(pix1[7] - pix2[7]);
1601         pix1 += line_size;
1602         pix2 += line_size;
1603     }
1604     return s;
1605 }
1606
1607 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1608                          int line_size, int h)
1609 {
1610     int s = 0, i;
1611
1612     for (i = 0; i < h; i++) {
1613         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1614         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1615         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1616         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1617         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1618         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1619         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1620         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1621         pix1 += line_size;
1622         pix2 += line_size;
1623     }
1624     return s;
1625 }
1626
1627 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1628                          int line_size, int h)
1629 {
1630     int s = 0, i;
1631     uint8_t *pix3 = pix2 + line_size;
1632
1633     for (i = 0; i < h; i++) {
1634         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1642         pix1 += line_size;
1643         pix2 += line_size;
1644         pix3 += line_size;
1645     }
1646     return s;
1647 }
1648
1649 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1650                           int line_size, int h)
1651 {
1652     int s = 0, i;
1653     uint8_t *pix3 = pix2 + line_size;
1654
1655     for (i = 0; i < h; i++) {
1656         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1657         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1658         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1659         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1660         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1661         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1662         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1663         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1664         pix1 += line_size;
1665         pix2 += line_size;
1666         pix3 += line_size;
1667     }
1668     return s;
1669 }
1670
1671 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1672 {
1673     int score1 = 0, score2 = 0, x, y;
1674
1675     for (y = 0; y < h; y++) {
1676         for (x = 0; x < 16; x++)
1677             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1678         if (y + 1 < h) {
1679             for (x = 0; x < 15; x++)
1680                 score2 += FFABS(s1[x]     - s1[x + stride] -
1681                                 s1[x + 1] + s1[x + stride + 1]) -
1682                           FFABS(s2[x]     - s2[x + stride] -
1683                                 s2[x + 1] + s2[x + stride + 1]);
1684         }
1685         s1 += stride;
1686         s2 += stride;
1687     }
1688
1689     if (c)
1690         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1691     else
1692         return score1 + FFABS(score2) * 8;
1693 }
1694
1695 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1696 {
1697     int score1 = 0, score2 = 0, x, y;
1698
1699     for (y = 0; y < h; y++) {
1700         for (x = 0; x < 8; x++)
1701             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1702         if (y + 1 < h) {
1703             for (x = 0; x < 7; x++)
1704                 score2 += FFABS(s1[x]     - s1[x + stride] -
1705                                 s1[x + 1] + s1[x + stride + 1]) -
1706                           FFABS(s2[x]     - s2[x + stride] -
1707                                 s2[x + 1] + s2[x + stride + 1]);
1708         }
1709         s1 += stride;
1710         s2 += stride;
1711     }
1712
1713     if (c)
1714         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1715     else
1716         return score1 + FFABS(score2) * 8;
1717 }
1718
1719 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1720                           int16_t basis[64], int scale)
1721 {
1722     int i;
1723     unsigned int sum = 0;
1724
1725     for (i = 0; i < 8 * 8; i++) {
1726         int b = rem[i] + ((basis[i] * scale +
1727                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1728                           (BASIS_SHIFT - RECON_SHIFT));
1729         int w = weight[i];
1730         b >>= RECON_SHIFT;
1731         av_assert2(-512 < b && b < 512);
1732
1733         sum += (w * b) * (w * b) >> 4;
1734     }
1735     return sum >> 2;
1736 }
1737
1738 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1739 {
1740     int i;
1741
1742     for (i = 0; i < 8 * 8; i++)
1743         rem[i] += (basis[i] * scale +
1744                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1745                   (BASIS_SHIFT - RECON_SHIFT);
1746 }
1747
1748 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1749                     int stride, int h)
1750 {
1751     return 0;
1752 }
1753
1754 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1755 {
1756     int i;
1757
1758     memset(cmp, 0, sizeof(void *) * 6);
1759
1760     for (i = 0; i < 6; i++) {
1761         switch (type & 0xFF) {
1762         case FF_CMP_SAD:
1763             cmp[i] = c->sad[i];
1764             break;
1765         case FF_CMP_SATD:
1766             cmp[i] = c->hadamard8_diff[i];
1767             break;
1768         case FF_CMP_SSE:
1769             cmp[i] = c->sse[i];
1770             break;
1771         case FF_CMP_DCT:
1772             cmp[i] = c->dct_sad[i];
1773             break;
1774         case FF_CMP_DCT264:
1775             cmp[i] = c->dct264_sad[i];
1776             break;
1777         case FF_CMP_DCTMAX:
1778             cmp[i] = c->dct_max[i];
1779             break;
1780         case FF_CMP_PSNR:
1781             cmp[i] = c->quant_psnr[i];
1782             break;
1783         case FF_CMP_BIT:
1784             cmp[i] = c->bit[i];
1785             break;
1786         case FF_CMP_RD:
1787             cmp[i] = c->rd[i];
1788             break;
1789         case FF_CMP_VSAD:
1790             cmp[i] = c->vsad[i];
1791             break;
1792         case FF_CMP_VSSE:
1793             cmp[i] = c->vsse[i];
1794             break;
1795         case FF_CMP_ZERO:
1796             cmp[i] = zero_cmp;
1797             break;
1798         case FF_CMP_NSSE:
1799             cmp[i] = c->nsse[i];
1800             break;
1801 #if CONFIG_DWT
1802         case FF_CMP_W53:
1803             cmp[i]= c->w53[i];
1804             break;
1805         case FF_CMP_W97:
1806             cmp[i]= c->w97[i];
1807             break;
1808 #endif
1809         default:
1810             av_log(NULL, AV_LOG_ERROR,
1811                    "internal error in cmp function selection\n");
1812         }
1813     }
1814 }
1815
1816 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1817 {
1818     long i;
1819
1820     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1821         long a = *(long *) (src + i);
1822         long b = *(long *) (dst + i);
1823         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1824     }
1825     for (; i < w; i++)
1826         dst[i + 0] += src[i + 0];
1827 }
1828
1829 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
1830 {
1831     long i;
1832
1833 #if !HAVE_FAST_UNALIGNED
1834     if ((long) src2 & (sizeof(long) - 1)) {
1835         for (i = 0; i + 7 < w; i += 8) {
1836             dst[i + 0] = src1[i + 0] - src2[i + 0];
1837             dst[i + 1] = src1[i + 1] - src2[i + 1];
1838             dst[i + 2] = src1[i + 2] - src2[i + 2];
1839             dst[i + 3] = src1[i + 3] - src2[i + 3];
1840             dst[i + 4] = src1[i + 4] - src2[i + 4];
1841             dst[i + 5] = src1[i + 5] - src2[i + 5];
1842             dst[i + 6] = src1[i + 6] - src2[i + 6];
1843             dst[i + 7] = src1[i + 7] - src2[i + 7];
1844         }
1845     } else
1846 #endif
1847     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1848         long a = *(long *) (src1 + i);
1849         long b = *(long *) (src2 + i);
1850         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1851                               ((a ^ b ^ pb_80) & pb_80);
1852     }
1853     for (; i < w; i++)
1854         dst[i + 0] = src1[i + 0] - src2[i + 0];
1855 }
1856
1857 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1858                                          const uint8_t *diff, int w,
1859                                          int *left, int *left_top)
1860 {
1861     int i;
1862     uint8_t l, lt;
1863
1864     l  = *left;
1865     lt = *left_top;
1866
1867     for (i = 0; i < w; i++) {
1868         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1869         lt     = src1[i];
1870         dst[i] = l;
1871     }
1872
1873     *left     = l;
1874     *left_top = lt;
1875 }
1876
1877 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1878                                          const uint8_t *src2, int w,
1879                                          int *left, int *left_top)
1880 {
1881     int i;
1882     uint8_t l, lt;
1883
1884     l  = *left;
1885     lt = *left_top;
1886
1887     for (i = 0; i < w; i++) {
1888         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1889         lt     = src1[i];
1890         l      = src2[i];
1891         dst[i] = l - pred;
1892     }
1893
1894     *left     = l;
1895     *left_top = lt;
1896 }
1897
1898 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1899                                       int w, int acc)
1900 {
1901     int i;
1902
1903     for (i = 0; i < w - 1; i++) {
1904         acc   += src[i];
1905         dst[i] = acc;
1906         i++;
1907         acc   += src[i];
1908         dst[i] = acc;
1909     }
1910
1911     for (; i < w; i++) {
1912         acc   += src[i];
1913         dst[i] = acc;
1914     }
1915
1916     return acc;
1917 }
1918
1919 #if HAVE_BIGENDIAN
1920 #define B 3
1921 #define G 2
1922 #define R 1
1923 #define A 0
1924 #else
1925 #define B 0
1926 #define G 1
1927 #define R 2
1928 #define A 3
1929 #endif
1930 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1931                                              int w, int *red, int *green,
1932                                              int *blue, int *alpha)
1933 {
1934     int i, r = *red, g = *green, b = *blue, a = *alpha;
1935
1936     for (i = 0; i < w; i++) {
1937         b += src[4 * i + B];
1938         g += src[4 * i + G];
1939         r += src[4 * i + R];
1940         a += src[4 * i + A];
1941
1942         dst[4 * i + B] = b;
1943         dst[4 * i + G] = g;
1944         dst[4 * i + R] = r;
1945         dst[4 * i + A] = a;
1946     }
1947
1948     *red   = r;
1949     *green = g;
1950     *blue  = b;
1951     *alpha = a;
1952 }
1953 #undef B
1954 #undef G
1955 #undef R
1956 #undef A
1957
1958 #define BUTTERFLY2(o1, o2, i1, i2)              \
1959     o1 = (i1) + (i2);                           \
1960     o2 = (i1) - (i2);
1961
1962 #define BUTTERFLY1(x, y)                        \
1963     {                                           \
1964         int a, b;                               \
1965         a = x;                                  \
1966         b = y;                                  \
1967         x = a + b;                              \
1968         y = a - b;                              \
1969     }
1970
1971 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1972
1973 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1974                                uint8_t *src, int stride, int h)
1975 {
1976     int i, temp[64], sum = 0;
1977
1978     av_assert2(h == 8);
1979
1980     for (i = 0; i < 8; i++) {
1981         // FIXME: try pointer walks
1982         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1983                    src[stride * i + 0] - dst[stride * i + 0],
1984                    src[stride * i + 1] - dst[stride * i + 1]);
1985         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1986                    src[stride * i + 2] - dst[stride * i + 2],
1987                    src[stride * i + 3] - dst[stride * i + 3]);
1988         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1989                    src[stride * i + 4] - dst[stride * i + 4],
1990                    src[stride * i + 5] - dst[stride * i + 5]);
1991         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1992                    src[stride * i + 6] - dst[stride * i + 6],
1993                    src[stride * i + 7] - dst[stride * i + 7]);
1994
1995         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1996         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1997         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1998         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1999
2000         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2001         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2002         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2003         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2004     }
2005
2006     for (i = 0; i < 8; i++) {
2007         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2008         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2009         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2010         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2011
2012         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2013         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2014         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2015         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2016
2017         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
2018                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
2019                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2020                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2021     }
2022     return sum;
2023 }
2024
2025 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
2026                                 uint8_t *dummy, int stride, int h)
2027 {
2028     int i, temp[64], sum = 0;
2029
2030     av_assert2(h == 8);
2031
2032     for (i = 0; i < 8; i++) {
2033         // FIXME: try pointer walks
2034         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2035                    src[stride * i + 0], src[stride * i + 1]);
2036         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2037                    src[stride * i + 2], src[stride * i + 3]);
2038         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2039                    src[stride * i + 4], src[stride * i + 5]);
2040         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2041                    src[stride * i + 6], src[stride * i + 7]);
2042
2043         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2044         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2045         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2046         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2047
2048         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2049         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2050         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2051         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2052     }
2053
2054     for (i = 0; i < 8; i++) {
2055         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2056         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2057         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2058         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2059
2060         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2061         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2062         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2063         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2064
2065         sum +=
2066             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2067             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2068             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2069             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2070     }
2071
2072     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2073
2074     return sum;
2075 }
2076
2077 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2078                         uint8_t *src2, int stride, int h)
2079 {
2080     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2081
2082     av_assert2(h == 8);
2083
2084     s->dsp.diff_pixels(temp, src1, src2, stride);
2085     s->dsp.fdct(temp);
2086     return s->dsp.sum_abs_dctelem(temp);
2087 }
2088
2089 #if CONFIG_GPL
2090 #define DCT8_1D                                         \
2091     {                                                   \
2092         const int s07 = SRC(0) + SRC(7);                \
2093         const int s16 = SRC(1) + SRC(6);                \
2094         const int s25 = SRC(2) + SRC(5);                \
2095         const int s34 = SRC(3) + SRC(4);                \
2096         const int a0  = s07 + s34;                      \
2097         const int a1  = s16 + s25;                      \
2098         const int a2  = s07 - s34;                      \
2099         const int a3  = s16 - s25;                      \
2100         const int d07 = SRC(0) - SRC(7);                \
2101         const int d16 = SRC(1) - SRC(6);                \
2102         const int d25 = SRC(2) - SRC(5);                \
2103         const int d34 = SRC(3) - SRC(4);                \
2104         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
2105         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
2106         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
2107         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
2108         DST(0, a0 + a1);                                \
2109         DST(1, a4 + (a7 >> 2));                         \
2110         DST(2, a2 + (a3 >> 1));                         \
2111         DST(3, a5 + (a6 >> 2));                         \
2112         DST(4, a0 - a1);                                \
2113         DST(5, a6 - (a5 >> 2));                         \
2114         DST(6, (a2 >> 1) - a3);                         \
2115         DST(7, (a4 >> 2) - a7);                         \
2116     }
2117
2118 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2119                            uint8_t *src2, int stride, int h)
2120 {
2121     int16_t dct[8][8];
2122     int i, sum = 0;
2123
2124     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2125
2126 #define SRC(x) dct[i][x]
2127 #define DST(x, v) dct[i][x] = v
2128     for (i = 0; i < 8; i++)
2129         DCT8_1D
2130 #undef SRC
2131 #undef DST
2132
2133 #define SRC(x) dct[x][i]
2134 #define DST(x, v) sum += FFABS(v)
2135         for (i = 0; i < 8; i++)
2136             DCT8_1D
2137 #undef SRC
2138 #undef DST
2139             return sum;
2140 }
2141 #endif
2142
2143 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2144                         uint8_t *src2, int stride, int h)
2145 {
2146     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2147     int sum = 0, i;
2148
2149     av_assert2(h == 8);
2150
2151     s->dsp.diff_pixels(temp, src1, src2, stride);
2152     s->dsp.fdct(temp);
2153
2154     for (i = 0; i < 64; i++)
2155         sum = FFMAX(sum, FFABS(temp[i]));
2156
2157     return sum;
2158 }
2159
2160 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2161                            uint8_t *src2, int stride, int h)
2162 {
2163     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2164     int16_t *const bak = temp + 64;
2165     int sum = 0, i;
2166
2167     av_assert2(h == 8);
2168     s->mb_intra = 0;
2169
2170     s->dsp.diff_pixels(temp, src1, src2, stride);
2171
2172     memcpy(bak, temp, 64 * sizeof(int16_t));
2173
2174     s->block_last_index[0 /* FIXME */] =
2175         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2176     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2177     ff_simple_idct_8(temp); // FIXME
2178
2179     for (i = 0; i < 64; i++)
2180         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2181
2182     return sum;
2183 }
2184
2185 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2186                    int stride, int h)
2187 {
2188     const uint8_t *scantable = s->intra_scantable.permutated;
2189     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2190     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2191     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2192     int i, last, run, bits, level, distortion, start_i;
2193     const int esc_length = s->ac_esc_length;
2194     uint8_t *length, *last_length;
2195
2196     av_assert2(h == 8);
2197
2198     copy_block8(lsrc1, src1, 8, stride, 8);
2199     copy_block8(lsrc2, src2, 8, stride, 8);
2200
2201     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2202
2203     s->block_last_index[0 /* FIXME */] =
2204     last                               =
2205         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2206
2207     bits = 0;
2208
2209     if (s->mb_intra) {
2210         start_i     = 1;
2211         length      = s->intra_ac_vlc_length;
2212         last_length = s->intra_ac_vlc_last_length;
2213         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2214     } else {
2215         start_i     = 0;
2216         length      = s->inter_ac_vlc_length;
2217         last_length = s->inter_ac_vlc_last_length;
2218     }
2219
2220     if (last >= start_i) {
2221         run = 0;
2222         for (i = start_i; i < last; i++) {
2223             int j = scantable[i];
2224             level = temp[j];
2225
2226             if (level) {
2227                 level += 64;
2228                 if ((level & (~127)) == 0)
2229                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2230                 else
2231                     bits += esc_length;
2232                 run = 0;
2233             } else
2234                 run++;
2235         }
2236         i = scantable[last];
2237
2238         level = temp[i] + 64;
2239
2240         av_assert2(level - 64);
2241
2242         if ((level & (~127)) == 0) {
2243             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2244         } else
2245             bits += esc_length;
2246     }
2247
2248     if (last >= 0) {
2249         if (s->mb_intra)
2250             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2251         else
2252             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2253     }
2254
2255     s->dsp.idct_add(lsrc2, 8, temp);
2256
2257     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2258
2259     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2260 }
2261
2262 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2263                     int stride, int h)
2264 {
2265     const uint8_t *scantable = s->intra_scantable.permutated;
2266     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2267     int i, last, run, bits, level, start_i;
2268     const int esc_length = s->ac_esc_length;
2269     uint8_t *length, *last_length;
2270
2271     av_assert2(h == 8);
2272
2273     s->dsp.diff_pixels(temp, src1, src2, stride);
2274
2275     s->block_last_index[0 /* FIXME */] =
2276     last                               =
2277         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2278
2279     bits = 0;
2280
2281     if (s->mb_intra) {
2282         start_i     = 1;
2283         length      = s->intra_ac_vlc_length;
2284         last_length = s->intra_ac_vlc_last_length;
2285         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2286     } else {
2287         start_i     = 0;
2288         length      = s->inter_ac_vlc_length;
2289         last_length = s->inter_ac_vlc_last_length;
2290     }
2291
2292     if (last >= start_i) {
2293         run = 0;
2294         for (i = start_i; i < last; i++) {
2295             int j = scantable[i];
2296             level = temp[j];
2297
2298             if (level) {
2299                 level += 64;
2300                 if ((level & (~127)) == 0)
2301                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2302                 else
2303                     bits += esc_length;
2304                 run = 0;
2305             } else
2306                 run++;
2307         }
2308         i = scantable[last];
2309
2310         level = temp[i] + 64;
2311
2312         av_assert2(level - 64);
2313
2314         if ((level & (~127)) == 0)
2315             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2316         else
2317             bits += esc_length;
2318     }
2319
2320     return bits;
2321 }
2322
2323 #define VSAD_INTRA(size)                                                \
2324 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2325                                     uint8_t *s, uint8_t *dummy,         \
2326                                     int stride, int h)                  \
2327 {                                                                       \
2328     int score = 0, x, y;                                                \
2329                                                                         \
2330     for (y = 1; y < h; y++) {                                           \
2331         for (x = 0; x < size; x += 4) {                                 \
2332             score += FFABS(s[x]     - s[x + stride])     +              \
2333                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2334                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2335                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2336         }                                                               \
2337         s += stride;                                                    \
2338     }                                                                   \
2339                                                                         \
2340     return score;                                                       \
2341 }
2342 VSAD_INTRA(8)
2343 VSAD_INTRA(16)
2344
2345 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2346                     int stride, int h)
2347 {
2348     int score = 0, x, y;
2349
2350     for (y = 1; y < h; y++) {
2351         for (x = 0; x < 16; x++)
2352             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2353         s1 += stride;
2354         s2 += stride;
2355     }
2356
2357     return score;
2358 }
2359
2360 #define SQ(a) ((a) * (a))
2361 #define VSSE_INTRA(size)                                                \
2362 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2363                                     uint8_t *s, uint8_t *dummy,         \
2364                                     int stride, int h)                  \
2365 {                                                                       \
2366     int score = 0, x, y;                                                \
2367                                                                         \
2368     for (y = 1; y < h; y++) {                                           \
2369         for (x = 0; x < size; x += 4) {                                 \
2370             score += SQ(s[x]     - s[x + stride]) +                     \
2371                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2372                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2373                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2374         }                                                               \
2375         s += stride;                                                    \
2376     }                                                                   \
2377                                                                         \
2378     return score;                                                       \
2379 }
2380 VSSE_INTRA(8)
2381 VSSE_INTRA(16)
2382
2383 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2384                     int stride, int h)
2385 {
2386     int score = 0, x, y;
2387
2388     for (y = 1; y < h; y++) {
2389         for (x = 0; x < 16; x++)
2390             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2391         s1 += stride;
2392         s2 += stride;
2393     }
2394
2395     return score;
2396 }
2397
2398 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2399                                int size)
2400 {
2401     int score = 0, i;
2402
2403     for (i = 0; i < size; i++)
2404         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2405     return score;
2406 }
2407
2408 #define WRAPPER8_16_SQ(name8, name16)                                   \
2409 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2410                   int stride, int h)                                    \
2411 {                                                                       \
2412     int score = 0;                                                      \
2413                                                                         \
2414     score += name8(s, dst, src, stride, 8);                             \
2415     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2416     if (h == 16) {                                                      \
2417         dst   += 8 * stride;                                            \
2418         src   += 8 * stride;                                            \
2419         score += name8(s, dst, src, stride, 8);                         \
2420         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2421     }                                                                   \
2422     return score;                                                       \
2423 }
2424
2425 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2426 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2427 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2428 #if CONFIG_GPL
2429 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2430 #endif
2431 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2432 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2433 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2434 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2435
2436 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2437                                    uint32_t maxi, uint32_t maxisign)
2438 {
2439     if (a > mini)
2440         return mini;
2441     else if ((a ^ (1U << 31)) > maxisign)
2442         return maxi;
2443     else
2444         return a;
2445 }
2446
2447 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2448                                          float *min, float *max, int len)
2449 {
2450     int i;
2451     uint32_t mini        = *(uint32_t *) min;
2452     uint32_t maxi        = *(uint32_t *) max;
2453     uint32_t maxisign    = maxi ^ (1U << 31);
2454     uint32_t *dsti       = (uint32_t *) dst;
2455     const uint32_t *srci = (const uint32_t *) src;
2456
2457     for (i = 0; i < len; i += 8) {
2458         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2459         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2460         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2461         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2462         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2463         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2464         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2465         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2466     }
2467 }
2468
2469 static void vector_clipf_c(float *dst, const float *src,
2470                            float min, float max, int len)
2471 {
2472     int i;
2473
2474     if (min < 0 && max > 0) {
2475         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2476     } else {
2477         for (i = 0; i < len; i += 8) {
2478             dst[i]     = av_clipf(src[i], min, max);
2479             dst[i + 1] = av_clipf(src[i + 1], min, max);
2480             dst[i + 2] = av_clipf(src[i + 2], min, max);
2481             dst[i + 3] = av_clipf(src[i + 3], min, max);
2482             dst[i + 4] = av_clipf(src[i + 4], min, max);
2483             dst[i + 5] = av_clipf(src[i + 5], min, max);
2484             dst[i + 6] = av_clipf(src[i + 6], min, max);
2485             dst[i + 7] = av_clipf(src[i + 7], min, max);
2486         }
2487     }
2488 }
2489
2490 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2491                                      int order)
2492 {
2493     int res = 0;
2494
2495     while (order--)
2496         res += *v1++ **v2++;
2497
2498     return res;
2499 }
2500
2501 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2502                                               const int16_t *v3,
2503                                               int order, int mul)
2504 {
2505     int res = 0;
2506
2507     while (order--) {
2508         res   += *v1 * *v2++;
2509         *v1++ += mul * *v3++;
2510     }
2511     return res;
2512 }
2513
2514 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2515                                 int32_t max, unsigned int len)
2516 {
2517     do {
2518         *dst++ = av_clip(*src++, min, max);
2519         *dst++ = av_clip(*src++, min, max);
2520         *dst++ = av_clip(*src++, min, max);
2521         *dst++ = av_clip(*src++, min, max);
2522         *dst++ = av_clip(*src++, min, max);
2523         *dst++ = av_clip(*src++, min, max);
2524         *dst++ = av_clip(*src++, min, max);
2525         *dst++ = av_clip(*src++, min, max);
2526         len   -= 8;
2527     } while (len > 0);
2528 }
2529
2530 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2531 {
2532     ff_j_rev_dct(block);
2533     put_pixels_clamped_c(block, dest, line_size);
2534 }
2535
2536 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2537 {
2538     ff_j_rev_dct(block);
2539     add_pixels_clamped_c(block, dest, line_size);
2540 }
2541
2542 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2543 {
2544     ff_j_rev_dct4 (block);
2545     put_pixels_clamped4_c(block, dest, line_size);
2546 }
2547 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2548 {
2549     ff_j_rev_dct4 (block);
2550     add_pixels_clamped4_c(block, dest, line_size);
2551 }
2552
2553 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2554 {
2555     ff_j_rev_dct2 (block);
2556     put_pixels_clamped2_c(block, dest, line_size);
2557 }
2558 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2559 {
2560     ff_j_rev_dct2 (block);
2561     add_pixels_clamped2_c(block, dest, line_size);
2562 }
2563
2564 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2565 {
2566     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2567 }
2568 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2569 {
2570     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2571 }
2572
2573 /* init static data */
2574 av_cold void ff_dsputil_static_init(void)
2575 {
2576     int i;
2577
2578     for (i = 0; i < 512; i++)
2579         ff_square_tab[i] = (i - 256) * (i - 256);
2580 }
2581
2582 int ff_check_alignment(void)
2583 {
2584     static int did_fail = 0;
2585     LOCAL_ALIGNED_16(int, aligned, [4]);
2586
2587     if ((intptr_t)aligned & 15) {
2588         if (!did_fail) {
2589 #if HAVE_MMX || HAVE_ALTIVEC
2590             av_log(NULL, AV_LOG_ERROR,
2591                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2592                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2593                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2594                 "Do not report crashes to FFmpeg developers.\n");
2595 #endif
2596             did_fail=1;
2597         }
2598         return -1;
2599     }
2600     return 0;
2601 }
2602
2603 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2604 {
2605     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2606
2607     ff_check_alignment();
2608
2609 #if CONFIG_ENCODERS
2610     if (avctx->bits_per_raw_sample == 10) {
2611         c->fdct    = ff_jpeg_fdct_islow_10;
2612         c->fdct248 = ff_fdct248_islow_10;
2613     } else {
2614         if (avctx->dct_algo == FF_DCT_FASTINT) {
2615             c->fdct    = ff_fdct_ifast;
2616             c->fdct248 = ff_fdct_ifast248;
2617         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2618             c->fdct    = ff_faandct;
2619             c->fdct248 = ff_faandct248;
2620         } else {
2621             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2622             c->fdct248 = ff_fdct248_islow_8;
2623         }
2624     }
2625 #endif /* CONFIG_ENCODERS */
2626
2627     if (avctx->lowres==1) {
2628         c->idct_put              = ff_jref_idct4_put;
2629         c->idct_add              = ff_jref_idct4_add;
2630         c->idct                  = ff_j_rev_dct4;
2631         c->idct_permutation_type = FF_NO_IDCT_PERM;
2632     } else if (avctx->lowres==2) {
2633         c->idct_put              =  ff_jref_idct2_put;
2634         c->idct_add              =  ff_jref_idct2_add;
2635         c->idct                  =  ff_j_rev_dct2;
2636         c->idct_permutation_type = FF_NO_IDCT_PERM;
2637     } else if (avctx->lowres==3) {
2638         c->idct_put              =  ff_jref_idct1_put;
2639         c->idct_add              =  ff_jref_idct1_add;
2640         c->idct                  =  ff_j_rev_dct1;
2641         c->idct_permutation_type = FF_NO_IDCT_PERM;
2642     } else {
2643         if (avctx->bits_per_raw_sample == 10) {
2644             c->idct_put              = ff_simple_idct_put_10;
2645             c->idct_add              = ff_simple_idct_add_10;
2646             c->idct                  = ff_simple_idct_10;
2647             c->idct_permutation_type = FF_NO_IDCT_PERM;
2648         } else if (avctx->bits_per_raw_sample == 12) {
2649             c->idct_put              = ff_simple_idct_put_12;
2650             c->idct_add              = ff_simple_idct_add_12;
2651             c->idct                  = ff_simple_idct_12;
2652             c->idct_permutation_type = FF_NO_IDCT_PERM;
2653         } else {
2654         if (avctx->idct_algo == FF_IDCT_INT) {
2655             c->idct_put              = jref_idct_put;
2656             c->idct_add              = jref_idct_add;
2657             c->idct                  = ff_j_rev_dct;
2658             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2659         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2660             c->idct_put              = ff_faanidct_put;
2661             c->idct_add              = ff_faanidct_add;
2662             c->idct                  = ff_faanidct;
2663             c->idct_permutation_type = FF_NO_IDCT_PERM;
2664         } else { // accurate/default
2665             c->idct_put              = ff_simple_idct_put_8;
2666             c->idct_add              = ff_simple_idct_add_8;
2667             c->idct                  = ff_simple_idct_8;
2668             c->idct_permutation_type = FF_NO_IDCT_PERM;
2669         }
2670         }
2671     }
2672
2673     c->diff_pixels = diff_pixels_c;
2674
2675     c->put_pixels_clamped        = put_pixels_clamped_c;
2676     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2677     c->add_pixels_clamped        = add_pixels_clamped_c;
2678
2679     c->sum_abs_dctelem = sum_abs_dctelem_c;
2680
2681     c->gmc1 = gmc1_c;
2682     c->gmc  = ff_gmc_c;
2683
2684     c->pix_sum   = pix_sum_c;
2685     c->pix_norm1 = pix_norm1_c;
2686
2687     c->fill_block_tab[0] = fill_block16_c;
2688     c->fill_block_tab[1] = fill_block8_c;
2689
2690     /* TODO [0] 16  [1] 8 */
2691     c->pix_abs[0][0] = pix_abs16_c;
2692     c->pix_abs[0][1] = pix_abs16_x2_c;
2693     c->pix_abs[0][2] = pix_abs16_y2_c;
2694     c->pix_abs[0][3] = pix_abs16_xy2_c;
2695     c->pix_abs[1][0] = pix_abs8_c;
2696     c->pix_abs[1][1] = pix_abs8_x2_c;
2697     c->pix_abs[1][2] = pix_abs8_y2_c;
2698     c->pix_abs[1][3] = pix_abs8_xy2_c;
2699
2700 #define dspfunc(PFX, IDX, NUM)                              \
2701     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2702     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2703     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2704     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2705     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2706     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2707     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2708     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2709     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2710     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2711     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2712     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2713     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2714     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2715     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2716     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2717
2718     dspfunc(put_qpel, 0, 16);
2719     dspfunc(put_qpel, 1, 8);
2720
2721     dspfunc(put_no_rnd_qpel, 0, 16);
2722     dspfunc(put_no_rnd_qpel, 1, 8);
2723
2724     dspfunc(avg_qpel, 0, 16);
2725     dspfunc(avg_qpel, 1, 8);
2726
2727 #undef dspfunc
2728
2729     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2730     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2731     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2732     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2733     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2734     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2735     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2736     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2737
2738 #define SET_CMP_FUNC(name)                      \
2739     c->name[0] = name ## 16_c;                  \
2740     c->name[1] = name ## 8x8_c;
2741
2742     SET_CMP_FUNC(hadamard8_diff)
2743     c->hadamard8_diff[4] = hadamard8_intra16_c;
2744     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2745     SET_CMP_FUNC(dct_sad)
2746     SET_CMP_FUNC(dct_max)
2747 #if CONFIG_GPL
2748     SET_CMP_FUNC(dct264_sad)
2749 #endif
2750     c->sad[0] = pix_abs16_c;
2751     c->sad[1] = pix_abs8_c;
2752     c->sse[0] = sse16_c;
2753     c->sse[1] = sse8_c;
2754     c->sse[2] = sse4_c;
2755     SET_CMP_FUNC(quant_psnr)
2756     SET_CMP_FUNC(rd)
2757     SET_CMP_FUNC(bit)
2758     c->vsad[0] = vsad16_c;
2759     c->vsad[4] = vsad_intra16_c;
2760     c->vsad[5] = vsad_intra8_c;
2761     c->vsse[0] = vsse16_c;
2762     c->vsse[4] = vsse_intra16_c;
2763     c->vsse[5] = vsse_intra8_c;
2764     c->nsse[0] = nsse16_c;
2765     c->nsse[1] = nsse8_c;
2766 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2767     ff_dsputil_init_dwt(c);
2768 #endif
2769
2770     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2771
2772     c->add_bytes                      = add_bytes_c;
2773     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2774     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2775     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2776
2777     c->diff_bytes                 = diff_bytes_c;
2778     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2779
2780     c->bswap_buf   = bswap_buf;
2781     c->bswap16_buf = bswap16_buf;
2782
2783     c->try_8x8basis = try_8x8basis_c;
2784     c->add_8x8basis = add_8x8basis_c;
2785
2786     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2787
2788     c->scalarproduct_int16 = scalarproduct_int16_c;
2789     c->vector_clip_int32   = vector_clip_int32_c;
2790     c->vector_clipf        = vector_clipf_c;
2791
2792     c->shrink[0] = av_image_copy_plane;
2793     c->shrink[1] = ff_shrink22;
2794     c->shrink[2] = ff_shrink44;
2795     c->shrink[3] = ff_shrink88;
2796
2797     c->add_pixels8 = add_pixels8_c;
2798
2799 #undef FUNC
2800 #undef FUNCC
2801 #define FUNC(f,  depth) f ## _ ## depth
2802 #define FUNCC(f, depth) f ## _ ## depth ## _c
2803
2804     c->draw_edges = FUNCC(draw_edges, 8);
2805
2806     c->clear_block  = FUNCC(clear_block, 8);
2807     c->clear_blocks = FUNCC(clear_blocks, 8);
2808
2809 #define BIT_DEPTH_FUNCS(depth)                  \
2810     c->get_pixels = FUNCC(get_pixels, depth);
2811
2812     switch (avctx->bits_per_raw_sample) {
2813     case 9:
2814     case 10:
2815     case 12:
2816     case 14:
2817         BIT_DEPTH_FUNCS(16);
2818         break;
2819     default:
2820         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2821             BIT_DEPTH_FUNCS(8);
2822         }
2823         break;
2824     }
2825
2826
2827     if (ARCH_ALPHA)
2828         ff_dsputil_init_alpha(c, avctx);
2829     if (ARCH_ARM)
2830         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2831     if (ARCH_BFIN)
2832         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2833     if (ARCH_PPC)
2834         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2835     if (ARCH_X86)
2836         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2837
2838     ff_init_scantable_permutation(c->idct_permutation,
2839                                   c->idct_permutation_type);
2840 }
2841
2842 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2843 {
2844     ff_dsputil_init(c, avctx);
2845 }
2846
2847 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2848 {
2849     ff_dsputil_init(c, avctx);
2850 }