]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
Merge commit 'ba71c74017c287681153ec8f6f1cba650d797275'
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45
46 uint32_t ff_square_tab[512] = { 0, };
47
48 #define BIT_DEPTH 16
49 #include "dsputilenc_template.c"
50 #undef BIT_DEPTH
51
52 #define BIT_DEPTH 8
53 #include "hpel_template.c"
54 #include "tpel_template.c"
55 #include "dsputil_template.c"
56 #include "dsputilenc_template.c"
57
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL / 255 * 0x7f)
60 #define pb_80 (~0UL / 255 * 0x80)
61
62 const uint8_t ff_alternate_horizontal_scan[64] = {
63      0,  1,  2,  3,  8,  9, 16, 17,
64     10, 11,  4,  5,  6,  7, 15, 14,
65     13, 12, 19, 18, 24, 25, 32, 33,
66     26, 27, 20, 21, 22, 23, 28, 29,
67     30, 31, 34, 35, 40, 41, 48, 49,
68     42, 43, 36, 37, 38, 39, 44, 45,
69     46, 47, 50, 51, 56, 57, 58, 59,
70     52, 53, 54, 55, 60, 61, 62, 63,
71 };
72
73 const uint8_t ff_alternate_vertical_scan[64] = {
74      0,  8, 16, 24,  1,  9,  2, 10,
75     17, 25, 32, 40, 48, 56, 57, 49,
76     41, 33, 26, 18,  3, 11,  4, 12,
77     19, 27, 34, 42, 50, 58, 35, 43,
78     51, 59, 20, 28,  5, 13,  6, 14,
79     21, 29, 36, 44, 52, 60, 37, 45,
80     53, 61, 22, 30,  7, 15, 23, 31,
81     38, 46, 54, 62, 39, 47, 55, 63,
82 };
83
84 /* Input permutation for the simple_idct_mmx */
85 static const uint8_t simple_mmx_permutation[64] = {
86     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
87     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
88     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
89     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
90     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
91     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
92     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
93     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
94 };
95
96 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
97
98 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
99                                const uint8_t *src_scantable)
100 {
101     int i, end;
102
103     st->scantable = src_scantable;
104
105     for (i = 0; i < 64; i++) {
106         int j = src_scantable[i];
107         st->permutated[i] = permutation[j];
108     }
109
110     end = -1;
111     for (i = 0; i < 64; i++) {
112         int j = st->permutated[i];
113         if (j > end)
114             end = j;
115         st->raster_end[i] = end;
116     }
117 }
118
119 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
120                                            int idct_permutation_type)
121 {
122     int i;
123
124     switch (idct_permutation_type) {
125     case FF_NO_IDCT_PERM:
126         for (i = 0; i < 64; i++)
127             idct_permutation[i] = i;
128         break;
129     case FF_LIBMPEG2_IDCT_PERM:
130         for (i = 0; i < 64; i++)
131             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
132         break;
133     case FF_SIMPLE_IDCT_PERM:
134         for (i = 0; i < 64; i++)
135             idct_permutation[i] = simple_mmx_permutation[i];
136         break;
137     case FF_TRANSPOSE_IDCT_PERM:
138         for (i = 0; i < 64; i++)
139             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
140         break;
141     case FF_PARTTRANS_IDCT_PERM:
142         for (i = 0; i < 64; i++)
143             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
144         break;
145     case FF_SSE2_IDCT_PERM:
146         for (i = 0; i < 64; i++)
147             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
148         break;
149     default:
150         av_log(NULL, AV_LOG_ERROR,
151                "Internal error, IDCT permutation not set\n");
152     }
153 }
154
155 static int pix_sum_c(uint8_t *pix, int line_size)
156 {
157     int s = 0, i, j;
158
159     for (i = 0; i < 16; i++) {
160         for (j = 0; j < 16; j += 8) {
161             s   += pix[0];
162             s   += pix[1];
163             s   += pix[2];
164             s   += pix[3];
165             s   += pix[4];
166             s   += pix[5];
167             s   += pix[6];
168             s   += pix[7];
169             pix += 8;
170         }
171         pix += line_size - 16;
172     }
173     return s;
174 }
175
176 static int pix_norm1_c(uint8_t *pix, int line_size)
177 {
178     int s = 0, i, j;
179     uint32_t *sq = ff_square_tab + 256;
180
181     for (i = 0; i < 16; i++) {
182         for (j = 0; j < 16; j += 8) {
183 #if 0
184             s += sq[pix[0]];
185             s += sq[pix[1]];
186             s += sq[pix[2]];
187             s += sq[pix[3]];
188             s += sq[pix[4]];
189             s += sq[pix[5]];
190             s += sq[pix[6]];
191             s += sq[pix[7]];
192 #else
193 #if HAVE_FAST_64BIT
194             register uint64_t x = *(uint64_t *) pix;
195             s += sq[x         & 0xff];
196             s += sq[(x >>  8) & 0xff];
197             s += sq[(x >> 16) & 0xff];
198             s += sq[(x >> 24) & 0xff];
199             s += sq[(x >> 32) & 0xff];
200             s += sq[(x >> 40) & 0xff];
201             s += sq[(x >> 48) & 0xff];
202             s += sq[(x >> 56) & 0xff];
203 #else
204             register uint32_t x = *(uint32_t *) pix;
205             s += sq[x         & 0xff];
206             s += sq[(x >>  8) & 0xff];
207             s += sq[(x >> 16) & 0xff];
208             s += sq[(x >> 24) & 0xff];
209             x  = *(uint32_t *) (pix + 4);
210             s += sq[x         & 0xff];
211             s += sq[(x >>  8) & 0xff];
212             s += sq[(x >> 16) & 0xff];
213             s += sq[(x >> 24) & 0xff];
214 #endif
215 #endif
216             pix += 8;
217         }
218         pix += line_size - 16;
219     }
220     return s;
221 }
222
223 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
224 {
225     int i;
226
227     for (i = 0; i + 8 <= w; i += 8) {
228         dst[i + 0] = av_bswap32(src[i + 0]);
229         dst[i + 1] = av_bswap32(src[i + 1]);
230         dst[i + 2] = av_bswap32(src[i + 2]);
231         dst[i + 3] = av_bswap32(src[i + 3]);
232         dst[i + 4] = av_bswap32(src[i + 4]);
233         dst[i + 5] = av_bswap32(src[i + 5]);
234         dst[i + 6] = av_bswap32(src[i + 6]);
235         dst[i + 7] = av_bswap32(src[i + 7]);
236     }
237     for (; i < w; i++)
238         dst[i + 0] = av_bswap32(src[i + 0]);
239 }
240
241 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
242 {
243     while (len--)
244         *dst++ = av_bswap16(*src++);
245 }
246
247 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
248                   int line_size, int h)
249 {
250     int s = 0, i;
251     uint32_t *sq = ff_square_tab + 256;
252
253     for (i = 0; i < h; i++) {
254         s    += sq[pix1[0] - pix2[0]];
255         s    += sq[pix1[1] - pix2[1]];
256         s    += sq[pix1[2] - pix2[2]];
257         s    += sq[pix1[3] - pix2[3]];
258         pix1 += line_size;
259         pix2 += line_size;
260     }
261     return s;
262 }
263
264 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
265                   int line_size, int h)
266 {
267     int s = 0, i;
268     uint32_t *sq = ff_square_tab + 256;
269
270     for (i = 0; i < h; i++) {
271         s    += sq[pix1[0] - pix2[0]];
272         s    += sq[pix1[1] - pix2[1]];
273         s    += sq[pix1[2] - pix2[2]];
274         s    += sq[pix1[3] - pix2[3]];
275         s    += sq[pix1[4] - pix2[4]];
276         s    += sq[pix1[5] - pix2[5]];
277         s    += sq[pix1[6] - pix2[6]];
278         s    += sq[pix1[7] - pix2[7]];
279         pix1 += line_size;
280         pix2 += line_size;
281     }
282     return s;
283 }
284
285 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
286                    int line_size, int h)
287 {
288     int s = 0, i;
289     uint32_t *sq = ff_square_tab + 256;
290
291     for (i = 0; i < h; i++) {
292         s += sq[pix1[0]  - pix2[0]];
293         s += sq[pix1[1]  - pix2[1]];
294         s += sq[pix1[2]  - pix2[2]];
295         s += sq[pix1[3]  - pix2[3]];
296         s += sq[pix1[4]  - pix2[4]];
297         s += sq[pix1[5]  - pix2[5]];
298         s += sq[pix1[6]  - pix2[6]];
299         s += sq[pix1[7]  - pix2[7]];
300         s += sq[pix1[8]  - pix2[8]];
301         s += sq[pix1[9]  - pix2[9]];
302         s += sq[pix1[10] - pix2[10]];
303         s += sq[pix1[11] - pix2[11]];
304         s += sq[pix1[12] - pix2[12]];
305         s += sq[pix1[13] - pix2[13]];
306         s += sq[pix1[14] - pix2[14]];
307         s += sq[pix1[15] - pix2[15]];
308
309         pix1 += line_size;
310         pix2 += line_size;
311     }
312     return s;
313 }
314
315 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
316                           const uint8_t *s2, int stride)
317 {
318     int i;
319
320     /* read the pixels */
321     for (i = 0; i < 8; i++) {
322         block[0] = s1[0] - s2[0];
323         block[1] = s1[1] - s2[1];
324         block[2] = s1[2] - s2[2];
325         block[3] = s1[3] - s2[3];
326         block[4] = s1[4] - s2[4];
327         block[5] = s1[5] - s2[5];
328         block[6] = s1[6] - s2[6];
329         block[7] = s1[7] - s2[7];
330         s1      += stride;
331         s2      += stride;
332         block   += 8;
333     }
334 }
335
336 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
337                                  int line_size)
338 {
339     int i;
340
341     /* read the pixels */
342     for (i = 0; i < 8; i++) {
343         pixels[0] = av_clip_uint8(block[0]);
344         pixels[1] = av_clip_uint8(block[1]);
345         pixels[2] = av_clip_uint8(block[2]);
346         pixels[3] = av_clip_uint8(block[3]);
347         pixels[4] = av_clip_uint8(block[4]);
348         pixels[5] = av_clip_uint8(block[5]);
349         pixels[6] = av_clip_uint8(block[6]);
350         pixels[7] = av_clip_uint8(block[7]);
351
352         pixels += line_size;
353         block  += 8;
354     }
355 }
356
357 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
358                                  int line_size)
359 {
360     int i;
361
362     /* read the pixels */
363     for(i=0;i<4;i++) {
364         pixels[0] = av_clip_uint8(block[0]);
365         pixels[1] = av_clip_uint8(block[1]);
366         pixels[2] = av_clip_uint8(block[2]);
367         pixels[3] = av_clip_uint8(block[3]);
368
369         pixels += line_size;
370         block += 8;
371     }
372 }
373
374 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
375                                  int line_size)
376 {
377     int i;
378
379     /* read the pixels */
380     for(i=0;i<2;i++) {
381         pixels[0] = av_clip_uint8(block[0]);
382         pixels[1] = av_clip_uint8(block[1]);
383
384         pixels += line_size;
385         block += 8;
386     }
387 }
388
389 static void put_signed_pixels_clamped_c(const int16_t *block,
390                                         uint8_t *av_restrict pixels,
391                                         int line_size)
392 {
393     int i, j;
394
395     for (i = 0; i < 8; i++) {
396         for (j = 0; j < 8; j++) {
397             if (*block < -128)
398                 *pixels = 0;
399             else if (*block > 127)
400                 *pixels = 255;
401             else
402                 *pixels = (uint8_t) (*block + 128);
403             block++;
404             pixels++;
405         }
406         pixels += (line_size - 8);
407     }
408 }
409
410 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
411                           int line_size)
412 {
413     int i;
414
415     for (i = 0; i < 8; i++) {
416         pixels[0] += block[0];
417         pixels[1] += block[1];
418         pixels[2] += block[2];
419         pixels[3] += block[3];
420         pixels[4] += block[4];
421         pixels[5] += block[5];
422         pixels[6] += block[6];
423         pixels[7] += block[7];
424         pixels    += line_size;
425         block     += 8;
426     }
427 }
428
429 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
430                                  int line_size)
431 {
432     int i;
433
434     /* read the pixels */
435     for (i = 0; i < 8; i++) {
436         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
437         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
438         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
439         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
440         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
441         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
442         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
443         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
444         pixels   += line_size;
445         block    += 8;
446     }
447 }
448
449 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
450                           int line_size)
451 {
452     int i;
453
454     /* read the pixels */
455     for(i=0;i<4;i++) {
456         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
457         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
458         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
459         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
460         pixels += line_size;
461         block += 8;
462     }
463 }
464
465 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
466                           int line_size)
467 {
468     int i;
469
470     /* read the pixels */
471     for(i=0;i<2;i++) {
472         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
473         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
474         pixels += line_size;
475         block += 8;
476     }
477 }
478
479 static int sum_abs_dctelem_c(int16_t *block)
480 {
481     int sum = 0, i;
482
483     for (i = 0; i < 64; i++)
484         sum += FFABS(block[i]);
485     return sum;
486 }
487
488 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
489 {
490     int i;
491
492     for (i = 0; i < h; i++) {
493         memset(block, value, 16);
494         block += line_size;
495     }
496 }
497
498 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
499 {
500     int i;
501
502     for (i = 0; i < h; i++) {
503         memset(block, value, 8);
504         block += line_size;
505     }
506 }
507
508 #define avg2(a, b) ((a + b + 1) >> 1)
509 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
510
511 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
512                    int x16, int y16, int rounder)
513 {
514     const int A = (16 - x16) * (16 - y16);
515     const int B = (x16)      * (16 - y16);
516     const int C = (16 - x16) * (y16);
517     const int D = (x16)      * (y16);
518     int i;
519
520     for (i = 0; i < h; i++) {
521         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
522         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
523         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
524         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
525         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
526         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
527         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
528         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
529         dst   += stride;
530         src   += stride;
531     }
532 }
533
534 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
535               int dxx, int dxy, int dyx, int dyy, int shift, int r,
536               int width, int height)
537 {
538     int y, vx, vy;
539     const int s = 1 << shift;
540
541     width--;
542     height--;
543
544     for (y = 0; y < h; y++) {
545         int x;
546
547         vx = ox;
548         vy = oy;
549         for (x = 0; x < 8; x++) { // FIXME: optimize
550             int index;
551             int src_x  = vx >> 16;
552             int src_y  = vy >> 16;
553             int frac_x = src_x & (s - 1);
554             int frac_y = src_y & (s - 1);
555
556             src_x >>= shift;
557             src_y >>= shift;
558
559             if ((unsigned) src_x < width) {
560                 if ((unsigned) src_y < height) {
561                     index = src_x + src_y * stride;
562                     dst[y * stride + x] =
563                         ((src[index]                        * (s - frac_x) +
564                           src[index + 1]          * frac_x) * (s - frac_y) +
565                          (src[index + stride]               * (s - frac_x) +
566                           src[index + stride + 1] * frac_x) *      frac_y  +
567                          r) >> (shift * 2);
568                 } else {
569                     index = src_x + av_clip(src_y, 0, height) * stride;
570                     dst[y * stride + x] =
571                         ((src[index]               * (s - frac_x) +
572                           src[index + 1] * frac_x) *  s           +
573                          r) >> (shift * 2);
574                 }
575             } else {
576                 if ((unsigned) src_y < height) {
577                     index = av_clip(src_x, 0, width) + src_y * stride;
578                     dst[y * stride + x] =
579                         ((src[index]                    * (s - frac_y) +
580                           src[index + stride] * frac_y) *  s           +
581                          r) >> (shift * 2);
582                 } else {
583                     index = av_clip(src_x, 0, width) +
584                             av_clip(src_y, 0, height) * stride;
585                     dst[y * stride + x] = src[index];
586                 }
587             }
588
589             vx += dxx;
590             vy += dyx;
591         }
592         ox += dxy;
593         oy += dyy;
594     }
595 }
596
597 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
598 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
599                                             int dstStride, int srcStride,     \
600                                             int h)                            \
601 {                                                                             \
602     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
603     int i;                                                                    \
604                                                                               \
605     for (i = 0; i < h; i++) {                                                 \
606         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
607         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
608         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
609         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
610         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
611         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
612         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
613         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
614         dst += dstStride;                                                     \
615         src += srcStride;                                                     \
616     }                                                                         \
617 }                                                                             \
618                                                                               \
619 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
620                                             int dstStride, int srcStride)     \
621 {                                                                             \
622     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
623     const int w = 8;                                                          \
624     int i;                                                                    \
625                                                                               \
626     for (i = 0; i < w; i++) {                                                 \
627         const int src0 = src[0 * srcStride];                                  \
628         const int src1 = src[1 * srcStride];                                  \
629         const int src2 = src[2 * srcStride];                                  \
630         const int src3 = src[3 * srcStride];                                  \
631         const int src4 = src[4 * srcStride];                                  \
632         const int src5 = src[5 * srcStride];                                  \
633         const int src6 = src[6 * srcStride];                                  \
634         const int src7 = src[7 * srcStride];                                  \
635         const int src8 = src[8 * srcStride];                                  \
636         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
637         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
638         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
639         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
640         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
641         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
642         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
643         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
644         dst++;                                                                \
645         src++;                                                                \
646     }                                                                         \
647 }                                                                             \
648                                                                               \
649 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
650                                              int dstStride, int srcStride,    \
651                                              int h)                           \
652 {                                                                             \
653     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
654     int i;                                                                    \
655                                                                               \
656     for (i = 0; i < h; i++) {                                                 \
657         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
658         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
659         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
660         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
661         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
662         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
663         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
664         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
665         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
666         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
667         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
668         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
669         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
670         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
671         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
672         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
673         dst += dstStride;                                                     \
674         src += srcStride;                                                     \
675     }                                                                         \
676 }                                                                             \
677                                                                               \
678 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
679                                              int dstStride, int srcStride)    \
680 {                                                                             \
681     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
682     const int w = 16;                                                         \
683     int i;                                                                    \
684                                                                               \
685     for (i = 0; i < w; i++) {                                                 \
686         const int src0  = src[0  * srcStride];                                \
687         const int src1  = src[1  * srcStride];                                \
688         const int src2  = src[2  * srcStride];                                \
689         const int src3  = src[3  * srcStride];                                \
690         const int src4  = src[4  * srcStride];                                \
691         const int src5  = src[5  * srcStride];                                \
692         const int src6  = src[6  * srcStride];                                \
693         const int src7  = src[7  * srcStride];                                \
694         const int src8  = src[8  * srcStride];                                \
695         const int src9  = src[9  * srcStride];                                \
696         const int src10 = src[10 * srcStride];                                \
697         const int src11 = src[11 * srcStride];                                \
698         const int src12 = src[12 * srcStride];                                \
699         const int src13 = src[13 * srcStride];                                \
700         const int src14 = src[14 * srcStride];                                \
701         const int src15 = src[15 * srcStride];                                \
702         const int src16 = src[16 * srcStride];                                \
703         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
704         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
705         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
706         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
707         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
708         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
709         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
710         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
711         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
712         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
713         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
714         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
715         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
716         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
717         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
718         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
719         dst++;                                                                \
720         src++;                                                                \
721     }                                                                         \
722 }                                                                             \
723                                                                               \
724 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
725                                    ptrdiff_t stride)                          \
726 {                                                                             \
727     uint8_t half[64];                                                         \
728                                                                               \
729     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
730     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
731 }                                                                             \
732                                                                               \
733 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
734                                    ptrdiff_t stride)                          \
735 {                                                                             \
736     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
737 }                                                                             \
738                                                                               \
739 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
740                                    ptrdiff_t stride)                          \
741 {                                                                             \
742     uint8_t half[64];                                                         \
743                                                                               \
744     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
745     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
746 }                                                                             \
747                                                                               \
748 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
749                                    ptrdiff_t stride)                          \
750 {                                                                             \
751     uint8_t full[16 * 9];                                                     \
752     uint8_t half[64];                                                         \
753                                                                               \
754     copy_block9(full, src, 16, stride, 9);                                    \
755     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
756     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
757 }                                                                             \
758                                                                               \
759 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
760                                    ptrdiff_t stride)                          \
761 {                                                                             \
762     uint8_t full[16 * 9];                                                     \
763                                                                               \
764     copy_block9(full, src, 16, stride, 9);                                    \
765     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
766 }                                                                             \
767                                                                               \
768 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
769                                    ptrdiff_t stride)                          \
770 {                                                                             \
771     uint8_t full[16 * 9];                                                     \
772     uint8_t half[64];                                                         \
773                                                                               \
774     copy_block9(full, src, 16, stride, 9);                                    \
775     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
776     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
777 }                                                                             \
778                                                                               \
779 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
780                                        ptrdiff_t stride)                      \
781 {                                                                             \
782     uint8_t full[16 * 9];                                                     \
783     uint8_t halfH[72];                                                        \
784     uint8_t halfV[64];                                                        \
785     uint8_t halfHV[64];                                                       \
786                                                                               \
787     copy_block9(full, src, 16, stride, 9);                                    \
788     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
789     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
790     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
791     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
792                            stride, 16, 8, 8, 8, 8);                           \
793 }                                                                             \
794                                                                               \
795 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
796                                    ptrdiff_t stride)                          \
797 {                                                                             \
798     uint8_t full[16 * 9];                                                     \
799     uint8_t halfH[72];                                                        \
800     uint8_t halfHV[64];                                                       \
801                                                                               \
802     copy_block9(full, src, 16, stride, 9);                                    \
803     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
804     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
805     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
806     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
807 }                                                                             \
808                                                                               \
809 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
810                                        ptrdiff_t stride)                      \
811 {                                                                             \
812     uint8_t full[16 * 9];                                                     \
813     uint8_t halfH[72];                                                        \
814     uint8_t halfV[64];                                                        \
815     uint8_t halfHV[64];                                                       \
816                                                                               \
817     copy_block9(full, src, 16, stride, 9);                                    \
818     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
819     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
820     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
821     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
822                            stride, 16, 8, 8, 8, 8);                           \
823 }                                                                             \
824                                                                               \
825 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
826                                    ptrdiff_t stride)                          \
827 {                                                                             \
828     uint8_t full[16 * 9];                                                     \
829     uint8_t halfH[72];                                                        \
830     uint8_t halfHV[64];                                                       \
831                                                                               \
832     copy_block9(full, src, 16, stride, 9);                                    \
833     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
834     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
835     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
836     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
837 }                                                                             \
838                                                                               \
839 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
840                                        ptrdiff_t stride)                      \
841 {                                                                             \
842     uint8_t full[16 * 9];                                                     \
843     uint8_t halfH[72];                                                        \
844     uint8_t halfV[64];                                                        \
845     uint8_t halfHV[64];                                                       \
846                                                                               \
847     copy_block9(full, src, 16, stride, 9);                                    \
848     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
849     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
850     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
851     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
852                            stride, 16, 8, 8, 8, 8);                           \
853 }                                                                             \
854                                                                               \
855 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
856                                    ptrdiff_t stride)                          \
857 {                                                                             \
858     uint8_t full[16 * 9];                                                     \
859     uint8_t halfH[72];                                                        \
860     uint8_t halfHV[64];                                                       \
861                                                                               \
862     copy_block9(full, src, 16, stride, 9);                                    \
863     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
864     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
865     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
866     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
867 }                                                                             \
868                                                                               \
869 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
870                                        ptrdiff_t stride)                      \
871 {                                                                             \
872     uint8_t full[16 * 9];                                                     \
873     uint8_t halfH[72];                                                        \
874     uint8_t halfV[64];                                                        \
875     uint8_t halfHV[64];                                                       \
876                                                                               \
877     copy_block9(full, src, 16, stride, 9);                                    \
878     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
879     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
880     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
881     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
882                            stride, 16, 8, 8, 8, 8);                           \
883 }                                                                             \
884                                                                               \
885 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
886                                    ptrdiff_t stride)                          \
887 {                                                                             \
888     uint8_t full[16 * 9];                                                     \
889     uint8_t halfH[72];                                                        \
890     uint8_t halfHV[64];                                                       \
891                                                                               \
892     copy_block9(full, src, 16, stride, 9);                                    \
893     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
894     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
895     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
896     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
897 }                                                                             \
898                                                                               \
899 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
900                                    ptrdiff_t stride)                          \
901 {                                                                             \
902     uint8_t halfH[72];                                                        \
903     uint8_t halfHV[64];                                                       \
904                                                                               \
905     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
906     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
907     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
908 }                                                                             \
909                                                                               \
910 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
911                                    ptrdiff_t stride)                          \
912 {                                                                             \
913     uint8_t halfH[72];                                                        \
914     uint8_t halfHV[64];                                                       \
915                                                                               \
916     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
917     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
918     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
919 }                                                                             \
920                                                                               \
921 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
922                                        ptrdiff_t stride)                      \
923 {                                                                             \
924     uint8_t full[16 * 9];                                                     \
925     uint8_t halfH[72];                                                        \
926     uint8_t halfV[64];                                                        \
927     uint8_t halfHV[64];                                                       \
928                                                                               \
929     copy_block9(full, src, 16, stride, 9);                                    \
930     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
931     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
932     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
933     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
934 }                                                                             \
935                                                                               \
936 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
937                                    ptrdiff_t stride)                          \
938 {                                                                             \
939     uint8_t full[16 * 9];                                                     \
940     uint8_t halfH[72];                                                        \
941                                                                               \
942     copy_block9(full, src, 16, stride, 9);                                    \
943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
944     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
945     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
946 }                                                                             \
947                                                                               \
948 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
949                                        ptrdiff_t stride)                      \
950 {                                                                             \
951     uint8_t full[16 * 9];                                                     \
952     uint8_t halfH[72];                                                        \
953     uint8_t halfV[64];                                                        \
954     uint8_t halfHV[64];                                                       \
955                                                                               \
956     copy_block9(full, src, 16, stride, 9);                                    \
957     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
958     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
959     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
960     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
961 }                                                                             \
962                                                                               \
963 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
964                                    ptrdiff_t stride)                          \
965 {                                                                             \
966     uint8_t full[16 * 9];                                                     \
967     uint8_t halfH[72];                                                        \
968                                                                               \
969     copy_block9(full, src, 16, stride, 9);                                    \
970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
971     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
972     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
973 }                                                                             \
974                                                                               \
975 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
976                                    ptrdiff_t stride)                          \
977 {                                                                             \
978     uint8_t halfH[72];                                                        \
979                                                                               \
980     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
981     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
982 }                                                                             \
983                                                                               \
984 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
985                                     ptrdiff_t stride)                         \
986 {                                                                             \
987     uint8_t half[256];                                                        \
988                                                                               \
989     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
990     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
991 }                                                                             \
992                                                                               \
993 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
994                                     ptrdiff_t stride)                         \
995 {                                                                             \
996     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
997 }                                                                             \
998                                                                               \
999 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
1000                                     ptrdiff_t stride)                         \
1001 {                                                                             \
1002     uint8_t half[256];                                                        \
1003                                                                               \
1004     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1005     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1006 }                                                                             \
1007                                                                               \
1008 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1009                                     ptrdiff_t stride)                         \
1010 {                                                                             \
1011     uint8_t full[24 * 17];                                                    \
1012     uint8_t half[256];                                                        \
1013                                                                               \
1014     copy_block17(full, src, 24, stride, 17);                                  \
1015     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1016     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1017 }                                                                             \
1018                                                                               \
1019 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1020                                     ptrdiff_t stride)                         \
1021 {                                                                             \
1022     uint8_t full[24 * 17];                                                    \
1023                                                                               \
1024     copy_block17(full, src, 24, stride, 17);                                  \
1025     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1026 }                                                                             \
1027                                                                               \
1028 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1029                                     ptrdiff_t stride)                         \
1030 {                                                                             \
1031     uint8_t full[24 * 17];                                                    \
1032     uint8_t half[256];                                                        \
1033                                                                               \
1034     copy_block17(full, src, 24, stride, 17);                                  \
1035     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1036     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1037 }                                                                             \
1038                                                                               \
1039 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1040                                         ptrdiff_t stride)                     \
1041 {                                                                             \
1042     uint8_t full[24 * 17];                                                    \
1043     uint8_t halfH[272];                                                       \
1044     uint8_t halfV[256];                                                       \
1045     uint8_t halfHV[256];                                                      \
1046                                                                               \
1047     copy_block17(full, src, 24, stride, 17);                                  \
1048     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1049     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1050     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1051     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1052                             stride, 24, 16, 16, 16, 16);                      \
1053 }                                                                             \
1054                                                                               \
1055 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1056                                     ptrdiff_t stride)                         \
1057 {                                                                             \
1058     uint8_t full[24 * 17];                                                    \
1059     uint8_t halfH[272];                                                       \
1060     uint8_t halfHV[256];                                                      \
1061                                                                               \
1062     copy_block17(full, src, 24, stride, 17);                                  \
1063     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1064     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1065     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1066     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1067 }                                                                             \
1068                                                                               \
1069 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1070                                         ptrdiff_t stride)                     \
1071 {                                                                             \
1072     uint8_t full[24 * 17];                                                    \
1073     uint8_t halfH[272];                                                       \
1074     uint8_t halfV[256];                                                       \
1075     uint8_t halfHV[256];                                                      \
1076                                                                               \
1077     copy_block17(full, src, 24, stride, 17);                                  \
1078     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1079     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1080     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1081     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1082                             stride, 24, 16, 16, 16, 16);                      \
1083 }                                                                             \
1084                                                                               \
1085 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1086                                     ptrdiff_t stride)                         \
1087 {                                                                             \
1088     uint8_t full[24 * 17];                                                    \
1089     uint8_t halfH[272];                                                       \
1090     uint8_t halfHV[256];                                                      \
1091                                                                               \
1092     copy_block17(full, src, 24, stride, 17);                                  \
1093     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1094     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1095     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1096     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1097 }                                                                             \
1098                                                                               \
1099 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1100                                         ptrdiff_t stride)                     \
1101 {                                                                             \
1102     uint8_t full[24 * 17];                                                    \
1103     uint8_t halfH[272];                                                       \
1104     uint8_t halfV[256];                                                       \
1105     uint8_t halfHV[256];                                                      \
1106                                                                               \
1107     copy_block17(full, src, 24, stride, 17);                                  \
1108     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1109     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1110     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1111     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1112                             stride, 24, 16, 16, 16, 16);                      \
1113 }                                                                             \
1114                                                                               \
1115 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1116                                     ptrdiff_t stride)                         \
1117 {                                                                             \
1118     uint8_t full[24 * 17];                                                    \
1119     uint8_t halfH[272];                                                       \
1120     uint8_t halfHV[256];                                                      \
1121                                                                               \
1122     copy_block17(full, src, 24, stride, 17);                                  \
1123     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1124     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1125     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1126     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1127 }                                                                             \
1128                                                                               \
1129 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1130                                         ptrdiff_t stride)                     \
1131 {                                                                             \
1132     uint8_t full[24 * 17];                                                    \
1133     uint8_t halfH[272];                                                       \
1134     uint8_t halfV[256];                                                       \
1135     uint8_t halfHV[256];                                                      \
1136                                                                               \
1137     copy_block17(full, src, 24, stride, 17);                                  \
1138     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1139     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1140     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1141     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1142                             stride, 24, 16, 16, 16, 16);                      \
1143 }                                                                             \
1144                                                                               \
1145 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1146                                     ptrdiff_t stride)                         \
1147 {                                                                             \
1148     uint8_t full[24 * 17];                                                    \
1149     uint8_t halfH[272];                                                       \
1150     uint8_t halfHV[256];                                                      \
1151                                                                               \
1152     copy_block17(full, src, 24, stride, 17);                                  \
1153     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1154     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1155     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1156     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1157 }                                                                             \
1158                                                                               \
1159 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1160                                     ptrdiff_t stride)                         \
1161 {                                                                             \
1162     uint8_t halfH[272];                                                       \
1163     uint8_t halfHV[256];                                                      \
1164                                                                               \
1165     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1166     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1167     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1168 }                                                                             \
1169                                                                               \
1170 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1171                                     ptrdiff_t stride)                         \
1172 {                                                                             \
1173     uint8_t halfH[272];                                                       \
1174     uint8_t halfHV[256];                                                      \
1175                                                                               \
1176     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1177     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1178     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1179 }                                                                             \
1180                                                                               \
1181 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1182                                         ptrdiff_t stride)                     \
1183 {                                                                             \
1184     uint8_t full[24 * 17];                                                    \
1185     uint8_t halfH[272];                                                       \
1186     uint8_t halfV[256];                                                       \
1187     uint8_t halfHV[256];                                                      \
1188                                                                               \
1189     copy_block17(full, src, 24, stride, 17);                                  \
1190     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1191     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1192     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1193     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1194 }                                                                             \
1195                                                                               \
1196 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1197                                     ptrdiff_t stride)                         \
1198 {                                                                             \
1199     uint8_t full[24 * 17];                                                    \
1200     uint8_t halfH[272];                                                       \
1201                                                                               \
1202     copy_block17(full, src, 24, stride, 17);                                  \
1203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1204     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1205     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1206 }                                                                             \
1207                                                                               \
1208 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1209                                         ptrdiff_t stride)                     \
1210 {                                                                             \
1211     uint8_t full[24 * 17];                                                    \
1212     uint8_t halfH[272];                                                       \
1213     uint8_t halfV[256];                                                       \
1214     uint8_t halfHV[256];                                                      \
1215                                                                               \
1216     copy_block17(full, src, 24, stride, 17);                                  \
1217     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1218     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1219     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1220     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1221 }                                                                             \
1222                                                                               \
1223 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1224                                     ptrdiff_t stride)                         \
1225 {                                                                             \
1226     uint8_t full[24 * 17];                                                    \
1227     uint8_t halfH[272];                                                       \
1228                                                                               \
1229     copy_block17(full, src, 24, stride, 17);                                  \
1230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1231     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1232     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1233 }                                                                             \
1234                                                                               \
1235 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1236                                     ptrdiff_t stride)                         \
1237 {                                                                             \
1238     uint8_t halfH[272];                                                       \
1239                                                                               \
1240     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1241     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1242 }
1243
1244 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1245 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1246 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1247 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1248
1249 QPEL_MC(0, put_, _, op_put)
1250 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1251 QPEL_MC(0, avg_, _, op_avg)
1252
1253 #undef op_avg
1254 #undef op_put
1255 #undef op_put_no_rnd
1256
1257 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1258 {
1259     put_pixels8_8_c(dst, src, stride, 8);
1260 }
1261
1262 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1263 {
1264     avg_pixels8_8_c(dst, src, stride, 8);
1265 }
1266
1267 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1268 {
1269     put_pixels16_8_c(dst, src, stride, 16);
1270 }
1271
1272 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 {
1274     avg_pixels16_8_c(dst, src, stride, 16);
1275 }
1276
1277 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1278 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1279 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1280 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1281 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1282 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1283
1284 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1285                                   int dstStride, int srcStride, int h)
1286 {
1287     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1288     int i;
1289
1290     for (i = 0; i < h; i++) {
1291         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1292         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1293         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1294         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1295         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1296         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1297         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1298         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1299         dst   += dstStride;
1300         src   += srcStride;
1301     }
1302 }
1303
1304 #if CONFIG_DIRAC_DECODER
1305 #define DIRAC_MC(OPNAME)\
1306 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1307 {\
1308      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1309 }\
1310 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1311 {\
1312     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1313 }\
1314 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1315 {\
1316     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1317     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1318 }\
1319 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1320 {\
1321     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1322 }\
1323 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1324 {\
1325     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1326 }\
1327 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1328 {\
1329     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1330     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1331 }\
1332 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1333 {\
1334     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1335 }\
1336 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1337 {\
1338     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1339 }\
1340 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1341 {\
1342     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1343     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1344 }
1345 DIRAC_MC(put)
1346 DIRAC_MC(avg)
1347 #endif
1348
1349 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1350                                   int dstStride, int srcStride, int w)
1351 {
1352     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1353     int i;
1354
1355     for (i = 0; i < w; i++) {
1356         const int src_1 = src[-srcStride];
1357         const int src0  = src[0];
1358         const int src1  = src[srcStride];
1359         const int src2  = src[2 * srcStride];
1360         const int src3  = src[3 * srcStride];
1361         const int src4  = src[4 * srcStride];
1362         const int src5  = src[5 * srcStride];
1363         const int src6  = src[6 * srcStride];
1364         const int src7  = src[7 * srcStride];
1365         const int src8  = src[8 * srcStride];
1366         const int src9  = src[9 * srcStride];
1367         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1368         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1369         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1370         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1371         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1372         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1373         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1374         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1375         src++;
1376         dst++;
1377     }
1378 }
1379
1380 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1381 {
1382     uint8_t half[64];
1383
1384     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1385     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1386 }
1387
1388 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1389 {
1390     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1391 }
1392
1393 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1394 {
1395     uint8_t half[64];
1396
1397     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1398     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1399 }
1400
1401 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1402 {
1403     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1404 }
1405
1406 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1407 {
1408     uint8_t halfH[88];
1409     uint8_t halfV[64];
1410     uint8_t halfHV[64];
1411
1412     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1413     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1414     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1415     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1416 }
1417
1418 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1419 {
1420     uint8_t halfH[88];
1421     uint8_t halfV[64];
1422     uint8_t halfHV[64];
1423
1424     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1425     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1426     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1427     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1428 }
1429
1430 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1431 {
1432     uint8_t halfH[88];
1433
1434     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1435     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1436 }
1437
1438 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1439                               int line_size, int h)
1440 {
1441     int s = 0, i;
1442
1443     for (i = 0; i < h; i++) {
1444         s    += abs(pix1[0]  - pix2[0]);
1445         s    += abs(pix1[1]  - pix2[1]);
1446         s    += abs(pix1[2]  - pix2[2]);
1447         s    += abs(pix1[3]  - pix2[3]);
1448         s    += abs(pix1[4]  - pix2[4]);
1449         s    += abs(pix1[5]  - pix2[5]);
1450         s    += abs(pix1[6]  - pix2[6]);
1451         s    += abs(pix1[7]  - pix2[7]);
1452         s    += abs(pix1[8]  - pix2[8]);
1453         s    += abs(pix1[9]  - pix2[9]);
1454         s    += abs(pix1[10] - pix2[10]);
1455         s    += abs(pix1[11] - pix2[11]);
1456         s    += abs(pix1[12] - pix2[12]);
1457         s    += abs(pix1[13] - pix2[13]);
1458         s    += abs(pix1[14] - pix2[14]);
1459         s    += abs(pix1[15] - pix2[15]);
1460         pix1 += line_size;
1461         pix2 += line_size;
1462     }
1463     return s;
1464 }
1465
1466 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1467                           int line_size, int h)
1468 {
1469     int s = 0, i;
1470
1471     for (i = 0; i < h; i++) {
1472         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1473         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1474         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1475         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1476         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1477         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1478         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1479         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1480         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1481         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1482         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1483         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1484         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1485         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1486         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1487         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1488         pix1 += line_size;
1489         pix2 += line_size;
1490     }
1491     return s;
1492 }
1493
1494 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1495                           int line_size, int h)
1496 {
1497     int s = 0, i;
1498     uint8_t *pix3 = pix2 + line_size;
1499
1500     for (i = 0; i < h; i++) {
1501         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1502         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1503         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1504         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1505         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1506         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1507         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1508         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1509         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1510         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1511         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1512         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1513         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1514         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1515         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1516         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1517         pix1 += line_size;
1518         pix2 += line_size;
1519         pix3 += line_size;
1520     }
1521     return s;
1522 }
1523
1524 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1525                            int line_size, int h)
1526 {
1527     int s = 0, i;
1528     uint8_t *pix3 = pix2 + line_size;
1529
1530     for (i = 0; i < h; i++) {
1531         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1532         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1533         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1534         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1535         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1536         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1537         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1538         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1539         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1540         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1541         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1542         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1543         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1544         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1545         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1546         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1547         pix1 += line_size;
1548         pix2 += line_size;
1549         pix3 += line_size;
1550     }
1551     return s;
1552 }
1553
1554 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1555                              int line_size, int h)
1556 {
1557     int s = 0, i;
1558
1559     for (i = 0; i < h; i++) {
1560         s    += abs(pix1[0] - pix2[0]);
1561         s    += abs(pix1[1] - pix2[1]);
1562         s    += abs(pix1[2] - pix2[2]);
1563         s    += abs(pix1[3] - pix2[3]);
1564         s    += abs(pix1[4] - pix2[4]);
1565         s    += abs(pix1[5] - pix2[5]);
1566         s    += abs(pix1[6] - pix2[6]);
1567         s    += abs(pix1[7] - pix2[7]);
1568         pix1 += line_size;
1569         pix2 += line_size;
1570     }
1571     return s;
1572 }
1573
1574 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1575                          int line_size, int h)
1576 {
1577     int s = 0, i;
1578
1579     for (i = 0; i < h; i++) {
1580         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1581         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1582         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1583         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1584         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1585         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1586         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1587         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1588         pix1 += line_size;
1589         pix2 += line_size;
1590     }
1591     return s;
1592 }
1593
1594 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1595                          int line_size, int h)
1596 {
1597     int s = 0, i;
1598     uint8_t *pix3 = pix2 + line_size;
1599
1600     for (i = 0; i < h; i++) {
1601         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1602         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1603         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1604         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1605         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1606         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1607         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1608         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1609         pix1 += line_size;
1610         pix2 += line_size;
1611         pix3 += line_size;
1612     }
1613     return s;
1614 }
1615
1616 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1617                           int line_size, int h)
1618 {
1619     int s = 0, i;
1620     uint8_t *pix3 = pix2 + line_size;
1621
1622     for (i = 0; i < h; i++) {
1623         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1624         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1625         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1626         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1627         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1628         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1629         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1630         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1631         pix1 += line_size;
1632         pix2 += line_size;
1633         pix3 += line_size;
1634     }
1635     return s;
1636 }
1637
1638 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1639 {
1640     int score1 = 0, score2 = 0, x, y;
1641
1642     for (y = 0; y < h; y++) {
1643         for (x = 0; x < 16; x++)
1644             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1645         if (y + 1 < h) {
1646             for (x = 0; x < 15; x++)
1647                 score2 += FFABS(s1[x]     - s1[x + stride] -
1648                                 s1[x + 1] + s1[x + stride + 1]) -
1649                           FFABS(s2[x]     - s2[x + stride] -
1650                                 s2[x + 1] + s2[x + stride + 1]);
1651         }
1652         s1 += stride;
1653         s2 += stride;
1654     }
1655
1656     if (c)
1657         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1658     else
1659         return score1 + FFABS(score2) * 8;
1660 }
1661
1662 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1663 {
1664     int score1 = 0, score2 = 0, x, y;
1665
1666     for (y = 0; y < h; y++) {
1667         for (x = 0; x < 8; x++)
1668             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1669         if (y + 1 < h) {
1670             for (x = 0; x < 7; x++)
1671                 score2 += FFABS(s1[x]     - s1[x + stride] -
1672                                 s1[x + 1] + s1[x + stride + 1]) -
1673                           FFABS(s2[x]     - s2[x + stride] -
1674                                 s2[x + 1] + s2[x + stride + 1]);
1675         }
1676         s1 += stride;
1677         s2 += stride;
1678     }
1679
1680     if (c)
1681         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1682     else
1683         return score1 + FFABS(score2) * 8;
1684 }
1685
1686 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1687                           int16_t basis[64], int scale)
1688 {
1689     int i;
1690     unsigned int sum = 0;
1691
1692     for (i = 0; i < 8 * 8; i++) {
1693         int b = rem[i] + ((basis[i] * scale +
1694                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1695                           (BASIS_SHIFT - RECON_SHIFT));
1696         int w = weight[i];
1697         b >>= RECON_SHIFT;
1698         av_assert2(-512 < b && b < 512);
1699
1700         sum += (w * b) * (w * b) >> 4;
1701     }
1702     return sum >> 2;
1703 }
1704
1705 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1706 {
1707     int i;
1708
1709     for (i = 0; i < 8 * 8; i++)
1710         rem[i] += (basis[i] * scale +
1711                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1712                   (BASIS_SHIFT - RECON_SHIFT);
1713 }
1714
1715 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1716                     int stride, int h)
1717 {
1718     return 0;
1719 }
1720
1721 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1722 {
1723     int i;
1724
1725     memset(cmp, 0, sizeof(void *) * 6);
1726
1727     for (i = 0; i < 6; i++) {
1728         switch (type & 0xFF) {
1729         case FF_CMP_SAD:
1730             cmp[i] = c->sad[i];
1731             break;
1732         case FF_CMP_SATD:
1733             cmp[i] = c->hadamard8_diff[i];
1734             break;
1735         case FF_CMP_SSE:
1736             cmp[i] = c->sse[i];
1737             break;
1738         case FF_CMP_DCT:
1739             cmp[i] = c->dct_sad[i];
1740             break;
1741         case FF_CMP_DCT264:
1742             cmp[i] = c->dct264_sad[i];
1743             break;
1744         case FF_CMP_DCTMAX:
1745             cmp[i] = c->dct_max[i];
1746             break;
1747         case FF_CMP_PSNR:
1748             cmp[i] = c->quant_psnr[i];
1749             break;
1750         case FF_CMP_BIT:
1751             cmp[i] = c->bit[i];
1752             break;
1753         case FF_CMP_RD:
1754             cmp[i] = c->rd[i];
1755             break;
1756         case FF_CMP_VSAD:
1757             cmp[i] = c->vsad[i];
1758             break;
1759         case FF_CMP_VSSE:
1760             cmp[i] = c->vsse[i];
1761             break;
1762         case FF_CMP_ZERO:
1763             cmp[i] = zero_cmp;
1764             break;
1765         case FF_CMP_NSSE:
1766             cmp[i] = c->nsse[i];
1767             break;
1768 #if CONFIG_DWT
1769         case FF_CMP_W53:
1770             cmp[i]= c->w53[i];
1771             break;
1772         case FF_CMP_W97:
1773             cmp[i]= c->w97[i];
1774             break;
1775 #endif
1776         default:
1777             av_log(NULL, AV_LOG_ERROR,
1778                    "internal error in cmp function selection\n");
1779         }
1780     }
1781 }
1782
1783 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1784 {
1785     long i;
1786
1787     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1788         long a = *(long *) (src + i);
1789         long b = *(long *) (dst + i);
1790         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1791     }
1792     for (; i < w; i++)
1793         dst[i + 0] += src[i + 0];
1794 }
1795
1796 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
1797 {
1798     long i;
1799
1800 #if !HAVE_FAST_UNALIGNED
1801     if ((long) src2 & (sizeof(long) - 1)) {
1802         for (i = 0; i + 7 < w; i += 8) {
1803             dst[i + 0] = src1[i + 0] - src2[i + 0];
1804             dst[i + 1] = src1[i + 1] - src2[i + 1];
1805             dst[i + 2] = src1[i + 2] - src2[i + 2];
1806             dst[i + 3] = src1[i + 3] - src2[i + 3];
1807             dst[i + 4] = src1[i + 4] - src2[i + 4];
1808             dst[i + 5] = src1[i + 5] - src2[i + 5];
1809             dst[i + 6] = src1[i + 6] - src2[i + 6];
1810             dst[i + 7] = src1[i + 7] - src2[i + 7];
1811         }
1812     } else
1813 #endif
1814     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1815         long a = *(long *) (src1 + i);
1816         long b = *(long *) (src2 + i);
1817         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1818                               ((a ^ b ^ pb_80) & pb_80);
1819     }
1820     for (; i < w; i++)
1821         dst[i + 0] = src1[i + 0] - src2[i + 0];
1822 }
1823
1824 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1825                                          const uint8_t *diff, int w,
1826                                          int *left, int *left_top)
1827 {
1828     int i;
1829     uint8_t l, lt;
1830
1831     l  = *left;
1832     lt = *left_top;
1833
1834     for (i = 0; i < w; i++) {
1835         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1836         lt     = src1[i];
1837         dst[i] = l;
1838     }
1839
1840     *left     = l;
1841     *left_top = lt;
1842 }
1843
1844 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1845                                          const uint8_t *src2, int w,
1846                                          int *left, int *left_top)
1847 {
1848     int i;
1849     uint8_t l, lt;
1850
1851     l  = *left;
1852     lt = *left_top;
1853
1854     for (i = 0; i < w; i++) {
1855         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1856         lt     = src1[i];
1857         l      = src2[i];
1858         dst[i] = l - pred;
1859     }
1860
1861     *left     = l;
1862     *left_top = lt;
1863 }
1864
1865 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1866                                       int w, int acc)
1867 {
1868     int i;
1869
1870     for (i = 0; i < w - 1; i++) {
1871         acc   += src[i];
1872         dst[i] = acc;
1873         i++;
1874         acc   += src[i];
1875         dst[i] = acc;
1876     }
1877
1878     for (; i < w; i++) {
1879         acc   += src[i];
1880         dst[i] = acc;
1881     }
1882
1883     return acc;
1884 }
1885
1886 #if HAVE_BIGENDIAN
1887 #define B 3
1888 #define G 2
1889 #define R 1
1890 #define A 0
1891 #else
1892 #define B 0
1893 #define G 1
1894 #define R 2
1895 #define A 3
1896 #endif
1897 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1898                                              int w, int *red, int *green,
1899                                              int *blue, int *alpha)
1900 {
1901     int i, r = *red, g = *green, b = *blue, a = *alpha;
1902
1903     for (i = 0; i < w; i++) {
1904         b += src[4 * i + B];
1905         g += src[4 * i + G];
1906         r += src[4 * i + R];
1907         a += src[4 * i + A];
1908
1909         dst[4 * i + B] = b;
1910         dst[4 * i + G] = g;
1911         dst[4 * i + R] = r;
1912         dst[4 * i + A] = a;
1913     }
1914
1915     *red   = r;
1916     *green = g;
1917     *blue  = b;
1918     *alpha = a;
1919 }
1920 #undef B
1921 #undef G
1922 #undef R
1923 #undef A
1924
1925 #define BUTTERFLY2(o1, o2, i1, i2)              \
1926     o1 = (i1) + (i2);                           \
1927     o2 = (i1) - (i2);
1928
1929 #define BUTTERFLY1(x, y)                        \
1930     {                                           \
1931         int a, b;                               \
1932         a = x;                                  \
1933         b = y;                                  \
1934         x = a + b;                              \
1935         y = a - b;                              \
1936     }
1937
1938 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1939
1940 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1941                                uint8_t *src, int stride, int h)
1942 {
1943     int i, temp[64], sum = 0;
1944
1945     av_assert2(h == 8);
1946
1947     for (i = 0; i < 8; i++) {
1948         // FIXME: try pointer walks
1949         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1950                    src[stride * i + 0] - dst[stride * i + 0],
1951                    src[stride * i + 1] - dst[stride * i + 1]);
1952         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1953                    src[stride * i + 2] - dst[stride * i + 2],
1954                    src[stride * i + 3] - dst[stride * i + 3]);
1955         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1956                    src[stride * i + 4] - dst[stride * i + 4],
1957                    src[stride * i + 5] - dst[stride * i + 5]);
1958         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1959                    src[stride * i + 6] - dst[stride * i + 6],
1960                    src[stride * i + 7] - dst[stride * i + 7]);
1961
1962         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1963         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1964         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1965         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1966
1967         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1968         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1969         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1970         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1971     }
1972
1973     for (i = 0; i < 8; i++) {
1974         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1975         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1976         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1977         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1978
1979         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1980         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1981         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1982         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1983
1984         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1985                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1986                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1987                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1988     }
1989     return sum;
1990 }
1991
1992 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1993                                 uint8_t *dummy, int stride, int h)
1994 {
1995     int i, temp[64], sum = 0;
1996
1997     av_assert2(h == 8);
1998
1999     for (i = 0; i < 8; i++) {
2000         // FIXME: try pointer walks
2001         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2002                    src[stride * i + 0], src[stride * i + 1]);
2003         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2004                    src[stride * i + 2], src[stride * i + 3]);
2005         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2006                    src[stride * i + 4], src[stride * i + 5]);
2007         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2008                    src[stride * i + 6], src[stride * i + 7]);
2009
2010         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2011         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2012         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2013         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2014
2015         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2016         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2017         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2018         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2019     }
2020
2021     for (i = 0; i < 8; i++) {
2022         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2023         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2024         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2025         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2026
2027         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2028         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2029         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2030         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2031
2032         sum +=
2033             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2034             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2035             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2036             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2037     }
2038
2039     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2040
2041     return sum;
2042 }
2043
2044 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2045                         uint8_t *src2, int stride, int h)
2046 {
2047     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2048
2049     av_assert2(h == 8);
2050
2051     s->dsp.diff_pixels(temp, src1, src2, stride);
2052     s->dsp.fdct(temp);
2053     return s->dsp.sum_abs_dctelem(temp);
2054 }
2055
2056 #if CONFIG_GPL
2057 #define DCT8_1D                                         \
2058     {                                                   \
2059         const int s07 = SRC(0) + SRC(7);                \
2060         const int s16 = SRC(1) + SRC(6);                \
2061         const int s25 = SRC(2) + SRC(5);                \
2062         const int s34 = SRC(3) + SRC(4);                \
2063         const int a0  = s07 + s34;                      \
2064         const int a1  = s16 + s25;                      \
2065         const int a2  = s07 - s34;                      \
2066         const int a3  = s16 - s25;                      \
2067         const int d07 = SRC(0) - SRC(7);                \
2068         const int d16 = SRC(1) - SRC(6);                \
2069         const int d25 = SRC(2) - SRC(5);                \
2070         const int d34 = SRC(3) - SRC(4);                \
2071         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
2072         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
2073         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
2074         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
2075         DST(0, a0 + a1);                                \
2076         DST(1, a4 + (a7 >> 2));                         \
2077         DST(2, a2 + (a3 >> 1));                         \
2078         DST(3, a5 + (a6 >> 2));                         \
2079         DST(4, a0 - a1);                                \
2080         DST(5, a6 - (a5 >> 2));                         \
2081         DST(6, (a2 >> 1) - a3);                         \
2082         DST(7, (a4 >> 2) - a7);                         \
2083     }
2084
2085 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2086                            uint8_t *src2, int stride, int h)
2087 {
2088     int16_t dct[8][8];
2089     int i, sum = 0;
2090
2091     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2092
2093 #define SRC(x) dct[i][x]
2094 #define DST(x, v) dct[i][x] = v
2095     for (i = 0; i < 8; i++)
2096         DCT8_1D
2097 #undef SRC
2098 #undef DST
2099
2100 #define SRC(x) dct[x][i]
2101 #define DST(x, v) sum += FFABS(v)
2102         for (i = 0; i < 8; i++)
2103             DCT8_1D
2104 #undef SRC
2105 #undef DST
2106             return sum;
2107 }
2108 #endif
2109
2110 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2111                         uint8_t *src2, int stride, int h)
2112 {
2113     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2114     int sum = 0, i;
2115
2116     av_assert2(h == 8);
2117
2118     s->dsp.diff_pixels(temp, src1, src2, stride);
2119     s->dsp.fdct(temp);
2120
2121     for (i = 0; i < 64; i++)
2122         sum = FFMAX(sum, FFABS(temp[i]));
2123
2124     return sum;
2125 }
2126
2127 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2128                            uint8_t *src2, int stride, int h)
2129 {
2130     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2131     int16_t *const bak = temp + 64;
2132     int sum = 0, i;
2133
2134     av_assert2(h == 8);
2135     s->mb_intra = 0;
2136
2137     s->dsp.diff_pixels(temp, src1, src2, stride);
2138
2139     memcpy(bak, temp, 64 * sizeof(int16_t));
2140
2141     s->block_last_index[0 /* FIXME */] =
2142         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2143     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2144     ff_simple_idct_8(temp); // FIXME
2145
2146     for (i = 0; i < 64; i++)
2147         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2148
2149     return sum;
2150 }
2151
2152 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2153                    int stride, int h)
2154 {
2155     const uint8_t *scantable = s->intra_scantable.permutated;
2156     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2157     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2158     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2159     int i, last, run, bits, level, distortion, start_i;
2160     const int esc_length = s->ac_esc_length;
2161     uint8_t *length, *last_length;
2162
2163     av_assert2(h == 8);
2164
2165     copy_block8(lsrc1, src1, 8, stride, 8);
2166     copy_block8(lsrc2, src2, 8, stride, 8);
2167
2168     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2169
2170     s->block_last_index[0 /* FIXME */] =
2171     last                               =
2172         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2173
2174     bits = 0;
2175
2176     if (s->mb_intra) {
2177         start_i     = 1;
2178         length      = s->intra_ac_vlc_length;
2179         last_length = s->intra_ac_vlc_last_length;
2180         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2181     } else {
2182         start_i     = 0;
2183         length      = s->inter_ac_vlc_length;
2184         last_length = s->inter_ac_vlc_last_length;
2185     }
2186
2187     if (last >= start_i) {
2188         run = 0;
2189         for (i = start_i; i < last; i++) {
2190             int j = scantable[i];
2191             level = temp[j];
2192
2193             if (level) {
2194                 level += 64;
2195                 if ((level & (~127)) == 0)
2196                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2197                 else
2198                     bits += esc_length;
2199                 run = 0;
2200             } else
2201                 run++;
2202         }
2203         i = scantable[last];
2204
2205         level = temp[i] + 64;
2206
2207         av_assert2(level - 64);
2208
2209         if ((level & (~127)) == 0) {
2210             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2211         } else
2212             bits += esc_length;
2213     }
2214
2215     if (last >= 0) {
2216         if (s->mb_intra)
2217             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2218         else
2219             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2220     }
2221
2222     s->dsp.idct_add(lsrc2, 8, temp);
2223
2224     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2225
2226     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2227 }
2228
2229 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2230                     int stride, int h)
2231 {
2232     const uint8_t *scantable = s->intra_scantable.permutated;
2233     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2234     int i, last, run, bits, level, start_i;
2235     const int esc_length = s->ac_esc_length;
2236     uint8_t *length, *last_length;
2237
2238     av_assert2(h == 8);
2239
2240     s->dsp.diff_pixels(temp, src1, src2, stride);
2241
2242     s->block_last_index[0 /* FIXME */] =
2243     last                               =
2244         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2245
2246     bits = 0;
2247
2248     if (s->mb_intra) {
2249         start_i     = 1;
2250         length      = s->intra_ac_vlc_length;
2251         last_length = s->intra_ac_vlc_last_length;
2252         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2253     } else {
2254         start_i     = 0;
2255         length      = s->inter_ac_vlc_length;
2256         last_length = s->inter_ac_vlc_last_length;
2257     }
2258
2259     if (last >= start_i) {
2260         run = 0;
2261         for (i = start_i; i < last; i++) {
2262             int j = scantable[i];
2263             level = temp[j];
2264
2265             if (level) {
2266                 level += 64;
2267                 if ((level & (~127)) == 0)
2268                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2269                 else
2270                     bits += esc_length;
2271                 run = 0;
2272             } else
2273                 run++;
2274         }
2275         i = scantable[last];
2276
2277         level = temp[i] + 64;
2278
2279         av_assert2(level - 64);
2280
2281         if ((level & (~127)) == 0)
2282             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2283         else
2284             bits += esc_length;
2285     }
2286
2287     return bits;
2288 }
2289
2290 #define VSAD_INTRA(size)                                                \
2291 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2292                                     uint8_t *s, uint8_t *dummy,         \
2293                                     int stride, int h)                  \
2294 {                                                                       \
2295     int score = 0, x, y;                                                \
2296                                                                         \
2297     for (y = 1; y < h; y++) {                                           \
2298         for (x = 0; x < size; x += 4) {                                 \
2299             score += FFABS(s[x]     - s[x + stride])     +              \
2300                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2301                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2302                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2303         }                                                               \
2304         s += stride;                                                    \
2305     }                                                                   \
2306                                                                         \
2307     return score;                                                       \
2308 }
2309 VSAD_INTRA(8)
2310 VSAD_INTRA(16)
2311
2312 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2313                     int stride, int h)
2314 {
2315     int score = 0, x, y;
2316
2317     for (y = 1; y < h; y++) {
2318         for (x = 0; x < 16; x++)
2319             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2320         s1 += stride;
2321         s2 += stride;
2322     }
2323
2324     return score;
2325 }
2326
2327 #define SQ(a) ((a) * (a))
2328 #define VSSE_INTRA(size)                                                \
2329 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2330                                     uint8_t *s, uint8_t *dummy,         \
2331                                     int stride, int h)                  \
2332 {                                                                       \
2333     int score = 0, x, y;                                                \
2334                                                                         \
2335     for (y = 1; y < h; y++) {                                           \
2336         for (x = 0; x < size; x += 4) {                                 \
2337             score += SQ(s[x]     - s[x + stride]) +                     \
2338                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2339                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2340                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2341         }                                                               \
2342         s += stride;                                                    \
2343     }                                                                   \
2344                                                                         \
2345     return score;                                                       \
2346 }
2347 VSSE_INTRA(8)
2348 VSSE_INTRA(16)
2349
2350 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2351                     int stride, int h)
2352 {
2353     int score = 0, x, y;
2354
2355     for (y = 1; y < h; y++) {
2356         for (x = 0; x < 16; x++)
2357             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2358         s1 += stride;
2359         s2 += stride;
2360     }
2361
2362     return score;
2363 }
2364
2365 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2366                                int size)
2367 {
2368     int score = 0, i;
2369
2370     for (i = 0; i < size; i++)
2371         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2372     return score;
2373 }
2374
2375 #define WRAPPER8_16_SQ(name8, name16)                                   \
2376 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2377                   int stride, int h)                                    \
2378 {                                                                       \
2379     int score = 0;                                                      \
2380                                                                         \
2381     score += name8(s, dst, src, stride, 8);                             \
2382     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2383     if (h == 16) {                                                      \
2384         dst   += 8 * stride;                                            \
2385         src   += 8 * stride;                                            \
2386         score += name8(s, dst, src, stride, 8);                         \
2387         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2388     }                                                                   \
2389     return score;                                                       \
2390 }
2391
2392 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2393 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2394 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2395 #if CONFIG_GPL
2396 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2397 #endif
2398 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2399 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2400 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2401 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2402
2403 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2404                                    uint32_t maxi, uint32_t maxisign)
2405 {
2406     if (a > mini)
2407         return mini;
2408     else if ((a ^ (1U << 31)) > maxisign)
2409         return maxi;
2410     else
2411         return a;
2412 }
2413
2414 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2415                                          float *min, float *max, int len)
2416 {
2417     int i;
2418     uint32_t mini        = *(uint32_t *) min;
2419     uint32_t maxi        = *(uint32_t *) max;
2420     uint32_t maxisign    = maxi ^ (1U << 31);
2421     uint32_t *dsti       = (uint32_t *) dst;
2422     const uint32_t *srci = (const uint32_t *) src;
2423
2424     for (i = 0; i < len; i += 8) {
2425         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2426         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2427         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2428         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2429         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2430         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2431         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2432         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2433     }
2434 }
2435
2436 static void vector_clipf_c(float *dst, const float *src,
2437                            float min, float max, int len)
2438 {
2439     int i;
2440
2441     if (min < 0 && max > 0) {
2442         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2443     } else {
2444         for (i = 0; i < len; i += 8) {
2445             dst[i]     = av_clipf(src[i], min, max);
2446             dst[i + 1] = av_clipf(src[i + 1], min, max);
2447             dst[i + 2] = av_clipf(src[i + 2], min, max);
2448             dst[i + 3] = av_clipf(src[i + 3], min, max);
2449             dst[i + 4] = av_clipf(src[i + 4], min, max);
2450             dst[i + 5] = av_clipf(src[i + 5], min, max);
2451             dst[i + 6] = av_clipf(src[i + 6], min, max);
2452             dst[i + 7] = av_clipf(src[i + 7], min, max);
2453         }
2454     }
2455 }
2456
2457 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2458                                      int order)
2459 {
2460     int res = 0;
2461
2462     while (order--)
2463         res += *v1++ **v2++;
2464
2465     return res;
2466 }
2467
2468 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2469                                               const int16_t *v3,
2470                                               int order, int mul)
2471 {
2472     int res = 0;
2473
2474     while (order--) {
2475         res   += *v1 * *v2++;
2476         *v1++ += mul * *v3++;
2477     }
2478     return res;
2479 }
2480
2481 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2482                                 int32_t max, unsigned int len)
2483 {
2484     do {
2485         *dst++ = av_clip(*src++, min, max);
2486         *dst++ = av_clip(*src++, min, max);
2487         *dst++ = av_clip(*src++, min, max);
2488         *dst++ = av_clip(*src++, min, max);
2489         *dst++ = av_clip(*src++, min, max);
2490         *dst++ = av_clip(*src++, min, max);
2491         *dst++ = av_clip(*src++, min, max);
2492         *dst++ = av_clip(*src++, min, max);
2493         len   -= 8;
2494     } while (len > 0);
2495 }
2496
2497 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2498 {
2499     ff_j_rev_dct(block);
2500     put_pixels_clamped_c(block, dest, line_size);
2501 }
2502
2503 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2504 {
2505     ff_j_rev_dct(block);
2506     add_pixels_clamped_c(block, dest, line_size);
2507 }
2508
2509 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2510 {
2511     ff_j_rev_dct4 (block);
2512     put_pixels_clamped4_c(block, dest, line_size);
2513 }
2514 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2515 {
2516     ff_j_rev_dct4 (block);
2517     add_pixels_clamped4_c(block, dest, line_size);
2518 }
2519
2520 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2521 {
2522     ff_j_rev_dct2 (block);
2523     put_pixels_clamped2_c(block, dest, line_size);
2524 }
2525 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2526 {
2527     ff_j_rev_dct2 (block);
2528     add_pixels_clamped2_c(block, dest, line_size);
2529 }
2530
2531 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2532 {
2533     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2534 }
2535 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2536 {
2537     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2538 }
2539
2540 /* draw the edges of width 'w' of an image of size width, height */
2541 // FIXME: Check that this is OK for MPEG-4 interlaced.
2542 static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
2543                            int w, int h, int sides)
2544 {
2545     uint8_t *ptr = buf, *last_line;
2546     int i;
2547
2548     /* left and right */
2549     for (i = 0; i < height; i++) {
2550         memset(ptr - w, ptr[0], w);
2551         memset(ptr + width, ptr[width - 1], w);
2552         ptr += wrap;
2553     }
2554
2555     /* top and bottom + corners */
2556     buf -= w;
2557     last_line = buf + (height - 1) * wrap;
2558     if (sides & EDGE_TOP)
2559         for (i = 0; i < h; i++)
2560             // top
2561             memcpy(buf - (i + 1) * wrap, buf, width + w + w);
2562     if (sides & EDGE_BOTTOM)
2563         for (i = 0; i < h; i++)
2564             // bottom
2565             memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);
2566 }
2567
2568 static void clear_block_8_c(int16_t *block)
2569 {
2570     memset(block, 0, sizeof(int16_t) * 64);
2571 }
2572
2573 static void clear_blocks_8_c(int16_t *blocks)
2574 {
2575     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
2576 }
2577
2578 /* init static data */
2579 av_cold void ff_dsputil_static_init(void)
2580 {
2581     int i;
2582
2583     for (i = 0; i < 512; i++)
2584         ff_square_tab[i] = (i - 256) * (i - 256);
2585 }
2586
2587 int ff_check_alignment(void)
2588 {
2589     static int did_fail = 0;
2590     LOCAL_ALIGNED_16(int, aligned, [4]);
2591
2592     if ((intptr_t)aligned & 15) {
2593         if (!did_fail) {
2594 #if HAVE_MMX || HAVE_ALTIVEC
2595             av_log(NULL, AV_LOG_ERROR,
2596                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2597                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2598                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2599                 "Do not report crashes to FFmpeg developers.\n");
2600 #endif
2601             did_fail=1;
2602         }
2603         return -1;
2604     }
2605     return 0;
2606 }
2607
2608 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2609 {
2610     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2611
2612     ff_check_alignment();
2613
2614 #if CONFIG_ENCODERS
2615     if (avctx->bits_per_raw_sample == 10) {
2616         c->fdct    = ff_jpeg_fdct_islow_10;
2617         c->fdct248 = ff_fdct248_islow_10;
2618     } else {
2619         if (avctx->dct_algo == FF_DCT_FASTINT) {
2620             c->fdct    = ff_fdct_ifast;
2621             c->fdct248 = ff_fdct_ifast248;
2622         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2623             c->fdct    = ff_faandct;
2624             c->fdct248 = ff_faandct248;
2625         } else {
2626             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2627             c->fdct248 = ff_fdct248_islow_8;
2628         }
2629     }
2630 #endif /* CONFIG_ENCODERS */
2631
2632     if (avctx->lowres==1) {
2633         c->idct_put              = ff_jref_idct4_put;
2634         c->idct_add              = ff_jref_idct4_add;
2635         c->idct                  = ff_j_rev_dct4;
2636         c->idct_permutation_type = FF_NO_IDCT_PERM;
2637     } else if (avctx->lowres==2) {
2638         c->idct_put              =  ff_jref_idct2_put;
2639         c->idct_add              =  ff_jref_idct2_add;
2640         c->idct                  =  ff_j_rev_dct2;
2641         c->idct_permutation_type = FF_NO_IDCT_PERM;
2642     } else if (avctx->lowres==3) {
2643         c->idct_put              =  ff_jref_idct1_put;
2644         c->idct_add              =  ff_jref_idct1_add;
2645         c->idct                  =  ff_j_rev_dct1;
2646         c->idct_permutation_type = FF_NO_IDCT_PERM;
2647     } else {
2648         if (avctx->bits_per_raw_sample == 10) {
2649             c->idct_put              = ff_simple_idct_put_10;
2650             c->idct_add              = ff_simple_idct_add_10;
2651             c->idct                  = ff_simple_idct_10;
2652             c->idct_permutation_type = FF_NO_IDCT_PERM;
2653         } else if (avctx->bits_per_raw_sample == 12) {
2654             c->idct_put              = ff_simple_idct_put_12;
2655             c->idct_add              = ff_simple_idct_add_12;
2656             c->idct                  = ff_simple_idct_12;
2657             c->idct_permutation_type = FF_NO_IDCT_PERM;
2658         } else {
2659         if (avctx->idct_algo == FF_IDCT_INT) {
2660             c->idct_put              = jref_idct_put;
2661             c->idct_add              = jref_idct_add;
2662             c->idct                  = ff_j_rev_dct;
2663             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2664         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2665             c->idct_put              = ff_faanidct_put;
2666             c->idct_add              = ff_faanidct_add;
2667             c->idct                  = ff_faanidct;
2668             c->idct_permutation_type = FF_NO_IDCT_PERM;
2669         } else { // accurate/default
2670             c->idct_put              = ff_simple_idct_put_8;
2671             c->idct_add              = ff_simple_idct_add_8;
2672             c->idct                  = ff_simple_idct_8;
2673             c->idct_permutation_type = FF_NO_IDCT_PERM;
2674         }
2675         }
2676     }
2677
2678     c->diff_pixels = diff_pixels_c;
2679
2680     c->put_pixels_clamped        = put_pixels_clamped_c;
2681     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2682     c->add_pixels_clamped        = add_pixels_clamped_c;
2683
2684     c->sum_abs_dctelem = sum_abs_dctelem_c;
2685
2686     c->gmc1 = gmc1_c;
2687     c->gmc  = ff_gmc_c;
2688
2689     c->pix_sum   = pix_sum_c;
2690     c->pix_norm1 = pix_norm1_c;
2691
2692     c->fill_block_tab[0] = fill_block16_c;
2693     c->fill_block_tab[1] = fill_block8_c;
2694
2695     /* TODO [0] 16  [1] 8 */
2696     c->pix_abs[0][0] = pix_abs16_c;
2697     c->pix_abs[0][1] = pix_abs16_x2_c;
2698     c->pix_abs[0][2] = pix_abs16_y2_c;
2699     c->pix_abs[0][3] = pix_abs16_xy2_c;
2700     c->pix_abs[1][0] = pix_abs8_c;
2701     c->pix_abs[1][1] = pix_abs8_x2_c;
2702     c->pix_abs[1][2] = pix_abs8_y2_c;
2703     c->pix_abs[1][3] = pix_abs8_xy2_c;
2704
2705 #define dspfunc(PFX, IDX, NUM)                              \
2706     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2707     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2708     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2709     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2710     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2711     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2712     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2713     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2714     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2715     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2716     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2717     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2718     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2719     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2720     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2721     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2722
2723     dspfunc(put_qpel, 0, 16);
2724     dspfunc(put_qpel, 1, 8);
2725
2726     dspfunc(put_no_rnd_qpel, 0, 16);
2727     dspfunc(put_no_rnd_qpel, 1, 8);
2728
2729     dspfunc(avg_qpel, 0, 16);
2730     dspfunc(avg_qpel, 1, 8);
2731
2732 #undef dspfunc
2733
2734     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2735     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2736     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2737     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2738     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2739     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2740     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2741     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2742
2743 #define SET_CMP_FUNC(name)                      \
2744     c->name[0] = name ## 16_c;                  \
2745     c->name[1] = name ## 8x8_c;
2746
2747     SET_CMP_FUNC(hadamard8_diff)
2748     c->hadamard8_diff[4] = hadamard8_intra16_c;
2749     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2750     SET_CMP_FUNC(dct_sad)
2751     SET_CMP_FUNC(dct_max)
2752 #if CONFIG_GPL
2753     SET_CMP_FUNC(dct264_sad)
2754 #endif
2755     c->sad[0] = pix_abs16_c;
2756     c->sad[1] = pix_abs8_c;
2757     c->sse[0] = sse16_c;
2758     c->sse[1] = sse8_c;
2759     c->sse[2] = sse4_c;
2760     SET_CMP_FUNC(quant_psnr)
2761     SET_CMP_FUNC(rd)
2762     SET_CMP_FUNC(bit)
2763     c->vsad[0] = vsad16_c;
2764     c->vsad[4] = vsad_intra16_c;
2765     c->vsad[5] = vsad_intra8_c;
2766     c->vsse[0] = vsse16_c;
2767     c->vsse[4] = vsse_intra16_c;
2768     c->vsse[5] = vsse_intra8_c;
2769     c->nsse[0] = nsse16_c;
2770     c->nsse[1] = nsse8_c;
2771 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2772     ff_dsputil_init_dwt(c);
2773 #endif
2774
2775     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2776
2777     c->add_bytes                      = add_bytes_c;
2778     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2779     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2780     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2781
2782     c->diff_bytes                 = diff_bytes_c;
2783     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2784
2785     c->bswap_buf   = bswap_buf;
2786     c->bswap16_buf = bswap16_buf;
2787
2788     c->try_8x8basis = try_8x8basis_c;
2789     c->add_8x8basis = add_8x8basis_c;
2790
2791     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2792
2793     c->scalarproduct_int16 = scalarproduct_int16_c;
2794     c->vector_clip_int32   = vector_clip_int32_c;
2795     c->vector_clipf        = vector_clipf_c;
2796
2797     c->shrink[0] = av_image_copy_plane;
2798     c->shrink[1] = ff_shrink22;
2799     c->shrink[2] = ff_shrink44;
2800     c->shrink[3] = ff_shrink88;
2801
2802     c->add_pixels8 = add_pixels8_c;
2803
2804     c->draw_edges = draw_edges_8_c;
2805
2806     c->clear_block  = clear_block_8_c;
2807     c->clear_blocks = clear_blocks_8_c;
2808
2809     switch (avctx->bits_per_raw_sample) {
2810     case 9:
2811     case 10:
2812     case 12:
2813     case 14:
2814         c->get_pixels = get_pixels_16_c;
2815         break;
2816     default:
2817         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2818             c->get_pixels = get_pixels_8_c;
2819         }
2820         break;
2821     }
2822
2823
2824     if (ARCH_ALPHA)
2825         ff_dsputil_init_alpha(c, avctx);
2826     if (ARCH_ARM)
2827         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2828     if (ARCH_BFIN)
2829         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2830     if (ARCH_PPC)
2831         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2832     if (ARCH_X86)
2833         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2834
2835     ff_init_scantable_permutation(c->idct_permutation,
2836                                   c->idct_permutation_type);
2837 }
2838
2839 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2840 {
2841     ff_dsputil_init(c, avctx);
2842 }
2843
2844 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2845 {
2846     ff_dsputil_init(c, avctx);
2847 }