]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
Merge commit 'b97f2c7c49e35724f3963677f8aeee28089dc64e'
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 /**
26  * @file
27  * DSP utils
28  */
29
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
33 #include "avcodec.h"
34 #include "copy_block.h"
35 #include "dct.h"
36 #include "dsputil.h"
37 #include "simple_idct.h"
38 #include "faandct.h"
39 #include "faanidct.h"
40 #include "imgconvert.h"
41 #include "mathops.h"
42 #include "mpegvideo.h"
43 #include "config.h"
44 #include "diracdsp.h"
45
46 uint32_t ff_square_tab[512] = { 0, };
47
48 #define BIT_DEPTH 16
49 #include "dsputilenc_template.c"
50 #undef BIT_DEPTH
51
52 #define BIT_DEPTH 8
53 #include "hpel_template.c"
54 #include "tpel_template.c"
55 #include "dsputil_template.c"
56 #include "dsputilenc_template.c"
57
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL / 255 * 0x7f)
60 #define pb_80 (~0UL / 255 * 0x80)
61
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63  * specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65      0,  8,  1,  9, 16, 24,  2, 10,
66     17, 25, 32, 40, 48, 56, 33, 41,
67     18, 26,  3, 11,  4, 12, 19, 27,
68     34, 42, 49, 57, 50, 58, 35, 43,
69     20, 28,  5, 13,  6, 14, 21, 29,
70     36, 44, 51, 59, 52, 60, 37, 45,
71     22, 30,  7, 15, 23, 31, 38, 46,
72     53, 61, 54, 62, 39, 47, 55, 63,
73 };
74
75 const uint8_t ff_alternate_horizontal_scan[64] = {
76      0,  1,  2,  3,  8,  9, 16, 17,
77     10, 11,  4,  5,  6,  7, 15, 14,
78     13, 12, 19, 18, 24, 25, 32, 33,
79     26, 27, 20, 21, 22, 23, 28, 29,
80     30, 31, 34, 35, 40, 41, 48, 49,
81     42, 43, 36, 37, 38, 39, 44, 45,
82     46, 47, 50, 51, 56, 57, 58, 59,
83     52, 53, 54, 55, 60, 61, 62, 63,
84 };
85
86 const uint8_t ff_alternate_vertical_scan[64] = {
87      0,  8, 16, 24,  1,  9,  2, 10,
88     17, 25, 32, 40, 48, 56, 57, 49,
89     41, 33, 26, 18,  3, 11,  4, 12,
90     19, 27, 34, 42, 50, 58, 35, 43,
91     51, 59, 20, 28,  5, 13,  6, 14,
92     21, 29, 36, 44, 52, 60, 37, 45,
93     53, 61, 22, 30,  7, 15, 23, 31,
94     38, 46, 54, 62, 39, 47, 55, 63,
95 };
96
97 /* Input permutation for the simple_idct_mmx */
98 static const uint8_t simple_mmx_permutation[64] = {
99     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 };
108
109 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
110
111 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
112                                const uint8_t *src_scantable)
113 {
114     int i, end;
115
116     st->scantable = src_scantable;
117
118     for (i = 0; i < 64; i++) {
119         int j = src_scantable[i];
120         st->permutated[i] = permutation[j];
121     }
122
123     end = -1;
124     for (i = 0; i < 64; i++) {
125         int j = st->permutated[i];
126         if (j > end)
127             end = j;
128         st->raster_end[i] = end;
129     }
130 }
131
132 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
133                                            int idct_permutation_type)
134 {
135     int i;
136
137     switch (idct_permutation_type) {
138     case FF_NO_IDCT_PERM:
139         for (i = 0; i < 64; i++)
140             idct_permutation[i] = i;
141         break;
142     case FF_LIBMPEG2_IDCT_PERM:
143         for (i = 0; i < 64; i++)
144             idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
145         break;
146     case FF_SIMPLE_IDCT_PERM:
147         for (i = 0; i < 64; i++)
148             idct_permutation[i] = simple_mmx_permutation[i];
149         break;
150     case FF_TRANSPOSE_IDCT_PERM:
151         for (i = 0; i < 64; i++)
152             idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
153         break;
154     case FF_PARTTRANS_IDCT_PERM:
155         for (i = 0; i < 64; i++)
156             idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
157         break;
158     case FF_SSE2_IDCT_PERM:
159         for (i = 0; i < 64; i++)
160             idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
161         break;
162     default:
163         av_log(NULL, AV_LOG_ERROR,
164                "Internal error, IDCT permutation not set\n");
165     }
166 }
167
168 static int pix_sum_c(uint8_t *pix, int line_size)
169 {
170     int s = 0, i, j;
171
172     for (i = 0; i < 16; i++) {
173         for (j = 0; j < 16; j += 8) {
174             s   += pix[0];
175             s   += pix[1];
176             s   += pix[2];
177             s   += pix[3];
178             s   += pix[4];
179             s   += pix[5];
180             s   += pix[6];
181             s   += pix[7];
182             pix += 8;
183         }
184         pix += line_size - 16;
185     }
186     return s;
187 }
188
189 static int pix_norm1_c(uint8_t *pix, int line_size)
190 {
191     int s = 0, i, j;
192     uint32_t *sq = ff_square_tab + 256;
193
194     for (i = 0; i < 16; i++) {
195         for (j = 0; j < 16; j += 8) {
196 #if 0
197             s += sq[pix[0]];
198             s += sq[pix[1]];
199             s += sq[pix[2]];
200             s += sq[pix[3]];
201             s += sq[pix[4]];
202             s += sq[pix[5]];
203             s += sq[pix[6]];
204             s += sq[pix[7]];
205 #else
206 #if HAVE_FAST_64BIT
207             register uint64_t x = *(uint64_t *) pix;
208             s += sq[x         & 0xff];
209             s += sq[(x >>  8) & 0xff];
210             s += sq[(x >> 16) & 0xff];
211             s += sq[(x >> 24) & 0xff];
212             s += sq[(x >> 32) & 0xff];
213             s += sq[(x >> 40) & 0xff];
214             s += sq[(x >> 48) & 0xff];
215             s += sq[(x >> 56) & 0xff];
216 #else
217             register uint32_t x = *(uint32_t *) pix;
218             s += sq[x         & 0xff];
219             s += sq[(x >>  8) & 0xff];
220             s += sq[(x >> 16) & 0xff];
221             s += sq[(x >> 24) & 0xff];
222             x  = *(uint32_t *) (pix + 4);
223             s += sq[x         & 0xff];
224             s += sq[(x >>  8) & 0xff];
225             s += sq[(x >> 16) & 0xff];
226             s += sq[(x >> 24) & 0xff];
227 #endif
228 #endif
229             pix += 8;
230         }
231         pix += line_size - 16;
232     }
233     return s;
234 }
235
236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
237 {
238     int i;
239
240     for (i = 0; i + 8 <= w; i += 8) {
241         dst[i + 0] = av_bswap32(src[i + 0]);
242         dst[i + 1] = av_bswap32(src[i + 1]);
243         dst[i + 2] = av_bswap32(src[i + 2]);
244         dst[i + 3] = av_bswap32(src[i + 3]);
245         dst[i + 4] = av_bswap32(src[i + 4]);
246         dst[i + 5] = av_bswap32(src[i + 5]);
247         dst[i + 6] = av_bswap32(src[i + 6]);
248         dst[i + 7] = av_bswap32(src[i + 7]);
249     }
250     for (; i < w; i++)
251         dst[i + 0] = av_bswap32(src[i + 0]);
252 }
253
254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
255 {
256     while (len--)
257         *dst++ = av_bswap16(*src++);
258 }
259
260 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
261                   int line_size, int h)
262 {
263     int s = 0, i;
264     uint32_t *sq = ff_square_tab + 256;
265
266     for (i = 0; i < h; i++) {
267         s    += sq[pix1[0] - pix2[0]];
268         s    += sq[pix1[1] - pix2[1]];
269         s    += sq[pix1[2] - pix2[2]];
270         s    += sq[pix1[3] - pix2[3]];
271         pix1 += line_size;
272         pix2 += line_size;
273     }
274     return s;
275 }
276
277 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
278                   int line_size, int h)
279 {
280     int s = 0, i;
281     uint32_t *sq = ff_square_tab + 256;
282
283     for (i = 0; i < h; i++) {
284         s    += sq[pix1[0] - pix2[0]];
285         s    += sq[pix1[1] - pix2[1]];
286         s    += sq[pix1[2] - pix2[2]];
287         s    += sq[pix1[3] - pix2[3]];
288         s    += sq[pix1[4] - pix2[4]];
289         s    += sq[pix1[5] - pix2[5]];
290         s    += sq[pix1[6] - pix2[6]];
291         s    += sq[pix1[7] - pix2[7]];
292         pix1 += line_size;
293         pix2 += line_size;
294     }
295     return s;
296 }
297
298 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
299                    int line_size, int h)
300 {
301     int s = 0, i;
302     uint32_t *sq = ff_square_tab + 256;
303
304     for (i = 0; i < h; i++) {
305         s += sq[pix1[0]  - pix2[0]];
306         s += sq[pix1[1]  - pix2[1]];
307         s += sq[pix1[2]  - pix2[2]];
308         s += sq[pix1[3]  - pix2[3]];
309         s += sq[pix1[4]  - pix2[4]];
310         s += sq[pix1[5]  - pix2[5]];
311         s += sq[pix1[6]  - pix2[6]];
312         s += sq[pix1[7]  - pix2[7]];
313         s += sq[pix1[8]  - pix2[8]];
314         s += sq[pix1[9]  - pix2[9]];
315         s += sq[pix1[10] - pix2[10]];
316         s += sq[pix1[11] - pix2[11]];
317         s += sq[pix1[12] - pix2[12]];
318         s += sq[pix1[13] - pix2[13]];
319         s += sq[pix1[14] - pix2[14]];
320         s += sq[pix1[15] - pix2[15]];
321
322         pix1 += line_size;
323         pix2 += line_size;
324     }
325     return s;
326 }
327
328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
329                           const uint8_t *s2, int stride)
330 {
331     int i;
332
333     /* read the pixels */
334     for (i = 0; i < 8; i++) {
335         block[0] = s1[0] - s2[0];
336         block[1] = s1[1] - s2[1];
337         block[2] = s1[2] - s2[2];
338         block[3] = s1[3] - s2[3];
339         block[4] = s1[4] - s2[4];
340         block[5] = s1[5] - s2[5];
341         block[6] = s1[6] - s2[6];
342         block[7] = s1[7] - s2[7];
343         s1      += stride;
344         s2      += stride;
345         block   += 8;
346     }
347 }
348
349 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
350                                  int line_size)
351 {
352     int i;
353
354     /* read the pixels */
355     for (i = 0; i < 8; i++) {
356         pixels[0] = av_clip_uint8(block[0]);
357         pixels[1] = av_clip_uint8(block[1]);
358         pixels[2] = av_clip_uint8(block[2]);
359         pixels[3] = av_clip_uint8(block[3]);
360         pixels[4] = av_clip_uint8(block[4]);
361         pixels[5] = av_clip_uint8(block[5]);
362         pixels[6] = av_clip_uint8(block[6]);
363         pixels[7] = av_clip_uint8(block[7]);
364
365         pixels += line_size;
366         block  += 8;
367     }
368 }
369
370 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
371                                  int line_size)
372 {
373     int i;
374
375     /* read the pixels */
376     for(i=0;i<4;i++) {
377         pixels[0] = av_clip_uint8(block[0]);
378         pixels[1] = av_clip_uint8(block[1]);
379         pixels[2] = av_clip_uint8(block[2]);
380         pixels[3] = av_clip_uint8(block[3]);
381
382         pixels += line_size;
383         block += 8;
384     }
385 }
386
387 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
388                                  int line_size)
389 {
390     int i;
391
392     /* read the pixels */
393     for(i=0;i<2;i++) {
394         pixels[0] = av_clip_uint8(block[0]);
395         pixels[1] = av_clip_uint8(block[1]);
396
397         pixels += line_size;
398         block += 8;
399     }
400 }
401
402 static void put_signed_pixels_clamped_c(const int16_t *block,
403                                         uint8_t *av_restrict pixels,
404                                         int line_size)
405 {
406     int i, j;
407
408     for (i = 0; i < 8; i++) {
409         for (j = 0; j < 8; j++) {
410             if (*block < -128)
411                 *pixels = 0;
412             else if (*block > 127)
413                 *pixels = 255;
414             else
415                 *pixels = (uint8_t) (*block + 128);
416             block++;
417             pixels++;
418         }
419         pixels += (line_size - 8);
420     }
421 }
422
423 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
424                           int line_size)
425 {
426     int i;
427
428     for (i = 0; i < 8; i++) {
429         pixels[0] += block[0];
430         pixels[1] += block[1];
431         pixels[2] += block[2];
432         pixels[3] += block[3];
433         pixels[4] += block[4];
434         pixels[5] += block[5];
435         pixels[6] += block[6];
436         pixels[7] += block[7];
437         pixels    += line_size;
438         block     += 8;
439     }
440 }
441
442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
443                                  int line_size)
444 {
445     int i;
446
447     /* read the pixels */
448     for (i = 0; i < 8; i++) {
449         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
450         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
451         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
452         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
453         pixels[4] = av_clip_uint8(pixels[4] + block[4]);
454         pixels[5] = av_clip_uint8(pixels[5] + block[5]);
455         pixels[6] = av_clip_uint8(pixels[6] + block[6]);
456         pixels[7] = av_clip_uint8(pixels[7] + block[7]);
457         pixels   += line_size;
458         block    += 8;
459     }
460 }
461
462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
463                           int line_size)
464 {
465     int i;
466
467     /* read the pixels */
468     for(i=0;i<4;i++) {
469         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
470         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
471         pixels[2] = av_clip_uint8(pixels[2] + block[2]);
472         pixels[3] = av_clip_uint8(pixels[3] + block[3]);
473         pixels += line_size;
474         block += 8;
475     }
476 }
477
478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
479                           int line_size)
480 {
481     int i;
482
483     /* read the pixels */
484     for(i=0;i<2;i++) {
485         pixels[0] = av_clip_uint8(pixels[0] + block[0]);
486         pixels[1] = av_clip_uint8(pixels[1] + block[1]);
487         pixels += line_size;
488         block += 8;
489     }
490 }
491
492 static int sum_abs_dctelem_c(int16_t *block)
493 {
494     int sum = 0, i;
495
496     for (i = 0; i < 64; i++)
497         sum += FFABS(block[i]);
498     return sum;
499 }
500
501 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
502 {
503     int i;
504
505     for (i = 0; i < h; i++) {
506         memset(block, value, 16);
507         block += line_size;
508     }
509 }
510
511 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
512 {
513     int i;
514
515     for (i = 0; i < h; i++) {
516         memset(block, value, 8);
517         block += line_size;
518     }
519 }
520
521 #define avg2(a, b) ((a + b + 1) >> 1)
522 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
523
524 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
525                    int x16, int y16, int rounder)
526 {
527     const int A = (16 - x16) * (16 - y16);
528     const int B = (x16)      * (16 - y16);
529     const int C = (16 - x16) * (y16);
530     const int D = (x16)      * (y16);
531     int i;
532
533     for (i = 0; i < h; i++) {
534         dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
535         dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
536         dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
537         dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
538         dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
539         dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
540         dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
541         dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
542         dst   += stride;
543         src   += stride;
544     }
545 }
546
547 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
548               int dxx, int dxy, int dyx, int dyy, int shift, int r,
549               int width, int height)
550 {
551     int y, vx, vy;
552     const int s = 1 << shift;
553
554     width--;
555     height--;
556
557     for (y = 0; y < h; y++) {
558         int x;
559
560         vx = ox;
561         vy = oy;
562         for (x = 0; x < 8; x++) { // FIXME: optimize
563             int index;
564             int src_x  = vx >> 16;
565             int src_y  = vy >> 16;
566             int frac_x = src_x & (s - 1);
567             int frac_y = src_y & (s - 1);
568
569             src_x >>= shift;
570             src_y >>= shift;
571
572             if ((unsigned) src_x < width) {
573                 if ((unsigned) src_y < height) {
574                     index = src_x + src_y * stride;
575                     dst[y * stride + x] =
576                         ((src[index]                        * (s - frac_x) +
577                           src[index + 1]          * frac_x) * (s - frac_y) +
578                          (src[index + stride]               * (s - frac_x) +
579                           src[index + stride + 1] * frac_x) *      frac_y  +
580                          r) >> (shift * 2);
581                 } else {
582                     index = src_x + av_clip(src_y, 0, height) * stride;
583                     dst[y * stride + x] =
584                         ((src[index]               * (s - frac_x) +
585                           src[index + 1] * frac_x) *  s           +
586                          r) >> (shift * 2);
587                 }
588             } else {
589                 if ((unsigned) src_y < height) {
590                     index = av_clip(src_x, 0, width) + src_y * stride;
591                     dst[y * stride + x] =
592                         ((src[index]                    * (s - frac_y) +
593                           src[index + stride] * frac_y) *  s           +
594                          r) >> (shift * 2);
595                 } else {
596                     index = av_clip(src_x, 0, width) +
597                             av_clip(src_y, 0, height) * stride;
598                     dst[y * stride + x] = src[index];
599                 }
600             }
601
602             vx += dxx;
603             vy += dyx;
604         }
605         ox += dxy;
606         oy += dyy;
607     }
608 }
609
610 #define QPEL_MC(r, OPNAME, RND, OP)                                           \
611 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src,       \
612                                             int dstStride, int srcStride,     \
613                                             int h)                            \
614 {                                                                             \
615     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
616     int i;                                                                    \
617                                                                               \
618     for (i = 0; i < h; i++) {                                                 \
619         OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
620         OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
621         OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
622         OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
623         OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
624         OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
625         OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
626         OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
627         dst += dstStride;                                                     \
628         src += srcStride;                                                     \
629     }                                                                         \
630 }                                                                             \
631                                                                               \
632 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src,       \
633                                             int dstStride, int srcStride)     \
634 {                                                                             \
635     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
636     const int w = 8;                                                          \
637     int i;                                                                    \
638                                                                               \
639     for (i = 0; i < w; i++) {                                                 \
640         const int src0 = src[0 * srcStride];                                  \
641         const int src1 = src[1 * srcStride];                                  \
642         const int src2 = src[2 * srcStride];                                  \
643         const int src3 = src[3 * srcStride];                                  \
644         const int src4 = src[4 * srcStride];                                  \
645         const int src5 = src[5 * srcStride];                                  \
646         const int src6 = src[6 * srcStride];                                  \
647         const int src7 = src[7 * srcStride];                                  \
648         const int src8 = src[8 * srcStride];                                  \
649         OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
650         OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
651         OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
652         OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
653         OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
654         OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
655         OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
656         OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
657         dst++;                                                                \
658         src++;                                                                \
659     }                                                                         \
660 }                                                                             \
661                                                                               \
662 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src,      \
663                                              int dstStride, int srcStride,    \
664                                              int h)                           \
665 {                                                                             \
666     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
667     int i;                                                                    \
668                                                                               \
669     for (i = 0; i < h; i++) {                                                 \
670         OP(dst[0],  (src[0]  + src[1])  * 20 - (src[0]  + src[2])  * 6 + (src[1]  + src[3])  * 3 - (src[2]  + src[4]));  \
671         OP(dst[1],  (src[1]  + src[2])  * 20 - (src[0]  + src[3])  * 6 + (src[0]  + src[4])  * 3 - (src[1]  + src[5]));  \
672         OP(dst[2],  (src[2]  + src[3])  * 20 - (src[1]  + src[4])  * 6 + (src[0]  + src[5])  * 3 - (src[0]  + src[6]));  \
673         OP(dst[3],  (src[3]  + src[4])  * 20 - (src[2]  + src[5])  * 6 + (src[1]  + src[6])  * 3 - (src[0]  + src[7]));  \
674         OP(dst[4],  (src[4]  + src[5])  * 20 - (src[3]  + src[6])  * 6 + (src[2]  + src[7])  * 3 - (src[1]  + src[8]));  \
675         OP(dst[5],  (src[5]  + src[6])  * 20 - (src[4]  + src[7])  * 6 + (src[3]  + src[8])  * 3 - (src[2]  + src[9]));  \
676         OP(dst[6],  (src[6]  + src[7])  * 20 - (src[5]  + src[8])  * 6 + (src[4]  + src[9])  * 3 - (src[3]  + src[10])); \
677         OP(dst[7],  (src[7]  + src[8])  * 20 - (src[6]  + src[9])  * 6 + (src[5]  + src[10]) * 3 - (src[4]  + src[11])); \
678         OP(dst[8],  (src[8]  + src[9])  * 20 - (src[7]  + src[10]) * 6 + (src[6]  + src[11]) * 3 - (src[5]  + src[12])); \
679         OP(dst[9],  (src[9]  + src[10]) * 20 - (src[8]  + src[11]) * 6 + (src[7]  + src[12]) * 3 - (src[6]  + src[13])); \
680         OP(dst[10], (src[10] + src[11]) * 20 - (src[9]  + src[12]) * 6 + (src[8]  + src[13]) * 3 - (src[7]  + src[14])); \
681         OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9]  + src[14]) * 3 - (src[8]  + src[15])); \
682         OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9]  + src[16])); \
683         OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
684         OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
685         OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
686         dst += dstStride;                                                     \
687         src += srcStride;                                                     \
688     }                                                                         \
689 }                                                                             \
690                                                                               \
691 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src,      \
692                                              int dstStride, int srcStride)    \
693 {                                                                             \
694     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;                           \
695     const int w = 16;                                                         \
696     int i;                                                                    \
697                                                                               \
698     for (i = 0; i < w; i++) {                                                 \
699         const int src0  = src[0  * srcStride];                                \
700         const int src1  = src[1  * srcStride];                                \
701         const int src2  = src[2  * srcStride];                                \
702         const int src3  = src[3  * srcStride];                                \
703         const int src4  = src[4  * srcStride];                                \
704         const int src5  = src[5  * srcStride];                                \
705         const int src6  = src[6  * srcStride];                                \
706         const int src7  = src[7  * srcStride];                                \
707         const int src8  = src[8  * srcStride];                                \
708         const int src9  = src[9  * srcStride];                                \
709         const int src10 = src[10 * srcStride];                                \
710         const int src11 = src[11 * srcStride];                                \
711         const int src12 = src[12 * srcStride];                                \
712         const int src13 = src[13 * srcStride];                                \
713         const int src14 = src[14 * srcStride];                                \
714         const int src15 = src[15 * srcStride];                                \
715         const int src16 = src[16 * srcStride];                                \
716         OP(dst[0  * dstStride], (src0  + src1)  * 20 - (src0  + src2)  * 6 + (src1  + src3)  * 3 - (src2  + src4));  \
717         OP(dst[1  * dstStride], (src1  + src2)  * 20 - (src0  + src3)  * 6 + (src0  + src4)  * 3 - (src1  + src5));  \
718         OP(dst[2  * dstStride], (src2  + src3)  * 20 - (src1  + src4)  * 6 + (src0  + src5)  * 3 - (src0  + src6));  \
719         OP(dst[3  * dstStride], (src3  + src4)  * 20 - (src2  + src5)  * 6 + (src1  + src6)  * 3 - (src0  + src7));  \
720         OP(dst[4  * dstStride], (src4  + src5)  * 20 - (src3  + src6)  * 6 + (src2  + src7)  * 3 - (src1  + src8));  \
721         OP(dst[5  * dstStride], (src5  + src6)  * 20 - (src4  + src7)  * 6 + (src3  + src8)  * 3 - (src2  + src9));  \
722         OP(dst[6  * dstStride], (src6  + src7)  * 20 - (src5  + src8)  * 6 + (src4  + src9)  * 3 - (src3  + src10)); \
723         OP(dst[7  * dstStride], (src7  + src8)  * 20 - (src6  + src9)  * 6 + (src5  + src10) * 3 - (src4  + src11)); \
724         OP(dst[8  * dstStride], (src8  + src9)  * 20 - (src7  + src10) * 6 + (src6  + src11) * 3 - (src5  + src12)); \
725         OP(dst[9  * dstStride], (src9  + src10) * 20 - (src8  + src11) * 6 + (src7  + src12) * 3 - (src6  + src13)); \
726         OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9  + src12) * 6 + (src8  + src13) * 3 - (src7  + src14)); \
727         OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9  + src14) * 3 - (src8  + src15)); \
728         OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9  + src16)); \
729         OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
730         OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
731         OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
732         dst++;                                                                \
733         src++;                                                                \
734     }                                                                         \
735 }                                                                             \
736                                                                               \
737 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src,                \
738                                    ptrdiff_t stride)                          \
739 {                                                                             \
740     uint8_t half[64];                                                         \
741                                                                               \
742     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
743     OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);             \
744 }                                                                             \
745                                                                               \
746 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src,                \
747                                    ptrdiff_t stride)                          \
748 {                                                                             \
749     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);             \
750 }                                                                             \
751                                                                               \
752 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src,                \
753                                    ptrdiff_t stride)                          \
754 {                                                                             \
755     uint8_t half[64];                                                         \
756                                                                               \
757     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);             \
758     OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);         \
759 }                                                                             \
760                                                                               \
761 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src,                \
762                                    ptrdiff_t stride)                          \
763 {                                                                             \
764     uint8_t full[16 * 9];                                                     \
765     uint8_t half[64];                                                         \
766                                                                               \
767     copy_block9(full, src, 16, stride, 9);                                    \
768     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
769     OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);                \
770 }                                                                             \
771                                                                               \
772 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src,                \
773                                    ptrdiff_t stride)                          \
774 {                                                                             \
775     uint8_t full[16 * 9];                                                     \
776                                                                               \
777     copy_block9(full, src, 16, stride, 9);                                    \
778     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);                   \
779 }                                                                             \
780                                                                               \
781 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src,                \
782                                    ptrdiff_t stride)                          \
783 {                                                                             \
784     uint8_t full[16 * 9];                                                     \
785     uint8_t half[64];                                                         \
786                                                                               \
787     copy_block9(full, src, 16, stride, 9);                                    \
788     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);                   \
789     OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8);           \
790 }                                                                             \
791                                                                               \
792 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src,            \
793                                        ptrdiff_t stride)                      \
794 {                                                                             \
795     uint8_t full[16 * 9];                                                     \
796     uint8_t halfH[72];                                                        \
797     uint8_t halfV[64];                                                        \
798     uint8_t halfHV[64];                                                       \
799                                                                               \
800     copy_block9(full, src, 16, stride, 9);                                    \
801     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
802     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
803     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
804     OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV,                   \
805                            stride, 16, 8, 8, 8, 8);                           \
806 }                                                                             \
807                                                                               \
808 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src,                \
809                                    ptrdiff_t stride)                          \
810 {                                                                             \
811     uint8_t full[16 * 9];                                                     \
812     uint8_t halfH[72];                                                        \
813     uint8_t halfHV[64];                                                       \
814                                                                               \
815     copy_block9(full, src, 16, stride, 9);                                    \
816     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
817     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
818     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
819     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
820 }                                                                             \
821                                                                               \
822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src,            \
823                                        ptrdiff_t stride)                      \
824 {                                                                             \
825     uint8_t full[16 * 9];                                                     \
826     uint8_t halfH[72];                                                        \
827     uint8_t halfV[64];                                                        \
828     uint8_t halfHV[64];                                                       \
829                                                                               \
830     copy_block9(full, src, 16, stride, 9);                                    \
831     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
832     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
833     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
834     OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV,               \
835                            stride, 16, 8, 8, 8, 8);                           \
836 }                                                                             \
837                                                                               \
838 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src,                \
839                                    ptrdiff_t stride)                          \
840 {                                                                             \
841     uint8_t full[16 * 9];                                                     \
842     uint8_t halfH[72];                                                        \
843     uint8_t halfHV[64];                                                       \
844                                                                               \
845     copy_block9(full, src, 16, stride, 9);                                    \
846     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
847     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
848     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
849     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
850 }                                                                             \
851                                                                               \
852 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src,            \
853                                        ptrdiff_t stride)                      \
854 {                                                                             \
855     uint8_t full[16 * 9];                                                     \
856     uint8_t halfH[72];                                                        \
857     uint8_t halfV[64];                                                        \
858     uint8_t halfHV[64];                                                       \
859                                                                               \
860     copy_block9(full, src, 16, stride, 9);                                    \
861     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
862     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
863     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
864     OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV,          \
865                            stride, 16, 8, 8, 8, 8);                           \
866 }                                                                             \
867                                                                               \
868 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src,                \
869                                    ptrdiff_t stride)                          \
870 {                                                                             \
871     uint8_t full[16 * 9];                                                     \
872     uint8_t halfH[72];                                                        \
873     uint8_t halfHV[64];                                                       \
874                                                                               \
875     copy_block9(full, src, 16, stride, 9);                                    \
876     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
877     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
878     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
879     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
880 }                                                                             \
881                                                                               \
882 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src,            \
883                                        ptrdiff_t stride)                      \
884 {                                                                             \
885     uint8_t full[16 * 9];                                                     \
886     uint8_t halfH[72];                                                        \
887     uint8_t halfV[64];                                                        \
888     uint8_t halfHV[64];                                                       \
889                                                                               \
890     copy_block9(full, src, 16, stride, 9);                                    \
891     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
892     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
893     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
894     OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV,          \
895                            stride, 16, 8, 8, 8, 8);                           \
896 }                                                                             \
897                                                                               \
898 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src,                \
899                                    ptrdiff_t stride)                          \
900 {                                                                             \
901     uint8_t full[16 * 9];                                                     \
902     uint8_t halfH[72];                                                        \
903     uint8_t halfHV[64];                                                       \
904                                                                               \
905     copy_block9(full, src, 16, stride, 9);                                    \
906     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
907     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
908     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
909     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
910 }                                                                             \
911                                                                               \
912 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src,                \
913                                    ptrdiff_t stride)                          \
914 {                                                                             \
915     uint8_t halfH[72];                                                        \
916     uint8_t halfHV[64];                                                       \
917                                                                               \
918     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
919     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
920     OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);              \
921 }                                                                             \
922                                                                               \
923 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src,                \
924                                    ptrdiff_t stride)                          \
925 {                                                                             \
926     uint8_t halfH[72];                                                        \
927     uint8_t halfHV[64];                                                       \
928                                                                               \
929     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
930     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
931     OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8);          \
932 }                                                                             \
933                                                                               \
934 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src,            \
935                                        ptrdiff_t stride)                      \
936 {                                                                             \
937     uint8_t full[16 * 9];                                                     \
938     uint8_t halfH[72];                                                        \
939     uint8_t halfV[64];                                                        \
940     uint8_t halfHV[64];                                                       \
941                                                                               \
942     copy_block9(full, src, 16, stride, 9);                                    \
943     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
944     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);                  \
945     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
946     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
947 }                                                                             \
948                                                                               \
949 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src,                \
950                                    ptrdiff_t stride)                          \
951 {                                                                             \
952     uint8_t full[16 * 9];                                                     \
953     uint8_t halfH[72];                                                        \
954                                                                               \
955     copy_block9(full, src, 16, stride, 9);                                    \
956     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
957     put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);              \
958     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
959 }                                                                             \
960                                                                               \
961 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src,            \
962                                        ptrdiff_t stride)                      \
963 {                                                                             \
964     uint8_t full[16 * 9];                                                     \
965     uint8_t halfH[72];                                                        \
966     uint8_t halfV[64];                                                        \
967     uint8_t halfHV[64];                                                       \
968                                                                               \
969     copy_block9(full, src, 16, stride, 9);                                    \
970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
971     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16);              \
972     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);                 \
973     OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);              \
974 }                                                                             \
975                                                                               \
976 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src,                \
977                                    ptrdiff_t stride)                          \
978 {                                                                             \
979     uint8_t full[16 * 9];                                                     \
980     uint8_t halfH[72];                                                        \
981                                                                               \
982     copy_block9(full, src, 16, stride, 9);                                    \
983     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);               \
984     put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9);          \
985     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
986 }                                                                             \
987                                                                               \
988 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src,                \
989                                    ptrdiff_t stride)                          \
990 {                                                                             \
991     uint8_t halfH[72];                                                        \
992                                                                               \
993     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);            \
994     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);                   \
995 }                                                                             \
996                                                                               \
997 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src,               \
998                                     ptrdiff_t stride)                         \
999 {                                                                             \
1000     uint8_t half[256];                                                        \
1001                                                                               \
1002     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1003     OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);          \
1004 }                                                                             \
1005                                                                               \
1006 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src,               \
1007                                     ptrdiff_t stride)                         \
1008 {                                                                             \
1009     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);           \
1010 }                                                                             \
1011                                                                               \
1012 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src,               \
1013                                     ptrdiff_t stride)                         \
1014 {                                                                             \
1015     uint8_t half[256];                                                        \
1016                                                                               \
1017     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);          \
1018     OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16);      \
1019 }                                                                             \
1020                                                                               \
1021 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src,               \
1022                                     ptrdiff_t stride)                         \
1023 {                                                                             \
1024     uint8_t full[24 * 17];                                                    \
1025     uint8_t half[256];                                                        \
1026                                                                               \
1027     copy_block17(full, src, 24, stride, 17);                                  \
1028     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1029     OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);             \
1030 }                                                                             \
1031                                                                               \
1032 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src,               \
1033                                     ptrdiff_t stride)                         \
1034 {                                                                             \
1035     uint8_t full[24 * 17];                                                    \
1036                                                                               \
1037     copy_block17(full, src, 24, stride, 17);                                  \
1038     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);                  \
1039 }                                                                             \
1040                                                                               \
1041 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src,               \
1042                                     ptrdiff_t stride)                         \
1043 {                                                                             \
1044     uint8_t full[24 * 17];                                                    \
1045     uint8_t half[256];                                                        \
1046                                                                               \
1047     copy_block17(full, src, 24, stride, 17);                                  \
1048     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);                 \
1049     OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16);        \
1050 }                                                                             \
1051                                                                               \
1052 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src,           \
1053                                         ptrdiff_t stride)                     \
1054 {                                                                             \
1055     uint8_t full[24 * 17];                                                    \
1056     uint8_t halfH[272];                                                       \
1057     uint8_t halfV[256];                                                       \
1058     uint8_t halfHV[256];                                                      \
1059                                                                               \
1060     copy_block17(full, src, 24, stride, 17);                                  \
1061     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1062     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1063     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1064     OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV,                  \
1065                             stride, 24, 16, 16, 16, 16);                      \
1066 }                                                                             \
1067                                                                               \
1068 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src,               \
1069                                     ptrdiff_t stride)                         \
1070 {                                                                             \
1071     uint8_t full[24 * 17];                                                    \
1072     uint8_t halfH[272];                                                       \
1073     uint8_t halfHV[256];                                                      \
1074                                                                               \
1075     copy_block17(full, src, 24, stride, 17);                                  \
1076     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1077     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1078     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1079     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1080 }                                                                             \
1081                                                                               \
1082 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src,           \
1083                                         ptrdiff_t stride)                     \
1084 {                                                                             \
1085     uint8_t full[24 * 17];                                                    \
1086     uint8_t halfH[272];                                                       \
1087     uint8_t halfV[256];                                                       \
1088     uint8_t halfHV[256];                                                      \
1089                                                                               \
1090     copy_block17(full, src, 24, stride, 17);                                  \
1091     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1092     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1093     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1094     OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV,              \
1095                             stride, 24, 16, 16, 16, 16);                      \
1096 }                                                                             \
1097                                                                               \
1098 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src,               \
1099                                     ptrdiff_t stride)                         \
1100 {                                                                             \
1101     uint8_t full[24 * 17];                                                    \
1102     uint8_t halfH[272];                                                       \
1103     uint8_t halfHV[256];                                                      \
1104                                                                               \
1105     copy_block17(full, src, 24, stride, 17);                                  \
1106     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1107     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1108     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1109     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1110 }                                                                             \
1111                                                                               \
1112 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src,           \
1113                                         ptrdiff_t stride)                     \
1114 {                                                                             \
1115     uint8_t full[24 * 17];                                                    \
1116     uint8_t halfH[272];                                                       \
1117     uint8_t halfV[256];                                                       \
1118     uint8_t halfHV[256];                                                      \
1119                                                                               \
1120     copy_block17(full, src, 24, stride, 17);                                  \
1121     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1122     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1123     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1124     OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV,        \
1125                             stride, 24, 16, 16, 16, 16);                      \
1126 }                                                                             \
1127                                                                               \
1128 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src,               \
1129                                     ptrdiff_t stride)                         \
1130 {                                                                             \
1131     uint8_t full[24 * 17];                                                    \
1132     uint8_t halfH[272];                                                       \
1133     uint8_t halfHV[256];                                                      \
1134                                                                               \
1135     copy_block17(full, src, 24, stride, 17);                                  \
1136     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1137     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1138     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1139     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1140 }                                                                             \
1141                                                                               \
1142 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src,           \
1143                                         ptrdiff_t stride)                     \
1144 {                                                                             \
1145     uint8_t full[24 * 17];                                                    \
1146     uint8_t halfH[272];                                                       \
1147     uint8_t halfV[256];                                                       \
1148     uint8_t halfHV[256];                                                      \
1149                                                                               \
1150     copy_block17(full, src, 24, stride, 17);                                  \
1151     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1152     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1153     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1154     OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV,        \
1155                             stride, 24, 16, 16, 16, 16);                      \
1156 }                                                                             \
1157                                                                               \
1158 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src,               \
1159                                     ptrdiff_t stride)                         \
1160 {                                                                             \
1161     uint8_t full[24 * 17];                                                    \
1162     uint8_t halfH[272];                                                       \
1163     uint8_t halfHV[256];                                                      \
1164                                                                               \
1165     copy_block17(full, src, 24, stride, 17);                                  \
1166     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1167     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1168     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1169     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1170 }                                                                             \
1171                                                                               \
1172 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src,               \
1173                                     ptrdiff_t stride)                         \
1174 {                                                                             \
1175     uint8_t halfH[272];                                                       \
1176     uint8_t halfHV[256];                                                      \
1177                                                                               \
1178     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1179     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1180     OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);          \
1181 }                                                                             \
1182                                                                               \
1183 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src,               \
1184                                     ptrdiff_t stride)                         \
1185 {                                                                             \
1186     uint8_t halfH[272];                                                       \
1187     uint8_t halfHV[256];                                                      \
1188                                                                               \
1189     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1190     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1191     OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16);     \
1192 }                                                                             \
1193                                                                               \
1194 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src,           \
1195                                         ptrdiff_t stride)                     \
1196 {                                                                             \
1197     uint8_t full[24 * 17];                                                    \
1198     uint8_t halfH[272];                                                       \
1199     uint8_t halfV[256];                                                       \
1200     uint8_t halfHV[256];                                                      \
1201                                                                               \
1202     copy_block17(full, src, 24, stride, 17);                                  \
1203     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1204     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);                \
1205     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1206     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1207 }                                                                             \
1208                                                                               \
1209 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src,               \
1210                                     ptrdiff_t stride)                         \
1211 {                                                                             \
1212     uint8_t full[24 * 17];                                                    \
1213     uint8_t halfH[272];                                                       \
1214                                                                               \
1215     copy_block17(full, src, 24, stride, 17);                                  \
1216     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1217     put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);          \
1218     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1219 }                                                                             \
1220                                                                               \
1221 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src,           \
1222                                         ptrdiff_t stride)                     \
1223 {                                                                             \
1224     uint8_t full[24 * 17];                                                    \
1225     uint8_t halfH[272];                                                       \
1226     uint8_t halfV[256];                                                       \
1227     uint8_t halfHV[256];                                                      \
1228                                                                               \
1229     copy_block17(full, src, 24, stride, 17);                                  \
1230     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1231     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24);            \
1232     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);              \
1233     OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);          \
1234 }                                                                             \
1235                                                                               \
1236 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src,               \
1237                                     ptrdiff_t stride)                         \
1238 {                                                                             \
1239     uint8_t full[24 * 17];                                                    \
1240     uint8_t halfH[272];                                                       \
1241                                                                               \
1242     copy_block17(full, src, 24, stride, 17);                                  \
1243     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);            \
1244     put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17);      \
1245     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1246 }                                                                             \
1247                                                                               \
1248 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src,               \
1249                                     ptrdiff_t stride)                         \
1250 {                                                                             \
1251     uint8_t halfH[272];                                                       \
1252                                                                               \
1253     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);         \
1254     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);                 \
1255 }
1256
1257 #define op_avg(a, b)        a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1258 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5])     >> 1)
1259 #define op_put(a, b)        a = cm[((b) + 16) >> 5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1261
1262 QPEL_MC(0, put_, _, op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_, _, op_avg)
1265
1266 #undef op_avg
1267 #undef op_put
1268 #undef op_put_no_rnd
1269
1270 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1271 {
1272     put_pixels8_8_c(dst, src, stride, 8);
1273 }
1274
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1276 {
1277     avg_pixels8_8_c(dst, src, stride, 8);
1278 }
1279
1280 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1281 {
1282     put_pixels16_8_c(dst, src, stride, 16);
1283 }
1284
1285 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1286 {
1287     avg_pixels16_8_c(dst, src, stride, 16);
1288 }
1289
1290 #define put_qpel8_mc00_c         ff_put_pixels8x8_c
1291 #define avg_qpel8_mc00_c         ff_avg_pixels8x8_c
1292 #define put_qpel16_mc00_c        ff_put_pixels16x16_c
1293 #define avg_qpel16_mc00_c        ff_avg_pixels16x16_c
1294 #define put_no_rnd_qpel8_mc00_c  ff_put_pixels8x8_c
1295 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1296
1297 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1298                                   int dstStride, int srcStride, int h)
1299 {
1300     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1301     int i;
1302
1303     for (i = 0; i < h; i++) {
1304         dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1305         dst[1] = cm[(9 * (src[1] + src[2]) - (src[0]  + src[3]) + 8) >> 4];
1306         dst[2] = cm[(9 * (src[2] + src[3]) - (src[1]  + src[4]) + 8) >> 4];
1307         dst[3] = cm[(9 * (src[3] + src[4]) - (src[2]  + src[5]) + 8) >> 4];
1308         dst[4] = cm[(9 * (src[4] + src[5]) - (src[3]  + src[6]) + 8) >> 4];
1309         dst[5] = cm[(9 * (src[5] + src[6]) - (src[4]  + src[7]) + 8) >> 4];
1310         dst[6] = cm[(9 * (src[6] + src[7]) - (src[5]  + src[8]) + 8) >> 4];
1311         dst[7] = cm[(9 * (src[7] + src[8]) - (src[6]  + src[9]) + 8) >> 4];
1312         dst   += dstStride;
1313         src   += srcStride;
1314     }
1315 }
1316
1317 #if CONFIG_DIRAC_DECODER
1318 #define DIRAC_MC(OPNAME)\
1319 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1320 {\
1321      OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1322 }\
1323 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1324 {\
1325     OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1326 }\
1327 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1328 {\
1329     OPNAME ## _pixels16_8_c(dst   , src[0]   , stride, h);\
1330     OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1331 }\
1332 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1333 {\
1334     OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1335 }\
1336 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1337 {\
1338     OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1339 }\
1340 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1341 {\
1342     OPNAME ## _pixels16_l2_8(dst   , src[0]   , src[1]   , stride, stride, stride, h);\
1343     OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1344 }\
1345 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1346 {\
1347     OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1348 }\
1349 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1350 {\
1351     OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1352 }\
1353 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1354 {\
1355     OPNAME ## _pixels16_l4_8(dst   , src[0]   , src[1]   , src[2]   , src[3]   , stride, stride, stride, stride, stride, h);\
1356     OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1357 }
1358 DIRAC_MC(put)
1359 DIRAC_MC(avg)
1360 #endif
1361
1362 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1363                                   int dstStride, int srcStride, int w)
1364 {
1365     const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1366     int i;
1367
1368     for (i = 0; i < w; i++) {
1369         const int src_1 = src[-srcStride];
1370         const int src0  = src[0];
1371         const int src1  = src[srcStride];
1372         const int src2  = src[2 * srcStride];
1373         const int src3  = src[3 * srcStride];
1374         const int src4  = src[4 * srcStride];
1375         const int src5  = src[5 * srcStride];
1376         const int src6  = src[6 * srcStride];
1377         const int src7  = src[7 * srcStride];
1378         const int src8  = src[8 * srcStride];
1379         const int src9  = src[9 * srcStride];
1380         dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1381         dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0  + src3) + 8) >> 4];
1382         dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1  + src4) + 8) >> 4];
1383         dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2  + src5) + 8) >> 4];
1384         dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3  + src6) + 8) >> 4];
1385         dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4  + src7) + 8) >> 4];
1386         dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5  + src8) + 8) >> 4];
1387         dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6  + src9) + 8) >> 4];
1388         src++;
1389         dst++;
1390     }
1391 }
1392
1393 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1394 {
1395     uint8_t half[64];
1396
1397     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1398     put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1399 }
1400
1401 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1402 {
1403     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1404 }
1405
1406 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1407 {
1408     uint8_t half[64];
1409
1410     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1411     put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1412 }
1413
1414 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1415 {
1416     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1417 }
1418
1419 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1420 {
1421     uint8_t halfH[88];
1422     uint8_t halfV[64];
1423     uint8_t halfHV[64];
1424
1425     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1426     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1427     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1428     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1429 }
1430
1431 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1432 {
1433     uint8_t halfH[88];
1434     uint8_t halfV[64];
1435     uint8_t halfHV[64];
1436
1437     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1438     wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1439     wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1440     put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1441 }
1442
1443 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1444 {
1445     uint8_t halfH[88];
1446
1447     wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1448     wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1449 }
1450
1451 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1452                               int line_size, int h)
1453 {
1454     int s = 0, i;
1455
1456     for (i = 0; i < h; i++) {
1457         s    += abs(pix1[0]  - pix2[0]);
1458         s    += abs(pix1[1]  - pix2[1]);
1459         s    += abs(pix1[2]  - pix2[2]);
1460         s    += abs(pix1[3]  - pix2[3]);
1461         s    += abs(pix1[4]  - pix2[4]);
1462         s    += abs(pix1[5]  - pix2[5]);
1463         s    += abs(pix1[6]  - pix2[6]);
1464         s    += abs(pix1[7]  - pix2[7]);
1465         s    += abs(pix1[8]  - pix2[8]);
1466         s    += abs(pix1[9]  - pix2[9]);
1467         s    += abs(pix1[10] - pix2[10]);
1468         s    += abs(pix1[11] - pix2[11]);
1469         s    += abs(pix1[12] - pix2[12]);
1470         s    += abs(pix1[13] - pix2[13]);
1471         s    += abs(pix1[14] - pix2[14]);
1472         s    += abs(pix1[15] - pix2[15]);
1473         pix1 += line_size;
1474         pix2 += line_size;
1475     }
1476     return s;
1477 }
1478
1479 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1480                           int line_size, int h)
1481 {
1482     int s = 0, i;
1483
1484     for (i = 0; i < h; i++) {
1485         s    += abs(pix1[0]  - avg2(pix2[0],  pix2[1]));
1486         s    += abs(pix1[1]  - avg2(pix2[1],  pix2[2]));
1487         s    += abs(pix1[2]  - avg2(pix2[2],  pix2[3]));
1488         s    += abs(pix1[3]  - avg2(pix2[3],  pix2[4]));
1489         s    += abs(pix1[4]  - avg2(pix2[4],  pix2[5]));
1490         s    += abs(pix1[5]  - avg2(pix2[5],  pix2[6]));
1491         s    += abs(pix1[6]  - avg2(pix2[6],  pix2[7]));
1492         s    += abs(pix1[7]  - avg2(pix2[7],  pix2[8]));
1493         s    += abs(pix1[8]  - avg2(pix2[8],  pix2[9]));
1494         s    += abs(pix1[9]  - avg2(pix2[9],  pix2[10]));
1495         s    += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1496         s    += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1497         s    += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1498         s    += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1499         s    += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1500         s    += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1501         pix1 += line_size;
1502         pix2 += line_size;
1503     }
1504     return s;
1505 }
1506
1507 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1508                           int line_size, int h)
1509 {
1510     int s = 0, i;
1511     uint8_t *pix3 = pix2 + line_size;
1512
1513     for (i = 0; i < h; i++) {
1514         s    += abs(pix1[0]  - avg2(pix2[0],  pix3[0]));
1515         s    += abs(pix1[1]  - avg2(pix2[1],  pix3[1]));
1516         s    += abs(pix1[2]  - avg2(pix2[2],  pix3[2]));
1517         s    += abs(pix1[3]  - avg2(pix2[3],  pix3[3]));
1518         s    += abs(pix1[4]  - avg2(pix2[4],  pix3[4]));
1519         s    += abs(pix1[5]  - avg2(pix2[5],  pix3[5]));
1520         s    += abs(pix1[6]  - avg2(pix2[6],  pix3[6]));
1521         s    += abs(pix1[7]  - avg2(pix2[7],  pix3[7]));
1522         s    += abs(pix1[8]  - avg2(pix2[8],  pix3[8]));
1523         s    += abs(pix1[9]  - avg2(pix2[9],  pix3[9]));
1524         s    += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1525         s    += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1526         s    += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1527         s    += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1528         s    += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1529         s    += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1530         pix1 += line_size;
1531         pix2 += line_size;
1532         pix3 += line_size;
1533     }
1534     return s;
1535 }
1536
1537 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1538                            int line_size, int h)
1539 {
1540     int s = 0, i;
1541     uint8_t *pix3 = pix2 + line_size;
1542
1543     for (i = 0; i < h; i++) {
1544         s    += abs(pix1[0]  - avg4(pix2[0],  pix2[1],  pix3[0],  pix3[1]));
1545         s    += abs(pix1[1]  - avg4(pix2[1],  pix2[2],  pix3[1],  pix3[2]));
1546         s    += abs(pix1[2]  - avg4(pix2[2],  pix2[3],  pix3[2],  pix3[3]));
1547         s    += abs(pix1[3]  - avg4(pix2[3],  pix2[4],  pix3[3],  pix3[4]));
1548         s    += abs(pix1[4]  - avg4(pix2[4],  pix2[5],  pix3[4],  pix3[5]));
1549         s    += abs(pix1[5]  - avg4(pix2[5],  pix2[6],  pix3[5],  pix3[6]));
1550         s    += abs(pix1[6]  - avg4(pix2[6],  pix2[7],  pix3[6],  pix3[7]));
1551         s    += abs(pix1[7]  - avg4(pix2[7],  pix2[8],  pix3[7],  pix3[8]));
1552         s    += abs(pix1[8]  - avg4(pix2[8],  pix2[9],  pix3[8],  pix3[9]));
1553         s    += abs(pix1[9]  - avg4(pix2[9],  pix2[10], pix3[9],  pix3[10]));
1554         s    += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1555         s    += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1556         s    += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1557         s    += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1558         s    += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1559         s    += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1560         pix1 += line_size;
1561         pix2 += line_size;
1562         pix3 += line_size;
1563     }
1564     return s;
1565 }
1566
1567 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1568                              int line_size, int h)
1569 {
1570     int s = 0, i;
1571
1572     for (i = 0; i < h; i++) {
1573         s    += abs(pix1[0] - pix2[0]);
1574         s    += abs(pix1[1] - pix2[1]);
1575         s    += abs(pix1[2] - pix2[2]);
1576         s    += abs(pix1[3] - pix2[3]);
1577         s    += abs(pix1[4] - pix2[4]);
1578         s    += abs(pix1[5] - pix2[5]);
1579         s    += abs(pix1[6] - pix2[6]);
1580         s    += abs(pix1[7] - pix2[7]);
1581         pix1 += line_size;
1582         pix2 += line_size;
1583     }
1584     return s;
1585 }
1586
1587 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1588                          int line_size, int h)
1589 {
1590     int s = 0, i;
1591
1592     for (i = 0; i < h; i++) {
1593         s    += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1594         s    += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1595         s    += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1596         s    += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1597         s    += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1598         s    += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1599         s    += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1600         s    += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1601         pix1 += line_size;
1602         pix2 += line_size;
1603     }
1604     return s;
1605 }
1606
1607 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1608                          int line_size, int h)
1609 {
1610     int s = 0, i;
1611     uint8_t *pix3 = pix2 + line_size;
1612
1613     for (i = 0; i < h; i++) {
1614         s    += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1615         s    += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1616         s    += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1617         s    += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1618         s    += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1619         s    += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1620         s    += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1621         s    += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1622         pix1 += line_size;
1623         pix2 += line_size;
1624         pix3 += line_size;
1625     }
1626     return s;
1627 }
1628
1629 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1630                           int line_size, int h)
1631 {
1632     int s = 0, i;
1633     uint8_t *pix3 = pix2 + line_size;
1634
1635     for (i = 0; i < h; i++) {
1636         s    += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1637         s    += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1638         s    += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1639         s    += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1640         s    += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1641         s    += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1642         s    += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1643         s    += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1644         pix1 += line_size;
1645         pix2 += line_size;
1646         pix3 += line_size;
1647     }
1648     return s;
1649 }
1650
1651 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1652 {
1653     int score1 = 0, score2 = 0, x, y;
1654
1655     for (y = 0; y < h; y++) {
1656         for (x = 0; x < 16; x++)
1657             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1658         if (y + 1 < h) {
1659             for (x = 0; x < 15; x++)
1660                 score2 += FFABS(s1[x]     - s1[x + stride] -
1661                                 s1[x + 1] + s1[x + stride + 1]) -
1662                           FFABS(s2[x]     - s2[x + stride] -
1663                                 s2[x + 1] + s2[x + stride + 1]);
1664         }
1665         s1 += stride;
1666         s2 += stride;
1667     }
1668
1669     if (c)
1670         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1671     else
1672         return score1 + FFABS(score2) * 8;
1673 }
1674
1675 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1676 {
1677     int score1 = 0, score2 = 0, x, y;
1678
1679     for (y = 0; y < h; y++) {
1680         for (x = 0; x < 8; x++)
1681             score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1682         if (y + 1 < h) {
1683             for (x = 0; x < 7; x++)
1684                 score2 += FFABS(s1[x]     - s1[x + stride] -
1685                                 s1[x + 1] + s1[x + stride + 1]) -
1686                           FFABS(s2[x]     - s2[x + stride] -
1687                                 s2[x + 1] + s2[x + stride + 1]);
1688         }
1689         s1 += stride;
1690         s2 += stride;
1691     }
1692
1693     if (c)
1694         return score1 + FFABS(score2) * c->avctx->nsse_weight;
1695     else
1696         return score1 + FFABS(score2) * 8;
1697 }
1698
1699 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1700                           int16_t basis[64], int scale)
1701 {
1702     int i;
1703     unsigned int sum = 0;
1704
1705     for (i = 0; i < 8 * 8; i++) {
1706         int b = rem[i] + ((basis[i] * scale +
1707                            (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1708                           (BASIS_SHIFT - RECON_SHIFT));
1709         int w = weight[i];
1710         b >>= RECON_SHIFT;
1711         av_assert2(-512 < b && b < 512);
1712
1713         sum += (w * b) * (w * b) >> 4;
1714     }
1715     return sum >> 2;
1716 }
1717
1718 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1719 {
1720     int i;
1721
1722     for (i = 0; i < 8 * 8; i++)
1723         rem[i] += (basis[i] * scale +
1724                    (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1725                   (BASIS_SHIFT - RECON_SHIFT);
1726 }
1727
1728 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1729                     int stride, int h)
1730 {
1731     return 0;
1732 }
1733
1734 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1735 {
1736     int i;
1737
1738     memset(cmp, 0, sizeof(void *) * 6);
1739
1740     for (i = 0; i < 6; i++) {
1741         switch (type & 0xFF) {
1742         case FF_CMP_SAD:
1743             cmp[i] = c->sad[i];
1744             break;
1745         case FF_CMP_SATD:
1746             cmp[i] = c->hadamard8_diff[i];
1747             break;
1748         case FF_CMP_SSE:
1749             cmp[i] = c->sse[i];
1750             break;
1751         case FF_CMP_DCT:
1752             cmp[i] = c->dct_sad[i];
1753             break;
1754         case FF_CMP_DCT264:
1755             cmp[i] = c->dct264_sad[i];
1756             break;
1757         case FF_CMP_DCTMAX:
1758             cmp[i] = c->dct_max[i];
1759             break;
1760         case FF_CMP_PSNR:
1761             cmp[i] = c->quant_psnr[i];
1762             break;
1763         case FF_CMP_BIT:
1764             cmp[i] = c->bit[i];
1765             break;
1766         case FF_CMP_RD:
1767             cmp[i] = c->rd[i];
1768             break;
1769         case FF_CMP_VSAD:
1770             cmp[i] = c->vsad[i];
1771             break;
1772         case FF_CMP_VSSE:
1773             cmp[i] = c->vsse[i];
1774             break;
1775         case FF_CMP_ZERO:
1776             cmp[i] = zero_cmp;
1777             break;
1778         case FF_CMP_NSSE:
1779             cmp[i] = c->nsse[i];
1780             break;
1781 #if CONFIG_DWT
1782         case FF_CMP_W53:
1783             cmp[i]= c->w53[i];
1784             break;
1785         case FF_CMP_W97:
1786             cmp[i]= c->w97[i];
1787             break;
1788 #endif
1789         default:
1790             av_log(NULL, AV_LOG_ERROR,
1791                    "internal error in cmp function selection\n");
1792         }
1793     }
1794 }
1795
1796 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1797 {
1798     long i;
1799
1800     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1801         long a = *(long *) (src + i);
1802         long b = *(long *) (dst + i);
1803         *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1804     }
1805     for (; i < w; i++)
1806         dst[i + 0] += src[i + 0];
1807 }
1808
1809 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
1810 {
1811     long i;
1812
1813 #if !HAVE_FAST_UNALIGNED
1814     if ((long) src2 & (sizeof(long) - 1)) {
1815         for (i = 0; i + 7 < w; i += 8) {
1816             dst[i + 0] = src1[i + 0] - src2[i + 0];
1817             dst[i + 1] = src1[i + 1] - src2[i + 1];
1818             dst[i + 2] = src1[i + 2] - src2[i + 2];
1819             dst[i + 3] = src1[i + 3] - src2[i + 3];
1820             dst[i + 4] = src1[i + 4] - src2[i + 4];
1821             dst[i + 5] = src1[i + 5] - src2[i + 5];
1822             dst[i + 6] = src1[i + 6] - src2[i + 6];
1823             dst[i + 7] = src1[i + 7] - src2[i + 7];
1824         }
1825     } else
1826 #endif
1827     for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1828         long a = *(long *) (src1 + i);
1829         long b = *(long *) (src2 + i);
1830         *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1831                               ((a ^ b ^ pb_80) & pb_80);
1832     }
1833     for (; i < w; i++)
1834         dst[i + 0] = src1[i + 0] - src2[i + 0];
1835 }
1836
1837 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1838                                          const uint8_t *diff, int w,
1839                                          int *left, int *left_top)
1840 {
1841     int i;
1842     uint8_t l, lt;
1843
1844     l  = *left;
1845     lt = *left_top;
1846
1847     for (i = 0; i < w; i++) {
1848         l      = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1849         lt     = src1[i];
1850         dst[i] = l;
1851     }
1852
1853     *left     = l;
1854     *left_top = lt;
1855 }
1856
1857 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1858                                          const uint8_t *src2, int w,
1859                                          int *left, int *left_top)
1860 {
1861     int i;
1862     uint8_t l, lt;
1863
1864     l  = *left;
1865     lt = *left_top;
1866
1867     for (i = 0; i < w; i++) {
1868         const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1869         lt     = src1[i];
1870         l      = src2[i];
1871         dst[i] = l - pred;
1872     }
1873
1874     *left     = l;
1875     *left_top = lt;
1876 }
1877
1878 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1879                                       int w, int acc)
1880 {
1881     int i;
1882
1883     for (i = 0; i < w - 1; i++) {
1884         acc   += src[i];
1885         dst[i] = acc;
1886         i++;
1887         acc   += src[i];
1888         dst[i] = acc;
1889     }
1890
1891     for (; i < w; i++) {
1892         acc   += src[i];
1893         dst[i] = acc;
1894     }
1895
1896     return acc;
1897 }
1898
1899 #if HAVE_BIGENDIAN
1900 #define B 3
1901 #define G 2
1902 #define R 1
1903 #define A 0
1904 #else
1905 #define B 0
1906 #define G 1
1907 #define R 2
1908 #define A 3
1909 #endif
1910 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1911                                              int w, int *red, int *green,
1912                                              int *blue, int *alpha)
1913 {
1914     int i, r = *red, g = *green, b = *blue, a = *alpha;
1915
1916     for (i = 0; i < w; i++) {
1917         b += src[4 * i + B];
1918         g += src[4 * i + G];
1919         r += src[4 * i + R];
1920         a += src[4 * i + A];
1921
1922         dst[4 * i + B] = b;
1923         dst[4 * i + G] = g;
1924         dst[4 * i + R] = r;
1925         dst[4 * i + A] = a;
1926     }
1927
1928     *red   = r;
1929     *green = g;
1930     *blue  = b;
1931     *alpha = a;
1932 }
1933 #undef B
1934 #undef G
1935 #undef R
1936 #undef A
1937
1938 #define BUTTERFLY2(o1, o2, i1, i2)              \
1939     o1 = (i1) + (i2);                           \
1940     o2 = (i1) - (i2);
1941
1942 #define BUTTERFLY1(x, y)                        \
1943     {                                           \
1944         int a, b;                               \
1945         a = x;                                  \
1946         b = y;                                  \
1947         x = a + b;                              \
1948         y = a - b;                              \
1949     }
1950
1951 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1952
1953 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1954                                uint8_t *src, int stride, int h)
1955 {
1956     int i, temp[64], sum = 0;
1957
1958     av_assert2(h == 8);
1959
1960     for (i = 0; i < 8; i++) {
1961         // FIXME: try pointer walks
1962         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1963                    src[stride * i + 0] - dst[stride * i + 0],
1964                    src[stride * i + 1] - dst[stride * i + 1]);
1965         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1966                    src[stride * i + 2] - dst[stride * i + 2],
1967                    src[stride * i + 3] - dst[stride * i + 3]);
1968         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1969                    src[stride * i + 4] - dst[stride * i + 4],
1970                    src[stride * i + 5] - dst[stride * i + 5]);
1971         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1972                    src[stride * i + 6] - dst[stride * i + 6],
1973                    src[stride * i + 7] - dst[stride * i + 7]);
1974
1975         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1976         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1977         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1978         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1979
1980         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1981         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1982         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1983         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1984     }
1985
1986     for (i = 0; i < 8; i++) {
1987         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1988         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1989         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1990         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1991
1992         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1993         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1994         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1995         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1996
1997         sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1998                BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1999                BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2000                BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2001     }
2002     return sum;
2003 }
2004
2005 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
2006                                 uint8_t *dummy, int stride, int h)
2007 {
2008     int i, temp[64], sum = 0;
2009
2010     av_assert2(h == 8);
2011
2012     for (i = 0; i < 8; i++) {
2013         // FIXME: try pointer walks
2014         BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2015                    src[stride * i + 0], src[stride * i + 1]);
2016         BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2017                    src[stride * i + 2], src[stride * i + 3]);
2018         BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2019                    src[stride * i + 4], src[stride * i + 5]);
2020         BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2021                    src[stride * i + 6], src[stride * i + 7]);
2022
2023         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2024         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2025         BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2026         BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2027
2028         BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2029         BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2030         BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2031         BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2032     }
2033
2034     for (i = 0; i < 8; i++) {
2035         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2036         BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2037         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2038         BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2039
2040         BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2041         BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2042         BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2043         BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2044
2045         sum +=
2046             BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2047             + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2048             + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2049             + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2050     }
2051
2052     sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2053
2054     return sum;
2055 }
2056
2057 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2058                         uint8_t *src2, int stride, int h)
2059 {
2060     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2061
2062     av_assert2(h == 8);
2063
2064     s->dsp.diff_pixels(temp, src1, src2, stride);
2065     s->dsp.fdct(temp);
2066     return s->dsp.sum_abs_dctelem(temp);
2067 }
2068
2069 #if CONFIG_GPL
2070 #define DCT8_1D                                         \
2071     {                                                   \
2072         const int s07 = SRC(0) + SRC(7);                \
2073         const int s16 = SRC(1) + SRC(6);                \
2074         const int s25 = SRC(2) + SRC(5);                \
2075         const int s34 = SRC(3) + SRC(4);                \
2076         const int a0  = s07 + s34;                      \
2077         const int a1  = s16 + s25;                      \
2078         const int a2  = s07 - s34;                      \
2079         const int a3  = s16 - s25;                      \
2080         const int d07 = SRC(0) - SRC(7);                \
2081         const int d16 = SRC(1) - SRC(6);                \
2082         const int d25 = SRC(2) - SRC(5);                \
2083         const int d34 = SRC(3) - SRC(4);                \
2084         const int a4  = d16 + d25 + (d07 + (d07 >> 1)); \
2085         const int a5  = d07 - d34 - (d25 + (d25 >> 1)); \
2086         const int a6  = d07 + d34 - (d16 + (d16 >> 1)); \
2087         const int a7  = d16 - d25 + (d34 + (d34 >> 1)); \
2088         DST(0, a0 + a1);                                \
2089         DST(1, a4 + (a7 >> 2));                         \
2090         DST(2, a2 + (a3 >> 1));                         \
2091         DST(3, a5 + (a6 >> 2));                         \
2092         DST(4, a0 - a1);                                \
2093         DST(5, a6 - (a5 >> 2));                         \
2094         DST(6, (a2 >> 1) - a3);                         \
2095         DST(7, (a4 >> 2) - a7);                         \
2096     }
2097
2098 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2099                            uint8_t *src2, int stride, int h)
2100 {
2101     int16_t dct[8][8];
2102     int i, sum = 0;
2103
2104     s->dsp.diff_pixels(dct[0], src1, src2, stride);
2105
2106 #define SRC(x) dct[i][x]
2107 #define DST(x, v) dct[i][x] = v
2108     for (i = 0; i < 8; i++)
2109         DCT8_1D
2110 #undef SRC
2111 #undef DST
2112
2113 #define SRC(x) dct[x][i]
2114 #define DST(x, v) sum += FFABS(v)
2115         for (i = 0; i < 8; i++)
2116             DCT8_1D
2117 #undef SRC
2118 #undef DST
2119             return sum;
2120 }
2121 #endif
2122
2123 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2124                         uint8_t *src2, int stride, int h)
2125 {
2126     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2127     int sum = 0, i;
2128
2129     av_assert2(h == 8);
2130
2131     s->dsp.diff_pixels(temp, src1, src2, stride);
2132     s->dsp.fdct(temp);
2133
2134     for (i = 0; i < 64; i++)
2135         sum = FFMAX(sum, FFABS(temp[i]));
2136
2137     return sum;
2138 }
2139
2140 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2141                            uint8_t *src2, int stride, int h)
2142 {
2143     LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2144     int16_t *const bak = temp + 64;
2145     int sum = 0, i;
2146
2147     av_assert2(h == 8);
2148     s->mb_intra = 0;
2149
2150     s->dsp.diff_pixels(temp, src1, src2, stride);
2151
2152     memcpy(bak, temp, 64 * sizeof(int16_t));
2153
2154     s->block_last_index[0 /* FIXME */] =
2155         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2156     s->dct_unquantize_inter(s, temp, 0, s->qscale);
2157     ff_simple_idct_8(temp); // FIXME
2158
2159     for (i = 0; i < 64; i++)
2160         sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2161
2162     return sum;
2163 }
2164
2165 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2166                    int stride, int h)
2167 {
2168     const uint8_t *scantable = s->intra_scantable.permutated;
2169     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2170     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2171     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2172     int i, last, run, bits, level, distortion, start_i;
2173     const int esc_length = s->ac_esc_length;
2174     uint8_t *length, *last_length;
2175
2176     av_assert2(h == 8);
2177
2178     copy_block8(lsrc1, src1, 8, stride, 8);
2179     copy_block8(lsrc2, src2, 8, stride, 8);
2180
2181     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2182
2183     s->block_last_index[0 /* FIXME */] =
2184     last                               =
2185         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2186
2187     bits = 0;
2188
2189     if (s->mb_intra) {
2190         start_i     = 1;
2191         length      = s->intra_ac_vlc_length;
2192         last_length = s->intra_ac_vlc_last_length;
2193         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2194     } else {
2195         start_i     = 0;
2196         length      = s->inter_ac_vlc_length;
2197         last_length = s->inter_ac_vlc_last_length;
2198     }
2199
2200     if (last >= start_i) {
2201         run = 0;
2202         for (i = start_i; i < last; i++) {
2203             int j = scantable[i];
2204             level = temp[j];
2205
2206             if (level) {
2207                 level += 64;
2208                 if ((level & (~127)) == 0)
2209                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2210                 else
2211                     bits += esc_length;
2212                 run = 0;
2213             } else
2214                 run++;
2215         }
2216         i = scantable[last];
2217
2218         level = temp[i] + 64;
2219
2220         av_assert2(level - 64);
2221
2222         if ((level & (~127)) == 0) {
2223             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2224         } else
2225             bits += esc_length;
2226     }
2227
2228     if (last >= 0) {
2229         if (s->mb_intra)
2230             s->dct_unquantize_intra(s, temp, 0, s->qscale);
2231         else
2232             s->dct_unquantize_inter(s, temp, 0, s->qscale);
2233     }
2234
2235     s->dsp.idct_add(lsrc2, 8, temp);
2236
2237     distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2238
2239     return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2240 }
2241
2242 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2243                     int stride, int h)
2244 {
2245     const uint8_t *scantable = s->intra_scantable.permutated;
2246     LOCAL_ALIGNED_16(int16_t, temp, [64]);
2247     int i, last, run, bits, level, start_i;
2248     const int esc_length = s->ac_esc_length;
2249     uint8_t *length, *last_length;
2250
2251     av_assert2(h == 8);
2252
2253     s->dsp.diff_pixels(temp, src1, src2, stride);
2254
2255     s->block_last_index[0 /* FIXME */] =
2256     last                               =
2257         s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2258
2259     bits = 0;
2260
2261     if (s->mb_intra) {
2262         start_i     = 1;
2263         length      = s->intra_ac_vlc_length;
2264         last_length = s->intra_ac_vlc_last_length;
2265         bits       += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2266     } else {
2267         start_i     = 0;
2268         length      = s->inter_ac_vlc_length;
2269         last_length = s->inter_ac_vlc_last_length;
2270     }
2271
2272     if (last >= start_i) {
2273         run = 0;
2274         for (i = start_i; i < last; i++) {
2275             int j = scantable[i];
2276             level = temp[j];
2277
2278             if (level) {
2279                 level += 64;
2280                 if ((level & (~127)) == 0)
2281                     bits += length[UNI_AC_ENC_INDEX(run, level)];
2282                 else
2283                     bits += esc_length;
2284                 run = 0;
2285             } else
2286                 run++;
2287         }
2288         i = scantable[last];
2289
2290         level = temp[i] + 64;
2291
2292         av_assert2(level - 64);
2293
2294         if ((level & (~127)) == 0)
2295             bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2296         else
2297             bits += esc_length;
2298     }
2299
2300     return bits;
2301 }
2302
2303 #define VSAD_INTRA(size)                                                \
2304 static int vsad_intra ## size ## _c(MpegEncContext *c,                  \
2305                                     uint8_t *s, uint8_t *dummy,         \
2306                                     int stride, int h)                  \
2307 {                                                                       \
2308     int score = 0, x, y;                                                \
2309                                                                         \
2310     for (y = 1; y < h; y++) {                                           \
2311         for (x = 0; x < size; x += 4) {                                 \
2312             score += FFABS(s[x]     - s[x + stride])     +              \
2313                      FFABS(s[x + 1] - s[x + stride + 1]) +              \
2314                      FFABS(s[x + 2] - s[x + 2 + stride]) +              \
2315                      FFABS(s[x + 3] - s[x + 3 + stride]);               \
2316         }                                                               \
2317         s += stride;                                                    \
2318     }                                                                   \
2319                                                                         \
2320     return score;                                                       \
2321 }
2322 VSAD_INTRA(8)
2323 VSAD_INTRA(16)
2324
2325 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2326                     int stride, int h)
2327 {
2328     int score = 0, x, y;
2329
2330     for (y = 1; y < h; y++) {
2331         for (x = 0; x < 16; x++)
2332             score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2333         s1 += stride;
2334         s2 += stride;
2335     }
2336
2337     return score;
2338 }
2339
2340 #define SQ(a) ((a) * (a))
2341 #define VSSE_INTRA(size)                                                \
2342 static int vsse_intra ## size ## _c(MpegEncContext *c,                  \
2343                                     uint8_t *s, uint8_t *dummy,         \
2344                                     int stride, int h)                  \
2345 {                                                                       \
2346     int score = 0, x, y;                                                \
2347                                                                         \
2348     for (y = 1; y < h; y++) {                                           \
2349         for (x = 0; x < size; x += 4) {                                 \
2350             score += SQ(s[x]     - s[x + stride]) +                     \
2351                      SQ(s[x + 1] - s[x + stride + 1]) +                 \
2352                      SQ(s[x + 2] - s[x + stride + 2]) +                 \
2353                      SQ(s[x + 3] - s[x + stride + 3]);                  \
2354         }                                                               \
2355         s += stride;                                                    \
2356     }                                                                   \
2357                                                                         \
2358     return score;                                                       \
2359 }
2360 VSSE_INTRA(8)
2361 VSSE_INTRA(16)
2362
2363 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2364                     int stride, int h)
2365 {
2366     int score = 0, x, y;
2367
2368     for (y = 1; y < h; y++) {
2369         for (x = 0; x < 16; x++)
2370             score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2371         s1 += stride;
2372         s2 += stride;
2373     }
2374
2375     return score;
2376 }
2377
2378 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2379                                int size)
2380 {
2381     int score = 0, i;
2382
2383     for (i = 0; i < size; i++)
2384         score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2385     return score;
2386 }
2387
2388 #define WRAPPER8_16_SQ(name8, name16)                                   \
2389 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,        \
2390                   int stride, int h)                                    \
2391 {                                                                       \
2392     int score = 0;                                                      \
2393                                                                         \
2394     score += name8(s, dst, src, stride, 8);                             \
2395     score += name8(s, dst + 8, src + 8, stride, 8);                     \
2396     if (h == 16) {                                                      \
2397         dst   += 8 * stride;                                            \
2398         src   += 8 * stride;                                            \
2399         score += name8(s, dst, src, stride, 8);                         \
2400         score += name8(s, dst + 8, src + 8, stride, 8);                 \
2401     }                                                                   \
2402     return score;                                                       \
2403 }
2404
2405 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2406 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2407 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2408 #if CONFIG_GPL
2409 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2410 #endif
2411 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2412 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2413 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2414 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2415
2416 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2417                                    uint32_t maxi, uint32_t maxisign)
2418 {
2419     if (a > mini)
2420         return mini;
2421     else if ((a ^ (1U << 31)) > maxisign)
2422         return maxi;
2423     else
2424         return a;
2425 }
2426
2427 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2428                                          float *min, float *max, int len)
2429 {
2430     int i;
2431     uint32_t mini        = *(uint32_t *) min;
2432     uint32_t maxi        = *(uint32_t *) max;
2433     uint32_t maxisign    = maxi ^ (1U << 31);
2434     uint32_t *dsti       = (uint32_t *) dst;
2435     const uint32_t *srci = (const uint32_t *) src;
2436
2437     for (i = 0; i < len; i += 8) {
2438         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2439         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2440         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2441         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2442         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2443         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2444         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2445         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2446     }
2447 }
2448
2449 static void vector_clipf_c(float *dst, const float *src,
2450                            float min, float max, int len)
2451 {
2452     int i;
2453
2454     if (min < 0 && max > 0) {
2455         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2456     } else {
2457         for (i = 0; i < len; i += 8) {
2458             dst[i]     = av_clipf(src[i], min, max);
2459             dst[i + 1] = av_clipf(src[i + 1], min, max);
2460             dst[i + 2] = av_clipf(src[i + 2], min, max);
2461             dst[i + 3] = av_clipf(src[i + 3], min, max);
2462             dst[i + 4] = av_clipf(src[i + 4], min, max);
2463             dst[i + 5] = av_clipf(src[i + 5], min, max);
2464             dst[i + 6] = av_clipf(src[i + 6], min, max);
2465             dst[i + 7] = av_clipf(src[i + 7], min, max);
2466         }
2467     }
2468 }
2469
2470 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2471                                      int order)
2472 {
2473     int res = 0;
2474
2475     while (order--)
2476         res += *v1++ **v2++;
2477
2478     return res;
2479 }
2480
2481 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2482                                               const int16_t *v3,
2483                                               int order, int mul)
2484 {
2485     int res = 0;
2486
2487     while (order--) {
2488         res   += *v1 * *v2++;
2489         *v1++ += mul * *v3++;
2490     }
2491     return res;
2492 }
2493
2494 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2495                                 int32_t max, unsigned int len)
2496 {
2497     do {
2498         *dst++ = av_clip(*src++, min, max);
2499         *dst++ = av_clip(*src++, min, max);
2500         *dst++ = av_clip(*src++, min, max);
2501         *dst++ = av_clip(*src++, min, max);
2502         *dst++ = av_clip(*src++, min, max);
2503         *dst++ = av_clip(*src++, min, max);
2504         *dst++ = av_clip(*src++, min, max);
2505         *dst++ = av_clip(*src++, min, max);
2506         len   -= 8;
2507     } while (len > 0);
2508 }
2509
2510 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2511 {
2512     ff_j_rev_dct(block);
2513     put_pixels_clamped_c(block, dest, line_size);
2514 }
2515
2516 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2517 {
2518     ff_j_rev_dct(block);
2519     add_pixels_clamped_c(block, dest, line_size);
2520 }
2521
2522 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2523 {
2524     ff_j_rev_dct4 (block);
2525     put_pixels_clamped4_c(block, dest, line_size);
2526 }
2527 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2528 {
2529     ff_j_rev_dct4 (block);
2530     add_pixels_clamped4_c(block, dest, line_size);
2531 }
2532
2533 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2534 {
2535     ff_j_rev_dct2 (block);
2536     put_pixels_clamped2_c(block, dest, line_size);
2537 }
2538 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2539 {
2540     ff_j_rev_dct2 (block);
2541     add_pixels_clamped2_c(block, dest, line_size);
2542 }
2543
2544 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2545 {
2546     dest[0] = av_clip_uint8((block[0] + 4)>>3);
2547 }
2548 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2549 {
2550     dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2551 }
2552
2553 /* draw the edges of width 'w' of an image of size width, height */
2554 // FIXME: Check that this is OK for MPEG-4 interlaced.
2555 static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
2556                            int w, int h, int sides)
2557 {
2558     uint8_t *ptr = buf, *last_line;
2559     int i;
2560
2561     /* left and right */
2562     for (i = 0; i < height; i++) {
2563         memset(ptr - w, ptr[0], w);
2564         memset(ptr + width, ptr[width - 1], w);
2565         ptr += wrap;
2566     }
2567
2568     /* top and bottom + corners */
2569     buf -= w;
2570     last_line = buf + (height - 1) * wrap;
2571     if (sides & EDGE_TOP)
2572         for (i = 0; i < h; i++)
2573             // top
2574             memcpy(buf - (i + 1) * wrap, buf, width + w + w);
2575     if (sides & EDGE_BOTTOM)
2576         for (i = 0; i < h; i++)
2577             // bottom
2578             memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);
2579 }
2580
2581 static void clear_block_8_c(int16_t *block)
2582 {
2583     memset(block, 0, sizeof(int16_t) * 64);
2584 }
2585
2586 static void clear_blocks_8_c(int16_t *blocks)
2587 {
2588     memset(blocks, 0, sizeof(int16_t) * 6 * 64);
2589 }
2590
2591 /* init static data */
2592 av_cold void ff_dsputil_static_init(void)
2593 {
2594     int i;
2595
2596     for (i = 0; i < 512; i++)
2597         ff_square_tab[i] = (i - 256) * (i - 256);
2598 }
2599
2600 int ff_check_alignment(void)
2601 {
2602     static int did_fail = 0;
2603     LOCAL_ALIGNED_16(int, aligned, [4]);
2604
2605     if ((intptr_t)aligned & 15) {
2606         if (!did_fail) {
2607 #if HAVE_MMX || HAVE_ALTIVEC
2608             av_log(NULL, AV_LOG_ERROR,
2609                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2610                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2611                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2612                 "Do not report crashes to FFmpeg developers.\n");
2613 #endif
2614             did_fail=1;
2615         }
2616         return -1;
2617     }
2618     return 0;
2619 }
2620
2621 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2622 {
2623     const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2624
2625     ff_check_alignment();
2626
2627 #if CONFIG_ENCODERS
2628     if (avctx->bits_per_raw_sample == 10) {
2629         c->fdct    = ff_jpeg_fdct_islow_10;
2630         c->fdct248 = ff_fdct248_islow_10;
2631     } else {
2632         if (avctx->dct_algo == FF_DCT_FASTINT) {
2633             c->fdct    = ff_fdct_ifast;
2634             c->fdct248 = ff_fdct_ifast248;
2635         } else if (avctx->dct_algo == FF_DCT_FAAN) {
2636             c->fdct    = ff_faandct;
2637             c->fdct248 = ff_faandct248;
2638         } else {
2639             c->fdct    = ff_jpeg_fdct_islow_8; // slow/accurate/default
2640             c->fdct248 = ff_fdct248_islow_8;
2641         }
2642     }
2643 #endif /* CONFIG_ENCODERS */
2644
2645     if (avctx->lowres==1) {
2646         c->idct_put              = ff_jref_idct4_put;
2647         c->idct_add              = ff_jref_idct4_add;
2648         c->idct                  = ff_j_rev_dct4;
2649         c->idct_permutation_type = FF_NO_IDCT_PERM;
2650     } else if (avctx->lowres==2) {
2651         c->idct_put              =  ff_jref_idct2_put;
2652         c->idct_add              =  ff_jref_idct2_add;
2653         c->idct                  =  ff_j_rev_dct2;
2654         c->idct_permutation_type = FF_NO_IDCT_PERM;
2655     } else if (avctx->lowres==3) {
2656         c->idct_put              =  ff_jref_idct1_put;
2657         c->idct_add              =  ff_jref_idct1_add;
2658         c->idct                  =  ff_j_rev_dct1;
2659         c->idct_permutation_type = FF_NO_IDCT_PERM;
2660     } else {
2661         if (avctx->bits_per_raw_sample == 10) {
2662             c->idct_put              = ff_simple_idct_put_10;
2663             c->idct_add              = ff_simple_idct_add_10;
2664             c->idct                  = ff_simple_idct_10;
2665             c->idct_permutation_type = FF_NO_IDCT_PERM;
2666         } else if (avctx->bits_per_raw_sample == 12) {
2667             c->idct_put              = ff_simple_idct_put_12;
2668             c->idct_add              = ff_simple_idct_add_12;
2669             c->idct                  = ff_simple_idct_12;
2670             c->idct_permutation_type = FF_NO_IDCT_PERM;
2671         } else {
2672         if (avctx->idct_algo == FF_IDCT_INT) {
2673             c->idct_put              = jref_idct_put;
2674             c->idct_add              = jref_idct_add;
2675             c->idct                  = ff_j_rev_dct;
2676             c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2677         } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2678             c->idct_put              = ff_faanidct_put;
2679             c->idct_add              = ff_faanidct_add;
2680             c->idct                  = ff_faanidct;
2681             c->idct_permutation_type = FF_NO_IDCT_PERM;
2682         } else { // accurate/default
2683             c->idct_put              = ff_simple_idct_put_8;
2684             c->idct_add              = ff_simple_idct_add_8;
2685             c->idct                  = ff_simple_idct_8;
2686             c->idct_permutation_type = FF_NO_IDCT_PERM;
2687         }
2688         }
2689     }
2690
2691     c->diff_pixels = diff_pixels_c;
2692
2693     c->put_pixels_clamped        = put_pixels_clamped_c;
2694     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2695     c->add_pixels_clamped        = add_pixels_clamped_c;
2696
2697     c->sum_abs_dctelem = sum_abs_dctelem_c;
2698
2699     c->gmc1 = gmc1_c;
2700     c->gmc  = ff_gmc_c;
2701
2702     c->pix_sum   = pix_sum_c;
2703     c->pix_norm1 = pix_norm1_c;
2704
2705     c->fill_block_tab[0] = fill_block16_c;
2706     c->fill_block_tab[1] = fill_block8_c;
2707
2708     /* TODO [0] 16  [1] 8 */
2709     c->pix_abs[0][0] = pix_abs16_c;
2710     c->pix_abs[0][1] = pix_abs16_x2_c;
2711     c->pix_abs[0][2] = pix_abs16_y2_c;
2712     c->pix_abs[0][3] = pix_abs16_xy2_c;
2713     c->pix_abs[1][0] = pix_abs8_c;
2714     c->pix_abs[1][1] = pix_abs8_x2_c;
2715     c->pix_abs[1][2] = pix_abs8_y2_c;
2716     c->pix_abs[1][3] = pix_abs8_xy2_c;
2717
2718 #define dspfunc(PFX, IDX, NUM)                              \
2719     c->PFX ## _pixels_tab[IDX][0]  = PFX ## NUM ## _mc00_c; \
2720     c->PFX ## _pixels_tab[IDX][1]  = PFX ## NUM ## _mc10_c; \
2721     c->PFX ## _pixels_tab[IDX][2]  = PFX ## NUM ## _mc20_c; \
2722     c->PFX ## _pixels_tab[IDX][3]  = PFX ## NUM ## _mc30_c; \
2723     c->PFX ## _pixels_tab[IDX][4]  = PFX ## NUM ## _mc01_c; \
2724     c->PFX ## _pixels_tab[IDX][5]  = PFX ## NUM ## _mc11_c; \
2725     c->PFX ## _pixels_tab[IDX][6]  = PFX ## NUM ## _mc21_c; \
2726     c->PFX ## _pixels_tab[IDX][7]  = PFX ## NUM ## _mc31_c; \
2727     c->PFX ## _pixels_tab[IDX][8]  = PFX ## NUM ## _mc02_c; \
2728     c->PFX ## _pixels_tab[IDX][9]  = PFX ## NUM ## _mc12_c; \
2729     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2730     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2731     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2732     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2733     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2734     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2735
2736     dspfunc(put_qpel, 0, 16);
2737     dspfunc(put_qpel, 1, 8);
2738
2739     dspfunc(put_no_rnd_qpel, 0, 16);
2740     dspfunc(put_no_rnd_qpel, 1, 8);
2741
2742     dspfunc(avg_qpel, 0, 16);
2743     dspfunc(avg_qpel, 1, 8);
2744
2745 #undef dspfunc
2746
2747     c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2748     c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2749     c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2750     c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2751     c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2752     c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2753     c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2754     c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2755
2756 #define SET_CMP_FUNC(name)                      \
2757     c->name[0] = name ## 16_c;                  \
2758     c->name[1] = name ## 8x8_c;
2759
2760     SET_CMP_FUNC(hadamard8_diff)
2761     c->hadamard8_diff[4] = hadamard8_intra16_c;
2762     c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2763     SET_CMP_FUNC(dct_sad)
2764     SET_CMP_FUNC(dct_max)
2765 #if CONFIG_GPL
2766     SET_CMP_FUNC(dct264_sad)
2767 #endif
2768     c->sad[0] = pix_abs16_c;
2769     c->sad[1] = pix_abs8_c;
2770     c->sse[0] = sse16_c;
2771     c->sse[1] = sse8_c;
2772     c->sse[2] = sse4_c;
2773     SET_CMP_FUNC(quant_psnr)
2774     SET_CMP_FUNC(rd)
2775     SET_CMP_FUNC(bit)
2776     c->vsad[0] = vsad16_c;
2777     c->vsad[4] = vsad_intra16_c;
2778     c->vsad[5] = vsad_intra8_c;
2779     c->vsse[0] = vsse16_c;
2780     c->vsse[4] = vsse_intra16_c;
2781     c->vsse[5] = vsse_intra8_c;
2782     c->nsse[0] = nsse16_c;
2783     c->nsse[1] = nsse8_c;
2784 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2785     ff_dsputil_init_dwt(c);
2786 #endif
2787
2788     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2789
2790     c->add_bytes                      = add_bytes_c;
2791     c->add_hfyu_median_prediction     = add_hfyu_median_prediction_c;
2792     c->add_hfyu_left_prediction       = add_hfyu_left_prediction_c;
2793     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2794
2795     c->diff_bytes                 = diff_bytes_c;
2796     c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2797
2798     c->bswap_buf   = bswap_buf;
2799     c->bswap16_buf = bswap16_buf;
2800
2801     c->try_8x8basis = try_8x8basis_c;
2802     c->add_8x8basis = add_8x8basis_c;
2803
2804     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2805
2806     c->scalarproduct_int16 = scalarproduct_int16_c;
2807     c->vector_clip_int32   = vector_clip_int32_c;
2808     c->vector_clipf        = vector_clipf_c;
2809
2810     c->shrink[0] = av_image_copy_plane;
2811     c->shrink[1] = ff_shrink22;
2812     c->shrink[2] = ff_shrink44;
2813     c->shrink[3] = ff_shrink88;
2814
2815     c->add_pixels8 = add_pixels8_c;
2816
2817     c->draw_edges = draw_edges_8_c;
2818
2819     c->clear_block  = clear_block_8_c;
2820     c->clear_blocks = clear_blocks_8_c;
2821
2822     switch (avctx->bits_per_raw_sample) {
2823     case 9:
2824     case 10:
2825     case 12:
2826     case 14:
2827         c->get_pixels = get_pixels_16_c;
2828         break;
2829     default:
2830         if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2831             c->get_pixels = get_pixels_8_c;
2832         }
2833         break;
2834     }
2835
2836
2837     if (ARCH_ALPHA)
2838         ff_dsputil_init_alpha(c, avctx);
2839     if (ARCH_ARM)
2840         ff_dsputil_init_arm(c, avctx, high_bit_depth);
2841     if (ARCH_BFIN)
2842         ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2843     if (ARCH_PPC)
2844         ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2845     if (ARCH_X86)
2846         ff_dsputil_init_x86(c, avctx, high_bit_depth);
2847
2848     ff_init_scantable_permutation(c->idct_permutation,
2849                                   c->idct_permutation_type);
2850 }
2851
2852 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2853 {
2854     ff_dsputil_init(c, avctx);
2855 }
2856
2857 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2858 {
2859     ff_dsputil_init(c, avctx);
2860 }