3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
44 uint32_t ff_square_tab[512] = { 0, };
47 #include "dsputil_template.c"
51 #include "hpel_template.c"
52 #include "tpel_template.c"
53 #include "dsputil_template.c"
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL / 255 * 0x7f)
57 #define pb_80 (~0UL / 255 * 0x80)
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60 * specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62 0, 8, 1, 9, 16, 24, 2, 10,
63 17, 25, 32, 40, 48, 56, 33, 41,
64 18, 26, 3, 11, 4, 12, 19, 27,
65 34, 42, 49, 57, 50, 58, 35, 43,
66 20, 28, 5, 13, 6, 14, 21, 29,
67 36, 44, 51, 59, 52, 60, 37, 45,
68 22, 30, 7, 15, 23, 31, 38, 46,
69 53, 61, 54, 62, 39, 47, 55, 63,
72 const uint8_t ff_alternate_horizontal_scan[64] = {
73 0, 1, 2, 3, 8, 9, 16, 17,
74 10, 11, 4, 5, 6, 7, 15, 14,
75 13, 12, 19, 18, 24, 25, 32, 33,
76 26, 27, 20, 21, 22, 23, 28, 29,
77 30, 31, 34, 35, 40, 41, 48, 49,
78 42, 43, 36, 37, 38, 39, 44, 45,
79 46, 47, 50, 51, 56, 57, 58, 59,
80 52, 53, 54, 55, 60, 61, 62, 63,
83 const uint8_t ff_alternate_vertical_scan[64] = {
84 0, 8, 16, 24, 1, 9, 2, 10,
85 17, 25, 32, 40, 48, 56, 57, 49,
86 41, 33, 26, 18, 3, 11, 4, 12,
87 19, 27, 34, 42, 50, 58, 35, 43,
88 51, 59, 20, 28, 5, 13, 6, 14,
89 21, 29, 36, 44, 52, 60, 37, 45,
90 53, 61, 22, 30, 7, 15, 23, 31,
91 38, 46, 54, 62, 39, 47, 55, 63,
94 /* Input permutation for the simple_idct_mmx */
95 static const uint8_t simple_mmx_permutation[64] = {
96 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
97 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
98 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
99 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
100 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
101 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
102 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
103 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
106 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
109 const uint8_t *src_scantable)
113 st->scantable = src_scantable;
115 for (i = 0; i < 64; i++) {
116 int j = src_scantable[i];
117 st->permutated[i] = permutation[j];
121 for (i = 0; i < 64; i++) {
122 int j = st->permutated[i];
125 st->raster_end[i] = end;
129 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
130 int idct_permutation_type)
134 switch (idct_permutation_type) {
135 case FF_NO_IDCT_PERM:
136 for (i = 0; i < 64; i++)
137 idct_permutation[i] = i;
139 case FF_LIBMPEG2_IDCT_PERM:
140 for (i = 0; i < 64; i++)
141 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
143 case FF_SIMPLE_IDCT_PERM:
144 for (i = 0; i < 64; i++)
145 idct_permutation[i] = simple_mmx_permutation[i];
147 case FF_TRANSPOSE_IDCT_PERM:
148 for (i = 0; i < 64; i++)
149 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
151 case FF_PARTTRANS_IDCT_PERM:
152 for (i = 0; i < 64; i++)
153 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
155 case FF_SSE2_IDCT_PERM:
156 for (i = 0; i < 64; i++)
157 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
160 av_log(NULL, AV_LOG_ERROR,
161 "Internal error, IDCT permutation not set\n");
165 static int pix_sum_c(uint8_t *pix, int line_size)
169 for (i = 0; i < 16; i++) {
170 for (j = 0; j < 16; j += 8) {
181 pix += line_size - 16;
186 static int pix_norm1_c(uint8_t *pix, int line_size)
189 uint32_t *sq = ff_square_tab + 256;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
204 register uint64_t x = *(uint64_t *) pix;
206 s += sq[(x >> 8) & 0xff];
207 s += sq[(x >> 16) & 0xff];
208 s += sq[(x >> 24) & 0xff];
209 s += sq[(x >> 32) & 0xff];
210 s += sq[(x >> 40) & 0xff];
211 s += sq[(x >> 48) & 0xff];
212 s += sq[(x >> 56) & 0xff];
214 register uint32_t x = *(uint32_t *) pix;
216 s += sq[(x >> 8) & 0xff];
217 s += sq[(x >> 16) & 0xff];
218 s += sq[(x >> 24) & 0xff];
219 x = *(uint32_t *) (pix + 4);
221 s += sq[(x >> 8) & 0xff];
222 s += sq[(x >> 16) & 0xff];
223 s += sq[(x >> 24) & 0xff];
228 pix += line_size - 16;
233 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
237 for (i = 0; i + 8 <= w; i += 8) {
238 dst[i + 0] = av_bswap32(src[i + 0]);
239 dst[i + 1] = av_bswap32(src[i + 1]);
240 dst[i + 2] = av_bswap32(src[i + 2]);
241 dst[i + 3] = av_bswap32(src[i + 3]);
242 dst[i + 4] = av_bswap32(src[i + 4]);
243 dst[i + 5] = av_bswap32(src[i + 5]);
244 dst[i + 6] = av_bswap32(src[i + 6]);
245 dst[i + 7] = av_bswap32(src[i + 7]);
248 dst[i + 0] = av_bswap32(src[i + 0]);
251 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
254 *dst++ = av_bswap16(*src++);
257 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
258 int line_size, int h)
261 uint32_t *sq = ff_square_tab + 256;
263 for (i = 0; i < h; i++) {
264 s += sq[pix1[0] - pix2[0]];
265 s += sq[pix1[1] - pix2[1]];
266 s += sq[pix1[2] - pix2[2]];
267 s += sq[pix1[3] - pix2[3]];
274 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
275 int line_size, int h)
278 uint32_t *sq = ff_square_tab + 256;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
285 s += sq[pix1[4] - pix2[4]];
286 s += sq[pix1[5] - pix2[5]];
287 s += sq[pix1[6] - pix2[6]];
288 s += sq[pix1[7] - pix2[7]];
295 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
296 int line_size, int h)
299 uint32_t *sq = ff_square_tab + 256;
301 for (i = 0; i < h; i++) {
302 s += sq[pix1[0] - pix2[0]];
303 s += sq[pix1[1] - pix2[1]];
304 s += sq[pix1[2] - pix2[2]];
305 s += sq[pix1[3] - pix2[3]];
306 s += sq[pix1[4] - pix2[4]];
307 s += sq[pix1[5] - pix2[5]];
308 s += sq[pix1[6] - pix2[6]];
309 s += sq[pix1[7] - pix2[7]];
310 s += sq[pix1[8] - pix2[8]];
311 s += sq[pix1[9] - pix2[9]];
312 s += sq[pix1[10] - pix2[10]];
313 s += sq[pix1[11] - pix2[11]];
314 s += sq[pix1[12] - pix2[12]];
315 s += sq[pix1[13] - pix2[13]];
316 s += sq[pix1[14] - pix2[14]];
317 s += sq[pix1[15] - pix2[15]];
325 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
326 const uint8_t *s2, int stride)
330 /* read the pixels */
331 for (i = 0; i < 8; i++) {
332 block[0] = s1[0] - s2[0];
333 block[1] = s1[1] - s2[1];
334 block[2] = s1[2] - s2[2];
335 block[3] = s1[3] - s2[3];
336 block[4] = s1[4] - s2[4];
337 block[5] = s1[5] - s2[5];
338 block[6] = s1[6] - s2[6];
339 block[7] = s1[7] - s2[7];
346 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
351 /* read the pixels */
352 for (i = 0; i < 8; i++) {
353 pixels[0] = av_clip_uint8(block[0]);
354 pixels[1] = av_clip_uint8(block[1]);
355 pixels[2] = av_clip_uint8(block[2]);
356 pixels[3] = av_clip_uint8(block[3]);
357 pixels[4] = av_clip_uint8(block[4]);
358 pixels[5] = av_clip_uint8(block[5]);
359 pixels[6] = av_clip_uint8(block[6]);
360 pixels[7] = av_clip_uint8(block[7]);
367 static void put_signed_pixels_clamped_c(const int16_t *block,
368 uint8_t *restrict pixels,
373 for (i = 0; i < 8; i++) {
374 for (j = 0; j < 8; j++) {
377 else if (*block > 127)
380 *pixels = (uint8_t) (*block + 128);
384 pixels += (line_size - 8);
388 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
393 for (i = 0; i < 8; i++) {
394 pixels[0] += block[0];
395 pixels[1] += block[1];
396 pixels[2] += block[2];
397 pixels[3] += block[3];
398 pixels[4] += block[4];
399 pixels[5] += block[5];
400 pixels[6] += block[6];
401 pixels[7] += block[7];
407 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
412 /* read the pixels */
413 for (i = 0; i < 8; i++) {
414 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
415 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
416 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
417 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
418 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
419 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
420 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
421 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
427 static int sum_abs_dctelem_c(int16_t *block)
431 for (i = 0; i < 64; i++)
432 sum += FFABS(block[i]);
436 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
440 for (i = 0; i < h; i++) {
441 memset(block, value, 16);
446 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
450 for (i = 0; i < h; i++) {
451 memset(block, value, 8);
456 #define avg2(a, b) ((a + b + 1) >> 1)
457 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
459 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
460 int x16, int y16, int rounder)
462 const int A = (16 - x16) * (16 - y16);
463 const int B = (x16) * (16 - y16);
464 const int C = (16 - x16) * (y16);
465 const int D = (x16) * (y16);
468 for (i = 0; i < h; i++) {
469 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
470 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
471 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
472 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
473 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
474 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
475 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
476 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
482 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
483 int dxx, int dxy, int dyx, int dyy, int shift, int r,
484 int width, int height)
487 const int s = 1 << shift;
492 for (y = 0; y < h; y++) {
497 for (x = 0; x < 8; x++) { // FIXME: optimize
499 int src_x = vx >> 16;
500 int src_y = vy >> 16;
501 int frac_x = src_x & (s - 1);
502 int frac_y = src_y & (s - 1);
507 if ((unsigned) src_x < width) {
508 if ((unsigned) src_y < height) {
509 index = src_x + src_y * stride;
510 dst[y * stride + x] =
511 ((src[index] * (s - frac_x) +
512 src[index + 1] * frac_x) * (s - frac_y) +
513 (src[index + stride] * (s - frac_x) +
514 src[index + stride + 1] * frac_x) * frac_y +
517 index = src_x + av_clip(src_y, 0, height) * stride;
518 dst[y * stride + x] =
519 ((src[index] * (s - frac_x) +
520 src[index + 1] * frac_x) * s +
524 if ((unsigned) src_y < height) {
525 index = av_clip(src_x, 0, width) + src_y * stride;
526 dst[y * stride + x] =
527 ((src[index] * (s - frac_y) +
528 src[index + stride] * frac_y) * s +
531 index = av_clip(src_x, 0, width) +
532 av_clip(src_y, 0, height) * stride;
533 dst[y * stride + x] = src[index];
545 #define QPEL_MC(r, OPNAME, RND, OP) \
546 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
547 int dstStride, int srcStride, \
550 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
553 for (i = 0; i < h; i++) { \
554 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
555 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
556 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
557 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
558 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
559 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
560 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
561 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
567 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
568 int dstStride, int srcStride) \
570 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
574 for (i = 0; i < w; i++) { \
575 const int src0 = src[0 * srcStride]; \
576 const int src1 = src[1 * srcStride]; \
577 const int src2 = src[2 * srcStride]; \
578 const int src3 = src[3 * srcStride]; \
579 const int src4 = src[4 * srcStride]; \
580 const int src5 = src[5 * srcStride]; \
581 const int src6 = src[6 * srcStride]; \
582 const int src7 = src[7 * srcStride]; \
583 const int src8 = src[8 * srcStride]; \
584 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
585 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
586 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
587 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
588 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
589 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
590 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
591 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
597 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
598 int dstStride, int srcStride, \
601 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
604 for (i = 0; i < h; i++) { \
605 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
606 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
607 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
608 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
609 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
610 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
611 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
612 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
613 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
614 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
615 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
616 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
617 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
618 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
619 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
620 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
626 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
627 int dstStride, int srcStride) \
629 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
633 for (i = 0; i < w; i++) { \
634 const int src0 = src[0 * srcStride]; \
635 const int src1 = src[1 * srcStride]; \
636 const int src2 = src[2 * srcStride]; \
637 const int src3 = src[3 * srcStride]; \
638 const int src4 = src[4 * srcStride]; \
639 const int src5 = src[5 * srcStride]; \
640 const int src6 = src[6 * srcStride]; \
641 const int src7 = src[7 * srcStride]; \
642 const int src8 = src[8 * srcStride]; \
643 const int src9 = src[9 * srcStride]; \
644 const int src10 = src[10 * srcStride]; \
645 const int src11 = src[11 * srcStride]; \
646 const int src12 = src[12 * srcStride]; \
647 const int src13 = src[13 * srcStride]; \
648 const int src14 = src[14 * srcStride]; \
649 const int src15 = src[15 * srcStride]; \
650 const int src16 = src[16 * srcStride]; \
651 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
652 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
653 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
654 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
655 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
656 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
657 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
658 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
659 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
660 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
661 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
662 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
663 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
664 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
665 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
666 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
672 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
677 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
678 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
681 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
684 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
687 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
692 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
693 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
696 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
699 uint8_t full[16 * 9]; \
702 copy_block9(full, src, 16, stride, 9); \
703 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
704 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
707 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
710 uint8_t full[16 * 9]; \
712 copy_block9(full, src, 16, stride, 9); \
713 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
716 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
719 uint8_t full[16 * 9]; \
722 copy_block9(full, src, 16, stride, 9); \
723 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
724 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
727 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
730 uint8_t full[16 * 9]; \
733 uint8_t halfHV[64]; \
735 copy_block9(full, src, 16, stride, 9); \
736 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
737 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
738 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
739 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
740 stride, 16, 8, 8, 8, 8); \
743 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
746 uint8_t full[16 * 9]; \
748 uint8_t halfHV[64]; \
750 copy_block9(full, src, 16, stride, 9); \
751 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
752 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
753 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
754 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
757 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
760 uint8_t full[16 * 9]; \
763 uint8_t halfHV[64]; \
765 copy_block9(full, src, 16, stride, 9); \
766 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
767 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
768 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
769 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
770 stride, 16, 8, 8, 8, 8); \
773 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
776 uint8_t full[16 * 9]; \
778 uint8_t halfHV[64]; \
780 copy_block9(full, src, 16, stride, 9); \
781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
782 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
783 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
784 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
787 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
790 uint8_t full[16 * 9]; \
793 uint8_t halfHV[64]; \
795 copy_block9(full, src, 16, stride, 9); \
796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
797 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
798 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
799 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
800 stride, 16, 8, 8, 8, 8); \
803 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
806 uint8_t full[16 * 9]; \
808 uint8_t halfHV[64]; \
810 copy_block9(full, src, 16, stride, 9); \
811 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
812 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
813 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
814 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
817 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
820 uint8_t full[16 * 9]; \
823 uint8_t halfHV[64]; \
825 copy_block9(full, src, 16, stride, 9); \
826 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
827 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
828 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
829 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
830 stride, 16, 8, 8, 8, 8); \
833 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
836 uint8_t full[16 * 9]; \
838 uint8_t halfHV[64]; \
840 copy_block9(full, src, 16, stride, 9); \
841 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
842 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
843 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
844 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
847 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
851 uint8_t halfHV[64]; \
853 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
854 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
855 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
858 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
862 uint8_t halfHV[64]; \
864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
865 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
866 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
869 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
872 uint8_t full[16 * 9]; \
875 uint8_t halfHV[64]; \
877 copy_block9(full, src, 16, stride, 9); \
878 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
879 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
880 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
881 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
884 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
887 uint8_t full[16 * 9]; \
890 copy_block9(full, src, 16, stride, 9); \
891 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
892 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
893 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
896 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
899 uint8_t full[16 * 9]; \
902 uint8_t halfHV[64]; \
904 copy_block9(full, src, 16, stride, 9); \
905 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
906 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
907 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
908 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
911 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
914 uint8_t full[16 * 9]; \
917 copy_block9(full, src, 16, stride, 9); \
918 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
919 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
920 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
923 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
928 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
929 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
932 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
937 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
938 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
941 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
944 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
947 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
952 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
953 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
956 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
959 uint8_t full[24 * 17]; \
962 copy_block17(full, src, 24, stride, 17); \
963 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
964 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
967 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
970 uint8_t full[24 * 17]; \
972 copy_block17(full, src, 24, stride, 17); \
973 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
976 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
979 uint8_t full[24 * 17]; \
982 copy_block17(full, src, 24, stride, 17); \
983 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
984 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
987 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
990 uint8_t full[24 * 17]; \
991 uint8_t halfH[272]; \
992 uint8_t halfV[256]; \
993 uint8_t halfHV[256]; \
995 copy_block17(full, src, 24, stride, 17); \
996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
997 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
998 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
999 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1000 stride, 24, 16, 16, 16, 16); \
1003 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1006 uint8_t full[24 * 17]; \
1007 uint8_t halfH[272]; \
1008 uint8_t halfHV[256]; \
1010 copy_block17(full, src, 24, stride, 17); \
1011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1012 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1014 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1017 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1020 uint8_t full[24 * 17]; \
1021 uint8_t halfH[272]; \
1022 uint8_t halfV[256]; \
1023 uint8_t halfHV[256]; \
1025 copy_block17(full, src, 24, stride, 17); \
1026 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1027 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1028 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1029 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1030 stride, 24, 16, 16, 16, 16); \
1033 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1036 uint8_t full[24 * 17]; \
1037 uint8_t halfH[272]; \
1038 uint8_t halfHV[256]; \
1040 copy_block17(full, src, 24, stride, 17); \
1041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1042 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1043 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1044 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1047 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1050 uint8_t full[24 * 17]; \
1051 uint8_t halfH[272]; \
1052 uint8_t halfV[256]; \
1053 uint8_t halfHV[256]; \
1055 copy_block17(full, src, 24, stride, 17); \
1056 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1057 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1058 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1059 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1060 stride, 24, 16, 16, 16, 16); \
1063 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1066 uint8_t full[24 * 17]; \
1067 uint8_t halfH[272]; \
1068 uint8_t halfHV[256]; \
1070 copy_block17(full, src, 24, stride, 17); \
1071 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1072 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1073 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1074 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1077 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1080 uint8_t full[24 * 17]; \
1081 uint8_t halfH[272]; \
1082 uint8_t halfV[256]; \
1083 uint8_t halfHV[256]; \
1085 copy_block17(full, src, 24, stride, 17); \
1086 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1087 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1088 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1089 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1090 stride, 24, 16, 16, 16, 16); \
1093 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1096 uint8_t full[24 * 17]; \
1097 uint8_t halfH[272]; \
1098 uint8_t halfHV[256]; \
1100 copy_block17(full, src, 24, stride, 17); \
1101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1102 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1104 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1107 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1110 uint8_t halfH[272]; \
1111 uint8_t halfHV[256]; \
1113 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1114 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1115 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1118 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1121 uint8_t halfH[272]; \
1122 uint8_t halfHV[256]; \
1124 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1125 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1126 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1129 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1132 uint8_t full[24 * 17]; \
1133 uint8_t halfH[272]; \
1134 uint8_t halfV[256]; \
1135 uint8_t halfHV[256]; \
1137 copy_block17(full, src, 24, stride, 17); \
1138 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1139 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1141 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1144 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1147 uint8_t full[24 * 17]; \
1148 uint8_t halfH[272]; \
1150 copy_block17(full, src, 24, stride, 17); \
1151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1152 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1153 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1156 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1159 uint8_t full[24 * 17]; \
1160 uint8_t halfH[272]; \
1161 uint8_t halfV[256]; \
1162 uint8_t halfHV[256]; \
1164 copy_block17(full, src, 24, stride, 17); \
1165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1168 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1171 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1174 uint8_t full[24 * 17]; \
1175 uint8_t halfH[272]; \
1177 copy_block17(full, src, 24, stride, 17); \
1178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1179 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1180 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1186 uint8_t halfH[272]; \
1188 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1192 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1193 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1194 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1195 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1197 QPEL_MC(0, put_, _, op_put)
1198 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1199 QPEL_MC(0, avg_, _, op_avg)
1203 #undef op_put_no_rnd
1205 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1207 put_pixels8_8_c(dst, src, stride, 8);
1210 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1212 avg_pixels8_8_c(dst, src, stride, 8);
1215 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1217 put_pixels16_8_c(dst, src, stride, 16);
1220 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1222 avg_pixels16_8_c(dst, src, stride, 16);
1225 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1226 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1227 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1228 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1229 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1230 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1232 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1233 int dstStride, int srcStride, int h)
1235 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1238 for (i = 0; i < h; i++) {
1239 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1240 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1241 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1242 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1243 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1244 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1245 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1246 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1252 #if CONFIG_RV40_DECODER
1253 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1255 put_pixels16_xy2_8_c(dst, src, stride, 16);
1258 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1260 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1263 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1265 put_pixels8_xy2_8_c(dst, src, stride, 8);
1268 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1270 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1272 #endif /* CONFIG_RV40_DECODER */
1274 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1275 int dstStride, int srcStride, int w)
1277 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1280 for (i = 0; i < w; i++) {
1281 const int src_1 = src[-srcStride];
1282 const int src0 = src[0];
1283 const int src1 = src[srcStride];
1284 const int src2 = src[2 * srcStride];
1285 const int src3 = src[3 * srcStride];
1286 const int src4 = src[4 * srcStride];
1287 const int src5 = src[5 * srcStride];
1288 const int src6 = src[6 * srcStride];
1289 const int src7 = src[7 * srcStride];
1290 const int src8 = src[8 * srcStride];
1291 const int src9 = src[9 * srcStride];
1292 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1293 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1294 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1295 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1296 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1297 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1298 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1299 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1305 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1309 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1310 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1313 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1315 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1318 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1322 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1323 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1326 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1331 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1337 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1338 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1339 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1340 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1343 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1349 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1350 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1351 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1352 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1355 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1359 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1360 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1363 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1364 int line_size, int h)
1368 for (i = 0; i < h; i++) {
1369 s += abs(pix1[0] - pix2[0]);
1370 s += abs(pix1[1] - pix2[1]);
1371 s += abs(pix1[2] - pix2[2]);
1372 s += abs(pix1[3] - pix2[3]);
1373 s += abs(pix1[4] - pix2[4]);
1374 s += abs(pix1[5] - pix2[5]);
1375 s += abs(pix1[6] - pix2[6]);
1376 s += abs(pix1[7] - pix2[7]);
1377 s += abs(pix1[8] - pix2[8]);
1378 s += abs(pix1[9] - pix2[9]);
1379 s += abs(pix1[10] - pix2[10]);
1380 s += abs(pix1[11] - pix2[11]);
1381 s += abs(pix1[12] - pix2[12]);
1382 s += abs(pix1[13] - pix2[13]);
1383 s += abs(pix1[14] - pix2[14]);
1384 s += abs(pix1[15] - pix2[15]);
1391 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1392 int line_size, int h)
1396 for (i = 0; i < h; i++) {
1397 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1398 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1399 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1400 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1401 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1402 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1403 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1404 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1405 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1406 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1407 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1408 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1409 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1410 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1411 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1412 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1419 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1420 int line_size, int h)
1423 uint8_t *pix3 = pix2 + line_size;
1425 for (i = 0; i < h; i++) {
1426 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1427 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1428 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1429 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1430 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1431 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1432 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1433 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1434 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1435 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1436 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1437 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1438 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1439 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1440 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1441 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1449 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1450 int line_size, int h)
1453 uint8_t *pix3 = pix2 + line_size;
1455 for (i = 0; i < h; i++) {
1456 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1457 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1458 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1459 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1460 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1461 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1462 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1463 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1464 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1465 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1466 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1467 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1468 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1469 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1470 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1471 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1479 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1480 int line_size, int h)
1484 for (i = 0; i < h; i++) {
1485 s += abs(pix1[0] - pix2[0]);
1486 s += abs(pix1[1] - pix2[1]);
1487 s += abs(pix1[2] - pix2[2]);
1488 s += abs(pix1[3] - pix2[3]);
1489 s += abs(pix1[4] - pix2[4]);
1490 s += abs(pix1[5] - pix2[5]);
1491 s += abs(pix1[6] - pix2[6]);
1492 s += abs(pix1[7] - pix2[7]);
1499 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1500 int line_size, int h)
1504 for (i = 0; i < h; i++) {
1505 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1519 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1520 int line_size, int h)
1523 uint8_t *pix3 = pix2 + line_size;
1525 for (i = 0; i < h; i++) {
1526 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1527 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1528 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1529 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1530 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1531 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1532 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1533 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1541 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1542 int line_size, int h)
1545 uint8_t *pix3 = pix2 + line_size;
1547 for (i = 0; i < h; i++) {
1548 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1549 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1550 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1551 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1552 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1553 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1554 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1555 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1563 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1565 int score1 = 0, score2 = 0, x, y;
1567 for (y = 0; y < h; y++) {
1568 for (x = 0; x < 16; x++)
1569 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1571 for (x = 0; x < 15; x++)
1572 score2 += FFABS(s1[x] - s1[x + stride] -
1573 s1[x + 1] + s1[x + stride + 1]) -
1574 FFABS(s2[x] - s2[x + stride] -
1575 s2[x + 1] + s2[x + stride + 1]);
1582 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1584 return score1 + FFABS(score2) * 8;
1587 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1589 int score1 = 0, score2 = 0, x, y;
1591 for (y = 0; y < h; y++) {
1592 for (x = 0; x < 8; x++)
1593 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1595 for (x = 0; x < 7; x++)
1596 score2 += FFABS(s1[x] - s1[x + stride] -
1597 s1[x + 1] + s1[x + stride + 1]) -
1598 FFABS(s2[x] - s2[x + stride] -
1599 s2[x + 1] + s2[x + stride + 1]);
1606 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1608 return score1 + FFABS(score2) * 8;
1611 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1612 int16_t basis[64], int scale)
1615 unsigned int sum = 0;
1617 for (i = 0; i < 8 * 8; i++) {
1618 int b = rem[i] + ((basis[i] * scale +
1619 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1620 (BASIS_SHIFT - RECON_SHIFT));
1623 assert(-512 < b && b < 512);
1625 sum += (w * b) * (w * b) >> 4;
1630 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1634 for (i = 0; i < 8 * 8; i++)
1635 rem[i] += (basis[i] * scale +
1636 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1637 (BASIS_SHIFT - RECON_SHIFT);
1640 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1646 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1650 memset(cmp, 0, sizeof(void *) * 6);
1652 for (i = 0; i < 6; i++) {
1653 switch (type & 0xFF) {
1658 cmp[i] = c->hadamard8_diff[i];
1664 cmp[i] = c->dct_sad[i];
1667 cmp[i] = c->dct264_sad[i];
1670 cmp[i] = c->dct_max[i];
1673 cmp[i] = c->quant_psnr[i];
1682 cmp[i] = c->vsad[i];
1685 cmp[i] = c->vsse[i];
1691 cmp[i] = c->nsse[i];
1694 av_log(NULL, AV_LOG_ERROR,
1695 "internal error in cmp function selection\n");
1700 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1704 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1705 long a = *(long *) (src + i);
1706 long b = *(long *) (dst + i);
1707 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1710 dst[i + 0] += src[i + 0];
1713 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1717 #if !HAVE_FAST_UNALIGNED
1718 if ((long) src2 & (sizeof(long) - 1)) {
1719 for (i = 0; i + 7 < w; i += 8) {
1720 dst[i + 0] = src1[i + 0] - src2[i + 0];
1721 dst[i + 1] = src1[i + 1] - src2[i + 1];
1722 dst[i + 2] = src1[i + 2] - src2[i + 2];
1723 dst[i + 3] = src1[i + 3] - src2[i + 3];
1724 dst[i + 4] = src1[i + 4] - src2[i + 4];
1725 dst[i + 5] = src1[i + 5] - src2[i + 5];
1726 dst[i + 6] = src1[i + 6] - src2[i + 6];
1727 dst[i + 7] = src1[i + 7] - src2[i + 7];
1731 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1732 long a = *(long *) (src1 + i);
1733 long b = *(long *) (src2 + i);
1734 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1735 ((a ^ b ^ pb_80) & pb_80);
1738 dst[i + 0] = src1[i + 0] - src2[i + 0];
1741 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1742 const uint8_t *diff, int w,
1743 int *left, int *left_top)
1751 for (i = 0; i < w; i++) {
1752 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1761 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1762 const uint8_t *src2, int w,
1763 int *left, int *left_top)
1771 for (i = 0; i < w; i++) {
1772 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1782 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1787 for (i = 0; i < w - 1; i++) {
1795 for (; i < w; i++) {
1814 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1815 int w, int *red, int *green,
1816 int *blue, int *alpha)
1818 int i, r = *red, g = *green, b = *blue, a = *alpha;
1820 for (i = 0; i < w; i++) {
1821 b += src[4 * i + B];
1822 g += src[4 * i + G];
1823 r += src[4 * i + R];
1824 a += src[4 * i + A];
1842 #define BUTTERFLY2(o1, o2, i1, i2) \
1846 #define BUTTERFLY1(x, y) \
1855 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1857 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1858 uint8_t *src, int stride, int h)
1860 int i, temp[64], sum = 0;
1864 for (i = 0; i < 8; i++) {
1865 // FIXME: try pointer walks
1866 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1867 src[stride * i + 0] - dst[stride * i + 0],
1868 src[stride * i + 1] - dst[stride * i + 1]);
1869 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1870 src[stride * i + 2] - dst[stride * i + 2],
1871 src[stride * i + 3] - dst[stride * i + 3]);
1872 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1873 src[stride * i + 4] - dst[stride * i + 4],
1874 src[stride * i + 5] - dst[stride * i + 5]);
1875 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1876 src[stride * i + 6] - dst[stride * i + 6],
1877 src[stride * i + 7] - dst[stride * i + 7]);
1879 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1880 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1881 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1882 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1884 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1885 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1886 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1887 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1890 for (i = 0; i < 8; i++) {
1891 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1892 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1893 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1894 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1896 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1897 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1898 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1899 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1901 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1902 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1903 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1904 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1909 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1910 uint8_t *dummy, int stride, int h)
1912 int i, temp[64], sum = 0;
1916 for (i = 0; i < 8; i++) {
1917 // FIXME: try pointer walks
1918 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1919 src[stride * i + 0], src[stride * i + 1]);
1920 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1921 src[stride * i + 2], src[stride * i + 3]);
1922 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1923 src[stride * i + 4], src[stride * i + 5]);
1924 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1925 src[stride * i + 6], src[stride * i + 7]);
1927 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1928 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1929 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1930 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1932 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1933 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1934 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1935 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1938 for (i = 0; i < 8; i++) {
1939 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1940 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1941 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1942 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1944 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1945 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1946 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1947 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1950 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1951 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1952 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1953 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1956 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1961 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1962 uint8_t *src2, int stride, int h)
1964 LOCAL_ALIGNED_16(int16_t, temp, [64]);
1968 s->dsp.diff_pixels(temp, src1, src2, stride);
1970 return s->dsp.sum_abs_dctelem(temp);
1976 const int s07 = SRC(0) + SRC(7); \
1977 const int s16 = SRC(1) + SRC(6); \
1978 const int s25 = SRC(2) + SRC(5); \
1979 const int s34 = SRC(3) + SRC(4); \
1980 const int a0 = s07 + s34; \
1981 const int a1 = s16 + s25; \
1982 const int a2 = s07 - s34; \
1983 const int a3 = s16 - s25; \
1984 const int d07 = SRC(0) - SRC(7); \
1985 const int d16 = SRC(1) - SRC(6); \
1986 const int d25 = SRC(2) - SRC(5); \
1987 const int d34 = SRC(3) - SRC(4); \
1988 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
1989 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
1990 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
1991 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
1993 DST(1, a4 + (a7 >> 2)); \
1994 DST(2, a2 + (a3 >> 1)); \
1995 DST(3, a5 + (a6 >> 2)); \
1997 DST(5, a6 - (a5 >> 2)); \
1998 DST(6, (a2 >> 1) - a3); \
1999 DST(7, (a4 >> 2) - a7); \
2002 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2003 uint8_t *src2, int stride, int h)
2008 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2010 #define SRC(x) dct[i][x]
2011 #define DST(x, v) dct[i][x] = v
2012 for (i = 0; i < 8; i++)
2017 #define SRC(x) dct[x][i]
2018 #define DST(x, v) sum += FFABS(v)
2019 for (i = 0; i < 8; i++)
2027 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2028 uint8_t *src2, int stride, int h)
2030 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2035 s->dsp.diff_pixels(temp, src1, src2, stride);
2038 for (i = 0; i < 64; i++)
2039 sum = FFMAX(sum, FFABS(temp[i]));
2044 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2045 uint8_t *src2, int stride, int h)
2047 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2048 int16_t *const bak = temp + 64;
2054 s->dsp.diff_pixels(temp, src1, src2, stride);
2056 memcpy(bak, temp, 64 * sizeof(int16_t));
2058 s->block_last_index[0 /* FIXME */] =
2059 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2060 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2061 ff_simple_idct_8(temp); // FIXME
2063 for (i = 0; i < 64; i++)
2064 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2069 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2072 const uint8_t *scantable = s->intra_scantable.permutated;
2073 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2074 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2075 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2076 int i, last, run, bits, level, distortion, start_i;
2077 const int esc_length = s->ac_esc_length;
2078 uint8_t *length, *last_length;
2082 copy_block8(lsrc1, src1, 8, stride, 8);
2083 copy_block8(lsrc2, src2, 8, stride, 8);
2085 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2087 s->block_last_index[0 /* FIXME */] =
2089 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2095 length = s->intra_ac_vlc_length;
2096 last_length = s->intra_ac_vlc_last_length;
2097 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2100 length = s->inter_ac_vlc_length;
2101 last_length = s->inter_ac_vlc_last_length;
2104 if (last >= start_i) {
2106 for (i = start_i; i < last; i++) {
2107 int j = scantable[i];
2112 if ((level & (~127)) == 0)
2113 bits += length[UNI_AC_ENC_INDEX(run, level)];
2120 i = scantable[last];
2122 level = temp[i] + 64;
2126 if ((level & (~127)) == 0) {
2127 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2134 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2136 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2139 s->dsp.idct_add(lsrc2, 8, temp);
2141 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2143 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2146 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2149 const uint8_t *scantable = s->intra_scantable.permutated;
2150 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2151 int i, last, run, bits, level, start_i;
2152 const int esc_length = s->ac_esc_length;
2153 uint8_t *length, *last_length;
2157 s->dsp.diff_pixels(temp, src1, src2, stride);
2159 s->block_last_index[0 /* FIXME */] =
2161 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2167 length = s->intra_ac_vlc_length;
2168 last_length = s->intra_ac_vlc_last_length;
2169 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2172 length = s->inter_ac_vlc_length;
2173 last_length = s->inter_ac_vlc_last_length;
2176 if (last >= start_i) {
2178 for (i = start_i; i < last; i++) {
2179 int j = scantable[i];
2184 if ((level & (~127)) == 0)
2185 bits += length[UNI_AC_ENC_INDEX(run, level)];
2192 i = scantable[last];
2194 level = temp[i] + 64;
2198 if ((level & (~127)) == 0)
2199 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2207 #define VSAD_INTRA(size) \
2208 static int vsad_intra ## size ## _c(MpegEncContext *c, \
2209 uint8_t *s, uint8_t *dummy, \
2210 int stride, int h) \
2212 int score = 0, x, y; \
2214 for (y = 1; y < h; y++) { \
2215 for (x = 0; x < size; x += 4) { \
2216 score += FFABS(s[x] - s[x + stride]) + \
2217 FFABS(s[x + 1] - s[x + stride + 1]) + \
2218 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2219 FFABS(s[x + 3] - s[x + 3 + stride]); \
2229 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2232 int score = 0, x, y;
2234 for (y = 1; y < h; y++) {
2235 for (x = 0; x < 16; x++)
2236 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2244 #define SQ(a) ((a) * (a))
2245 #define VSSE_INTRA(size) \
2246 static int vsse_intra ## size ## _c(MpegEncContext *c, \
2247 uint8_t *s, uint8_t *dummy, \
2248 int stride, int h) \
2250 int score = 0, x, y; \
2252 for (y = 1; y < h; y++) { \
2253 for (x = 0; x < size; x += 4) { \
2254 score += SQ(s[x] - s[x + stride]) + \
2255 SQ(s[x + 1] - s[x + stride + 1]) + \
2256 SQ(s[x + 2] - s[x + stride + 2]) + \
2257 SQ(s[x + 3] - s[x + stride + 3]); \
2267 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2270 int score = 0, x, y;
2272 for (y = 1; y < h; y++) {
2273 for (x = 0; x < 16; x++)
2274 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2282 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2287 for (i = 0; i < size; i++)
2288 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2292 #define WRAPPER8_16_SQ(name8, name16) \
2293 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
2294 int stride, int h) \
2298 score += name8(s, dst, src, stride, 8); \
2299 score += name8(s, dst + 8, src + 8, stride, 8); \
2301 dst += 8 * stride; \
2302 src += 8 * stride; \
2303 score += name8(s, dst, src, stride, 8); \
2304 score += name8(s, dst + 8, src + 8, stride, 8); \
2309 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2310 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2311 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2313 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2315 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2316 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2317 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2318 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2320 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2321 uint32_t maxi, uint32_t maxisign)
2325 else if ((a ^ (1U << 31)) > maxisign)
2331 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2332 float *min, float *max, int len)
2335 uint32_t mini = *(uint32_t *) min;
2336 uint32_t maxi = *(uint32_t *) max;
2337 uint32_t maxisign = maxi ^ (1U << 31);
2338 uint32_t *dsti = (uint32_t *) dst;
2339 const uint32_t *srci = (const uint32_t *) src;
2341 for (i = 0; i < len; i += 8) {
2342 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2343 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2344 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2345 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2346 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2347 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2348 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2349 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2353 static void vector_clipf_c(float *dst, const float *src,
2354 float min, float max, int len)
2358 if (min < 0 && max > 0) {
2359 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2361 for (i = 0; i < len; i += 8) {
2362 dst[i] = av_clipf(src[i], min, max);
2363 dst[i + 1] = av_clipf(src[i + 1], min, max);
2364 dst[i + 2] = av_clipf(src[i + 2], min, max);
2365 dst[i + 3] = av_clipf(src[i + 3], min, max);
2366 dst[i + 4] = av_clipf(src[i + 4], min, max);
2367 dst[i + 5] = av_clipf(src[i + 5], min, max);
2368 dst[i + 6] = av_clipf(src[i + 6], min, max);
2369 dst[i + 7] = av_clipf(src[i + 7], min, max);
2374 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2380 res += *v1++ **v2++;
2385 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2393 *v1++ += mul * *v3++;
2398 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2399 int32_t max, unsigned int len)
2402 *dst++ = av_clip(*src++, min, max);
2403 *dst++ = av_clip(*src++, min, max);
2404 *dst++ = av_clip(*src++, min, max);
2405 *dst++ = av_clip(*src++, min, max);
2406 *dst++ = av_clip(*src++, min, max);
2407 *dst++ = av_clip(*src++, min, max);
2408 *dst++ = av_clip(*src++, min, max);
2409 *dst++ = av_clip(*src++, min, max);
2414 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2416 ff_j_rev_dct(block);
2417 put_pixels_clamped_c(block, dest, line_size);
2420 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2422 ff_j_rev_dct(block);
2423 add_pixels_clamped_c(block, dest, line_size);
2426 /* init static data */
2427 av_cold void ff_dsputil_static_init(void)
2431 for (i = 0; i < 512; i++)
2432 ff_square_tab[i] = (i - 256) * (i - 256);
2435 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2437 const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2440 if (avctx->bits_per_raw_sample == 10) {
2441 c->fdct = ff_jpeg_fdct_islow_10;
2442 c->fdct248 = ff_fdct248_islow_10;
2444 if (avctx->dct_algo == FF_DCT_FASTINT) {
2445 c->fdct = ff_fdct_ifast;
2446 c->fdct248 = ff_fdct_ifast248;
2447 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2448 c->fdct = ff_faandct;
2449 c->fdct248 = ff_faandct248;
2451 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2452 c->fdct248 = ff_fdct248_islow_8;
2455 #endif /* CONFIG_ENCODERS */
2457 if (avctx->bits_per_raw_sample == 10) {
2458 c->idct_put = ff_simple_idct_put_10;
2459 c->idct_add = ff_simple_idct_add_10;
2460 c->idct = ff_simple_idct_10;
2461 c->idct_permutation_type = FF_NO_IDCT_PERM;
2463 if (avctx->idct_algo == FF_IDCT_INT) {
2464 c->idct_put = jref_idct_put;
2465 c->idct_add = jref_idct_add;
2466 c->idct = ff_j_rev_dct;
2467 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2468 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2469 c->idct_put = ff_faanidct_put;
2470 c->idct_add = ff_faanidct_add;
2471 c->idct = ff_faanidct;
2472 c->idct_permutation_type = FF_NO_IDCT_PERM;
2473 } else { // accurate/default
2474 c->idct_put = ff_simple_idct_put_8;
2475 c->idct_add = ff_simple_idct_add_8;
2476 c->idct = ff_simple_idct_8;
2477 c->idct_permutation_type = FF_NO_IDCT_PERM;
2481 c->diff_pixels = diff_pixels_c;
2483 c->put_pixels_clamped = put_pixels_clamped_c;
2484 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2485 c->add_pixels_clamped = add_pixels_clamped_c;
2487 c->sum_abs_dctelem = sum_abs_dctelem_c;
2492 c->pix_sum = pix_sum_c;
2493 c->pix_norm1 = pix_norm1_c;
2495 c->fill_block_tab[0] = fill_block16_c;
2496 c->fill_block_tab[1] = fill_block8_c;
2498 /* TODO [0] 16 [1] 8 */
2499 c->pix_abs[0][0] = pix_abs16_c;
2500 c->pix_abs[0][1] = pix_abs16_x2_c;
2501 c->pix_abs[0][2] = pix_abs16_y2_c;
2502 c->pix_abs[0][3] = pix_abs16_xy2_c;
2503 c->pix_abs[1][0] = pix_abs8_c;
2504 c->pix_abs[1][1] = pix_abs8_x2_c;
2505 c->pix_abs[1][2] = pix_abs8_y2_c;
2506 c->pix_abs[1][3] = pix_abs8_xy2_c;
2508 #define dspfunc(PFX, IDX, NUM) \
2509 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
2510 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
2511 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
2512 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
2513 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
2514 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
2515 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
2516 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
2517 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
2518 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
2519 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2520 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2521 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2522 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2523 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2524 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2526 dspfunc(put_qpel, 0, 16);
2527 dspfunc(put_qpel, 1, 8);
2529 dspfunc(put_no_rnd_qpel, 0, 16);
2530 dspfunc(put_no_rnd_qpel, 1, 8);
2532 dspfunc(avg_qpel, 0, 16);
2533 dspfunc(avg_qpel, 1, 8);
2537 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2538 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2539 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2540 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2541 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2542 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2543 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2544 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2546 #define SET_CMP_FUNC(name) \
2547 c->name[0] = name ## 16_c; \
2548 c->name[1] = name ## 8x8_c;
2550 SET_CMP_FUNC(hadamard8_diff)
2551 c->hadamard8_diff[4] = hadamard8_intra16_c;
2552 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2553 SET_CMP_FUNC(dct_sad)
2554 SET_CMP_FUNC(dct_max)
2556 SET_CMP_FUNC(dct264_sad)
2558 c->sad[0] = pix_abs16_c;
2559 c->sad[1] = pix_abs8_c;
2560 c->sse[0] = sse16_c;
2563 SET_CMP_FUNC(quant_psnr)
2566 c->vsad[0] = vsad16_c;
2567 c->vsad[4] = vsad_intra16_c;
2568 c->vsad[5] = vsad_intra8_c;
2569 c->vsse[0] = vsse16_c;
2570 c->vsse[4] = vsse_intra16_c;
2571 c->vsse[5] = vsse_intra8_c;
2572 c->nsse[0] = nsse16_c;
2573 c->nsse[1] = nsse8_c;
2575 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2577 c->add_bytes = add_bytes_c;
2578 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
2579 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2580 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2582 c->diff_bytes = diff_bytes_c;
2583 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2585 c->bswap_buf = bswap_buf;
2586 c->bswap16_buf = bswap16_buf;
2588 c->try_8x8basis = try_8x8basis_c;
2589 c->add_8x8basis = add_8x8basis_c;
2591 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2593 c->scalarproduct_int16 = scalarproduct_int16_c;
2594 c->vector_clip_int32 = vector_clip_int32_c;
2595 c->vector_clipf = vector_clipf_c;
2597 c->shrink[0] = av_image_copy_plane;
2598 c->shrink[1] = ff_shrink22;
2599 c->shrink[2] = ff_shrink44;
2600 c->shrink[3] = ff_shrink88;
2602 c->add_pixels8 = add_pixels8_c;
2606 #define FUNC(f, depth) f ## _ ## depth
2607 #define FUNCC(f, depth) f ## _ ## depth ## _c
2609 c->draw_edges = FUNCC(draw_edges, 8);
2611 c->clear_block = FUNCC(clear_block, 8);
2612 c->clear_blocks = FUNCC(clear_blocks, 8);
2614 #define BIT_DEPTH_FUNCS(depth) \
2615 c->get_pixels = FUNCC(get_pixels, depth);
2617 switch (avctx->bits_per_raw_sample) {
2620 BIT_DEPTH_FUNCS(16);
2628 ff_dsputil_init_arm(c, avctx, high_bit_depth);
2630 ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2632 ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2634 ff_dsputil_init_x86(c, avctx, high_bit_depth);
2636 ff_init_scantable_permutation(c->idct_permutation,
2637 c->idct_permutation_type);