3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
44 uint32_t ff_square_tab[512] = { 0, };
47 #include "dsputilenc_template.c"
51 #include "hpel_template.c"
52 #include "tpel_template.c"
53 #include "dsputil_template.c"
54 #include "dsputilenc_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL / 255 * 0x7f)
58 #define pb_80 (~0UL / 255 * 0x80)
60 /* Specific zigzag scan for 248 idct. NOTE that unlike the
61 * specification, we interleave the fields */
62 const uint8_t ff_zigzag248_direct[64] = {
63 0, 8, 1, 9, 16, 24, 2, 10,
64 17, 25, 32, 40, 48, 56, 33, 41,
65 18, 26, 3, 11, 4, 12, 19, 27,
66 34, 42, 49, 57, 50, 58, 35, 43,
67 20, 28, 5, 13, 6, 14, 21, 29,
68 36, 44, 51, 59, 52, 60, 37, 45,
69 22, 30, 7, 15, 23, 31, 38, 46,
70 53, 61, 54, 62, 39, 47, 55, 63,
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64] = {
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
109 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
110 const uint8_t *src_scantable)
114 st->scantable = src_scantable;
116 for (i = 0; i < 64; i++) {
117 int j = src_scantable[i];
118 st->permutated[i] = permutation[j];
122 for (i = 0; i < 64; i++) {
123 int j = st->permutated[i];
126 st->raster_end[i] = end;
130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
131 int idct_permutation_type)
135 switch (idct_permutation_type) {
136 case FF_NO_IDCT_PERM:
137 for (i = 0; i < 64; i++)
138 idct_permutation[i] = i;
140 case FF_LIBMPEG2_IDCT_PERM:
141 for (i = 0; i < 64; i++)
142 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
144 case FF_SIMPLE_IDCT_PERM:
145 for (i = 0; i < 64; i++)
146 idct_permutation[i] = simple_mmx_permutation[i];
148 case FF_TRANSPOSE_IDCT_PERM:
149 for (i = 0; i < 64; i++)
150 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
152 case FF_PARTTRANS_IDCT_PERM:
153 for (i = 0; i < 64; i++)
154 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
156 case FF_SSE2_IDCT_PERM:
157 for (i = 0; i < 64; i++)
158 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
161 av_log(NULL, AV_LOG_ERROR,
162 "Internal error, IDCT permutation not set\n");
166 static int pix_sum_c(uint8_t *pix, int line_size)
170 for (i = 0; i < 16; i++) {
171 for (j = 0; j < 16; j += 8) {
182 pix += line_size - 16;
187 static int pix_norm1_c(uint8_t *pix, int line_size)
190 uint32_t *sq = ff_square_tab + 256;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
205 register uint64_t x = *(uint64_t *) pix;
207 s += sq[(x >> 8) & 0xff];
208 s += sq[(x >> 16) & 0xff];
209 s += sq[(x >> 24) & 0xff];
210 s += sq[(x >> 32) & 0xff];
211 s += sq[(x >> 40) & 0xff];
212 s += sq[(x >> 48) & 0xff];
213 s += sq[(x >> 56) & 0xff];
215 register uint32_t x = *(uint32_t *) pix;
217 s += sq[(x >> 8) & 0xff];
218 s += sq[(x >> 16) & 0xff];
219 s += sq[(x >> 24) & 0xff];
220 x = *(uint32_t *) (pix + 4);
222 s += sq[(x >> 8) & 0xff];
223 s += sq[(x >> 16) & 0xff];
224 s += sq[(x >> 24) & 0xff];
229 pix += line_size - 16;
234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
238 for (i = 0; i + 8 <= w; i += 8) {
239 dst[i + 0] = av_bswap32(src[i + 0]);
240 dst[i + 1] = av_bswap32(src[i + 1]);
241 dst[i + 2] = av_bswap32(src[i + 2]);
242 dst[i + 3] = av_bswap32(src[i + 3]);
243 dst[i + 4] = av_bswap32(src[i + 4]);
244 dst[i + 5] = av_bswap32(src[i + 5]);
245 dst[i + 6] = av_bswap32(src[i + 6]);
246 dst[i + 7] = av_bswap32(src[i + 7]);
249 dst[i + 0] = av_bswap32(src[i + 0]);
252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
255 *dst++ = av_bswap16(*src++);
258 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
259 int line_size, int h)
262 uint32_t *sq = ff_square_tab + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
275 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276 int line_size, int h)
279 uint32_t *sq = ff_square_tab + 256;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 s += sq[pix1[4] - pix2[4]];
287 s += sq[pix1[5] - pix2[5]];
288 s += sq[pix1[6] - pix2[6]];
289 s += sq[pix1[7] - pix2[7]];
296 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297 int line_size, int h)
300 uint32_t *sq = ff_square_tab + 256;
302 for (i = 0; i < h; i++) {
303 s += sq[pix1[0] - pix2[0]];
304 s += sq[pix1[1] - pix2[1]];
305 s += sq[pix1[2] - pix2[2]];
306 s += sq[pix1[3] - pix2[3]];
307 s += sq[pix1[4] - pix2[4]];
308 s += sq[pix1[5] - pix2[5]];
309 s += sq[pix1[6] - pix2[6]];
310 s += sq[pix1[7] - pix2[7]];
311 s += sq[pix1[8] - pix2[8]];
312 s += sq[pix1[9] - pix2[9]];
313 s += sq[pix1[10] - pix2[10]];
314 s += sq[pix1[11] - pix2[11]];
315 s += sq[pix1[12] - pix2[12]];
316 s += sq[pix1[13] - pix2[13]];
317 s += sq[pix1[14] - pix2[14]];
318 s += sq[pix1[15] - pix2[15]];
326 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
327 const uint8_t *s2, int stride)
331 /* read the pixels */
332 for (i = 0; i < 8; i++) {
333 block[0] = s1[0] - s2[0];
334 block[1] = s1[1] - s2[1];
335 block[2] = s1[2] - s2[2];
336 block[3] = s1[3] - s2[3];
337 block[4] = s1[4] - s2[4];
338 block[5] = s1[5] - s2[5];
339 block[6] = s1[6] - s2[6];
340 block[7] = s1[7] - s2[7];
347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
352 /* read the pixels */
353 for (i = 0; i < 8; i++) {
354 pixels[0] = av_clip_uint8(block[0]);
355 pixels[1] = av_clip_uint8(block[1]);
356 pixels[2] = av_clip_uint8(block[2]);
357 pixels[3] = av_clip_uint8(block[3]);
358 pixels[4] = av_clip_uint8(block[4]);
359 pixels[5] = av_clip_uint8(block[5]);
360 pixels[6] = av_clip_uint8(block[6]);
361 pixels[7] = av_clip_uint8(block[7]);
368 static void put_signed_pixels_clamped_c(const int16_t *block,
369 uint8_t *restrict pixels,
374 for (i = 0; i < 8; i++) {
375 for (j = 0; j < 8; j++) {
378 else if (*block > 127)
381 *pixels = (uint8_t) (*block + 128);
385 pixels += (line_size - 8);
389 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
394 for (i = 0; i < 8; i++) {
395 pixels[0] += block[0];
396 pixels[1] += block[1];
397 pixels[2] += block[2];
398 pixels[3] += block[3];
399 pixels[4] += block[4];
400 pixels[5] += block[5];
401 pixels[6] += block[6];
402 pixels[7] += block[7];
408 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
413 /* read the pixels */
414 for (i = 0; i < 8; i++) {
415 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
416 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
417 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
418 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
419 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
420 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
421 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
422 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
428 static int sum_abs_dctelem_c(int16_t *block)
432 for (i = 0; i < 64; i++)
433 sum += FFABS(block[i]);
437 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
441 for (i = 0; i < h; i++) {
442 memset(block, value, 16);
447 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
451 for (i = 0; i < h; i++) {
452 memset(block, value, 8);
457 #define avg2(a, b) ((a + b + 1) >> 1)
458 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
460 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
461 int x16, int y16, int rounder)
463 const int A = (16 - x16) * (16 - y16);
464 const int B = (x16) * (16 - y16);
465 const int C = (16 - x16) * (y16);
466 const int D = (x16) * (y16);
469 for (i = 0; i < h; i++) {
470 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
471 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
472 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
473 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
474 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
475 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
476 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
477 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
483 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
484 int dxx, int dxy, int dyx, int dyy, int shift, int r,
485 int width, int height)
488 const int s = 1 << shift;
493 for (y = 0; y < h; y++) {
498 for (x = 0; x < 8; x++) { // FIXME: optimize
500 int src_x = vx >> 16;
501 int src_y = vy >> 16;
502 int frac_x = src_x & (s - 1);
503 int frac_y = src_y & (s - 1);
508 if ((unsigned) src_x < width) {
509 if ((unsigned) src_y < height) {
510 index = src_x + src_y * stride;
511 dst[y * stride + x] =
512 ((src[index] * (s - frac_x) +
513 src[index + 1] * frac_x) * (s - frac_y) +
514 (src[index + stride] * (s - frac_x) +
515 src[index + stride + 1] * frac_x) * frac_y +
518 index = src_x + av_clip(src_y, 0, height) * stride;
519 dst[y * stride + x] =
520 ((src[index] * (s - frac_x) +
521 src[index + 1] * frac_x) * s +
525 if ((unsigned) src_y < height) {
526 index = av_clip(src_x, 0, width) + src_y * stride;
527 dst[y * stride + x] =
528 ((src[index] * (s - frac_y) +
529 src[index + stride] * frac_y) * s +
532 index = av_clip(src_x, 0, width) +
533 av_clip(src_y, 0, height) * stride;
534 dst[y * stride + x] = src[index];
546 #define QPEL_MC(r, OPNAME, RND, OP) \
547 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
548 int dstStride, int srcStride, \
551 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
554 for (i = 0; i < h; i++) { \
555 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
556 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
557 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
558 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
559 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
560 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
561 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
562 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
568 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
569 int dstStride, int srcStride) \
571 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
575 for (i = 0; i < w; i++) { \
576 const int src0 = src[0 * srcStride]; \
577 const int src1 = src[1 * srcStride]; \
578 const int src2 = src[2 * srcStride]; \
579 const int src3 = src[3 * srcStride]; \
580 const int src4 = src[4 * srcStride]; \
581 const int src5 = src[5 * srcStride]; \
582 const int src6 = src[6 * srcStride]; \
583 const int src7 = src[7 * srcStride]; \
584 const int src8 = src[8 * srcStride]; \
585 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
586 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
587 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
588 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
589 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
590 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
591 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
592 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
598 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
599 int dstStride, int srcStride, \
602 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
605 for (i = 0; i < h; i++) { \
606 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
607 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
608 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
609 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
610 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
611 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
612 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
613 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
614 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
615 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
616 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
617 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
618 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
619 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
620 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
621 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
627 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
628 int dstStride, int srcStride) \
630 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
634 for (i = 0; i < w; i++) { \
635 const int src0 = src[0 * srcStride]; \
636 const int src1 = src[1 * srcStride]; \
637 const int src2 = src[2 * srcStride]; \
638 const int src3 = src[3 * srcStride]; \
639 const int src4 = src[4 * srcStride]; \
640 const int src5 = src[5 * srcStride]; \
641 const int src6 = src[6 * srcStride]; \
642 const int src7 = src[7 * srcStride]; \
643 const int src8 = src[8 * srcStride]; \
644 const int src9 = src[9 * srcStride]; \
645 const int src10 = src[10 * srcStride]; \
646 const int src11 = src[11 * srcStride]; \
647 const int src12 = src[12 * srcStride]; \
648 const int src13 = src[13 * srcStride]; \
649 const int src14 = src[14 * srcStride]; \
650 const int src15 = src[15 * srcStride]; \
651 const int src16 = src[16 * srcStride]; \
652 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
653 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
654 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
655 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
656 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
657 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
658 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
659 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
660 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
661 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
662 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
663 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
664 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
665 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
666 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
667 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
673 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
678 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
679 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
682 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
685 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
688 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
693 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
694 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
697 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
700 uint8_t full[16 * 9]; \
703 copy_block9(full, src, 16, stride, 9); \
704 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
705 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
708 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
711 uint8_t full[16 * 9]; \
713 copy_block9(full, src, 16, stride, 9); \
714 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
717 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
720 uint8_t full[16 * 9]; \
723 copy_block9(full, src, 16, stride, 9); \
724 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
725 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
728 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
731 uint8_t full[16 * 9]; \
734 uint8_t halfHV[64]; \
736 copy_block9(full, src, 16, stride, 9); \
737 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
738 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
739 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
740 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
741 stride, 16, 8, 8, 8, 8); \
744 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
747 uint8_t full[16 * 9]; \
749 uint8_t halfHV[64]; \
751 copy_block9(full, src, 16, stride, 9); \
752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
753 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
755 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
758 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
761 uint8_t full[16 * 9]; \
764 uint8_t halfHV[64]; \
766 copy_block9(full, src, 16, stride, 9); \
767 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
768 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
770 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
771 stride, 16, 8, 8, 8, 8); \
774 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
777 uint8_t full[16 * 9]; \
779 uint8_t halfHV[64]; \
781 copy_block9(full, src, 16, stride, 9); \
782 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
783 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
784 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
785 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
788 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
791 uint8_t full[16 * 9]; \
794 uint8_t halfHV[64]; \
796 copy_block9(full, src, 16, stride, 9); \
797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
798 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
799 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
800 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
801 stride, 16, 8, 8, 8, 8); \
804 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
807 uint8_t full[16 * 9]; \
809 uint8_t halfHV[64]; \
811 copy_block9(full, src, 16, stride, 9); \
812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
813 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
815 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
818 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
821 uint8_t full[16 * 9]; \
824 uint8_t halfHV[64]; \
826 copy_block9(full, src, 16, stride, 9); \
827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
830 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
831 stride, 16, 8, 8, 8, 8); \
834 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
837 uint8_t full[16 * 9]; \
839 uint8_t halfHV[64]; \
841 copy_block9(full, src, 16, stride, 9); \
842 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
843 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
844 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
845 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
848 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
852 uint8_t halfHV[64]; \
854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
855 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
856 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
859 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
863 uint8_t halfHV[64]; \
865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
866 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
867 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
870 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
873 uint8_t full[16 * 9]; \
876 uint8_t halfHV[64]; \
878 copy_block9(full, src, 16, stride, 9); \
879 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
880 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
881 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
882 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
885 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
888 uint8_t full[16 * 9]; \
891 copy_block9(full, src, 16, stride, 9); \
892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
893 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
894 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
897 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
900 uint8_t full[16 * 9]; \
903 uint8_t halfHV[64]; \
905 copy_block9(full, src, 16, stride, 9); \
906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
907 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
909 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
912 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
915 uint8_t full[16 * 9]; \
918 copy_block9(full, src, 16, stride, 9); \
919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
920 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
921 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
924 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
930 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
933 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
938 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
939 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
942 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
945 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
948 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
953 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
954 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
957 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
960 uint8_t full[24 * 17]; \
963 copy_block17(full, src, 24, stride, 17); \
964 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
965 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
968 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
971 uint8_t full[24 * 17]; \
973 copy_block17(full, src, 24, stride, 17); \
974 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
977 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
980 uint8_t full[24 * 17]; \
983 copy_block17(full, src, 24, stride, 17); \
984 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
985 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
988 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
991 uint8_t full[24 * 17]; \
992 uint8_t halfH[272]; \
993 uint8_t halfV[256]; \
994 uint8_t halfHV[256]; \
996 copy_block17(full, src, 24, stride, 17); \
997 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
998 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
999 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1000 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1001 stride, 24, 16, 16, 16, 16); \
1004 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1007 uint8_t full[24 * 17]; \
1008 uint8_t halfH[272]; \
1009 uint8_t halfHV[256]; \
1011 copy_block17(full, src, 24, stride, 17); \
1012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1013 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1015 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1018 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1021 uint8_t full[24 * 17]; \
1022 uint8_t halfH[272]; \
1023 uint8_t halfV[256]; \
1024 uint8_t halfHV[256]; \
1026 copy_block17(full, src, 24, stride, 17); \
1027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1028 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1030 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1031 stride, 24, 16, 16, 16, 16); \
1034 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1037 uint8_t full[24 * 17]; \
1038 uint8_t halfH[272]; \
1039 uint8_t halfHV[256]; \
1041 copy_block17(full, src, 24, stride, 17); \
1042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1043 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1044 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1045 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1048 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1051 uint8_t full[24 * 17]; \
1052 uint8_t halfH[272]; \
1053 uint8_t halfV[256]; \
1054 uint8_t halfHV[256]; \
1056 copy_block17(full, src, 24, stride, 17); \
1057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1058 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1060 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1061 stride, 24, 16, 16, 16, 16); \
1064 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1067 uint8_t full[24 * 17]; \
1068 uint8_t halfH[272]; \
1069 uint8_t halfHV[256]; \
1071 copy_block17(full, src, 24, stride, 17); \
1072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1073 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1075 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1078 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1081 uint8_t full[24 * 17]; \
1082 uint8_t halfH[272]; \
1083 uint8_t halfV[256]; \
1084 uint8_t halfHV[256]; \
1086 copy_block17(full, src, 24, stride, 17); \
1087 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1088 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1089 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1090 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1091 stride, 24, 16, 16, 16, 16); \
1094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1097 uint8_t full[24 * 17]; \
1098 uint8_t halfH[272]; \
1099 uint8_t halfHV[256]; \
1101 copy_block17(full, src, 24, stride, 17); \
1102 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1103 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1104 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1105 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1108 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1111 uint8_t halfH[272]; \
1112 uint8_t halfHV[256]; \
1114 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1115 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1116 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1119 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1122 uint8_t halfH[272]; \
1123 uint8_t halfHV[256]; \
1125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1126 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1127 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1130 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1133 uint8_t full[24 * 17]; \
1134 uint8_t halfH[272]; \
1135 uint8_t halfV[256]; \
1136 uint8_t halfHV[256]; \
1138 copy_block17(full, src, 24, stride, 17); \
1139 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1141 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1142 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1145 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1148 uint8_t full[24 * 17]; \
1149 uint8_t halfH[272]; \
1151 copy_block17(full, src, 24, stride, 17); \
1152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1153 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1154 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1157 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1160 uint8_t full[24 * 17]; \
1161 uint8_t halfH[272]; \
1162 uint8_t halfV[256]; \
1163 uint8_t halfHV[256]; \
1165 copy_block17(full, src, 24, stride, 17); \
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1169 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1172 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1175 uint8_t full[24 * 17]; \
1176 uint8_t halfH[272]; \
1178 copy_block17(full, src, 24, stride, 17); \
1179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1180 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1181 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1184 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1187 uint8_t halfH[272]; \
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1190 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1193 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1194 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1195 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1196 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1198 QPEL_MC(0, put_, _, op_put)
1199 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1200 QPEL_MC(0, avg_, _, op_avg)
1204 #undef op_put_no_rnd
1206 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1208 put_pixels8_8_c(dst, src, stride, 8);
1211 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1213 avg_pixels8_8_c(dst, src, stride, 8);
1216 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1218 put_pixels16_8_c(dst, src, stride, 16);
1221 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1223 avg_pixels16_8_c(dst, src, stride, 16);
1226 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1227 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1228 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1229 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1230 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1231 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1233 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1234 int dstStride, int srcStride, int h)
1236 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1239 for (i = 0; i < h; i++) {
1240 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1241 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1242 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1243 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1244 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1245 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1246 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1247 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1253 #if CONFIG_RV40_DECODER
1254 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1256 put_pixels16_xy2_8_c(dst, src, stride, 16);
1259 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1261 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1264 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1266 put_pixels8_xy2_8_c(dst, src, stride, 8);
1269 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1271 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1273 #endif /* CONFIG_RV40_DECODER */
1275 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1276 int dstStride, int srcStride, int w)
1278 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1281 for (i = 0; i < w; i++) {
1282 const int src_1 = src[-srcStride];
1283 const int src0 = src[0];
1284 const int src1 = src[srcStride];
1285 const int src2 = src[2 * srcStride];
1286 const int src3 = src[3 * srcStride];
1287 const int src4 = src[4 * srcStride];
1288 const int src5 = src[5 * srcStride];
1289 const int src6 = src[6 * srcStride];
1290 const int src7 = src[7 * srcStride];
1291 const int src8 = src[8 * srcStride];
1292 const int src9 = src[9 * srcStride];
1293 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1294 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1295 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1296 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1297 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1298 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1299 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1300 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1306 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1310 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1311 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1314 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1319 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1323 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1324 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1327 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1329 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1332 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1338 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1339 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1340 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1341 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1344 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1350 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1351 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1352 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1353 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1356 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1360 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1361 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1364 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1365 int line_size, int h)
1369 for (i = 0; i < h; i++) {
1370 s += abs(pix1[0] - pix2[0]);
1371 s += abs(pix1[1] - pix2[1]);
1372 s += abs(pix1[2] - pix2[2]);
1373 s += abs(pix1[3] - pix2[3]);
1374 s += abs(pix1[4] - pix2[4]);
1375 s += abs(pix1[5] - pix2[5]);
1376 s += abs(pix1[6] - pix2[6]);
1377 s += abs(pix1[7] - pix2[7]);
1378 s += abs(pix1[8] - pix2[8]);
1379 s += abs(pix1[9] - pix2[9]);
1380 s += abs(pix1[10] - pix2[10]);
1381 s += abs(pix1[11] - pix2[11]);
1382 s += abs(pix1[12] - pix2[12]);
1383 s += abs(pix1[13] - pix2[13]);
1384 s += abs(pix1[14] - pix2[14]);
1385 s += abs(pix1[15] - pix2[15]);
1392 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1393 int line_size, int h)
1397 for (i = 0; i < h; i++) {
1398 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1399 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1400 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1401 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1402 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1403 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1404 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1405 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1406 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1407 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1408 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1409 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1410 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1411 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1412 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1413 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1420 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1421 int line_size, int h)
1424 uint8_t *pix3 = pix2 + line_size;
1426 for (i = 0; i < h; i++) {
1427 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1428 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1429 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1430 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1431 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1432 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1433 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1434 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1435 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1436 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1437 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1438 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1439 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1440 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1441 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1442 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1450 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1451 int line_size, int h)
1454 uint8_t *pix3 = pix2 + line_size;
1456 for (i = 0; i < h; i++) {
1457 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1458 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1459 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1460 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1461 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1462 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1463 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1464 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1465 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1466 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1467 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1468 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1469 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1470 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1471 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1472 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1480 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1481 int line_size, int h)
1485 for (i = 0; i < h; i++) {
1486 s += abs(pix1[0] - pix2[0]);
1487 s += abs(pix1[1] - pix2[1]);
1488 s += abs(pix1[2] - pix2[2]);
1489 s += abs(pix1[3] - pix2[3]);
1490 s += abs(pix1[4] - pix2[4]);
1491 s += abs(pix1[5] - pix2[5]);
1492 s += abs(pix1[6] - pix2[6]);
1493 s += abs(pix1[7] - pix2[7]);
1500 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1501 int line_size, int h)
1505 for (i = 0; i < h; i++) {
1506 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1507 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1508 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1509 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1510 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1511 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1512 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1513 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1520 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1521 int line_size, int h)
1524 uint8_t *pix3 = pix2 + line_size;
1526 for (i = 0; i < h; i++) {
1527 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1528 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1529 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1530 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1531 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1532 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1533 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1534 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1542 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1543 int line_size, int h)
1546 uint8_t *pix3 = pix2 + line_size;
1548 for (i = 0; i < h; i++) {
1549 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1550 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1551 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1552 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1553 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1554 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1555 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1556 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1564 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1566 int score1 = 0, score2 = 0, x, y;
1568 for (y = 0; y < h; y++) {
1569 for (x = 0; x < 16; x++)
1570 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1572 for (x = 0; x < 15; x++)
1573 score2 += FFABS(s1[x] - s1[x + stride] -
1574 s1[x + 1] + s1[x + stride + 1]) -
1575 FFABS(s2[x] - s2[x + stride] -
1576 s2[x + 1] + s2[x + stride + 1]);
1583 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1585 return score1 + FFABS(score2) * 8;
1588 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1590 int score1 = 0, score2 = 0, x, y;
1592 for (y = 0; y < h; y++) {
1593 for (x = 0; x < 8; x++)
1594 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1596 for (x = 0; x < 7; x++)
1597 score2 += FFABS(s1[x] - s1[x + stride] -
1598 s1[x + 1] + s1[x + stride + 1]) -
1599 FFABS(s2[x] - s2[x + stride] -
1600 s2[x + 1] + s2[x + stride + 1]);
1607 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1609 return score1 + FFABS(score2) * 8;
1612 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1613 int16_t basis[64], int scale)
1616 unsigned int sum = 0;
1618 for (i = 0; i < 8 * 8; i++) {
1619 int b = rem[i] + ((basis[i] * scale +
1620 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1621 (BASIS_SHIFT - RECON_SHIFT));
1624 assert(-512 < b && b < 512);
1626 sum += (w * b) * (w * b) >> 4;
1631 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1635 for (i = 0; i < 8 * 8; i++)
1636 rem[i] += (basis[i] * scale +
1637 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1638 (BASIS_SHIFT - RECON_SHIFT);
1641 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1647 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1651 memset(cmp, 0, sizeof(void *) * 6);
1653 for (i = 0; i < 6; i++) {
1654 switch (type & 0xFF) {
1659 cmp[i] = c->hadamard8_diff[i];
1665 cmp[i] = c->dct_sad[i];
1668 cmp[i] = c->dct264_sad[i];
1671 cmp[i] = c->dct_max[i];
1674 cmp[i] = c->quant_psnr[i];
1683 cmp[i] = c->vsad[i];
1686 cmp[i] = c->vsse[i];
1692 cmp[i] = c->nsse[i];
1695 av_log(NULL, AV_LOG_ERROR,
1696 "internal error in cmp function selection\n");
1701 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1705 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1706 long a = *(long *) (src + i);
1707 long b = *(long *) (dst + i);
1708 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1711 dst[i + 0] += src[i + 0];
1714 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1718 #if !HAVE_FAST_UNALIGNED
1719 if ((long) src2 & (sizeof(long) - 1)) {
1720 for (i = 0; i + 7 < w; i += 8) {
1721 dst[i + 0] = src1[i + 0] - src2[i + 0];
1722 dst[i + 1] = src1[i + 1] - src2[i + 1];
1723 dst[i + 2] = src1[i + 2] - src2[i + 2];
1724 dst[i + 3] = src1[i + 3] - src2[i + 3];
1725 dst[i + 4] = src1[i + 4] - src2[i + 4];
1726 dst[i + 5] = src1[i + 5] - src2[i + 5];
1727 dst[i + 6] = src1[i + 6] - src2[i + 6];
1728 dst[i + 7] = src1[i + 7] - src2[i + 7];
1732 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1733 long a = *(long *) (src1 + i);
1734 long b = *(long *) (src2 + i);
1735 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1736 ((a ^ b ^ pb_80) & pb_80);
1739 dst[i + 0] = src1[i + 0] - src2[i + 0];
1742 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1743 const uint8_t *diff, int w,
1744 int *left, int *left_top)
1752 for (i = 0; i < w; i++) {
1753 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1762 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1763 const uint8_t *src2, int w,
1764 int *left, int *left_top)
1772 for (i = 0; i < w; i++) {
1773 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1783 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1788 for (i = 0; i < w - 1; i++) {
1796 for (; i < w; i++) {
1815 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1816 int w, int *red, int *green,
1817 int *blue, int *alpha)
1819 int i, r = *red, g = *green, b = *blue, a = *alpha;
1821 for (i = 0; i < w; i++) {
1822 b += src[4 * i + B];
1823 g += src[4 * i + G];
1824 r += src[4 * i + R];
1825 a += src[4 * i + A];
1843 #define BUTTERFLY2(o1, o2, i1, i2) \
1847 #define BUTTERFLY1(x, y) \
1856 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1858 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1859 uint8_t *src, int stride, int h)
1861 int i, temp[64], sum = 0;
1865 for (i = 0; i < 8; i++) {
1866 // FIXME: try pointer walks
1867 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1868 src[stride * i + 0] - dst[stride * i + 0],
1869 src[stride * i + 1] - dst[stride * i + 1]);
1870 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1871 src[stride * i + 2] - dst[stride * i + 2],
1872 src[stride * i + 3] - dst[stride * i + 3]);
1873 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1874 src[stride * i + 4] - dst[stride * i + 4],
1875 src[stride * i + 5] - dst[stride * i + 5]);
1876 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1877 src[stride * i + 6] - dst[stride * i + 6],
1878 src[stride * i + 7] - dst[stride * i + 7]);
1880 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1881 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1882 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1883 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1885 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1886 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1887 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1888 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1891 for (i = 0; i < 8; i++) {
1892 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1893 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1894 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1895 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1897 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1898 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1899 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1900 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1902 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1903 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1904 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1905 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1910 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1911 uint8_t *dummy, int stride, int h)
1913 int i, temp[64], sum = 0;
1917 for (i = 0; i < 8; i++) {
1918 // FIXME: try pointer walks
1919 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1920 src[stride * i + 0], src[stride * i + 1]);
1921 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1922 src[stride * i + 2], src[stride * i + 3]);
1923 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1924 src[stride * i + 4], src[stride * i + 5]);
1925 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1926 src[stride * i + 6], src[stride * i + 7]);
1928 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1929 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1930 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1931 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1933 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1934 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1935 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1936 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1939 for (i = 0; i < 8; i++) {
1940 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1941 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1942 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1943 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1945 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1946 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1947 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1948 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1951 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1952 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1953 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1954 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1957 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1962 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1963 uint8_t *src2, int stride, int h)
1965 LOCAL_ALIGNED_16(int16_t, temp, [64]);
1969 s->dsp.diff_pixels(temp, src1, src2, stride);
1971 return s->dsp.sum_abs_dctelem(temp);
1977 const int s07 = SRC(0) + SRC(7); \
1978 const int s16 = SRC(1) + SRC(6); \
1979 const int s25 = SRC(2) + SRC(5); \
1980 const int s34 = SRC(3) + SRC(4); \
1981 const int a0 = s07 + s34; \
1982 const int a1 = s16 + s25; \
1983 const int a2 = s07 - s34; \
1984 const int a3 = s16 - s25; \
1985 const int d07 = SRC(0) - SRC(7); \
1986 const int d16 = SRC(1) - SRC(6); \
1987 const int d25 = SRC(2) - SRC(5); \
1988 const int d34 = SRC(3) - SRC(4); \
1989 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
1990 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
1991 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
1992 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
1994 DST(1, a4 + (a7 >> 2)); \
1995 DST(2, a2 + (a3 >> 1)); \
1996 DST(3, a5 + (a6 >> 2)); \
1998 DST(5, a6 - (a5 >> 2)); \
1999 DST(6, (a2 >> 1) - a3); \
2000 DST(7, (a4 >> 2) - a7); \
2003 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2004 uint8_t *src2, int stride, int h)
2009 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2011 #define SRC(x) dct[i][x]
2012 #define DST(x, v) dct[i][x] = v
2013 for (i = 0; i < 8; i++)
2018 #define SRC(x) dct[x][i]
2019 #define DST(x, v) sum += FFABS(v)
2020 for (i = 0; i < 8; i++)
2028 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2029 uint8_t *src2, int stride, int h)
2031 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2036 s->dsp.diff_pixels(temp, src1, src2, stride);
2039 for (i = 0; i < 64; i++)
2040 sum = FFMAX(sum, FFABS(temp[i]));
2045 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2046 uint8_t *src2, int stride, int h)
2048 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2049 int16_t *const bak = temp + 64;
2055 s->dsp.diff_pixels(temp, src1, src2, stride);
2057 memcpy(bak, temp, 64 * sizeof(int16_t));
2059 s->block_last_index[0 /* FIXME */] =
2060 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2061 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2062 ff_simple_idct_8(temp); // FIXME
2064 for (i = 0; i < 64; i++)
2065 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2070 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2073 const uint8_t *scantable = s->intra_scantable.permutated;
2074 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2075 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2076 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2077 int i, last, run, bits, level, distortion, start_i;
2078 const int esc_length = s->ac_esc_length;
2079 uint8_t *length, *last_length;
2083 copy_block8(lsrc1, src1, 8, stride, 8);
2084 copy_block8(lsrc2, src2, 8, stride, 8);
2086 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2088 s->block_last_index[0 /* FIXME */] =
2090 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2096 length = s->intra_ac_vlc_length;
2097 last_length = s->intra_ac_vlc_last_length;
2098 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2101 length = s->inter_ac_vlc_length;
2102 last_length = s->inter_ac_vlc_last_length;
2105 if (last >= start_i) {
2107 for (i = start_i; i < last; i++) {
2108 int j = scantable[i];
2113 if ((level & (~127)) == 0)
2114 bits += length[UNI_AC_ENC_INDEX(run, level)];
2121 i = scantable[last];
2123 level = temp[i] + 64;
2127 if ((level & (~127)) == 0) {
2128 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2135 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2137 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2140 s->dsp.idct_add(lsrc2, 8, temp);
2142 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2144 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2147 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2150 const uint8_t *scantable = s->intra_scantable.permutated;
2151 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2152 int i, last, run, bits, level, start_i;
2153 const int esc_length = s->ac_esc_length;
2154 uint8_t *length, *last_length;
2158 s->dsp.diff_pixels(temp, src1, src2, stride);
2160 s->block_last_index[0 /* FIXME */] =
2162 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2168 length = s->intra_ac_vlc_length;
2169 last_length = s->intra_ac_vlc_last_length;
2170 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2173 length = s->inter_ac_vlc_length;
2174 last_length = s->inter_ac_vlc_last_length;
2177 if (last >= start_i) {
2179 for (i = start_i; i < last; i++) {
2180 int j = scantable[i];
2185 if ((level & (~127)) == 0)
2186 bits += length[UNI_AC_ENC_INDEX(run, level)];
2193 i = scantable[last];
2195 level = temp[i] + 64;
2199 if ((level & (~127)) == 0)
2200 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2208 #define VSAD_INTRA(size) \
2209 static int vsad_intra ## size ## _c(MpegEncContext *c, \
2210 uint8_t *s, uint8_t *dummy, \
2211 int stride, int h) \
2213 int score = 0, x, y; \
2215 for (y = 1; y < h; y++) { \
2216 for (x = 0; x < size; x += 4) { \
2217 score += FFABS(s[x] - s[x + stride]) + \
2218 FFABS(s[x + 1] - s[x + stride + 1]) + \
2219 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2220 FFABS(s[x + 3] - s[x + 3 + stride]); \
2230 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2233 int score = 0, x, y;
2235 for (y = 1; y < h; y++) {
2236 for (x = 0; x < 16; x++)
2237 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2245 #define SQ(a) ((a) * (a))
2246 #define VSSE_INTRA(size) \
2247 static int vsse_intra ## size ## _c(MpegEncContext *c, \
2248 uint8_t *s, uint8_t *dummy, \
2249 int stride, int h) \
2251 int score = 0, x, y; \
2253 for (y = 1; y < h; y++) { \
2254 for (x = 0; x < size; x += 4) { \
2255 score += SQ(s[x] - s[x + stride]) + \
2256 SQ(s[x + 1] - s[x + stride + 1]) + \
2257 SQ(s[x + 2] - s[x + stride + 2]) + \
2258 SQ(s[x + 3] - s[x + stride + 3]); \
2268 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2271 int score = 0, x, y;
2273 for (y = 1; y < h; y++) {
2274 for (x = 0; x < 16; x++)
2275 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2283 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2288 for (i = 0; i < size; i++)
2289 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2293 #define WRAPPER8_16_SQ(name8, name16) \
2294 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
2295 int stride, int h) \
2299 score += name8(s, dst, src, stride, 8); \
2300 score += name8(s, dst + 8, src + 8, stride, 8); \
2302 dst += 8 * stride; \
2303 src += 8 * stride; \
2304 score += name8(s, dst, src, stride, 8); \
2305 score += name8(s, dst + 8, src + 8, stride, 8); \
2310 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2311 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2312 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2314 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2316 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2317 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2318 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2319 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2321 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2322 uint32_t maxi, uint32_t maxisign)
2326 else if ((a ^ (1U << 31)) > maxisign)
2332 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2333 float *min, float *max, int len)
2336 uint32_t mini = *(uint32_t *) min;
2337 uint32_t maxi = *(uint32_t *) max;
2338 uint32_t maxisign = maxi ^ (1U << 31);
2339 uint32_t *dsti = (uint32_t *) dst;
2340 const uint32_t *srci = (const uint32_t *) src;
2342 for (i = 0; i < len; i += 8) {
2343 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2344 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2345 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2346 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2347 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2348 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2349 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2350 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2354 static void vector_clipf_c(float *dst, const float *src,
2355 float min, float max, int len)
2359 if (min < 0 && max > 0) {
2360 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2362 for (i = 0; i < len; i += 8) {
2363 dst[i] = av_clipf(src[i], min, max);
2364 dst[i + 1] = av_clipf(src[i + 1], min, max);
2365 dst[i + 2] = av_clipf(src[i + 2], min, max);
2366 dst[i + 3] = av_clipf(src[i + 3], min, max);
2367 dst[i + 4] = av_clipf(src[i + 4], min, max);
2368 dst[i + 5] = av_clipf(src[i + 5], min, max);
2369 dst[i + 6] = av_clipf(src[i + 6], min, max);
2370 dst[i + 7] = av_clipf(src[i + 7], min, max);
2375 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2381 res += *v1++ **v2++;
2386 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2394 *v1++ += mul * *v3++;
2399 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2400 int32_t max, unsigned int len)
2403 *dst++ = av_clip(*src++, min, max);
2404 *dst++ = av_clip(*src++, min, max);
2405 *dst++ = av_clip(*src++, min, max);
2406 *dst++ = av_clip(*src++, min, max);
2407 *dst++ = av_clip(*src++, min, max);
2408 *dst++ = av_clip(*src++, min, max);
2409 *dst++ = av_clip(*src++, min, max);
2410 *dst++ = av_clip(*src++, min, max);
2415 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2417 ff_j_rev_dct(block);
2418 put_pixels_clamped_c(block, dest, line_size);
2421 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2423 ff_j_rev_dct(block);
2424 add_pixels_clamped_c(block, dest, line_size);
2427 /* init static data */
2428 av_cold void ff_dsputil_static_init(void)
2432 for (i = 0; i < 512; i++)
2433 ff_square_tab[i] = (i - 256) * (i - 256);
2436 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2438 const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2441 if (avctx->bits_per_raw_sample == 10) {
2442 c->fdct = ff_jpeg_fdct_islow_10;
2443 c->fdct248 = ff_fdct248_islow_10;
2445 if (avctx->dct_algo == FF_DCT_FASTINT) {
2446 c->fdct = ff_fdct_ifast;
2447 c->fdct248 = ff_fdct_ifast248;
2448 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2449 c->fdct = ff_faandct;
2450 c->fdct248 = ff_faandct248;
2452 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2453 c->fdct248 = ff_fdct248_islow_8;
2456 #endif /* CONFIG_ENCODERS */
2458 if (avctx->bits_per_raw_sample == 10) {
2459 c->idct_put = ff_simple_idct_put_10;
2460 c->idct_add = ff_simple_idct_add_10;
2461 c->idct = ff_simple_idct_10;
2462 c->idct_permutation_type = FF_NO_IDCT_PERM;
2464 if (avctx->idct_algo == FF_IDCT_INT) {
2465 c->idct_put = jref_idct_put;
2466 c->idct_add = jref_idct_add;
2467 c->idct = ff_j_rev_dct;
2468 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2469 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2470 c->idct_put = ff_faanidct_put;
2471 c->idct_add = ff_faanidct_add;
2472 c->idct = ff_faanidct;
2473 c->idct_permutation_type = FF_NO_IDCT_PERM;
2474 } else { // accurate/default
2475 c->idct_put = ff_simple_idct_put_8;
2476 c->idct_add = ff_simple_idct_add_8;
2477 c->idct = ff_simple_idct_8;
2478 c->idct_permutation_type = FF_NO_IDCT_PERM;
2482 c->diff_pixels = diff_pixels_c;
2484 c->put_pixels_clamped = put_pixels_clamped_c;
2485 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2486 c->add_pixels_clamped = add_pixels_clamped_c;
2488 c->sum_abs_dctelem = sum_abs_dctelem_c;
2493 c->pix_sum = pix_sum_c;
2494 c->pix_norm1 = pix_norm1_c;
2496 c->fill_block_tab[0] = fill_block16_c;
2497 c->fill_block_tab[1] = fill_block8_c;
2499 /* TODO [0] 16 [1] 8 */
2500 c->pix_abs[0][0] = pix_abs16_c;
2501 c->pix_abs[0][1] = pix_abs16_x2_c;
2502 c->pix_abs[0][2] = pix_abs16_y2_c;
2503 c->pix_abs[0][3] = pix_abs16_xy2_c;
2504 c->pix_abs[1][0] = pix_abs8_c;
2505 c->pix_abs[1][1] = pix_abs8_x2_c;
2506 c->pix_abs[1][2] = pix_abs8_y2_c;
2507 c->pix_abs[1][3] = pix_abs8_xy2_c;
2509 #define dspfunc(PFX, IDX, NUM) \
2510 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
2511 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
2512 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
2513 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
2514 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
2515 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
2516 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
2517 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
2518 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
2519 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
2520 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2521 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2522 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2523 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2524 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2525 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2527 dspfunc(put_qpel, 0, 16);
2528 dspfunc(put_qpel, 1, 8);
2530 dspfunc(put_no_rnd_qpel, 0, 16);
2531 dspfunc(put_no_rnd_qpel, 1, 8);
2533 dspfunc(avg_qpel, 0, 16);
2534 dspfunc(avg_qpel, 1, 8);
2538 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2539 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2540 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2541 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2542 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2543 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2544 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2545 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2547 #define SET_CMP_FUNC(name) \
2548 c->name[0] = name ## 16_c; \
2549 c->name[1] = name ## 8x8_c;
2551 SET_CMP_FUNC(hadamard8_diff)
2552 c->hadamard8_diff[4] = hadamard8_intra16_c;
2553 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2554 SET_CMP_FUNC(dct_sad)
2555 SET_CMP_FUNC(dct_max)
2557 SET_CMP_FUNC(dct264_sad)
2559 c->sad[0] = pix_abs16_c;
2560 c->sad[1] = pix_abs8_c;
2561 c->sse[0] = sse16_c;
2564 SET_CMP_FUNC(quant_psnr)
2567 c->vsad[0] = vsad16_c;
2568 c->vsad[4] = vsad_intra16_c;
2569 c->vsad[5] = vsad_intra8_c;
2570 c->vsse[0] = vsse16_c;
2571 c->vsse[4] = vsse_intra16_c;
2572 c->vsse[5] = vsse_intra8_c;
2573 c->nsse[0] = nsse16_c;
2574 c->nsse[1] = nsse8_c;
2576 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2578 c->add_bytes = add_bytes_c;
2579 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
2580 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2581 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2583 c->diff_bytes = diff_bytes_c;
2584 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2586 c->bswap_buf = bswap_buf;
2587 c->bswap16_buf = bswap16_buf;
2589 c->try_8x8basis = try_8x8basis_c;
2590 c->add_8x8basis = add_8x8basis_c;
2592 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2594 c->scalarproduct_int16 = scalarproduct_int16_c;
2595 c->vector_clip_int32 = vector_clip_int32_c;
2596 c->vector_clipf = vector_clipf_c;
2598 c->shrink[0] = av_image_copy_plane;
2599 c->shrink[1] = ff_shrink22;
2600 c->shrink[2] = ff_shrink44;
2601 c->shrink[3] = ff_shrink88;
2603 c->add_pixels8 = add_pixels8_c;
2607 #define FUNC(f, depth) f ## _ ## depth
2608 #define FUNCC(f, depth) f ## _ ## depth ## _c
2610 c->draw_edges = FUNCC(draw_edges, 8);
2612 c->clear_block = FUNCC(clear_block, 8);
2613 c->clear_blocks = FUNCC(clear_blocks, 8);
2615 #define BIT_DEPTH_FUNCS(depth) \
2616 c->get_pixels = FUNCC(get_pixels, depth);
2618 switch (avctx->bits_per_raw_sample) {
2621 BIT_DEPTH_FUNCS(16);
2629 ff_dsputil_init_arm(c, avctx, high_bit_depth);
2631 ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2633 ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2635 ff_dsputil_init_x86(c, avctx, high_bit_depth);
2637 ff_init_scantable_permutation(c->idct_permutation,
2638 c->idct_permutation_type);