3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
44 uint32_t ff_square_tab[512] = { 0, };
47 #include "dsputilenc_template.c"
51 #include "hpel_template.c"
52 #include "tpel_template.c"
53 #include "dsputil_template.c"
54 #include "dsputilenc_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL / 255 * 0x7f)
58 #define pb_80 (~0UL / 255 * 0x80)
60 /* Specific zigzag scan for 248 idct. NOTE that unlike the
61 * specification, we interleave the fields */
62 const uint8_t ff_zigzag248_direct[64] = {
63 0, 8, 1, 9, 16, 24, 2, 10,
64 17, 25, 32, 40, 48, 56, 33, 41,
65 18, 26, 3, 11, 4, 12, 19, 27,
66 34, 42, 49, 57, 50, 58, 35, 43,
67 20, 28, 5, 13, 6, 14, 21, 29,
68 36, 44, 51, 59, 52, 60, 37, 45,
69 22, 30, 7, 15, 23, 31, 38, 46,
70 53, 61, 54, 62, 39, 47, 55, 63,
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64] = {
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
109 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
110 const uint8_t *src_scantable)
114 st->scantable = src_scantable;
116 for (i = 0; i < 64; i++) {
117 int j = src_scantable[i];
118 st->permutated[i] = permutation[j];
122 for (i = 0; i < 64; i++) {
123 int j = st->permutated[i];
126 st->raster_end[i] = end;
130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
131 int idct_permutation_type)
135 switch (idct_permutation_type) {
136 case FF_NO_IDCT_PERM:
137 for (i = 0; i < 64; i++)
138 idct_permutation[i] = i;
140 case FF_LIBMPEG2_IDCT_PERM:
141 for (i = 0; i < 64; i++)
142 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
144 case FF_SIMPLE_IDCT_PERM:
145 for (i = 0; i < 64; i++)
146 idct_permutation[i] = simple_mmx_permutation[i];
148 case FF_TRANSPOSE_IDCT_PERM:
149 for (i = 0; i < 64; i++)
150 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
152 case FF_PARTTRANS_IDCT_PERM:
153 for (i = 0; i < 64; i++)
154 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
156 case FF_SSE2_IDCT_PERM:
157 for (i = 0; i < 64; i++)
158 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
161 av_log(NULL, AV_LOG_ERROR,
162 "Internal error, IDCT permutation not set\n");
166 static int pix_sum_c(uint8_t *pix, int line_size)
170 for (i = 0; i < 16; i++) {
171 for (j = 0; j < 16; j += 8) {
182 pix += line_size - 16;
187 static int pix_norm1_c(uint8_t *pix, int line_size)
190 uint32_t *sq = ff_square_tab + 256;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
205 register uint64_t x = *(uint64_t *) pix;
207 s += sq[(x >> 8) & 0xff];
208 s += sq[(x >> 16) & 0xff];
209 s += sq[(x >> 24) & 0xff];
210 s += sq[(x >> 32) & 0xff];
211 s += sq[(x >> 40) & 0xff];
212 s += sq[(x >> 48) & 0xff];
213 s += sq[(x >> 56) & 0xff];
215 register uint32_t x = *(uint32_t *) pix;
217 s += sq[(x >> 8) & 0xff];
218 s += sq[(x >> 16) & 0xff];
219 s += sq[(x >> 24) & 0xff];
220 x = *(uint32_t *) (pix + 4);
222 s += sq[(x >> 8) & 0xff];
223 s += sq[(x >> 16) & 0xff];
224 s += sq[(x >> 24) & 0xff];
229 pix += line_size - 16;
234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
238 for (i = 0; i + 8 <= w; i += 8) {
239 dst[i + 0] = av_bswap32(src[i + 0]);
240 dst[i + 1] = av_bswap32(src[i + 1]);
241 dst[i + 2] = av_bswap32(src[i + 2]);
242 dst[i + 3] = av_bswap32(src[i + 3]);
243 dst[i + 4] = av_bswap32(src[i + 4]);
244 dst[i + 5] = av_bswap32(src[i + 5]);
245 dst[i + 6] = av_bswap32(src[i + 6]);
246 dst[i + 7] = av_bswap32(src[i + 7]);
249 dst[i + 0] = av_bswap32(src[i + 0]);
252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
255 *dst++ = av_bswap16(*src++);
258 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
259 int line_size, int h)
262 uint32_t *sq = ff_square_tab + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
275 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276 int line_size, int h)
279 uint32_t *sq = ff_square_tab + 256;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 s += sq[pix1[4] - pix2[4]];
287 s += sq[pix1[5] - pix2[5]];
288 s += sq[pix1[6] - pix2[6]];
289 s += sq[pix1[7] - pix2[7]];
296 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297 int line_size, int h)
300 uint32_t *sq = ff_square_tab + 256;
302 for (i = 0; i < h; i++) {
303 s += sq[pix1[0] - pix2[0]];
304 s += sq[pix1[1] - pix2[1]];
305 s += sq[pix1[2] - pix2[2]];
306 s += sq[pix1[3] - pix2[3]];
307 s += sq[pix1[4] - pix2[4]];
308 s += sq[pix1[5] - pix2[5]];
309 s += sq[pix1[6] - pix2[6]];
310 s += sq[pix1[7] - pix2[7]];
311 s += sq[pix1[8] - pix2[8]];
312 s += sq[pix1[9] - pix2[9]];
313 s += sq[pix1[10] - pix2[10]];
314 s += sq[pix1[11] - pix2[11]];
315 s += sq[pix1[12] - pix2[12]];
316 s += sq[pix1[13] - pix2[13]];
317 s += sq[pix1[14] - pix2[14]];
318 s += sq[pix1[15] - pix2[15]];
326 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
327 const uint8_t *s2, int stride)
331 /* read the pixels */
332 for (i = 0; i < 8; i++) {
333 block[0] = s1[0] - s2[0];
334 block[1] = s1[1] - s2[1];
335 block[2] = s1[2] - s2[2];
336 block[3] = s1[3] - s2[3];
337 block[4] = s1[4] - s2[4];
338 block[5] = s1[5] - s2[5];
339 block[6] = s1[6] - s2[6];
340 block[7] = s1[7] - s2[7];
347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
352 /* read the pixels */
353 for (i = 0; i < 8; i++) {
354 pixels[0] = av_clip_uint8(block[0]);
355 pixels[1] = av_clip_uint8(block[1]);
356 pixels[2] = av_clip_uint8(block[2]);
357 pixels[3] = av_clip_uint8(block[3]);
358 pixels[4] = av_clip_uint8(block[4]);
359 pixels[5] = av_clip_uint8(block[5]);
360 pixels[6] = av_clip_uint8(block[6]);
361 pixels[7] = av_clip_uint8(block[7]);
368 static void put_signed_pixels_clamped_c(const int16_t *block,
369 uint8_t *restrict pixels,
374 for (i = 0; i < 8; i++) {
375 for (j = 0; j < 8; j++) {
378 else if (*block > 127)
381 *pixels = (uint8_t) (*block + 128);
385 pixels += (line_size - 8);
389 static void add_pixels8_c(uint8_t *restrict pixels, int16_t *block,
394 for (i = 0; i < 8; i++) {
395 pixels[0] += block[0];
396 pixels[1] += block[1];
397 pixels[2] += block[2];
398 pixels[3] += block[3];
399 pixels[4] += block[4];
400 pixels[5] += block[5];
401 pixels[6] += block[6];
402 pixels[7] += block[7];
408 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
413 /* read the pixels */
414 for (i = 0; i < 8; i++) {
415 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
416 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
417 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
418 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
419 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
420 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
421 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
422 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
428 static int sum_abs_dctelem_c(int16_t *block)
432 for (i = 0; i < 64; i++)
433 sum += FFABS(block[i]);
437 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
441 for (i = 0; i < h; i++) {
442 memset(block, value, 16);
447 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
451 for (i = 0; i < h; i++) {
452 memset(block, value, 8);
457 #define avg2(a, b) ((a + b + 1) >> 1)
458 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
460 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
461 int x16, int y16, int rounder)
463 const int A = (16 - x16) * (16 - y16);
464 const int B = (x16) * (16 - y16);
465 const int C = (16 - x16) * (y16);
466 const int D = (x16) * (y16);
469 for (i = 0; i < h; i++) {
470 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
471 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
472 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
473 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
474 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
475 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
476 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
477 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
483 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
484 int dxx, int dxy, int dyx, int dyy, int shift, int r,
485 int width, int height)
488 const int s = 1 << shift;
493 for (y = 0; y < h; y++) {
498 for (x = 0; x < 8; x++) { // FIXME: optimize
500 int src_x = vx >> 16;
501 int src_y = vy >> 16;
502 int frac_x = src_x & (s - 1);
503 int frac_y = src_y & (s - 1);
508 if ((unsigned) src_x < width) {
509 if ((unsigned) src_y < height) {
510 index = src_x + src_y * stride;
511 dst[y * stride + x] =
512 ((src[index] * (s - frac_x) +
513 src[index + 1] * frac_x) * (s - frac_y) +
514 (src[index + stride] * (s - frac_x) +
515 src[index + stride + 1] * frac_x) * frac_y +
518 index = src_x + av_clip(src_y, 0, height) * stride;
519 dst[y * stride + x] =
520 ((src[index] * (s - frac_x) +
521 src[index + 1] * frac_x) * s +
525 if ((unsigned) src_y < height) {
526 index = av_clip(src_x, 0, width) + src_y * stride;
527 dst[y * stride + x] =
528 ((src[index] * (s - frac_y) +
529 src[index + stride] * frac_y) * s +
532 index = av_clip(src_x, 0, width) +
533 av_clip(src_y, 0, height) * stride;
534 dst[y * stride + x] = src[index];
546 #define QPEL_MC(r, OPNAME, RND, OP) \
547 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
548 int dstStride, int srcStride, \
551 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
554 for (i = 0; i < h; i++) { \
555 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
556 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
557 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
558 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
559 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
560 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
561 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
562 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
568 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
569 int dstStride, int srcStride) \
571 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
575 for (i = 0; i < w; i++) { \
576 const int src0 = src[0 * srcStride]; \
577 const int src1 = src[1 * srcStride]; \
578 const int src2 = src[2 * srcStride]; \
579 const int src3 = src[3 * srcStride]; \
580 const int src4 = src[4 * srcStride]; \
581 const int src5 = src[5 * srcStride]; \
582 const int src6 = src[6 * srcStride]; \
583 const int src7 = src[7 * srcStride]; \
584 const int src8 = src[8 * srcStride]; \
585 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
586 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
587 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
588 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
589 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
590 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
591 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
592 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
598 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
599 int dstStride, int srcStride, \
602 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
605 for (i = 0; i < h; i++) { \
606 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
607 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
608 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
609 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
610 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
611 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
612 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
613 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
614 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
615 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
616 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
617 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
618 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
619 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
620 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
621 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
627 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
628 int dstStride, int srcStride) \
630 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
634 for (i = 0; i < w; i++) { \
635 const int src0 = src[0 * srcStride]; \
636 const int src1 = src[1 * srcStride]; \
637 const int src2 = src[2 * srcStride]; \
638 const int src3 = src[3 * srcStride]; \
639 const int src4 = src[4 * srcStride]; \
640 const int src5 = src[5 * srcStride]; \
641 const int src6 = src[6 * srcStride]; \
642 const int src7 = src[7 * srcStride]; \
643 const int src8 = src[8 * srcStride]; \
644 const int src9 = src[9 * srcStride]; \
645 const int src10 = src[10 * srcStride]; \
646 const int src11 = src[11 * srcStride]; \
647 const int src12 = src[12 * srcStride]; \
648 const int src13 = src[13 * srcStride]; \
649 const int src14 = src[14 * srcStride]; \
650 const int src15 = src[15 * srcStride]; \
651 const int src16 = src[16 * srcStride]; \
652 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
653 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
654 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
655 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
656 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
657 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
658 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
659 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
660 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
661 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
662 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
663 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
664 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
665 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
666 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
667 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
673 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
678 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
679 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
682 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
685 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
688 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
693 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
694 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
697 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
700 uint8_t full[16 * 9]; \
703 copy_block9(full, src, 16, stride, 9); \
704 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
705 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
708 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
711 uint8_t full[16 * 9]; \
713 copy_block9(full, src, 16, stride, 9); \
714 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
717 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
720 uint8_t full[16 * 9]; \
723 copy_block9(full, src, 16, stride, 9); \
724 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
725 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
728 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
731 uint8_t full[16 * 9]; \
734 uint8_t halfHV[64]; \
736 copy_block9(full, src, 16, stride, 9); \
737 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
738 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
739 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
740 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
741 stride, 16, 8, 8, 8, 8); \
744 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
747 uint8_t full[16 * 9]; \
749 uint8_t halfHV[64]; \
751 copy_block9(full, src, 16, stride, 9); \
752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
753 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
755 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
758 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
761 uint8_t full[16 * 9]; \
764 uint8_t halfHV[64]; \
766 copy_block9(full, src, 16, stride, 9); \
767 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
768 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
770 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
771 stride, 16, 8, 8, 8, 8); \
774 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
777 uint8_t full[16 * 9]; \
779 uint8_t halfHV[64]; \
781 copy_block9(full, src, 16, stride, 9); \
782 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
783 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
784 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
785 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
788 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
791 uint8_t full[16 * 9]; \
794 uint8_t halfHV[64]; \
796 copy_block9(full, src, 16, stride, 9); \
797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
798 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
799 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
800 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
801 stride, 16, 8, 8, 8, 8); \
804 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
807 uint8_t full[16 * 9]; \
809 uint8_t halfHV[64]; \
811 copy_block9(full, src, 16, stride, 9); \
812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
813 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
815 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
818 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
821 uint8_t full[16 * 9]; \
824 uint8_t halfHV[64]; \
826 copy_block9(full, src, 16, stride, 9); \
827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
830 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
831 stride, 16, 8, 8, 8, 8); \
834 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
837 uint8_t full[16 * 9]; \
839 uint8_t halfHV[64]; \
841 copy_block9(full, src, 16, stride, 9); \
842 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
843 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
844 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
845 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
848 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
852 uint8_t halfHV[64]; \
854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
855 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
856 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
859 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
863 uint8_t halfHV[64]; \
865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
866 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
867 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
870 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
873 uint8_t full[16 * 9]; \
876 uint8_t halfHV[64]; \
878 copy_block9(full, src, 16, stride, 9); \
879 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
880 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
881 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
882 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
885 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
888 uint8_t full[16 * 9]; \
891 copy_block9(full, src, 16, stride, 9); \
892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
893 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
894 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
897 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
900 uint8_t full[16 * 9]; \
903 uint8_t halfHV[64]; \
905 copy_block9(full, src, 16, stride, 9); \
906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
907 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
909 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
912 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
915 uint8_t full[16 * 9]; \
918 copy_block9(full, src, 16, stride, 9); \
919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
920 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
921 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
924 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
930 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
933 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
938 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
939 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
942 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
945 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
948 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
953 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
954 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
957 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
960 uint8_t full[24 * 17]; \
963 copy_block17(full, src, 24, stride, 17); \
964 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
965 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
968 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
971 uint8_t full[24 * 17]; \
973 copy_block17(full, src, 24, stride, 17); \
974 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
977 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
980 uint8_t full[24 * 17]; \
983 copy_block17(full, src, 24, stride, 17); \
984 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
985 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
988 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
991 uint8_t full[24 * 17]; \
992 uint8_t halfH[272]; \
993 uint8_t halfV[256]; \
994 uint8_t halfHV[256]; \
996 copy_block17(full, src, 24, stride, 17); \
997 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
998 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
999 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1000 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1001 stride, 24, 16, 16, 16, 16); \
1004 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1007 uint8_t full[24 * 17]; \
1008 uint8_t halfH[272]; \
1009 uint8_t halfHV[256]; \
1011 copy_block17(full, src, 24, stride, 17); \
1012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1013 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1015 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1018 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1021 uint8_t full[24 * 17]; \
1022 uint8_t halfH[272]; \
1023 uint8_t halfV[256]; \
1024 uint8_t halfHV[256]; \
1026 copy_block17(full, src, 24, stride, 17); \
1027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1028 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1030 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1031 stride, 24, 16, 16, 16, 16); \
1034 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1037 uint8_t full[24 * 17]; \
1038 uint8_t halfH[272]; \
1039 uint8_t halfHV[256]; \
1041 copy_block17(full, src, 24, stride, 17); \
1042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1043 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1044 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1045 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1048 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1051 uint8_t full[24 * 17]; \
1052 uint8_t halfH[272]; \
1053 uint8_t halfV[256]; \
1054 uint8_t halfHV[256]; \
1056 copy_block17(full, src, 24, stride, 17); \
1057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1058 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1060 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1061 stride, 24, 16, 16, 16, 16); \
1064 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1067 uint8_t full[24 * 17]; \
1068 uint8_t halfH[272]; \
1069 uint8_t halfHV[256]; \
1071 copy_block17(full, src, 24, stride, 17); \
1072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1073 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1075 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1078 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1081 uint8_t full[24 * 17]; \
1082 uint8_t halfH[272]; \
1083 uint8_t halfV[256]; \
1084 uint8_t halfHV[256]; \
1086 copy_block17(full, src, 24, stride, 17); \
1087 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1088 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1089 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1090 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1091 stride, 24, 16, 16, 16, 16); \
1094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1097 uint8_t full[24 * 17]; \
1098 uint8_t halfH[272]; \
1099 uint8_t halfHV[256]; \
1101 copy_block17(full, src, 24, stride, 17); \
1102 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1103 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1104 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1105 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1108 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1111 uint8_t halfH[272]; \
1112 uint8_t halfHV[256]; \
1114 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1115 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1116 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1119 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1122 uint8_t halfH[272]; \
1123 uint8_t halfHV[256]; \
1125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1126 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1127 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1130 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1133 uint8_t full[24 * 17]; \
1134 uint8_t halfH[272]; \
1135 uint8_t halfV[256]; \
1136 uint8_t halfHV[256]; \
1138 copy_block17(full, src, 24, stride, 17); \
1139 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1141 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1142 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1145 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1148 uint8_t full[24 * 17]; \
1149 uint8_t halfH[272]; \
1151 copy_block17(full, src, 24, stride, 17); \
1152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1153 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1154 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1157 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1160 uint8_t full[24 * 17]; \
1161 uint8_t halfH[272]; \
1162 uint8_t halfV[256]; \
1163 uint8_t halfHV[256]; \
1165 copy_block17(full, src, 24, stride, 17); \
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1169 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1172 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1175 uint8_t full[24 * 17]; \
1176 uint8_t halfH[272]; \
1178 copy_block17(full, src, 24, stride, 17); \
1179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1180 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1181 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1184 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1187 uint8_t halfH[272]; \
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1190 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1193 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1194 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1195 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1196 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1198 QPEL_MC(0, put_, _, op_put)
1199 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1200 QPEL_MC(0, avg_, _, op_avg)
1204 #undef op_put_no_rnd
1206 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1208 put_pixels8_8_c(dst, src, stride, 8);
1211 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1213 avg_pixels8_8_c(dst, src, stride, 8);
1216 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1218 put_pixels16_8_c(dst, src, stride, 16);
1221 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1223 avg_pixels16_8_c(dst, src, stride, 16);
1226 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1227 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1228 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1229 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1230 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1231 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1233 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1234 int dstStride, int srcStride, int h)
1236 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1239 for (i = 0; i < h; i++) {
1240 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1241 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1242 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1243 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1244 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1245 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1246 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1247 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1253 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1254 int dstStride, int srcStride, int w)
1256 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1259 for (i = 0; i < w; i++) {
1260 const int src_1 = src[-srcStride];
1261 const int src0 = src[0];
1262 const int src1 = src[srcStride];
1263 const int src2 = src[2 * srcStride];
1264 const int src3 = src[3 * srcStride];
1265 const int src4 = src[4 * srcStride];
1266 const int src5 = src[5 * srcStride];
1267 const int src6 = src[6 * srcStride];
1268 const int src7 = src[7 * srcStride];
1269 const int src8 = src[8 * srcStride];
1270 const int src9 = src[9 * srcStride];
1271 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1272 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1273 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1274 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1275 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1276 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1277 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1278 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1284 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1288 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1289 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1292 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1294 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1297 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1301 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1302 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1305 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1307 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1310 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1317 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1318 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1319 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1322 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1329 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1330 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1331 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1334 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1338 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1339 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1342 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1343 int line_size, int h)
1347 for (i = 0; i < h; i++) {
1348 s += abs(pix1[0] - pix2[0]);
1349 s += abs(pix1[1] - pix2[1]);
1350 s += abs(pix1[2] - pix2[2]);
1351 s += abs(pix1[3] - pix2[3]);
1352 s += abs(pix1[4] - pix2[4]);
1353 s += abs(pix1[5] - pix2[5]);
1354 s += abs(pix1[6] - pix2[6]);
1355 s += abs(pix1[7] - pix2[7]);
1356 s += abs(pix1[8] - pix2[8]);
1357 s += abs(pix1[9] - pix2[9]);
1358 s += abs(pix1[10] - pix2[10]);
1359 s += abs(pix1[11] - pix2[11]);
1360 s += abs(pix1[12] - pix2[12]);
1361 s += abs(pix1[13] - pix2[13]);
1362 s += abs(pix1[14] - pix2[14]);
1363 s += abs(pix1[15] - pix2[15]);
1370 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1371 int line_size, int h)
1375 for (i = 0; i < h; i++) {
1376 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1377 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1378 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1379 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1380 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1381 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1382 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1383 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1384 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1385 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1386 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1387 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1388 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1389 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1390 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1391 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1398 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1399 int line_size, int h)
1402 uint8_t *pix3 = pix2 + line_size;
1404 for (i = 0; i < h; i++) {
1405 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1406 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1407 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1408 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1409 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1410 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1411 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1412 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1413 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1414 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1415 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1416 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1417 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1418 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1419 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1420 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1428 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1429 int line_size, int h)
1432 uint8_t *pix3 = pix2 + line_size;
1434 for (i = 0; i < h; i++) {
1435 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1436 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1437 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1438 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1439 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1440 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1441 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1442 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1443 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1444 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1445 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1446 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1447 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1448 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1449 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1450 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1458 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1459 int line_size, int h)
1463 for (i = 0; i < h; i++) {
1464 s += abs(pix1[0] - pix2[0]);
1465 s += abs(pix1[1] - pix2[1]);
1466 s += abs(pix1[2] - pix2[2]);
1467 s += abs(pix1[3] - pix2[3]);
1468 s += abs(pix1[4] - pix2[4]);
1469 s += abs(pix1[5] - pix2[5]);
1470 s += abs(pix1[6] - pix2[6]);
1471 s += abs(pix1[7] - pix2[7]);
1478 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1479 int line_size, int h)
1483 for (i = 0; i < h; i++) {
1484 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1485 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1486 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1487 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1488 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1489 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1490 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1491 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1498 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1499 int line_size, int h)
1502 uint8_t *pix3 = pix2 + line_size;
1504 for (i = 0; i < h; i++) {
1505 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1506 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1507 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1508 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1509 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1510 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1511 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1512 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1520 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1521 int line_size, int h)
1524 uint8_t *pix3 = pix2 + line_size;
1526 for (i = 0; i < h; i++) {
1527 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1528 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1529 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1530 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1531 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1532 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1533 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1534 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1542 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1544 int score1 = 0, score2 = 0, x, y;
1546 for (y = 0; y < h; y++) {
1547 for (x = 0; x < 16; x++)
1548 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1550 for (x = 0; x < 15; x++)
1551 score2 += FFABS(s1[x] - s1[x + stride] -
1552 s1[x + 1] + s1[x + stride + 1]) -
1553 FFABS(s2[x] - s2[x + stride] -
1554 s2[x + 1] + s2[x + stride + 1]);
1561 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1563 return score1 + FFABS(score2) * 8;
1566 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1568 int score1 = 0, score2 = 0, x, y;
1570 for (y = 0; y < h; y++) {
1571 for (x = 0; x < 8; x++)
1572 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1574 for (x = 0; x < 7; x++)
1575 score2 += FFABS(s1[x] - s1[x + stride] -
1576 s1[x + 1] + s1[x + stride + 1]) -
1577 FFABS(s2[x] - s2[x + stride] -
1578 s2[x + 1] + s2[x + stride + 1]);
1585 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1587 return score1 + FFABS(score2) * 8;
1590 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1591 int16_t basis[64], int scale)
1594 unsigned int sum = 0;
1596 for (i = 0; i < 8 * 8; i++) {
1597 int b = rem[i] + ((basis[i] * scale +
1598 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1599 (BASIS_SHIFT - RECON_SHIFT));
1602 assert(-512 < b && b < 512);
1604 sum += (w * b) * (w * b) >> 4;
1609 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1613 for (i = 0; i < 8 * 8; i++)
1614 rem[i] += (basis[i] * scale +
1615 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1616 (BASIS_SHIFT - RECON_SHIFT);
1619 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1625 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1629 memset(cmp, 0, sizeof(void *) * 6);
1631 for (i = 0; i < 6; i++) {
1632 switch (type & 0xFF) {
1637 cmp[i] = c->hadamard8_diff[i];
1643 cmp[i] = c->dct_sad[i];
1646 cmp[i] = c->dct264_sad[i];
1649 cmp[i] = c->dct_max[i];
1652 cmp[i] = c->quant_psnr[i];
1661 cmp[i] = c->vsad[i];
1664 cmp[i] = c->vsse[i];
1670 cmp[i] = c->nsse[i];
1673 av_log(NULL, AV_LOG_ERROR,
1674 "internal error in cmp function selection\n");
1679 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1683 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1684 long a = *(long *) (src + i);
1685 long b = *(long *) (dst + i);
1686 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1689 dst[i + 0] += src[i + 0];
1692 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w)
1696 #if !HAVE_FAST_UNALIGNED
1697 if ((long) src2 & (sizeof(long) - 1)) {
1698 for (i = 0; i + 7 < w; i += 8) {
1699 dst[i + 0] = src1[i + 0] - src2[i + 0];
1700 dst[i + 1] = src1[i + 1] - src2[i + 1];
1701 dst[i + 2] = src1[i + 2] - src2[i + 2];
1702 dst[i + 3] = src1[i + 3] - src2[i + 3];
1703 dst[i + 4] = src1[i + 4] - src2[i + 4];
1704 dst[i + 5] = src1[i + 5] - src2[i + 5];
1705 dst[i + 6] = src1[i + 6] - src2[i + 6];
1706 dst[i + 7] = src1[i + 7] - src2[i + 7];
1710 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1711 long a = *(long *) (src1 + i);
1712 long b = *(long *) (src2 + i);
1713 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1714 ((a ^ b ^ pb_80) & pb_80);
1717 dst[i + 0] = src1[i + 0] - src2[i + 0];
1720 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1721 const uint8_t *diff, int w,
1722 int *left, int *left_top)
1730 for (i = 0; i < w; i++) {
1731 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1740 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1741 const uint8_t *src2, int w,
1742 int *left, int *left_top)
1750 for (i = 0; i < w; i++) {
1751 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1761 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1766 for (i = 0; i < w - 1; i++) {
1774 for (; i < w; i++) {
1793 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1794 int w, int *red, int *green,
1795 int *blue, int *alpha)
1797 int i, r = *red, g = *green, b = *blue, a = *alpha;
1799 for (i = 0; i < w; i++) {
1800 b += src[4 * i + B];
1801 g += src[4 * i + G];
1802 r += src[4 * i + R];
1803 a += src[4 * i + A];
1821 #define BUTTERFLY2(o1, o2, i1, i2) \
1825 #define BUTTERFLY1(x, y) \
1834 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1836 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1837 uint8_t *src, int stride, int h)
1839 int i, temp[64], sum = 0;
1843 for (i = 0; i < 8; i++) {
1844 // FIXME: try pointer walks
1845 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1846 src[stride * i + 0] - dst[stride * i + 0],
1847 src[stride * i + 1] - dst[stride * i + 1]);
1848 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1849 src[stride * i + 2] - dst[stride * i + 2],
1850 src[stride * i + 3] - dst[stride * i + 3]);
1851 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1852 src[stride * i + 4] - dst[stride * i + 4],
1853 src[stride * i + 5] - dst[stride * i + 5]);
1854 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1855 src[stride * i + 6] - dst[stride * i + 6],
1856 src[stride * i + 7] - dst[stride * i + 7]);
1858 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1859 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1860 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1861 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1863 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1864 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1865 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1866 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1869 for (i = 0; i < 8; i++) {
1870 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1871 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1872 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1873 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1875 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1876 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1877 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1878 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1880 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1881 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1882 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
1883 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1888 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
1889 uint8_t *dummy, int stride, int h)
1891 int i, temp[64], sum = 0;
1895 for (i = 0; i < 8; i++) {
1896 // FIXME: try pointer walks
1897 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1898 src[stride * i + 0], src[stride * i + 1]);
1899 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1900 src[stride * i + 2], src[stride * i + 3]);
1901 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1902 src[stride * i + 4], src[stride * i + 5]);
1903 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1904 src[stride * i + 6], src[stride * i + 7]);
1906 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1907 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1908 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1909 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1911 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1912 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1913 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1914 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1917 for (i = 0; i < 8; i++) {
1918 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1919 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1920 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1921 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1923 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1924 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1925 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1926 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1929 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
1930 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
1931 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
1932 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
1935 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
1940 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1941 uint8_t *src2, int stride, int h)
1943 LOCAL_ALIGNED_16(int16_t, temp, [64]);
1947 s->dsp.diff_pixels(temp, src1, src2, stride);
1949 return s->dsp.sum_abs_dctelem(temp);
1955 const int s07 = SRC(0) + SRC(7); \
1956 const int s16 = SRC(1) + SRC(6); \
1957 const int s25 = SRC(2) + SRC(5); \
1958 const int s34 = SRC(3) + SRC(4); \
1959 const int a0 = s07 + s34; \
1960 const int a1 = s16 + s25; \
1961 const int a2 = s07 - s34; \
1962 const int a3 = s16 - s25; \
1963 const int d07 = SRC(0) - SRC(7); \
1964 const int d16 = SRC(1) - SRC(6); \
1965 const int d25 = SRC(2) - SRC(5); \
1966 const int d34 = SRC(3) - SRC(4); \
1967 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
1968 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
1969 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
1970 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
1972 DST(1, a4 + (a7 >> 2)); \
1973 DST(2, a2 + (a3 >> 1)); \
1974 DST(3, a5 + (a6 >> 2)); \
1976 DST(5, a6 - (a5 >> 2)); \
1977 DST(6, (a2 >> 1) - a3); \
1978 DST(7, (a4 >> 2) - a7); \
1981 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
1982 uint8_t *src2, int stride, int h)
1987 s->dsp.diff_pixels(dct[0], src1, src2, stride);
1989 #define SRC(x) dct[i][x]
1990 #define DST(x, v) dct[i][x] = v
1991 for (i = 0; i < 8; i++)
1996 #define SRC(x) dct[x][i]
1997 #define DST(x, v) sum += FFABS(v)
1998 for (i = 0; i < 8; i++)
2006 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2007 uint8_t *src2, int stride, int h)
2009 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2014 s->dsp.diff_pixels(temp, src1, src2, stride);
2017 for (i = 0; i < 64; i++)
2018 sum = FFMAX(sum, FFABS(temp[i]));
2023 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2024 uint8_t *src2, int stride, int h)
2026 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2027 int16_t *const bak = temp + 64;
2033 s->dsp.diff_pixels(temp, src1, src2, stride);
2035 memcpy(bak, temp, 64 * sizeof(int16_t));
2037 s->block_last_index[0 /* FIXME */] =
2038 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2039 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2040 ff_simple_idct_8(temp); // FIXME
2042 for (i = 0; i < 64; i++)
2043 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2048 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2051 const uint8_t *scantable = s->intra_scantable.permutated;
2052 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2053 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2054 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2055 int i, last, run, bits, level, distortion, start_i;
2056 const int esc_length = s->ac_esc_length;
2057 uint8_t *length, *last_length;
2061 copy_block8(lsrc1, src1, 8, stride, 8);
2062 copy_block8(lsrc2, src2, 8, stride, 8);
2064 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2066 s->block_last_index[0 /* FIXME */] =
2068 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2074 length = s->intra_ac_vlc_length;
2075 last_length = s->intra_ac_vlc_last_length;
2076 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2079 length = s->inter_ac_vlc_length;
2080 last_length = s->inter_ac_vlc_last_length;
2083 if (last >= start_i) {
2085 for (i = start_i; i < last; i++) {
2086 int j = scantable[i];
2091 if ((level & (~127)) == 0)
2092 bits += length[UNI_AC_ENC_INDEX(run, level)];
2099 i = scantable[last];
2101 level = temp[i] + 64;
2105 if ((level & (~127)) == 0) {
2106 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2113 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2115 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2118 s->dsp.idct_add(lsrc2, 8, temp);
2120 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2122 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2125 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2128 const uint8_t *scantable = s->intra_scantable.permutated;
2129 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2130 int i, last, run, bits, level, start_i;
2131 const int esc_length = s->ac_esc_length;
2132 uint8_t *length, *last_length;
2136 s->dsp.diff_pixels(temp, src1, src2, stride);
2138 s->block_last_index[0 /* FIXME */] =
2140 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2146 length = s->intra_ac_vlc_length;
2147 last_length = s->intra_ac_vlc_last_length;
2148 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2151 length = s->inter_ac_vlc_length;
2152 last_length = s->inter_ac_vlc_last_length;
2155 if (last >= start_i) {
2157 for (i = start_i; i < last; i++) {
2158 int j = scantable[i];
2163 if ((level & (~127)) == 0)
2164 bits += length[UNI_AC_ENC_INDEX(run, level)];
2171 i = scantable[last];
2173 level = temp[i] + 64;
2177 if ((level & (~127)) == 0)
2178 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2186 #define VSAD_INTRA(size) \
2187 static int vsad_intra ## size ## _c(MpegEncContext *c, \
2188 uint8_t *s, uint8_t *dummy, \
2189 int stride, int h) \
2191 int score = 0, x, y; \
2193 for (y = 1; y < h; y++) { \
2194 for (x = 0; x < size; x += 4) { \
2195 score += FFABS(s[x] - s[x + stride]) + \
2196 FFABS(s[x + 1] - s[x + stride + 1]) + \
2197 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2198 FFABS(s[x + 3] - s[x + 3 + stride]); \
2208 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2211 int score = 0, x, y;
2213 for (y = 1; y < h; y++) {
2214 for (x = 0; x < 16; x++)
2215 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2223 #define SQ(a) ((a) * (a))
2224 #define VSSE_INTRA(size) \
2225 static int vsse_intra ## size ## _c(MpegEncContext *c, \
2226 uint8_t *s, uint8_t *dummy, \
2227 int stride, int h) \
2229 int score = 0, x, y; \
2231 for (y = 1; y < h; y++) { \
2232 for (x = 0; x < size; x += 4) { \
2233 score += SQ(s[x] - s[x + stride]) + \
2234 SQ(s[x + 1] - s[x + stride + 1]) + \
2235 SQ(s[x + 2] - s[x + stride + 2]) + \
2236 SQ(s[x + 3] - s[x + stride + 3]); \
2246 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2249 int score = 0, x, y;
2251 for (y = 1; y < h; y++) {
2252 for (x = 0; x < 16; x++)
2253 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2261 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2266 for (i = 0; i < size; i++)
2267 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2271 #define WRAPPER8_16_SQ(name8, name16) \
2272 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
2273 int stride, int h) \
2277 score += name8(s, dst, src, stride, 8); \
2278 score += name8(s, dst + 8, src + 8, stride, 8); \
2280 dst += 8 * stride; \
2281 src += 8 * stride; \
2282 score += name8(s, dst, src, stride, 8); \
2283 score += name8(s, dst + 8, src + 8, stride, 8); \
2288 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2289 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2290 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2292 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2294 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2295 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2296 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2297 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2299 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2300 uint32_t maxi, uint32_t maxisign)
2304 else if ((a ^ (1U << 31)) > maxisign)
2310 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2311 float *min, float *max, int len)
2314 uint32_t mini = *(uint32_t *) min;
2315 uint32_t maxi = *(uint32_t *) max;
2316 uint32_t maxisign = maxi ^ (1U << 31);
2317 uint32_t *dsti = (uint32_t *) dst;
2318 const uint32_t *srci = (const uint32_t *) src;
2320 for (i = 0; i < len; i += 8) {
2321 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2322 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2323 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2324 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2325 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2326 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2327 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2328 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2332 static void vector_clipf_c(float *dst, const float *src,
2333 float min, float max, int len)
2337 if (min < 0 && max > 0) {
2338 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2340 for (i = 0; i < len; i += 8) {
2341 dst[i] = av_clipf(src[i], min, max);
2342 dst[i + 1] = av_clipf(src[i + 1], min, max);
2343 dst[i + 2] = av_clipf(src[i + 2], min, max);
2344 dst[i + 3] = av_clipf(src[i + 3], min, max);
2345 dst[i + 4] = av_clipf(src[i + 4], min, max);
2346 dst[i + 5] = av_clipf(src[i + 5], min, max);
2347 dst[i + 6] = av_clipf(src[i + 6], min, max);
2348 dst[i + 7] = av_clipf(src[i + 7], min, max);
2353 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2359 res += *v1++ **v2++;
2364 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2372 *v1++ += mul * *v3++;
2377 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2378 int32_t max, unsigned int len)
2381 *dst++ = av_clip(*src++, min, max);
2382 *dst++ = av_clip(*src++, min, max);
2383 *dst++ = av_clip(*src++, min, max);
2384 *dst++ = av_clip(*src++, min, max);
2385 *dst++ = av_clip(*src++, min, max);
2386 *dst++ = av_clip(*src++, min, max);
2387 *dst++ = av_clip(*src++, min, max);
2388 *dst++ = av_clip(*src++, min, max);
2393 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2395 ff_j_rev_dct(block);
2396 put_pixels_clamped_c(block, dest, line_size);
2399 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2401 ff_j_rev_dct(block);
2402 add_pixels_clamped_c(block, dest, line_size);
2405 /* init static data */
2406 av_cold void ff_dsputil_static_init(void)
2410 for (i = 0; i < 512; i++)
2411 ff_square_tab[i] = (i - 256) * (i - 256);
2414 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2416 const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2419 if (avctx->bits_per_raw_sample == 10) {
2420 c->fdct = ff_jpeg_fdct_islow_10;
2421 c->fdct248 = ff_fdct248_islow_10;
2423 if (avctx->dct_algo == FF_DCT_FASTINT) {
2424 c->fdct = ff_fdct_ifast;
2425 c->fdct248 = ff_fdct_ifast248;
2426 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2427 c->fdct = ff_faandct;
2428 c->fdct248 = ff_faandct248;
2430 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2431 c->fdct248 = ff_fdct248_islow_8;
2434 #endif /* CONFIG_ENCODERS */
2436 if (avctx->bits_per_raw_sample == 10) {
2437 c->idct_put = ff_simple_idct_put_10;
2438 c->idct_add = ff_simple_idct_add_10;
2439 c->idct = ff_simple_idct_10;
2440 c->idct_permutation_type = FF_NO_IDCT_PERM;
2442 if (avctx->idct_algo == FF_IDCT_INT) {
2443 c->idct_put = jref_idct_put;
2444 c->idct_add = jref_idct_add;
2445 c->idct = ff_j_rev_dct;
2446 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2447 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2448 c->idct_put = ff_faanidct_put;
2449 c->idct_add = ff_faanidct_add;
2450 c->idct = ff_faanidct;
2451 c->idct_permutation_type = FF_NO_IDCT_PERM;
2452 } else { // accurate/default
2453 c->idct_put = ff_simple_idct_put_8;
2454 c->idct_add = ff_simple_idct_add_8;
2455 c->idct = ff_simple_idct_8;
2456 c->idct_permutation_type = FF_NO_IDCT_PERM;
2460 c->diff_pixels = diff_pixels_c;
2462 c->put_pixels_clamped = put_pixels_clamped_c;
2463 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2464 c->add_pixels_clamped = add_pixels_clamped_c;
2466 c->sum_abs_dctelem = sum_abs_dctelem_c;
2471 c->pix_sum = pix_sum_c;
2472 c->pix_norm1 = pix_norm1_c;
2474 c->fill_block_tab[0] = fill_block16_c;
2475 c->fill_block_tab[1] = fill_block8_c;
2477 /* TODO [0] 16 [1] 8 */
2478 c->pix_abs[0][0] = pix_abs16_c;
2479 c->pix_abs[0][1] = pix_abs16_x2_c;
2480 c->pix_abs[0][2] = pix_abs16_y2_c;
2481 c->pix_abs[0][3] = pix_abs16_xy2_c;
2482 c->pix_abs[1][0] = pix_abs8_c;
2483 c->pix_abs[1][1] = pix_abs8_x2_c;
2484 c->pix_abs[1][2] = pix_abs8_y2_c;
2485 c->pix_abs[1][3] = pix_abs8_xy2_c;
2487 #define dspfunc(PFX, IDX, NUM) \
2488 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
2489 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
2490 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
2491 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
2492 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
2493 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
2494 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
2495 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
2496 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
2497 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
2498 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2499 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2500 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2501 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2502 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2503 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2505 dspfunc(put_qpel, 0, 16);
2506 dspfunc(put_qpel, 1, 8);
2508 dspfunc(put_no_rnd_qpel, 0, 16);
2509 dspfunc(put_no_rnd_qpel, 1, 8);
2511 dspfunc(avg_qpel, 0, 16);
2512 dspfunc(avg_qpel, 1, 8);
2516 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2517 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2518 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2519 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2520 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2521 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2522 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2523 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2525 #define SET_CMP_FUNC(name) \
2526 c->name[0] = name ## 16_c; \
2527 c->name[1] = name ## 8x8_c;
2529 SET_CMP_FUNC(hadamard8_diff)
2530 c->hadamard8_diff[4] = hadamard8_intra16_c;
2531 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2532 SET_CMP_FUNC(dct_sad)
2533 SET_CMP_FUNC(dct_max)
2535 SET_CMP_FUNC(dct264_sad)
2537 c->sad[0] = pix_abs16_c;
2538 c->sad[1] = pix_abs8_c;
2539 c->sse[0] = sse16_c;
2542 SET_CMP_FUNC(quant_psnr)
2545 c->vsad[0] = vsad16_c;
2546 c->vsad[4] = vsad_intra16_c;
2547 c->vsad[5] = vsad_intra8_c;
2548 c->vsse[0] = vsse16_c;
2549 c->vsse[4] = vsse_intra16_c;
2550 c->vsse[5] = vsse_intra8_c;
2551 c->nsse[0] = nsse16_c;
2552 c->nsse[1] = nsse8_c;
2554 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2556 c->add_bytes = add_bytes_c;
2557 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
2558 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2559 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2561 c->diff_bytes = diff_bytes_c;
2562 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2564 c->bswap_buf = bswap_buf;
2565 c->bswap16_buf = bswap16_buf;
2567 c->try_8x8basis = try_8x8basis_c;
2568 c->add_8x8basis = add_8x8basis_c;
2570 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2572 c->scalarproduct_int16 = scalarproduct_int16_c;
2573 c->vector_clip_int32 = vector_clip_int32_c;
2574 c->vector_clipf = vector_clipf_c;
2576 c->shrink[0] = av_image_copy_plane;
2577 c->shrink[1] = ff_shrink22;
2578 c->shrink[2] = ff_shrink44;
2579 c->shrink[3] = ff_shrink88;
2581 c->add_pixels8 = add_pixels8_c;
2583 c->draw_edges = draw_edges_8_c;
2585 c->clear_block = clear_block_8_c;
2586 c->clear_blocks = clear_blocks_8_c;
2588 switch (avctx->bits_per_raw_sample) {
2591 c->get_pixels = get_pixels_16_c;
2594 c->get_pixels = get_pixels_8_c;
2599 ff_dsputil_init_arm(c, avctx, high_bit_depth);
2601 ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2603 ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2605 ff_dsputil_init_x86(c, avctx, high_bit_depth);
2607 ff_init_scantable_permutation(c->idct_permutation,
2608 c->idct_permutation_type);