3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_square_tab[512] = { 0, };
49 #include "dsputilenc_template.c"
53 #include "hpel_template.c"
54 #include "tpel_template.c"
55 #include "dsputil_template.c"
56 #include "dsputilenc_template.c"
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL / 255 * 0x7f)
60 #define pb_80 (~0UL / 255 * 0x80)
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63 * specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
75 const uint8_t ff_alternate_horizontal_scan[64] = {
76 0, 1, 2, 3, 8, 9, 16, 17,
77 10, 11, 4, 5, 6, 7, 15, 14,
78 13, 12, 19, 18, 24, 25, 32, 33,
79 26, 27, 20, 21, 22, 23, 28, 29,
80 30, 31, 34, 35, 40, 41, 48, 49,
81 42, 43, 36, 37, 38, 39, 44, 45,
82 46, 47, 50, 51, 56, 57, 58, 59,
83 52, 53, 54, 55, 60, 61, 62, 63,
86 const uint8_t ff_alternate_vertical_scan[64] = {
87 0, 8, 16, 24, 1, 9, 2, 10,
88 17, 25, 32, 40, 48, 56, 57, 49,
89 41, 33, 26, 18, 3, 11, 4, 12,
90 19, 27, 34, 42, 50, 58, 35, 43,
91 51, 59, 20, 28, 5, 13, 6, 14,
92 21, 29, 36, 44, 52, 60, 37, 45,
93 53, 61, 22, 30, 7, 15, 23, 31,
94 38, 46, 54, 62, 39, 47, 55, 63,
97 /* Input permutation for the simple_idct_mmx */
98 static const uint8_t simple_mmx_permutation[64] = {
99 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
111 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
112 const uint8_t *src_scantable)
116 st->scantable = src_scantable;
118 for (i = 0; i < 64; i++) {
119 int j = src_scantable[i];
120 st->permutated[i] = permutation[j];
124 for (i = 0; i < 64; i++) {
125 int j = st->permutated[i];
128 st->raster_end[i] = end;
132 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
133 int idct_permutation_type)
137 switch (idct_permutation_type) {
138 case FF_NO_IDCT_PERM:
139 for (i = 0; i < 64; i++)
140 idct_permutation[i] = i;
142 case FF_LIBMPEG2_IDCT_PERM:
143 for (i = 0; i < 64; i++)
144 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
146 case FF_SIMPLE_IDCT_PERM:
147 for (i = 0; i < 64; i++)
148 idct_permutation[i] = simple_mmx_permutation[i];
150 case FF_TRANSPOSE_IDCT_PERM:
151 for (i = 0; i < 64; i++)
152 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
154 case FF_PARTTRANS_IDCT_PERM:
155 for (i = 0; i < 64; i++)
156 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
158 case FF_SSE2_IDCT_PERM:
159 for (i = 0; i < 64; i++)
160 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
163 av_log(NULL, AV_LOG_ERROR,
164 "Internal error, IDCT permutation not set\n");
168 static int pix_sum_c(uint8_t *pix, int line_size)
172 for (i = 0; i < 16; i++) {
173 for (j = 0; j < 16; j += 8) {
184 pix += line_size - 16;
189 static int pix_norm1_c(uint8_t *pix, int line_size)
192 uint32_t *sq = ff_square_tab + 256;
194 for (i = 0; i < 16; i++) {
195 for (j = 0; j < 16; j += 8) {
207 register uint64_t x = *(uint64_t *) pix;
209 s += sq[(x >> 8) & 0xff];
210 s += sq[(x >> 16) & 0xff];
211 s += sq[(x >> 24) & 0xff];
212 s += sq[(x >> 32) & 0xff];
213 s += sq[(x >> 40) & 0xff];
214 s += sq[(x >> 48) & 0xff];
215 s += sq[(x >> 56) & 0xff];
217 register uint32_t x = *(uint32_t *) pix;
219 s += sq[(x >> 8) & 0xff];
220 s += sq[(x >> 16) & 0xff];
221 s += sq[(x >> 24) & 0xff];
222 x = *(uint32_t *) (pix + 4);
224 s += sq[(x >> 8) & 0xff];
225 s += sq[(x >> 16) & 0xff];
226 s += sq[(x >> 24) & 0xff];
231 pix += line_size - 16;
236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
240 for (i = 0; i + 8 <= w; i += 8) {
241 dst[i + 0] = av_bswap32(src[i + 0]);
242 dst[i + 1] = av_bswap32(src[i + 1]);
243 dst[i + 2] = av_bswap32(src[i + 2]);
244 dst[i + 3] = av_bswap32(src[i + 3]);
245 dst[i + 4] = av_bswap32(src[i + 4]);
246 dst[i + 5] = av_bswap32(src[i + 5]);
247 dst[i + 6] = av_bswap32(src[i + 6]);
248 dst[i + 7] = av_bswap32(src[i + 7]);
251 dst[i + 0] = av_bswap32(src[i + 0]);
254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
257 *dst++ = av_bswap16(*src++);
260 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
261 int line_size, int h)
264 uint32_t *sq = ff_square_tab + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[0] - pix2[0]];
268 s += sq[pix1[1] - pix2[1]];
269 s += sq[pix1[2] - pix2[2]];
270 s += sq[pix1[3] - pix2[3]];
277 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
278 int line_size, int h)
281 uint32_t *sq = ff_square_tab + 256;
283 for (i = 0; i < h; i++) {
284 s += sq[pix1[0] - pix2[0]];
285 s += sq[pix1[1] - pix2[1]];
286 s += sq[pix1[2] - pix2[2]];
287 s += sq[pix1[3] - pix2[3]];
288 s += sq[pix1[4] - pix2[4]];
289 s += sq[pix1[5] - pix2[5]];
290 s += sq[pix1[6] - pix2[6]];
291 s += sq[pix1[7] - pix2[7]];
298 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
299 int line_size, int h)
302 uint32_t *sq = ff_square_tab + 256;
304 for (i = 0; i < h; i++) {
305 s += sq[pix1[0] - pix2[0]];
306 s += sq[pix1[1] - pix2[1]];
307 s += sq[pix1[2] - pix2[2]];
308 s += sq[pix1[3] - pix2[3]];
309 s += sq[pix1[4] - pix2[4]];
310 s += sq[pix1[5] - pix2[5]];
311 s += sq[pix1[6] - pix2[6]];
312 s += sq[pix1[7] - pix2[7]];
313 s += sq[pix1[8] - pix2[8]];
314 s += sq[pix1[9] - pix2[9]];
315 s += sq[pix1[10] - pix2[10]];
316 s += sq[pix1[11] - pix2[11]];
317 s += sq[pix1[12] - pix2[12]];
318 s += sq[pix1[13] - pix2[13]];
319 s += sq[pix1[14] - pix2[14]];
320 s += sq[pix1[15] - pix2[15]];
328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
329 const uint8_t *s2, int stride)
333 /* read the pixels */
334 for (i = 0; i < 8; i++) {
335 block[0] = s1[0] - s2[0];
336 block[1] = s1[1] - s2[1];
337 block[2] = s1[2] - s2[2];
338 block[3] = s1[3] - s2[3];
339 block[4] = s1[4] - s2[4];
340 block[5] = s1[5] - s2[5];
341 block[6] = s1[6] - s2[6];
342 block[7] = s1[7] - s2[7];
349 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
354 /* read the pixels */
355 for (i = 0; i < 8; i++) {
356 pixels[0] = av_clip_uint8(block[0]);
357 pixels[1] = av_clip_uint8(block[1]);
358 pixels[2] = av_clip_uint8(block[2]);
359 pixels[3] = av_clip_uint8(block[3]);
360 pixels[4] = av_clip_uint8(block[4]);
361 pixels[5] = av_clip_uint8(block[5]);
362 pixels[6] = av_clip_uint8(block[6]);
363 pixels[7] = av_clip_uint8(block[7]);
370 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
375 /* read the pixels */
377 pixels[0] = av_clip_uint8(block[0]);
378 pixels[1] = av_clip_uint8(block[1]);
379 pixels[2] = av_clip_uint8(block[2]);
380 pixels[3] = av_clip_uint8(block[3]);
387 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
392 /* read the pixels */
394 pixels[0] = av_clip_uint8(block[0]);
395 pixels[1] = av_clip_uint8(block[1]);
402 static void put_signed_pixels_clamped_c(const int16_t *block,
403 uint8_t *av_restrict pixels,
408 for (i = 0; i < 8; i++) {
409 for (j = 0; j < 8; j++) {
412 else if (*block > 127)
415 *pixels = (uint8_t) (*block + 128);
419 pixels += (line_size - 8);
423 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
428 for (i = 0; i < 8; i++) {
429 pixels[0] += block[0];
430 pixels[1] += block[1];
431 pixels[2] += block[2];
432 pixels[3] += block[3];
433 pixels[4] += block[4];
434 pixels[5] += block[5];
435 pixels[6] += block[6];
436 pixels[7] += block[7];
442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
447 /* read the pixels */
448 for (i = 0; i < 8; i++) {
449 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
450 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
451 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
452 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
453 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
454 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
455 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
456 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
467 /* read the pixels */
469 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
470 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
471 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
472 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
483 /* read the pixels */
485 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
486 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
492 static int sum_abs_dctelem_c(int16_t *block)
496 for (i = 0; i < 64; i++)
497 sum += FFABS(block[i]);
501 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
505 for (i = 0; i < h; i++) {
506 memset(block, value, 16);
511 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
515 for (i = 0; i < h; i++) {
516 memset(block, value, 8);
521 #define avg2(a, b) ((a + b + 1) >> 1)
522 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
524 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
525 int x16, int y16, int rounder)
527 const int A = (16 - x16) * (16 - y16);
528 const int B = (x16) * (16 - y16);
529 const int C = (16 - x16) * (y16);
530 const int D = (x16) * (y16);
533 for (i = 0; i < h; i++) {
534 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
535 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
536 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
537 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
538 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
539 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
540 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
541 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
547 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
548 int dxx, int dxy, int dyx, int dyy, int shift, int r,
549 int width, int height)
552 const int s = 1 << shift;
557 for (y = 0; y < h; y++) {
562 for (x = 0; x < 8; x++) { // FIXME: optimize
564 int src_x = vx >> 16;
565 int src_y = vy >> 16;
566 int frac_x = src_x & (s - 1);
567 int frac_y = src_y & (s - 1);
572 if ((unsigned) src_x < width) {
573 if ((unsigned) src_y < height) {
574 index = src_x + src_y * stride;
575 dst[y * stride + x] =
576 ((src[index] * (s - frac_x) +
577 src[index + 1] * frac_x) * (s - frac_y) +
578 (src[index + stride] * (s - frac_x) +
579 src[index + stride + 1] * frac_x) * frac_y +
582 index = src_x + av_clip(src_y, 0, height) * stride;
583 dst[y * stride + x] =
584 ((src[index] * (s - frac_x) +
585 src[index + 1] * frac_x) * s +
589 if ((unsigned) src_y < height) {
590 index = av_clip(src_x, 0, width) + src_y * stride;
591 dst[y * stride + x] =
592 ((src[index] * (s - frac_y) +
593 src[index + stride] * frac_y) * s +
596 index = av_clip(src_x, 0, width) +
597 av_clip(src_y, 0, height) * stride;
598 dst[y * stride + x] = src[index];
610 #define QPEL_MC(r, OPNAME, RND, OP) \
611 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
612 int dstStride, int srcStride, \
615 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
618 for (i = 0; i < h; i++) { \
619 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
620 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
621 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
622 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
623 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
624 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
625 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
626 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
632 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
633 int dstStride, int srcStride) \
635 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
639 for (i = 0; i < w; i++) { \
640 const int src0 = src[0 * srcStride]; \
641 const int src1 = src[1 * srcStride]; \
642 const int src2 = src[2 * srcStride]; \
643 const int src3 = src[3 * srcStride]; \
644 const int src4 = src[4 * srcStride]; \
645 const int src5 = src[5 * srcStride]; \
646 const int src6 = src[6 * srcStride]; \
647 const int src7 = src[7 * srcStride]; \
648 const int src8 = src[8 * srcStride]; \
649 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
650 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
651 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
652 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
653 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
654 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
655 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
656 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
662 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
663 int dstStride, int srcStride, \
666 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
669 for (i = 0; i < h; i++) { \
670 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
671 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
672 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
673 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
674 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
675 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
676 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
677 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
678 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
679 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
680 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
681 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
682 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
683 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
684 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
685 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
691 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
692 int dstStride, int srcStride) \
694 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
698 for (i = 0; i < w; i++) { \
699 const int src0 = src[0 * srcStride]; \
700 const int src1 = src[1 * srcStride]; \
701 const int src2 = src[2 * srcStride]; \
702 const int src3 = src[3 * srcStride]; \
703 const int src4 = src[4 * srcStride]; \
704 const int src5 = src[5 * srcStride]; \
705 const int src6 = src[6 * srcStride]; \
706 const int src7 = src[7 * srcStride]; \
707 const int src8 = src[8 * srcStride]; \
708 const int src9 = src[9 * srcStride]; \
709 const int src10 = src[10 * srcStride]; \
710 const int src11 = src[11 * srcStride]; \
711 const int src12 = src[12 * srcStride]; \
712 const int src13 = src[13 * srcStride]; \
713 const int src14 = src[14 * srcStride]; \
714 const int src15 = src[15 * srcStride]; \
715 const int src16 = src[16 * srcStride]; \
716 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
717 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
718 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
719 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
720 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
721 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
722 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
723 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
724 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
725 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
726 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
727 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
728 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
729 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
730 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
731 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
737 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
742 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
743 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
746 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
749 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
752 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
757 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
758 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
761 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
764 uint8_t full[16 * 9]; \
767 copy_block9(full, src, 16, stride, 9); \
768 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
769 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
772 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
775 uint8_t full[16 * 9]; \
777 copy_block9(full, src, 16, stride, 9); \
778 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
781 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
784 uint8_t full[16 * 9]; \
787 copy_block9(full, src, 16, stride, 9); \
788 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
789 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
792 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
795 uint8_t full[16 * 9]; \
798 uint8_t halfHV[64]; \
800 copy_block9(full, src, 16, stride, 9); \
801 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
802 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
804 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
805 stride, 16, 8, 8, 8, 8); \
808 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
811 uint8_t full[16 * 9]; \
813 uint8_t halfHV[64]; \
815 copy_block9(full, src, 16, stride, 9); \
816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
817 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
819 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
825 uint8_t full[16 * 9]; \
828 uint8_t halfHV[64]; \
830 copy_block9(full, src, 16, stride, 9); \
831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
832 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
834 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
835 stride, 16, 8, 8, 8, 8); \
838 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
841 uint8_t full[16 * 9]; \
843 uint8_t halfHV[64]; \
845 copy_block9(full, src, 16, stride, 9); \
846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
847 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
848 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
849 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
852 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
855 uint8_t full[16 * 9]; \
858 uint8_t halfHV[64]; \
860 copy_block9(full, src, 16, stride, 9); \
861 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
862 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
863 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
864 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
865 stride, 16, 8, 8, 8, 8); \
868 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
871 uint8_t full[16 * 9]; \
873 uint8_t halfHV[64]; \
875 copy_block9(full, src, 16, stride, 9); \
876 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
877 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
878 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
879 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
882 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
885 uint8_t full[16 * 9]; \
888 uint8_t halfHV[64]; \
890 copy_block9(full, src, 16, stride, 9); \
891 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
892 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
893 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
894 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
895 stride, 16, 8, 8, 8, 8); \
898 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
901 uint8_t full[16 * 9]; \
903 uint8_t halfHV[64]; \
905 copy_block9(full, src, 16, stride, 9); \
906 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
907 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
909 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
912 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
916 uint8_t halfHV[64]; \
918 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
919 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
920 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
923 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
927 uint8_t halfHV[64]; \
929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
930 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
931 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
934 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
937 uint8_t full[16 * 9]; \
940 uint8_t halfHV[64]; \
942 copy_block9(full, src, 16, stride, 9); \
943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
944 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
945 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
946 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
949 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
952 uint8_t full[16 * 9]; \
955 copy_block9(full, src, 16, stride, 9); \
956 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
957 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
958 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
961 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
964 uint8_t full[16 * 9]; \
967 uint8_t halfHV[64]; \
969 copy_block9(full, src, 16, stride, 9); \
970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
972 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
973 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
976 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
979 uint8_t full[16 * 9]; \
982 copy_block9(full, src, 16, stride, 9); \
983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
984 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
985 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
988 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
994 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
997 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
1000 uint8_t half[256]; \
1002 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1003 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
1006 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
1009 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
1012 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
1015 uint8_t half[256]; \
1017 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1018 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
1021 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
1024 uint8_t full[24 * 17]; \
1025 uint8_t half[256]; \
1027 copy_block17(full, src, 24, stride, 17); \
1028 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1029 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
1032 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
1035 uint8_t full[24 * 17]; \
1037 copy_block17(full, src, 24, stride, 17); \
1038 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
1041 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
1044 uint8_t full[24 * 17]; \
1045 uint8_t half[256]; \
1047 copy_block17(full, src, 24, stride, 17); \
1048 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1049 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
1052 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
1055 uint8_t full[24 * 17]; \
1056 uint8_t halfH[272]; \
1057 uint8_t halfV[256]; \
1058 uint8_t halfHV[256]; \
1060 copy_block17(full, src, 24, stride, 17); \
1061 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1062 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1063 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1064 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1065 stride, 24, 16, 16, 16, 16); \
1068 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1071 uint8_t full[24 * 17]; \
1072 uint8_t halfH[272]; \
1073 uint8_t halfHV[256]; \
1075 copy_block17(full, src, 24, stride, 17); \
1076 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1077 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1078 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1079 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1082 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1085 uint8_t full[24 * 17]; \
1086 uint8_t halfH[272]; \
1087 uint8_t halfV[256]; \
1088 uint8_t halfHV[256]; \
1090 copy_block17(full, src, 24, stride, 17); \
1091 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1092 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1093 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1094 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1095 stride, 24, 16, 16, 16, 16); \
1098 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1101 uint8_t full[24 * 17]; \
1102 uint8_t halfH[272]; \
1103 uint8_t halfHV[256]; \
1105 copy_block17(full, src, 24, stride, 17); \
1106 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1107 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1109 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1112 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1115 uint8_t full[24 * 17]; \
1116 uint8_t halfH[272]; \
1117 uint8_t halfV[256]; \
1118 uint8_t halfHV[256]; \
1120 copy_block17(full, src, 24, stride, 17); \
1121 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1122 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1123 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1124 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1125 stride, 24, 16, 16, 16, 16); \
1128 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1131 uint8_t full[24 * 17]; \
1132 uint8_t halfH[272]; \
1133 uint8_t halfHV[256]; \
1135 copy_block17(full, src, 24, stride, 17); \
1136 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1137 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1138 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1139 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1142 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1145 uint8_t full[24 * 17]; \
1146 uint8_t halfH[272]; \
1147 uint8_t halfV[256]; \
1148 uint8_t halfHV[256]; \
1150 copy_block17(full, src, 24, stride, 17); \
1151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1152 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1153 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1154 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1155 stride, 24, 16, 16, 16, 16); \
1158 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1161 uint8_t full[24 * 17]; \
1162 uint8_t halfH[272]; \
1163 uint8_t halfHV[256]; \
1165 copy_block17(full, src, 24, stride, 17); \
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1167 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1169 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1172 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1175 uint8_t halfH[272]; \
1176 uint8_t halfHV[256]; \
1178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1179 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1180 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1183 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1186 uint8_t halfH[272]; \
1187 uint8_t halfHV[256]; \
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1191 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1194 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1197 uint8_t full[24 * 17]; \
1198 uint8_t halfH[272]; \
1199 uint8_t halfV[256]; \
1200 uint8_t halfHV[256]; \
1202 copy_block17(full, src, 24, stride, 17); \
1203 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1204 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1206 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1209 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1212 uint8_t full[24 * 17]; \
1213 uint8_t halfH[272]; \
1215 copy_block17(full, src, 24, stride, 17); \
1216 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1217 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1218 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1221 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1224 uint8_t full[24 * 17]; \
1225 uint8_t halfH[272]; \
1226 uint8_t halfV[256]; \
1227 uint8_t halfHV[256]; \
1229 copy_block17(full, src, 24, stride, 17); \
1230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1231 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1232 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1233 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1236 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1239 uint8_t full[24 * 17]; \
1240 uint8_t halfH[272]; \
1242 copy_block17(full, src, 24, stride, 17); \
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1244 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1245 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1248 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1251 uint8_t halfH[272]; \
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1254 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1257 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1258 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1259 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1262 QPEL_MC(0, put_, _, op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_, _, op_avg)
1268 #undef op_put_no_rnd
1270 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1272 put_pixels8_8_c(dst, src, stride, 8);
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1277 avg_pixels8_8_c(dst, src, stride, 8);
1280 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1282 put_pixels16_8_c(dst, src, stride, 16);
1285 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1287 avg_pixels16_8_c(dst, src, stride, 16);
1290 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1291 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1292 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1293 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1294 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1295 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1297 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1298 int dstStride, int srcStride, int h)
1300 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1303 for (i = 0; i < h; i++) {
1304 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1305 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1306 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1307 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1308 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1309 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1310 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1311 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1317 #if CONFIG_DIRAC_DECODER
1318 #define DIRAC_MC(OPNAME)\
1319 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1321 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1323 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1325 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1327 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1329 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1330 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1332 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1334 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1336 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1338 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1340 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1342 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1343 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1345 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1347 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1349 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1351 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1353 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1355 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1356 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1362 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1363 int dstStride, int srcStride, int w)
1365 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1368 for (i = 0; i < w; i++) {
1369 const int src_1 = src[-srcStride];
1370 const int src0 = src[0];
1371 const int src1 = src[srcStride];
1372 const int src2 = src[2 * srcStride];
1373 const int src3 = src[3 * srcStride];
1374 const int src4 = src[4 * srcStride];
1375 const int src5 = src[5 * srcStride];
1376 const int src6 = src[6 * srcStride];
1377 const int src7 = src[7 * srcStride];
1378 const int src8 = src[8 * srcStride];
1379 const int src9 = src[9 * srcStride];
1380 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1381 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1382 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1383 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1384 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1385 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1386 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1387 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1393 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1397 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1398 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1401 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1403 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1406 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1410 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1411 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1414 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1416 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1419 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1425 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1426 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1427 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1428 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1431 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1437 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1438 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1439 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1440 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1443 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1447 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1448 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1451 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1452 int line_size, int h)
1456 for (i = 0; i < h; i++) {
1457 s += abs(pix1[0] - pix2[0]);
1458 s += abs(pix1[1] - pix2[1]);
1459 s += abs(pix1[2] - pix2[2]);
1460 s += abs(pix1[3] - pix2[3]);
1461 s += abs(pix1[4] - pix2[4]);
1462 s += abs(pix1[5] - pix2[5]);
1463 s += abs(pix1[6] - pix2[6]);
1464 s += abs(pix1[7] - pix2[7]);
1465 s += abs(pix1[8] - pix2[8]);
1466 s += abs(pix1[9] - pix2[9]);
1467 s += abs(pix1[10] - pix2[10]);
1468 s += abs(pix1[11] - pix2[11]);
1469 s += abs(pix1[12] - pix2[12]);
1470 s += abs(pix1[13] - pix2[13]);
1471 s += abs(pix1[14] - pix2[14]);
1472 s += abs(pix1[15] - pix2[15]);
1479 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1480 int line_size, int h)
1484 for (i = 0; i < h; i++) {
1485 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1486 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1487 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1488 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1489 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1490 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1491 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1492 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1493 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1494 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1495 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1496 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1497 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1498 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1499 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1500 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1507 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1508 int line_size, int h)
1511 uint8_t *pix3 = pix2 + line_size;
1513 for (i = 0; i < h; i++) {
1514 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1515 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1516 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1517 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1518 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1519 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1520 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1521 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1522 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1523 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1524 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1525 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1526 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1527 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1528 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1529 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1537 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1538 int line_size, int h)
1541 uint8_t *pix3 = pix2 + line_size;
1543 for (i = 0; i < h; i++) {
1544 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1545 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1546 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1547 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1548 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1549 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1550 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1551 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1552 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1553 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1554 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1555 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1556 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1557 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1558 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1559 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1567 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1568 int line_size, int h)
1572 for (i = 0; i < h; i++) {
1573 s += abs(pix1[0] - pix2[0]);
1574 s += abs(pix1[1] - pix2[1]);
1575 s += abs(pix1[2] - pix2[2]);
1576 s += abs(pix1[3] - pix2[3]);
1577 s += abs(pix1[4] - pix2[4]);
1578 s += abs(pix1[5] - pix2[5]);
1579 s += abs(pix1[6] - pix2[6]);
1580 s += abs(pix1[7] - pix2[7]);
1587 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1588 int line_size, int h)
1592 for (i = 0; i < h; i++) {
1593 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1594 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1595 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1596 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1597 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1598 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1599 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1600 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1607 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1608 int line_size, int h)
1611 uint8_t *pix3 = pix2 + line_size;
1613 for (i = 0; i < h; i++) {
1614 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1615 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1616 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1617 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1618 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1619 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1620 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1621 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1629 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1630 int line_size, int h)
1633 uint8_t *pix3 = pix2 + line_size;
1635 for (i = 0; i < h; i++) {
1636 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1637 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1638 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1639 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1640 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1641 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1642 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1643 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1651 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1653 int score1 = 0, score2 = 0, x, y;
1655 for (y = 0; y < h; y++) {
1656 for (x = 0; x < 16; x++)
1657 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1659 for (x = 0; x < 15; x++)
1660 score2 += FFABS(s1[x] - s1[x + stride] -
1661 s1[x + 1] + s1[x + stride + 1]) -
1662 FFABS(s2[x] - s2[x + stride] -
1663 s2[x + 1] + s2[x + stride + 1]);
1670 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1672 return score1 + FFABS(score2) * 8;
1675 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1677 int score1 = 0, score2 = 0, x, y;
1679 for (y = 0; y < h; y++) {
1680 for (x = 0; x < 8; x++)
1681 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1683 for (x = 0; x < 7; x++)
1684 score2 += FFABS(s1[x] - s1[x + stride] -
1685 s1[x + 1] + s1[x + stride + 1]) -
1686 FFABS(s2[x] - s2[x + stride] -
1687 s2[x + 1] + s2[x + stride + 1]);
1694 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1696 return score1 + FFABS(score2) * 8;
1699 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1700 int16_t basis[64], int scale)
1703 unsigned int sum = 0;
1705 for (i = 0; i < 8 * 8; i++) {
1706 int b = rem[i] + ((basis[i] * scale +
1707 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1708 (BASIS_SHIFT - RECON_SHIFT));
1711 av_assert2(-512 < b && b < 512);
1713 sum += (w * b) * (w * b) >> 4;
1718 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1722 for (i = 0; i < 8 * 8; i++)
1723 rem[i] += (basis[i] * scale +
1724 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1725 (BASIS_SHIFT - RECON_SHIFT);
1728 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1734 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1738 memset(cmp, 0, sizeof(void *) * 6);
1740 for (i = 0; i < 6; i++) {
1741 switch (type & 0xFF) {
1746 cmp[i] = c->hadamard8_diff[i];
1752 cmp[i] = c->dct_sad[i];
1755 cmp[i] = c->dct264_sad[i];
1758 cmp[i] = c->dct_max[i];
1761 cmp[i] = c->quant_psnr[i];
1770 cmp[i] = c->vsad[i];
1773 cmp[i] = c->vsse[i];
1779 cmp[i] = c->nsse[i];
1790 av_log(NULL, AV_LOG_ERROR,
1791 "internal error in cmp function selection\n");
1796 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1800 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1801 long a = *(long *) (src + i);
1802 long b = *(long *) (dst + i);
1803 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1806 dst[i + 0] += src[i + 0];
1809 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
1813 #if !HAVE_FAST_UNALIGNED
1814 if ((long) src2 & (sizeof(long) - 1)) {
1815 for (i = 0; i + 7 < w; i += 8) {
1816 dst[i + 0] = src1[i + 0] - src2[i + 0];
1817 dst[i + 1] = src1[i + 1] - src2[i + 1];
1818 dst[i + 2] = src1[i + 2] - src2[i + 2];
1819 dst[i + 3] = src1[i + 3] - src2[i + 3];
1820 dst[i + 4] = src1[i + 4] - src2[i + 4];
1821 dst[i + 5] = src1[i + 5] - src2[i + 5];
1822 dst[i + 6] = src1[i + 6] - src2[i + 6];
1823 dst[i + 7] = src1[i + 7] - src2[i + 7];
1827 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1828 long a = *(long *) (src1 + i);
1829 long b = *(long *) (src2 + i);
1830 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1831 ((a ^ b ^ pb_80) & pb_80);
1834 dst[i + 0] = src1[i + 0] - src2[i + 0];
1837 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1838 const uint8_t *diff, int w,
1839 int *left, int *left_top)
1847 for (i = 0; i < w; i++) {
1848 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1857 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1858 const uint8_t *src2, int w,
1859 int *left, int *left_top)
1867 for (i = 0; i < w; i++) {
1868 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1878 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1883 for (i = 0; i < w - 1; i++) {
1891 for (; i < w; i++) {
1910 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1911 int w, int *red, int *green,
1912 int *blue, int *alpha)
1914 int i, r = *red, g = *green, b = *blue, a = *alpha;
1916 for (i = 0; i < w; i++) {
1917 b += src[4 * i + B];
1918 g += src[4 * i + G];
1919 r += src[4 * i + R];
1920 a += src[4 * i + A];
1938 #define BUTTERFLY2(o1, o2, i1, i2) \
1942 #define BUTTERFLY1(x, y) \
1951 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1953 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1954 uint8_t *src, int stride, int h)
1956 int i, temp[64], sum = 0;
1960 for (i = 0; i < 8; i++) {
1961 // FIXME: try pointer walks
1962 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1963 src[stride * i + 0] - dst[stride * i + 0],
1964 src[stride * i + 1] - dst[stride * i + 1]);
1965 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1966 src[stride * i + 2] - dst[stride * i + 2],
1967 src[stride * i + 3] - dst[stride * i + 3]);
1968 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1969 src[stride * i + 4] - dst[stride * i + 4],
1970 src[stride * i + 5] - dst[stride * i + 5]);
1971 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1972 src[stride * i + 6] - dst[stride * i + 6],
1973 src[stride * i + 7] - dst[stride * i + 7]);
1975 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1976 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1977 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1978 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
1980 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
1981 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
1982 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
1983 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
1986 for (i = 0; i < 8; i++) {
1987 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
1988 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
1989 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
1990 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
1992 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
1993 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
1994 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
1995 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
1997 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
1998 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
1999 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2000 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2005 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
2006 uint8_t *dummy, int stride, int h)
2008 int i, temp[64], sum = 0;
2012 for (i = 0; i < 8; i++) {
2013 // FIXME: try pointer walks
2014 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2015 src[stride * i + 0], src[stride * i + 1]);
2016 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2017 src[stride * i + 2], src[stride * i + 3]);
2018 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2019 src[stride * i + 4], src[stride * i + 5]);
2020 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2021 src[stride * i + 6], src[stride * i + 7]);
2023 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2024 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2025 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2026 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2028 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2029 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2030 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2031 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2034 for (i = 0; i < 8; i++) {
2035 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2036 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2037 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2038 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2040 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2041 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2042 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2043 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2046 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2047 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2048 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2049 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2052 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2057 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2058 uint8_t *src2, int stride, int h)
2060 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2064 s->dsp.diff_pixels(temp, src1, src2, stride);
2066 return s->dsp.sum_abs_dctelem(temp);
2072 const int s07 = SRC(0) + SRC(7); \
2073 const int s16 = SRC(1) + SRC(6); \
2074 const int s25 = SRC(2) + SRC(5); \
2075 const int s34 = SRC(3) + SRC(4); \
2076 const int a0 = s07 + s34; \
2077 const int a1 = s16 + s25; \
2078 const int a2 = s07 - s34; \
2079 const int a3 = s16 - s25; \
2080 const int d07 = SRC(0) - SRC(7); \
2081 const int d16 = SRC(1) - SRC(6); \
2082 const int d25 = SRC(2) - SRC(5); \
2083 const int d34 = SRC(3) - SRC(4); \
2084 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
2085 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
2086 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
2087 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
2089 DST(1, a4 + (a7 >> 2)); \
2090 DST(2, a2 + (a3 >> 1)); \
2091 DST(3, a5 + (a6 >> 2)); \
2093 DST(5, a6 - (a5 >> 2)); \
2094 DST(6, (a2 >> 1) - a3); \
2095 DST(7, (a4 >> 2) - a7); \
2098 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2099 uint8_t *src2, int stride, int h)
2104 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2106 #define SRC(x) dct[i][x]
2107 #define DST(x, v) dct[i][x] = v
2108 for (i = 0; i < 8; i++)
2113 #define SRC(x) dct[x][i]
2114 #define DST(x, v) sum += FFABS(v)
2115 for (i = 0; i < 8; i++)
2123 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2124 uint8_t *src2, int stride, int h)
2126 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2131 s->dsp.diff_pixels(temp, src1, src2, stride);
2134 for (i = 0; i < 64; i++)
2135 sum = FFMAX(sum, FFABS(temp[i]));
2140 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2141 uint8_t *src2, int stride, int h)
2143 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2144 int16_t *const bak = temp + 64;
2150 s->dsp.diff_pixels(temp, src1, src2, stride);
2152 memcpy(bak, temp, 64 * sizeof(int16_t));
2154 s->block_last_index[0 /* FIXME */] =
2155 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2156 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2157 ff_simple_idct_8(temp); // FIXME
2159 for (i = 0; i < 64; i++)
2160 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2165 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2168 const uint8_t *scantable = s->intra_scantable.permutated;
2169 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2170 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2171 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2172 int i, last, run, bits, level, distortion, start_i;
2173 const int esc_length = s->ac_esc_length;
2174 uint8_t *length, *last_length;
2178 copy_block8(lsrc1, src1, 8, stride, 8);
2179 copy_block8(lsrc2, src2, 8, stride, 8);
2181 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2183 s->block_last_index[0 /* FIXME */] =
2185 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2191 length = s->intra_ac_vlc_length;
2192 last_length = s->intra_ac_vlc_last_length;
2193 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2196 length = s->inter_ac_vlc_length;
2197 last_length = s->inter_ac_vlc_last_length;
2200 if (last >= start_i) {
2202 for (i = start_i; i < last; i++) {
2203 int j = scantable[i];
2208 if ((level & (~127)) == 0)
2209 bits += length[UNI_AC_ENC_INDEX(run, level)];
2216 i = scantable[last];
2218 level = temp[i] + 64;
2220 av_assert2(level - 64);
2222 if ((level & (~127)) == 0) {
2223 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2230 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2232 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2235 s->dsp.idct_add(lsrc2, 8, temp);
2237 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2239 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2242 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2245 const uint8_t *scantable = s->intra_scantable.permutated;
2246 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2247 int i, last, run, bits, level, start_i;
2248 const int esc_length = s->ac_esc_length;
2249 uint8_t *length, *last_length;
2253 s->dsp.diff_pixels(temp, src1, src2, stride);
2255 s->block_last_index[0 /* FIXME */] =
2257 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2263 length = s->intra_ac_vlc_length;
2264 last_length = s->intra_ac_vlc_last_length;
2265 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2268 length = s->inter_ac_vlc_length;
2269 last_length = s->inter_ac_vlc_last_length;
2272 if (last >= start_i) {
2274 for (i = start_i; i < last; i++) {
2275 int j = scantable[i];
2280 if ((level & (~127)) == 0)
2281 bits += length[UNI_AC_ENC_INDEX(run, level)];
2288 i = scantable[last];
2290 level = temp[i] + 64;
2292 av_assert2(level - 64);
2294 if ((level & (~127)) == 0)
2295 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2303 #define VSAD_INTRA(size) \
2304 static int vsad_intra ## size ## _c(MpegEncContext *c, \
2305 uint8_t *s, uint8_t *dummy, \
2306 int stride, int h) \
2308 int score = 0, x, y; \
2310 for (y = 1; y < h; y++) { \
2311 for (x = 0; x < size; x += 4) { \
2312 score += FFABS(s[x] - s[x + stride]) + \
2313 FFABS(s[x + 1] - s[x + stride + 1]) + \
2314 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2315 FFABS(s[x + 3] - s[x + 3 + stride]); \
2325 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2328 int score = 0, x, y;
2330 for (y = 1; y < h; y++) {
2331 for (x = 0; x < 16; x++)
2332 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2340 #define SQ(a) ((a) * (a))
2341 #define VSSE_INTRA(size) \
2342 static int vsse_intra ## size ## _c(MpegEncContext *c, \
2343 uint8_t *s, uint8_t *dummy, \
2344 int stride, int h) \
2346 int score = 0, x, y; \
2348 for (y = 1; y < h; y++) { \
2349 for (x = 0; x < size; x += 4) { \
2350 score += SQ(s[x] - s[x + stride]) + \
2351 SQ(s[x + 1] - s[x + stride + 1]) + \
2352 SQ(s[x + 2] - s[x + stride + 2]) + \
2353 SQ(s[x + 3] - s[x + stride + 3]); \
2363 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2366 int score = 0, x, y;
2368 for (y = 1; y < h; y++) {
2369 for (x = 0; x < 16; x++)
2370 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2378 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2383 for (i = 0; i < size; i++)
2384 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2388 #define WRAPPER8_16_SQ(name8, name16) \
2389 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
2390 int stride, int h) \
2394 score += name8(s, dst, src, stride, 8); \
2395 score += name8(s, dst + 8, src + 8, stride, 8); \
2397 dst += 8 * stride; \
2398 src += 8 * stride; \
2399 score += name8(s, dst, src, stride, 8); \
2400 score += name8(s, dst + 8, src + 8, stride, 8); \
2405 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2406 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2407 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2409 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2411 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2412 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2413 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2414 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2416 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2417 uint32_t maxi, uint32_t maxisign)
2421 else if ((a ^ (1U << 31)) > maxisign)
2427 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2428 float *min, float *max, int len)
2431 uint32_t mini = *(uint32_t *) min;
2432 uint32_t maxi = *(uint32_t *) max;
2433 uint32_t maxisign = maxi ^ (1U << 31);
2434 uint32_t *dsti = (uint32_t *) dst;
2435 const uint32_t *srci = (const uint32_t *) src;
2437 for (i = 0; i < len; i += 8) {
2438 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2439 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2440 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2441 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2442 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2443 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2444 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2445 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2449 static void vector_clipf_c(float *dst, const float *src,
2450 float min, float max, int len)
2454 if (min < 0 && max > 0) {
2455 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2457 for (i = 0; i < len; i += 8) {
2458 dst[i] = av_clipf(src[i], min, max);
2459 dst[i + 1] = av_clipf(src[i + 1], min, max);
2460 dst[i + 2] = av_clipf(src[i + 2], min, max);
2461 dst[i + 3] = av_clipf(src[i + 3], min, max);
2462 dst[i + 4] = av_clipf(src[i + 4], min, max);
2463 dst[i + 5] = av_clipf(src[i + 5], min, max);
2464 dst[i + 6] = av_clipf(src[i + 6], min, max);
2465 dst[i + 7] = av_clipf(src[i + 7], min, max);
2470 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2476 res += *v1++ **v2++;
2481 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2489 *v1++ += mul * *v3++;
2494 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2495 int32_t max, unsigned int len)
2498 *dst++ = av_clip(*src++, min, max);
2499 *dst++ = av_clip(*src++, min, max);
2500 *dst++ = av_clip(*src++, min, max);
2501 *dst++ = av_clip(*src++, min, max);
2502 *dst++ = av_clip(*src++, min, max);
2503 *dst++ = av_clip(*src++, min, max);
2504 *dst++ = av_clip(*src++, min, max);
2505 *dst++ = av_clip(*src++, min, max);
2510 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2512 ff_j_rev_dct(block);
2513 put_pixels_clamped_c(block, dest, line_size);
2516 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2518 ff_j_rev_dct(block);
2519 add_pixels_clamped_c(block, dest, line_size);
2522 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2524 ff_j_rev_dct4 (block);
2525 put_pixels_clamped4_c(block, dest, line_size);
2527 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2529 ff_j_rev_dct4 (block);
2530 add_pixels_clamped4_c(block, dest, line_size);
2533 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2535 ff_j_rev_dct2 (block);
2536 put_pixels_clamped2_c(block, dest, line_size);
2538 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2540 ff_j_rev_dct2 (block);
2541 add_pixels_clamped2_c(block, dest, line_size);
2544 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2546 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2548 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2550 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2553 /* draw the edges of width 'w' of an image of size width, height */
2554 // FIXME: Check that this is OK for MPEG-4 interlaced.
2555 static void draw_edges_8_c(uint8_t *buf, int wrap, int width, int height,
2556 int w, int h, int sides)
2558 uint8_t *ptr = buf, *last_line;
2561 /* left and right */
2562 for (i = 0; i < height; i++) {
2563 memset(ptr - w, ptr[0], w);
2564 memset(ptr + width, ptr[width - 1], w);
2568 /* top and bottom + corners */
2570 last_line = buf + (height - 1) * wrap;
2571 if (sides & EDGE_TOP)
2572 for (i = 0; i < h; i++)
2574 memcpy(buf - (i + 1) * wrap, buf, width + w + w);
2575 if (sides & EDGE_BOTTOM)
2576 for (i = 0; i < h; i++)
2578 memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);
2581 static void clear_block_8_c(int16_t *block)
2583 memset(block, 0, sizeof(int16_t) * 64);
2586 static void clear_blocks_8_c(int16_t *blocks)
2588 memset(blocks, 0, sizeof(int16_t) * 6 * 64);
2591 /* init static data */
2592 av_cold void ff_dsputil_static_init(void)
2596 for (i = 0; i < 512; i++)
2597 ff_square_tab[i] = (i - 256) * (i - 256);
2600 int ff_check_alignment(void)
2602 static int did_fail = 0;
2603 LOCAL_ALIGNED_16(int, aligned, [4]);
2605 if ((intptr_t)aligned & 15) {
2607 #if HAVE_MMX || HAVE_ALTIVEC
2608 av_log(NULL, AV_LOG_ERROR,
2609 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2610 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2611 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2612 "Do not report crashes to FFmpeg developers.\n");
2621 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2623 const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2625 ff_check_alignment();
2628 if (avctx->bits_per_raw_sample == 10) {
2629 c->fdct = ff_jpeg_fdct_islow_10;
2630 c->fdct248 = ff_fdct248_islow_10;
2632 if (avctx->dct_algo == FF_DCT_FASTINT) {
2633 c->fdct = ff_fdct_ifast;
2634 c->fdct248 = ff_fdct_ifast248;
2635 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2636 c->fdct = ff_faandct;
2637 c->fdct248 = ff_faandct248;
2639 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2640 c->fdct248 = ff_fdct248_islow_8;
2643 #endif /* CONFIG_ENCODERS */
2645 if (avctx->lowres==1) {
2646 c->idct_put = ff_jref_idct4_put;
2647 c->idct_add = ff_jref_idct4_add;
2648 c->idct = ff_j_rev_dct4;
2649 c->idct_permutation_type = FF_NO_IDCT_PERM;
2650 } else if (avctx->lowres==2) {
2651 c->idct_put = ff_jref_idct2_put;
2652 c->idct_add = ff_jref_idct2_add;
2653 c->idct = ff_j_rev_dct2;
2654 c->idct_permutation_type = FF_NO_IDCT_PERM;
2655 } else if (avctx->lowres==3) {
2656 c->idct_put = ff_jref_idct1_put;
2657 c->idct_add = ff_jref_idct1_add;
2658 c->idct = ff_j_rev_dct1;
2659 c->idct_permutation_type = FF_NO_IDCT_PERM;
2661 if (avctx->bits_per_raw_sample == 10) {
2662 c->idct_put = ff_simple_idct_put_10;
2663 c->idct_add = ff_simple_idct_add_10;
2664 c->idct = ff_simple_idct_10;
2665 c->idct_permutation_type = FF_NO_IDCT_PERM;
2666 } else if (avctx->bits_per_raw_sample == 12) {
2667 c->idct_put = ff_simple_idct_put_12;
2668 c->idct_add = ff_simple_idct_add_12;
2669 c->idct = ff_simple_idct_12;
2670 c->idct_permutation_type = FF_NO_IDCT_PERM;
2672 if (avctx->idct_algo == FF_IDCT_INT) {
2673 c->idct_put = jref_idct_put;
2674 c->idct_add = jref_idct_add;
2675 c->idct = ff_j_rev_dct;
2676 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2677 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2678 c->idct_put = ff_faanidct_put;
2679 c->idct_add = ff_faanidct_add;
2680 c->idct = ff_faanidct;
2681 c->idct_permutation_type = FF_NO_IDCT_PERM;
2682 } else { // accurate/default
2683 c->idct_put = ff_simple_idct_put_8;
2684 c->idct_add = ff_simple_idct_add_8;
2685 c->idct = ff_simple_idct_8;
2686 c->idct_permutation_type = FF_NO_IDCT_PERM;
2691 c->diff_pixels = diff_pixels_c;
2693 c->put_pixels_clamped = put_pixels_clamped_c;
2694 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2695 c->add_pixels_clamped = add_pixels_clamped_c;
2697 c->sum_abs_dctelem = sum_abs_dctelem_c;
2702 c->pix_sum = pix_sum_c;
2703 c->pix_norm1 = pix_norm1_c;
2705 c->fill_block_tab[0] = fill_block16_c;
2706 c->fill_block_tab[1] = fill_block8_c;
2708 /* TODO [0] 16 [1] 8 */
2709 c->pix_abs[0][0] = pix_abs16_c;
2710 c->pix_abs[0][1] = pix_abs16_x2_c;
2711 c->pix_abs[0][2] = pix_abs16_y2_c;
2712 c->pix_abs[0][3] = pix_abs16_xy2_c;
2713 c->pix_abs[1][0] = pix_abs8_c;
2714 c->pix_abs[1][1] = pix_abs8_x2_c;
2715 c->pix_abs[1][2] = pix_abs8_y2_c;
2716 c->pix_abs[1][3] = pix_abs8_xy2_c;
2718 #define dspfunc(PFX, IDX, NUM) \
2719 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
2720 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
2721 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
2722 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
2723 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
2724 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
2725 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
2726 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
2727 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
2728 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
2729 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2730 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2731 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2732 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2733 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2734 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2736 dspfunc(put_qpel, 0, 16);
2737 dspfunc(put_qpel, 1, 8);
2739 dspfunc(put_no_rnd_qpel, 0, 16);
2740 dspfunc(put_no_rnd_qpel, 1, 8);
2742 dspfunc(avg_qpel, 0, 16);
2743 dspfunc(avg_qpel, 1, 8);
2747 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2748 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2749 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2750 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2751 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2752 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2753 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2754 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2756 #define SET_CMP_FUNC(name) \
2757 c->name[0] = name ## 16_c; \
2758 c->name[1] = name ## 8x8_c;
2760 SET_CMP_FUNC(hadamard8_diff)
2761 c->hadamard8_diff[4] = hadamard8_intra16_c;
2762 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2763 SET_CMP_FUNC(dct_sad)
2764 SET_CMP_FUNC(dct_max)
2766 SET_CMP_FUNC(dct264_sad)
2768 c->sad[0] = pix_abs16_c;
2769 c->sad[1] = pix_abs8_c;
2770 c->sse[0] = sse16_c;
2773 SET_CMP_FUNC(quant_psnr)
2776 c->vsad[0] = vsad16_c;
2777 c->vsad[4] = vsad_intra16_c;
2778 c->vsad[5] = vsad_intra8_c;
2779 c->vsse[0] = vsse16_c;
2780 c->vsse[4] = vsse_intra16_c;
2781 c->vsse[5] = vsse_intra8_c;
2782 c->nsse[0] = nsse16_c;
2783 c->nsse[1] = nsse8_c;
2784 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2785 ff_dsputil_init_dwt(c);
2788 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2790 c->add_bytes = add_bytes_c;
2791 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
2792 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2793 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2795 c->diff_bytes = diff_bytes_c;
2796 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2798 c->bswap_buf = bswap_buf;
2799 c->bswap16_buf = bswap16_buf;
2801 c->try_8x8basis = try_8x8basis_c;
2802 c->add_8x8basis = add_8x8basis_c;
2804 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2806 c->scalarproduct_int16 = scalarproduct_int16_c;
2807 c->vector_clip_int32 = vector_clip_int32_c;
2808 c->vector_clipf = vector_clipf_c;
2810 c->shrink[0] = av_image_copy_plane;
2811 c->shrink[1] = ff_shrink22;
2812 c->shrink[2] = ff_shrink44;
2813 c->shrink[3] = ff_shrink88;
2815 c->add_pixels8 = add_pixels8_c;
2817 c->draw_edges = draw_edges_8_c;
2819 c->clear_block = clear_block_8_c;
2820 c->clear_blocks = clear_blocks_8_c;
2822 switch (avctx->bits_per_raw_sample) {
2827 c->get_pixels = get_pixels_16_c;
2830 if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2831 c->get_pixels = get_pixels_8_c;
2838 ff_dsputil_init_alpha(c, avctx);
2840 ff_dsputil_init_arm(c, avctx, high_bit_depth);
2842 ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2844 ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2846 ff_dsputil_init_x86(c, avctx, high_bit_depth);
2848 ff_init_scantable_permutation(c->idct_permutation,
2849 c->idct_permutation_type);
2852 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2854 ff_dsputil_init(c, avctx);
2857 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2859 ff_dsputil_init(c, avctx);