3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_square_tab[512] = { 0, };
49 #include "dsputil_template.c"
53 #include "tpel_template.c"
54 #include "dsputil_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL / 255 * 0x7f)
58 #define pb_80 (~0UL / 255 * 0x80)
60 /* Specific zigzag scan for 248 idct. NOTE that unlike the
61 * specification, we interleave the fields */
62 const uint8_t ff_zigzag248_direct[64] = {
63 0, 8, 1, 9, 16, 24, 2, 10,
64 17, 25, 32, 40, 48, 56, 33, 41,
65 18, 26, 3, 11, 4, 12, 19, 27,
66 34, 42, 49, 57, 50, 58, 35, 43,
67 20, 28, 5, 13, 6, 14, 21, 29,
68 36, 44, 51, 59, 52, 60, 37, 45,
69 22, 30, 7, 15, 23, 31, 38, 46,
70 53, 61, 54, 62, 39, 47, 55, 63,
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64] = {
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
109 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
110 const uint8_t *src_scantable)
114 st->scantable = src_scantable;
116 for (i = 0; i < 64; i++) {
117 int j = src_scantable[i];
118 st->permutated[i] = permutation[j];
122 for (i = 0; i < 64; i++) {
123 int j = st->permutated[i];
126 st->raster_end[i] = end;
130 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
131 int idct_permutation_type)
135 switch (idct_permutation_type) {
136 case FF_NO_IDCT_PERM:
137 for (i = 0; i < 64; i++)
138 idct_permutation[i] = i;
140 case FF_LIBMPEG2_IDCT_PERM:
141 for (i = 0; i < 64; i++)
142 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
144 case FF_SIMPLE_IDCT_PERM:
145 for (i = 0; i < 64; i++)
146 idct_permutation[i] = simple_mmx_permutation[i];
148 case FF_TRANSPOSE_IDCT_PERM:
149 for (i = 0; i < 64; i++)
150 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
152 case FF_PARTTRANS_IDCT_PERM:
153 for (i = 0; i < 64; i++)
154 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
156 case FF_SSE2_IDCT_PERM:
157 for (i = 0; i < 64; i++)
158 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
161 av_log(NULL, AV_LOG_ERROR,
162 "Internal error, IDCT permutation not set\n");
166 static int pix_sum_c(uint8_t *pix, int line_size)
170 for (i = 0; i < 16; i++) {
171 for (j = 0; j < 16; j += 8) {
182 pix += line_size - 16;
187 static int pix_norm1_c(uint8_t *pix, int line_size)
190 uint32_t *sq = ff_square_tab + 256;
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
205 register uint64_t x = *(uint64_t *) pix;
207 s += sq[(x >> 8) & 0xff];
208 s += sq[(x >> 16) & 0xff];
209 s += sq[(x >> 24) & 0xff];
210 s += sq[(x >> 32) & 0xff];
211 s += sq[(x >> 40) & 0xff];
212 s += sq[(x >> 48) & 0xff];
213 s += sq[(x >> 56) & 0xff];
215 register uint32_t x = *(uint32_t *) pix;
217 s += sq[(x >> 8) & 0xff];
218 s += sq[(x >> 16) & 0xff];
219 s += sq[(x >> 24) & 0xff];
220 x = *(uint32_t *) (pix + 4);
222 s += sq[(x >> 8) & 0xff];
223 s += sq[(x >> 16) & 0xff];
224 s += sq[(x >> 24) & 0xff];
229 pix += line_size - 16;
234 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
238 for (i = 0; i + 8 <= w; i += 8) {
239 dst[i + 0] = av_bswap32(src[i + 0]);
240 dst[i + 1] = av_bswap32(src[i + 1]);
241 dst[i + 2] = av_bswap32(src[i + 2]);
242 dst[i + 3] = av_bswap32(src[i + 3]);
243 dst[i + 4] = av_bswap32(src[i + 4]);
244 dst[i + 5] = av_bswap32(src[i + 5]);
245 dst[i + 6] = av_bswap32(src[i + 6]);
246 dst[i + 7] = av_bswap32(src[i + 7]);
249 dst[i + 0] = av_bswap32(src[i + 0]);
252 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
255 *dst++ = av_bswap16(*src++);
258 static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
259 int line_size, int h)
262 uint32_t *sq = ff_square_tab + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
275 static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
276 int line_size, int h)
279 uint32_t *sq = ff_square_tab + 256;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
286 s += sq[pix1[4] - pix2[4]];
287 s += sq[pix1[5] - pix2[5]];
288 s += sq[pix1[6] - pix2[6]];
289 s += sq[pix1[7] - pix2[7]];
296 static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
297 int line_size, int h)
300 uint32_t *sq = ff_square_tab + 256;
302 for (i = 0; i < h; i++) {
303 s += sq[pix1[0] - pix2[0]];
304 s += sq[pix1[1] - pix2[1]];
305 s += sq[pix1[2] - pix2[2]];
306 s += sq[pix1[3] - pix2[3]];
307 s += sq[pix1[4] - pix2[4]];
308 s += sq[pix1[5] - pix2[5]];
309 s += sq[pix1[6] - pix2[6]];
310 s += sq[pix1[7] - pix2[7]];
311 s += sq[pix1[8] - pix2[8]];
312 s += sq[pix1[9] - pix2[9]];
313 s += sq[pix1[10] - pix2[10]];
314 s += sq[pix1[11] - pix2[11]];
315 s += sq[pix1[12] - pix2[12]];
316 s += sq[pix1[13] - pix2[13]];
317 s += sq[pix1[14] - pix2[14]];
318 s += sq[pix1[15] - pix2[15]];
326 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
327 const uint8_t *s2, int stride)
331 /* read the pixels */
332 for (i = 0; i < 8; i++) {
333 block[0] = s1[0] - s2[0];
334 block[1] = s1[1] - s2[1];
335 block[2] = s1[2] - s2[2];
336 block[3] = s1[3] - s2[3];
337 block[4] = s1[4] - s2[4];
338 block[5] = s1[5] - s2[5];
339 block[6] = s1[6] - s2[6];
340 block[7] = s1[7] - s2[7];
347 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
352 /* read the pixels */
353 for (i = 0; i < 8; i++) {
354 pixels[0] = av_clip_uint8(block[0]);
355 pixels[1] = av_clip_uint8(block[1]);
356 pixels[2] = av_clip_uint8(block[2]);
357 pixels[3] = av_clip_uint8(block[3]);
358 pixels[4] = av_clip_uint8(block[4]);
359 pixels[5] = av_clip_uint8(block[5]);
360 pixels[6] = av_clip_uint8(block[6]);
361 pixels[7] = av_clip_uint8(block[7]);
368 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
373 /* read the pixels */
375 pixels[0] = av_clip_uint8(block[0]);
376 pixels[1] = av_clip_uint8(block[1]);
377 pixels[2] = av_clip_uint8(block[2]);
378 pixels[3] = av_clip_uint8(block[3]);
385 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
390 /* read the pixels */
392 pixels[0] = av_clip_uint8(block[0]);
393 pixels[1] = av_clip_uint8(block[1]);
400 static void put_signed_pixels_clamped_c(const int16_t *block,
401 uint8_t *av_restrict pixels,
406 for (i = 0; i < 8; i++) {
407 for (j = 0; j < 8; j++) {
410 else if (*block > 127)
413 *pixels = (uint8_t) (*block + 128);
417 pixels += (line_size - 8);
421 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
426 for (i = 0; i < 8; i++) {
427 pixels[0] += block[0];
428 pixels[1] += block[1];
429 pixels[2] += block[2];
430 pixels[3] += block[3];
431 pixels[4] += block[4];
432 pixels[5] += block[5];
433 pixels[6] += block[6];
434 pixels[7] += block[7];
440 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
445 /* read the pixels */
446 for (i = 0; i < 8; i++) {
447 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
448 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
449 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
450 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
451 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
452 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
453 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
454 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
460 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
465 /* read the pixels */
467 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
468 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
469 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
470 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
476 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
481 /* read the pixels */
483 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
484 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
490 static int sum_abs_dctelem_c(int16_t *block)
494 for (i = 0; i < 64; i++)
495 sum += FFABS(block[i]);
499 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
503 for (i = 0; i < h; i++) {
504 memset(block, value, 16);
509 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
513 for (i = 0; i < h; i++) {
514 memset(block, value, 8);
519 #define avg2(a, b) ((a + b + 1) >> 1)
520 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
522 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
523 int x16, int y16, int rounder)
525 const int A = (16 - x16) * (16 - y16);
526 const int B = (x16) * (16 - y16);
527 const int C = (16 - x16) * (y16);
528 const int D = (x16) * (y16);
531 for (i = 0; i < h; i++) {
532 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
533 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
534 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
535 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
536 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
537 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
538 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
539 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
545 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
546 int dxx, int dxy, int dyx, int dyy, int shift, int r,
547 int width, int height)
550 const int s = 1 << shift;
555 for (y = 0; y < h; y++) {
560 for (x = 0; x < 8; x++) { // FIXME: optimize
562 int src_x = vx >> 16;
563 int src_y = vy >> 16;
564 int frac_x = src_x & (s - 1);
565 int frac_y = src_y & (s - 1);
570 if ((unsigned) src_x < width) {
571 if ((unsigned) src_y < height) {
572 index = src_x + src_y * stride;
573 dst[y * stride + x] =
574 ((src[index] * (s - frac_x) +
575 src[index + 1] * frac_x) * (s - frac_y) +
576 (src[index + stride] * (s - frac_x) +
577 src[index + stride + 1] * frac_x) * frac_y +
580 index = src_x + av_clip(src_y, 0, height) * stride;
581 dst[y * stride + x] =
582 ((src[index] * (s - frac_x) +
583 src[index + 1] * frac_x) * s +
587 if ((unsigned) src_y < height) {
588 index = av_clip(src_x, 0, width) + src_y * stride;
589 dst[y * stride + x] =
590 ((src[index] * (s - frac_y) +
591 src[index + stride] * frac_y) * s +
594 index = av_clip(src_x, 0, width) +
595 av_clip(src_y, 0, height) * stride;
596 dst[y * stride + x] = src[index];
608 #define QPEL_MC(r, OPNAME, RND, OP) \
609 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
610 int dstStride, int srcStride, \
613 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
616 for (i = 0; i < h; i++) { \
617 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
618 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
619 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
620 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
621 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
622 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
623 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
624 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
630 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
631 int dstStride, int srcStride) \
633 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
637 for (i = 0; i < w; i++) { \
638 const int src0 = src[0 * srcStride]; \
639 const int src1 = src[1 * srcStride]; \
640 const int src2 = src[2 * srcStride]; \
641 const int src3 = src[3 * srcStride]; \
642 const int src4 = src[4 * srcStride]; \
643 const int src5 = src[5 * srcStride]; \
644 const int src6 = src[6 * srcStride]; \
645 const int src7 = src[7 * srcStride]; \
646 const int src8 = src[8 * srcStride]; \
647 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
648 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
649 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
650 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
651 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
652 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
653 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
654 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
660 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
661 int dstStride, int srcStride, \
664 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
667 for (i = 0; i < h; i++) { \
668 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
669 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
670 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
671 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
672 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
673 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
674 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
675 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
676 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
677 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
678 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
679 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
680 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
681 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
682 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
683 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
689 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
690 int dstStride, int srcStride) \
692 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
696 for (i = 0; i < w; i++) { \
697 const int src0 = src[0 * srcStride]; \
698 const int src1 = src[1 * srcStride]; \
699 const int src2 = src[2 * srcStride]; \
700 const int src3 = src[3 * srcStride]; \
701 const int src4 = src[4 * srcStride]; \
702 const int src5 = src[5 * srcStride]; \
703 const int src6 = src[6 * srcStride]; \
704 const int src7 = src[7 * srcStride]; \
705 const int src8 = src[8 * srcStride]; \
706 const int src9 = src[9 * srcStride]; \
707 const int src10 = src[10 * srcStride]; \
708 const int src11 = src[11 * srcStride]; \
709 const int src12 = src[12 * srcStride]; \
710 const int src13 = src[13 * srcStride]; \
711 const int src14 = src[14 * srcStride]; \
712 const int src15 = src[15 * srcStride]; \
713 const int src16 = src[16 * srcStride]; \
714 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
715 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
716 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
717 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
718 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
719 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
720 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
721 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
722 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
723 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
724 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
725 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
726 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
727 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
728 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
729 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
735 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
740 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
741 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
744 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
747 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
750 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
755 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
756 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
759 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
762 uint8_t full[16 * 9]; \
765 copy_block9(full, src, 16, stride, 9); \
766 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
767 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
770 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
773 uint8_t full[16 * 9]; \
775 copy_block9(full, src, 16, stride, 9); \
776 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
779 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
782 uint8_t full[16 * 9]; \
785 copy_block9(full, src, 16, stride, 9); \
786 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
787 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
790 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
793 uint8_t full[16 * 9]; \
796 uint8_t halfHV[64]; \
798 copy_block9(full, src, 16, stride, 9); \
799 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
800 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
801 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
802 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
803 stride, 16, 8, 8, 8, 8); \
806 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
809 uint8_t full[16 * 9]; \
811 uint8_t halfHV[64]; \
813 copy_block9(full, src, 16, stride, 9); \
814 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
815 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
816 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
817 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
820 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
823 uint8_t full[16 * 9]; \
826 uint8_t halfHV[64]; \
828 copy_block9(full, src, 16, stride, 9); \
829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
830 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
832 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
833 stride, 16, 8, 8, 8, 8); \
836 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
839 uint8_t full[16 * 9]; \
841 uint8_t halfHV[64]; \
843 copy_block9(full, src, 16, stride, 9); \
844 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
845 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
846 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
847 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
850 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
853 uint8_t full[16 * 9]; \
856 uint8_t halfHV[64]; \
858 copy_block9(full, src, 16, stride, 9); \
859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
860 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
861 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
862 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
863 stride, 16, 8, 8, 8, 8); \
866 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
869 uint8_t full[16 * 9]; \
871 uint8_t halfHV[64]; \
873 copy_block9(full, src, 16, stride, 9); \
874 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
875 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
876 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
877 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
880 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
883 uint8_t full[16 * 9]; \
886 uint8_t halfHV[64]; \
888 copy_block9(full, src, 16, stride, 9); \
889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
890 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
891 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
892 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
893 stride, 16, 8, 8, 8, 8); \
896 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
899 uint8_t full[16 * 9]; \
901 uint8_t halfHV[64]; \
903 copy_block9(full, src, 16, stride, 9); \
904 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
905 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
906 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
907 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
910 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
914 uint8_t halfHV[64]; \
916 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
917 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
918 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
921 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
925 uint8_t halfHV[64]; \
927 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
928 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
929 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
932 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
935 uint8_t full[16 * 9]; \
938 uint8_t halfHV[64]; \
940 copy_block9(full, src, 16, stride, 9); \
941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
942 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
944 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
947 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
950 uint8_t full[16 * 9]; \
953 copy_block9(full, src, 16, stride, 9); \
954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
955 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
956 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
959 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
962 uint8_t full[16 * 9]; \
965 uint8_t halfHV[64]; \
967 copy_block9(full, src, 16, stride, 9); \
968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
971 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
974 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
977 uint8_t full[16 * 9]; \
980 copy_block9(full, src, 16, stride, 9); \
981 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
982 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
983 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
986 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
991 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
992 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
995 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
1000 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1001 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
1004 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
1007 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
1010 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
1013 uint8_t half[256]; \
1015 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1016 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
1019 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
1022 uint8_t full[24 * 17]; \
1023 uint8_t half[256]; \
1025 copy_block17(full, src, 24, stride, 17); \
1026 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1027 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
1030 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
1033 uint8_t full[24 * 17]; \
1035 copy_block17(full, src, 24, stride, 17); \
1036 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
1039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
1042 uint8_t full[24 * 17]; \
1043 uint8_t half[256]; \
1045 copy_block17(full, src, 24, stride, 17); \
1046 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1047 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
1050 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
1053 uint8_t full[24 * 17]; \
1054 uint8_t halfH[272]; \
1055 uint8_t halfV[256]; \
1056 uint8_t halfHV[256]; \
1058 copy_block17(full, src, 24, stride, 17); \
1059 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1060 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1061 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1062 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1063 stride, 24, 16, 16, 16, 16); \
1066 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1069 uint8_t full[24 * 17]; \
1070 uint8_t halfH[272]; \
1071 uint8_t halfHV[256]; \
1073 copy_block17(full, src, 24, stride, 17); \
1074 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1075 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1076 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1077 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1080 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1083 uint8_t full[24 * 17]; \
1084 uint8_t halfH[272]; \
1085 uint8_t halfV[256]; \
1086 uint8_t halfHV[256]; \
1088 copy_block17(full, src, 24, stride, 17); \
1089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1090 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1092 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1093 stride, 24, 16, 16, 16, 16); \
1096 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1099 uint8_t full[24 * 17]; \
1100 uint8_t halfH[272]; \
1101 uint8_t halfHV[256]; \
1103 copy_block17(full, src, 24, stride, 17); \
1104 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1105 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1106 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1107 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1110 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1113 uint8_t full[24 * 17]; \
1114 uint8_t halfH[272]; \
1115 uint8_t halfV[256]; \
1116 uint8_t halfHV[256]; \
1118 copy_block17(full, src, 24, stride, 17); \
1119 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1120 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1121 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1122 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1123 stride, 24, 16, 16, 16, 16); \
1126 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1129 uint8_t full[24 * 17]; \
1130 uint8_t halfH[272]; \
1131 uint8_t halfHV[256]; \
1133 copy_block17(full, src, 24, stride, 17); \
1134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1135 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1136 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1137 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1140 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1143 uint8_t full[24 * 17]; \
1144 uint8_t halfH[272]; \
1145 uint8_t halfV[256]; \
1146 uint8_t halfHV[256]; \
1148 copy_block17(full, src, 24, stride, 17); \
1149 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1151 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1152 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1153 stride, 24, 16, 16, 16, 16); \
1156 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1159 uint8_t full[24 * 17]; \
1160 uint8_t halfH[272]; \
1161 uint8_t halfHV[256]; \
1163 copy_block17(full, src, 24, stride, 17); \
1164 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1165 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1167 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1170 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1173 uint8_t halfH[272]; \
1174 uint8_t halfHV[256]; \
1176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1178 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1181 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1184 uint8_t halfH[272]; \
1185 uint8_t halfHV[256]; \
1187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1189 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1192 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1195 uint8_t full[24 * 17]; \
1196 uint8_t halfH[272]; \
1197 uint8_t halfV[256]; \
1198 uint8_t halfHV[256]; \
1200 copy_block17(full, src, 24, stride, 17); \
1201 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1203 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1204 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1207 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1210 uint8_t full[24 * 17]; \
1211 uint8_t halfH[272]; \
1213 copy_block17(full, src, 24, stride, 17); \
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1216 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1219 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1222 uint8_t full[24 * 17]; \
1223 uint8_t halfH[272]; \
1224 uint8_t halfV[256]; \
1225 uint8_t halfHV[256]; \
1227 copy_block17(full, src, 24, stride, 17); \
1228 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1229 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1230 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1231 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1234 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1237 uint8_t full[24 * 17]; \
1238 uint8_t halfH[272]; \
1240 copy_block17(full, src, 24, stride, 17); \
1241 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1242 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1243 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1246 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1249 uint8_t halfH[272]; \
1251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1252 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1255 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1256 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1257 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1258 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1260 QPEL_MC(0, put_, _, op_put)
1261 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1262 QPEL_MC(0, avg_, _, op_avg)
1266 #undef op_put_no_rnd
1268 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1270 put_pixels8_8_c(dst, src, stride, 8);
1273 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1275 avg_pixels8_8_c(dst, src, stride, 8);
1278 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1280 put_pixels16_8_c(dst, src, stride, 16);
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1285 avg_pixels16_8_c(dst, src, stride, 16);
1288 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1296 int dstStride, int srcStride, int h)
1298 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1301 for (i = 0; i < h; i++) {
1302 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1303 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1304 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1305 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1306 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1307 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1308 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1309 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1315 #if CONFIG_RV40_DECODER
1316 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1318 put_pixels16_xy2_8_c(dst, src, stride, 16);
1321 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1323 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1326 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 put_pixels8_xy2_8_c(dst, src, stride, 8);
1331 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1333 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1335 #endif /* CONFIG_RV40_DECODER */
1337 #if CONFIG_DIRAC_DECODER
1338 #define DIRAC_MC(OPNAME)\
1339 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1341 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1343 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1345 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1347 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1349 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1350 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1352 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1354 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1356 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1358 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1360 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1362 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1363 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1365 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1367 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1369 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1371 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1373 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1375 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1376 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1382 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1383 int dstStride, int srcStride, int w)
1385 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1388 for (i = 0; i < w; i++) {
1389 const int src_1 = src[-srcStride];
1390 const int src0 = src[0];
1391 const int src1 = src[srcStride];
1392 const int src2 = src[2 * srcStride];
1393 const int src3 = src[3 * srcStride];
1394 const int src4 = src[4 * srcStride];
1395 const int src5 = src[5 * srcStride];
1396 const int src6 = src[6 * srcStride];
1397 const int src7 = src[7 * srcStride];
1398 const int src8 = src[8 * srcStride];
1399 const int src9 = src[9 * srcStride];
1400 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1401 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1402 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1403 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1404 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1405 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1406 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1407 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1413 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1417 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1418 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1421 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1423 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1426 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1430 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1431 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1434 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1436 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1439 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1445 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1446 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1447 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1448 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1451 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1457 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1458 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1459 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1460 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1463 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1467 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1468 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1471 static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1472 int line_size, int h)
1476 for (i = 0; i < h; i++) {
1477 s += abs(pix1[0] - pix2[0]);
1478 s += abs(pix1[1] - pix2[1]);
1479 s += abs(pix1[2] - pix2[2]);
1480 s += abs(pix1[3] - pix2[3]);
1481 s += abs(pix1[4] - pix2[4]);
1482 s += abs(pix1[5] - pix2[5]);
1483 s += abs(pix1[6] - pix2[6]);
1484 s += abs(pix1[7] - pix2[7]);
1485 s += abs(pix1[8] - pix2[8]);
1486 s += abs(pix1[9] - pix2[9]);
1487 s += abs(pix1[10] - pix2[10]);
1488 s += abs(pix1[11] - pix2[11]);
1489 s += abs(pix1[12] - pix2[12]);
1490 s += abs(pix1[13] - pix2[13]);
1491 s += abs(pix1[14] - pix2[14]);
1492 s += abs(pix1[15] - pix2[15]);
1499 static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1500 int line_size, int h)
1504 for (i = 0; i < h; i++) {
1505 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1513 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1514 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1515 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1516 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1517 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1518 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1519 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1520 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1527 static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1528 int line_size, int h)
1531 uint8_t *pix3 = pix2 + line_size;
1533 for (i = 0; i < h; i++) {
1534 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1535 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1536 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1537 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1538 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1539 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1540 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1541 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1542 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1543 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1544 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1545 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1546 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1547 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1548 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1549 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1557 static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1558 int line_size, int h)
1561 uint8_t *pix3 = pix2 + line_size;
1563 for (i = 0; i < h; i++) {
1564 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1565 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1566 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1567 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1568 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1569 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1570 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1571 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1572 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1573 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1574 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1575 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1576 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1577 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1578 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1579 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1587 static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1588 int line_size, int h)
1592 for (i = 0; i < h; i++) {
1593 s += abs(pix1[0] - pix2[0]);
1594 s += abs(pix1[1] - pix2[1]);
1595 s += abs(pix1[2] - pix2[2]);
1596 s += abs(pix1[3] - pix2[3]);
1597 s += abs(pix1[4] - pix2[4]);
1598 s += abs(pix1[5] - pix2[5]);
1599 s += abs(pix1[6] - pix2[6]);
1600 s += abs(pix1[7] - pix2[7]);
1607 static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1608 int line_size, int h)
1612 for (i = 0; i < h; i++) {
1613 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1614 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1615 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1616 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1617 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1618 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1619 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1620 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1627 static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1628 int line_size, int h)
1631 uint8_t *pix3 = pix2 + line_size;
1633 for (i = 0; i < h; i++) {
1634 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1649 static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
1650 int line_size, int h)
1653 uint8_t *pix3 = pix2 + line_size;
1655 for (i = 0; i < h; i++) {
1656 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1657 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1658 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1659 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1660 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1661 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1662 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1663 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1671 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1673 int score1 = 0, score2 = 0, x, y;
1675 for (y = 0; y < h; y++) {
1676 for (x = 0; x < 16; x++)
1677 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1679 for (x = 0; x < 15; x++)
1680 score2 += FFABS(s1[x] - s1[x + stride] -
1681 s1[x + 1] + s1[x + stride + 1]) -
1682 FFABS(s2[x] - s2[x + stride] -
1683 s2[x + 1] + s2[x + stride + 1]);
1690 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1692 return score1 + FFABS(score2) * 8;
1695 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
1697 int score1 = 0, score2 = 0, x, y;
1699 for (y = 0; y < h; y++) {
1700 for (x = 0; x < 8; x++)
1701 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1703 for (x = 0; x < 7; x++)
1704 score2 += FFABS(s1[x] - s1[x + stride] -
1705 s1[x + 1] + s1[x + stride + 1]) -
1706 FFABS(s2[x] - s2[x + stride] -
1707 s2[x + 1] + s2[x + stride + 1]);
1714 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1716 return score1 + FFABS(score2) * 8;
1719 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1720 int16_t basis[64], int scale)
1723 unsigned int sum = 0;
1725 for (i = 0; i < 8 * 8; i++) {
1726 int b = rem[i] + ((basis[i] * scale +
1727 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1728 (BASIS_SHIFT - RECON_SHIFT));
1731 av_assert2(-512 < b && b < 512);
1733 sum += (w * b) * (w * b) >> 4;
1738 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
1742 for (i = 0; i < 8 * 8; i++)
1743 rem[i] += (basis[i] * scale +
1744 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
1745 (BASIS_SHIFT - RECON_SHIFT);
1748 static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
1754 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
1758 memset(cmp, 0, sizeof(void *) * 6);
1760 for (i = 0; i < 6; i++) {
1761 switch (type & 0xFF) {
1766 cmp[i] = c->hadamard8_diff[i];
1772 cmp[i] = c->dct_sad[i];
1775 cmp[i] = c->dct264_sad[i];
1778 cmp[i] = c->dct_max[i];
1781 cmp[i] = c->quant_psnr[i];
1790 cmp[i] = c->vsad[i];
1793 cmp[i] = c->vsse[i];
1799 cmp[i] = c->nsse[i];
1810 av_log(NULL, AV_LOG_ERROR,
1811 "internal error in cmp function selection\n");
1816 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
1820 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1821 long a = *(long *) (src + i);
1822 long b = *(long *) (dst + i);
1823 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
1826 dst[i + 0] += src[i + 0];
1829 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
1833 #if !HAVE_FAST_UNALIGNED
1834 if ((long) src2 & (sizeof(long) - 1)) {
1835 for (i = 0; i + 7 < w; i += 8) {
1836 dst[i + 0] = src1[i + 0] - src2[i + 0];
1837 dst[i + 1] = src1[i + 1] - src2[i + 1];
1838 dst[i + 2] = src1[i + 2] - src2[i + 2];
1839 dst[i + 3] = src1[i + 3] - src2[i + 3];
1840 dst[i + 4] = src1[i + 4] - src2[i + 4];
1841 dst[i + 5] = src1[i + 5] - src2[i + 5];
1842 dst[i + 6] = src1[i + 6] - src2[i + 6];
1843 dst[i + 7] = src1[i + 7] - src2[i + 7];
1847 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
1848 long a = *(long *) (src1 + i);
1849 long b = *(long *) (src2 + i);
1850 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
1851 ((a ^ b ^ pb_80) & pb_80);
1854 dst[i + 0] = src1[i + 0] - src2[i + 0];
1857 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1858 const uint8_t *diff, int w,
1859 int *left, int *left_top)
1867 for (i = 0; i < w; i++) {
1868 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
1877 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
1878 const uint8_t *src2, int w,
1879 int *left, int *left_top)
1887 for (i = 0; i < w; i++) {
1888 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
1898 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
1903 for (i = 0; i < w - 1; i++) {
1911 for (; i < w; i++) {
1930 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
1931 int w, int *red, int *green,
1932 int *blue, int *alpha)
1934 int i, r = *red, g = *green, b = *blue, a = *alpha;
1936 for (i = 0; i < w; i++) {
1937 b += src[4 * i + B];
1938 g += src[4 * i + G];
1939 r += src[4 * i + R];
1940 a += src[4 * i + A];
1958 #define BUTTERFLY2(o1, o2, i1, i2) \
1962 #define BUTTERFLY1(x, y) \
1971 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
1973 static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
1974 uint8_t *src, int stride, int h)
1976 int i, temp[64], sum = 0;
1980 for (i = 0; i < 8; i++) {
1981 // FIXME: try pointer walks
1982 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
1983 src[stride * i + 0] - dst[stride * i + 0],
1984 src[stride * i + 1] - dst[stride * i + 1]);
1985 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
1986 src[stride * i + 2] - dst[stride * i + 2],
1987 src[stride * i + 3] - dst[stride * i + 3]);
1988 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
1989 src[stride * i + 4] - dst[stride * i + 4],
1990 src[stride * i + 5] - dst[stride * i + 5]);
1991 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
1992 src[stride * i + 6] - dst[stride * i + 6],
1993 src[stride * i + 7] - dst[stride * i + 7]);
1995 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
1996 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
1997 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
1998 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2000 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2001 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2002 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2003 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2006 for (i = 0; i < 8; i++) {
2007 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2008 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2009 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2010 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2012 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2013 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2014 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2015 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2017 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
2018 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
2019 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2020 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2025 static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
2026 uint8_t *dummy, int stride, int h)
2028 int i, temp[64], sum = 0;
2032 for (i = 0; i < 8; i++) {
2033 // FIXME: try pointer walks
2034 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2035 src[stride * i + 0], src[stride * i + 1]);
2036 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2037 src[stride * i + 2], src[stride * i + 3]);
2038 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2039 src[stride * i + 4], src[stride * i + 5]);
2040 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2041 src[stride * i + 6], src[stride * i + 7]);
2043 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2044 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2045 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2046 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2048 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2049 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2050 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2051 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2054 for (i = 0; i < 8; i++) {
2055 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2056 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2057 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2058 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2060 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2061 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2062 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2063 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2066 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2067 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2068 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2069 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2072 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2077 static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2078 uint8_t *src2, int stride, int h)
2080 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2084 s->dsp.diff_pixels(temp, src1, src2, stride);
2086 return s->dsp.sum_abs_dctelem(temp);
2092 const int s07 = SRC(0) + SRC(7); \
2093 const int s16 = SRC(1) + SRC(6); \
2094 const int s25 = SRC(2) + SRC(5); \
2095 const int s34 = SRC(3) + SRC(4); \
2096 const int a0 = s07 + s34; \
2097 const int a1 = s16 + s25; \
2098 const int a2 = s07 - s34; \
2099 const int a3 = s16 - s25; \
2100 const int d07 = SRC(0) - SRC(7); \
2101 const int d16 = SRC(1) - SRC(6); \
2102 const int d25 = SRC(2) - SRC(5); \
2103 const int d34 = SRC(3) - SRC(4); \
2104 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
2105 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
2106 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
2107 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
2109 DST(1, a4 + (a7 >> 2)); \
2110 DST(2, a2 + (a3 >> 1)); \
2111 DST(3, a5 + (a6 >> 2)); \
2113 DST(5, a6 - (a5 >> 2)); \
2114 DST(6, (a2 >> 1) - a3); \
2115 DST(7, (a4 >> 2) - a7); \
2118 static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
2119 uint8_t *src2, int stride, int h)
2124 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2126 #define SRC(x) dct[i][x]
2127 #define DST(x, v) dct[i][x] = v
2128 for (i = 0; i < 8; i++)
2133 #define SRC(x) dct[x][i]
2134 #define DST(x, v) sum += FFABS(v)
2135 for (i = 0; i < 8; i++)
2143 static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
2144 uint8_t *src2, int stride, int h)
2146 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2151 s->dsp.diff_pixels(temp, src1, src2, stride);
2154 for (i = 0; i < 64; i++)
2155 sum = FFMAX(sum, FFABS(temp[i]));
2160 static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
2161 uint8_t *src2, int stride, int h)
2163 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2164 int16_t *const bak = temp + 64;
2170 s->dsp.diff_pixels(temp, src1, src2, stride);
2172 memcpy(bak, temp, 64 * sizeof(int16_t));
2174 s->block_last_index[0 /* FIXME */] =
2175 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2176 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2177 ff_simple_idct_8(temp); // FIXME
2179 for (i = 0; i < 64; i++)
2180 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2185 static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2188 const uint8_t *scantable = s->intra_scantable.permutated;
2189 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2190 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2191 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2192 int i, last, run, bits, level, distortion, start_i;
2193 const int esc_length = s->ac_esc_length;
2194 uint8_t *length, *last_length;
2198 copy_block8(lsrc1, src1, 8, stride, 8);
2199 copy_block8(lsrc2, src2, 8, stride, 8);
2201 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2203 s->block_last_index[0 /* FIXME */] =
2205 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2211 length = s->intra_ac_vlc_length;
2212 last_length = s->intra_ac_vlc_last_length;
2213 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2216 length = s->inter_ac_vlc_length;
2217 last_length = s->inter_ac_vlc_last_length;
2220 if (last >= start_i) {
2222 for (i = start_i; i < last; i++) {
2223 int j = scantable[i];
2228 if ((level & (~127)) == 0)
2229 bits += length[UNI_AC_ENC_INDEX(run, level)];
2236 i = scantable[last];
2238 level = temp[i] + 64;
2240 av_assert2(level - 64);
2242 if ((level & (~127)) == 0) {
2243 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2250 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2252 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2255 s->dsp.idct_add(lsrc2, 8, temp);
2257 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2259 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2262 static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
2265 const uint8_t *scantable = s->intra_scantable.permutated;
2266 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2267 int i, last, run, bits, level, start_i;
2268 const int esc_length = s->ac_esc_length;
2269 uint8_t *length, *last_length;
2273 s->dsp.diff_pixels(temp, src1, src2, stride);
2275 s->block_last_index[0 /* FIXME */] =
2277 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2283 length = s->intra_ac_vlc_length;
2284 last_length = s->intra_ac_vlc_last_length;
2285 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2288 length = s->inter_ac_vlc_length;
2289 last_length = s->inter_ac_vlc_last_length;
2292 if (last >= start_i) {
2294 for (i = start_i; i < last; i++) {
2295 int j = scantable[i];
2300 if ((level & (~127)) == 0)
2301 bits += length[UNI_AC_ENC_INDEX(run, level)];
2308 i = scantable[last];
2310 level = temp[i] + 64;
2312 av_assert2(level - 64);
2314 if ((level & (~127)) == 0)
2315 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2323 #define VSAD_INTRA(size) \
2324 static int vsad_intra ## size ## _c(MpegEncContext *c, \
2325 uint8_t *s, uint8_t *dummy, \
2326 int stride, int h) \
2328 int score = 0, x, y; \
2330 for (y = 1; y < h; y++) { \
2331 for (x = 0; x < size; x += 4) { \
2332 score += FFABS(s[x] - s[x + stride]) + \
2333 FFABS(s[x + 1] - s[x + stride + 1]) + \
2334 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2335 FFABS(s[x + 3] - s[x + 3 + stride]); \
2345 static int vsad16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2348 int score = 0, x, y;
2350 for (y = 1; y < h; y++) {
2351 for (x = 0; x < 16; x++)
2352 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2360 #define SQ(a) ((a) * (a))
2361 #define VSSE_INTRA(size) \
2362 static int vsse_intra ## size ## _c(MpegEncContext *c, \
2363 uint8_t *s, uint8_t *dummy, \
2364 int stride, int h) \
2366 int score = 0, x, y; \
2368 for (y = 1; y < h; y++) { \
2369 for (x = 0; x < size; x += 4) { \
2370 score += SQ(s[x] - s[x + stride]) + \
2371 SQ(s[x + 1] - s[x + stride + 1]) + \
2372 SQ(s[x + 2] - s[x + stride + 2]) + \
2373 SQ(s[x + 3] - s[x + stride + 3]); \
2383 static int vsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2,
2386 int score = 0, x, y;
2388 for (y = 1; y < h; y++) {
2389 for (x = 0; x < 16; x++)
2390 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2398 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2403 for (i = 0; i < size; i++)
2404 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2408 #define WRAPPER8_16_SQ(name8, name16) \
2409 static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
2410 int stride, int h) \
2414 score += name8(s, dst, src, stride, 8); \
2415 score += name8(s, dst + 8, src + 8, stride, 8); \
2417 dst += 8 * stride; \
2418 src += 8 * stride; \
2419 score += name8(s, dst, src, stride, 8); \
2420 score += name8(s, dst + 8, src + 8, stride, 8); \
2425 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2426 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2427 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2429 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2431 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2432 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2433 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2434 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2436 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2437 uint32_t maxi, uint32_t maxisign)
2441 else if ((a ^ (1U << 31)) > maxisign)
2447 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2448 float *min, float *max, int len)
2451 uint32_t mini = *(uint32_t *) min;
2452 uint32_t maxi = *(uint32_t *) max;
2453 uint32_t maxisign = maxi ^ (1U << 31);
2454 uint32_t *dsti = (uint32_t *) dst;
2455 const uint32_t *srci = (const uint32_t *) src;
2457 for (i = 0; i < len; i += 8) {
2458 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2459 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2460 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2461 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2462 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2463 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2464 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2465 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2469 static void vector_clipf_c(float *dst, const float *src,
2470 float min, float max, int len)
2474 if (min < 0 && max > 0) {
2475 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2477 for (i = 0; i < len; i += 8) {
2478 dst[i] = av_clipf(src[i], min, max);
2479 dst[i + 1] = av_clipf(src[i + 1], min, max);
2480 dst[i + 2] = av_clipf(src[i + 2], min, max);
2481 dst[i + 3] = av_clipf(src[i + 3], min, max);
2482 dst[i + 4] = av_clipf(src[i + 4], min, max);
2483 dst[i + 5] = av_clipf(src[i + 5], min, max);
2484 dst[i + 6] = av_clipf(src[i + 6], min, max);
2485 dst[i + 7] = av_clipf(src[i + 7], min, max);
2490 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2496 res += *v1++ **v2++;
2501 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2509 *v1++ += mul * *v3++;
2514 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2515 int32_t max, unsigned int len)
2518 *dst++ = av_clip(*src++, min, max);
2519 *dst++ = av_clip(*src++, min, max);
2520 *dst++ = av_clip(*src++, min, max);
2521 *dst++ = av_clip(*src++, min, max);
2522 *dst++ = av_clip(*src++, min, max);
2523 *dst++ = av_clip(*src++, min, max);
2524 *dst++ = av_clip(*src++, min, max);
2525 *dst++ = av_clip(*src++, min, max);
2530 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2532 ff_j_rev_dct(block);
2533 put_pixels_clamped_c(block, dest, line_size);
2536 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2538 ff_j_rev_dct(block);
2539 add_pixels_clamped_c(block, dest, line_size);
2542 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2544 ff_j_rev_dct4 (block);
2545 put_pixels_clamped4_c(block, dest, line_size);
2547 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2549 ff_j_rev_dct4 (block);
2550 add_pixels_clamped4_c(block, dest, line_size);
2553 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2555 ff_j_rev_dct2 (block);
2556 put_pixels_clamped2_c(block, dest, line_size);
2558 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2560 ff_j_rev_dct2 (block);
2561 add_pixels_clamped2_c(block, dest, line_size);
2564 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2566 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2568 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2570 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2573 /* init static data */
2574 av_cold void ff_dsputil_static_init(void)
2578 for (i = 0; i < 512; i++)
2579 ff_square_tab[i] = (i - 256) * (i - 256);
2582 int ff_check_alignment(void)
2584 static int did_fail = 0;
2585 LOCAL_ALIGNED_16(int, aligned, [4]);
2587 if ((intptr_t)aligned & 15) {
2589 #if HAVE_MMX || HAVE_ALTIVEC
2590 av_log(NULL, AV_LOG_ERROR,
2591 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2592 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2593 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2594 "Do not report crashes to FFmpeg developers.\n");
2603 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2605 const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
2607 ff_check_alignment();
2610 if (avctx->bits_per_raw_sample == 10) {
2611 c->fdct = ff_jpeg_fdct_islow_10;
2612 c->fdct248 = ff_fdct248_islow_10;
2614 if (avctx->dct_algo == FF_DCT_FASTINT) {
2615 c->fdct = ff_fdct_ifast;
2616 c->fdct248 = ff_fdct_ifast248;
2617 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2618 c->fdct = ff_faandct;
2619 c->fdct248 = ff_faandct248;
2621 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2622 c->fdct248 = ff_fdct248_islow_8;
2625 #endif /* CONFIG_ENCODERS */
2627 if (avctx->lowres==1) {
2628 c->idct_put = ff_jref_idct4_put;
2629 c->idct_add = ff_jref_idct4_add;
2630 c->idct = ff_j_rev_dct4;
2631 c->idct_permutation_type = FF_NO_IDCT_PERM;
2632 } else if (avctx->lowres==2) {
2633 c->idct_put = ff_jref_idct2_put;
2634 c->idct_add = ff_jref_idct2_add;
2635 c->idct = ff_j_rev_dct2;
2636 c->idct_permutation_type = FF_NO_IDCT_PERM;
2637 } else if (avctx->lowres==3) {
2638 c->idct_put = ff_jref_idct1_put;
2639 c->idct_add = ff_jref_idct1_add;
2640 c->idct = ff_j_rev_dct1;
2641 c->idct_permutation_type = FF_NO_IDCT_PERM;
2643 if (avctx->bits_per_raw_sample == 10) {
2644 c->idct_put = ff_simple_idct_put_10;
2645 c->idct_add = ff_simple_idct_add_10;
2646 c->idct = ff_simple_idct_10;
2647 c->idct_permutation_type = FF_NO_IDCT_PERM;
2648 } else if (avctx->bits_per_raw_sample == 12) {
2649 c->idct_put = ff_simple_idct_put_12;
2650 c->idct_add = ff_simple_idct_add_12;
2651 c->idct = ff_simple_idct_12;
2652 c->idct_permutation_type = FF_NO_IDCT_PERM;
2654 if (avctx->idct_algo == FF_IDCT_INT) {
2655 c->idct_put = jref_idct_put;
2656 c->idct_add = jref_idct_add;
2657 c->idct = ff_j_rev_dct;
2658 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2659 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2660 c->idct_put = ff_faanidct_put;
2661 c->idct_add = ff_faanidct_add;
2662 c->idct = ff_faanidct;
2663 c->idct_permutation_type = FF_NO_IDCT_PERM;
2664 } else { // accurate/default
2665 c->idct_put = ff_simple_idct_put_8;
2666 c->idct_add = ff_simple_idct_add_8;
2667 c->idct = ff_simple_idct_8;
2668 c->idct_permutation_type = FF_NO_IDCT_PERM;
2673 c->diff_pixels = diff_pixels_c;
2675 c->put_pixels_clamped = put_pixels_clamped_c;
2676 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2677 c->add_pixels_clamped = add_pixels_clamped_c;
2679 c->sum_abs_dctelem = sum_abs_dctelem_c;
2684 c->pix_sum = pix_sum_c;
2685 c->pix_norm1 = pix_norm1_c;
2687 c->fill_block_tab[0] = fill_block16_c;
2688 c->fill_block_tab[1] = fill_block8_c;
2690 /* TODO [0] 16 [1] 8 */
2691 c->pix_abs[0][0] = pix_abs16_c;
2692 c->pix_abs[0][1] = pix_abs16_x2_c;
2693 c->pix_abs[0][2] = pix_abs16_y2_c;
2694 c->pix_abs[0][3] = pix_abs16_xy2_c;
2695 c->pix_abs[1][0] = pix_abs8_c;
2696 c->pix_abs[1][1] = pix_abs8_x2_c;
2697 c->pix_abs[1][2] = pix_abs8_y2_c;
2698 c->pix_abs[1][3] = pix_abs8_xy2_c;
2700 #define dspfunc(PFX, IDX, NUM) \
2701 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
2702 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
2703 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
2704 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
2705 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
2706 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
2707 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
2708 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
2709 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
2710 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
2711 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2712 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2713 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2714 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2715 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2716 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2718 dspfunc(put_qpel, 0, 16);
2719 dspfunc(put_qpel, 1, 8);
2721 dspfunc(put_no_rnd_qpel, 0, 16);
2722 dspfunc(put_no_rnd_qpel, 1, 8);
2724 dspfunc(avg_qpel, 0, 16);
2725 dspfunc(avg_qpel, 1, 8);
2729 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
2730 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
2731 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
2732 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
2733 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
2734 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
2735 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
2736 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
2738 #define SET_CMP_FUNC(name) \
2739 c->name[0] = name ## 16_c; \
2740 c->name[1] = name ## 8x8_c;
2742 SET_CMP_FUNC(hadamard8_diff)
2743 c->hadamard8_diff[4] = hadamard8_intra16_c;
2744 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
2745 SET_CMP_FUNC(dct_sad)
2746 SET_CMP_FUNC(dct_max)
2748 SET_CMP_FUNC(dct264_sad)
2750 c->sad[0] = pix_abs16_c;
2751 c->sad[1] = pix_abs8_c;
2752 c->sse[0] = sse16_c;
2755 SET_CMP_FUNC(quant_psnr)
2758 c->vsad[0] = vsad16_c;
2759 c->vsad[4] = vsad_intra16_c;
2760 c->vsad[5] = vsad_intra8_c;
2761 c->vsse[0] = vsse16_c;
2762 c->vsse[4] = vsse_intra16_c;
2763 c->vsse[5] = vsse_intra8_c;
2764 c->nsse[0] = nsse16_c;
2765 c->nsse[1] = nsse8_c;
2766 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2767 ff_dsputil_init_dwt(c);
2770 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2772 c->add_bytes = add_bytes_c;
2773 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
2774 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2775 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2777 c->diff_bytes = diff_bytes_c;
2778 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
2780 c->bswap_buf = bswap_buf;
2781 c->bswap16_buf = bswap16_buf;
2783 c->try_8x8basis = try_8x8basis_c;
2784 c->add_8x8basis = add_8x8basis_c;
2786 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2788 c->scalarproduct_int16 = scalarproduct_int16_c;
2789 c->vector_clip_int32 = vector_clip_int32_c;
2790 c->vector_clipf = vector_clipf_c;
2792 c->shrink[0] = av_image_copy_plane;
2793 c->shrink[1] = ff_shrink22;
2794 c->shrink[2] = ff_shrink44;
2795 c->shrink[3] = ff_shrink88;
2797 c->add_pixels8 = add_pixels8_c;
2801 #define FUNC(f, depth) f ## _ ## depth
2802 #define FUNCC(f, depth) f ## _ ## depth ## _c
2804 c->draw_edges = FUNCC(draw_edges, 8);
2806 c->clear_block = FUNCC(clear_block, 8);
2807 c->clear_blocks = FUNCC(clear_blocks, 8);
2809 #define BIT_DEPTH_FUNCS(depth) \
2810 c->get_pixels = FUNCC(get_pixels, depth);
2812 switch (avctx->bits_per_raw_sample) {
2817 BIT_DEPTH_FUNCS(16);
2820 if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2828 ff_dsputil_init_alpha(c, avctx);
2830 ff_dsputil_init_arm(c, avctx, high_bit_depth);
2832 ff_dsputil_init_bfin(c, avctx, high_bit_depth);
2834 ff_dsputil_init_ppc(c, avctx, high_bit_depth);
2836 ff_dsputil_init_x86(c, avctx, high_bit_depth);
2838 ff_init_scantable_permutation(c->idct_permutation,
2839 c->idct_permutation_type);
2842 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2844 ff_dsputil_init(c, avctx);
2847 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2849 ff_dsputil_init(c, avctx);