3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_square_tab[512] = { 0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL / 255 * 0x7f)
57 #define pb_80 (~0UL / 255 * 0x80)
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60 * specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62 0, 8, 1, 9, 16, 24, 2, 10,
63 17, 25, 32, 40, 48, 56, 33, 41,
64 18, 26, 3, 11, 4, 12, 19, 27,
65 34, 42, 49, 57, 50, 58, 35, 43,
66 20, 28, 5, 13, 6, 14, 21, 29,
67 36, 44, 51, 59, 52, 60, 37, 45,
68 22, 30, 7, 15, 23, 31, 38, 46,
69 53, 61, 54, 62, 39, 47, 55, 63,
72 const uint8_t ff_alternate_horizontal_scan[64] = {
73 0, 1, 2, 3, 8, 9, 16, 17,
74 10, 11, 4, 5, 6, 7, 15, 14,
75 13, 12, 19, 18, 24, 25, 32, 33,
76 26, 27, 20, 21, 22, 23, 28, 29,
77 30, 31, 34, 35, 40, 41, 48, 49,
78 42, 43, 36, 37, 38, 39, 44, 45,
79 46, 47, 50, 51, 56, 57, 58, 59,
80 52, 53, 54, 55, 60, 61, 62, 63,
83 const uint8_t ff_alternate_vertical_scan[64] = {
84 0, 8, 16, 24, 1, 9, 2, 10,
85 17, 25, 32, 40, 48, 56, 57, 49,
86 41, 33, 26, 18, 3, 11, 4, 12,
87 19, 27, 34, 42, 50, 58, 35, 43,
88 51, 59, 20, 28, 5, 13, 6, 14,
89 21, 29, 36, 44, 52, 60, 37, 45,
90 53, 61, 22, 30, 7, 15, 23, 31,
91 38, 46, 54, 62, 39, 47, 55, 63,
94 /* Input permutation for the simple_idct_mmx */
95 static const uint8_t simple_mmx_permutation[64] = {
96 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
97 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
98 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
99 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
100 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
101 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
102 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
103 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
106 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
109 const uint8_t *src_scantable)
113 st->scantable = src_scantable;
115 for (i = 0; i < 64; i++) {
116 int j = src_scantable[i];
117 st->permutated[i] = permutation[j];
121 for (i = 0; i < 64; i++) {
122 int j = st->permutated[i];
125 st->raster_end[i] = end;
129 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
130 int idct_permutation_type)
134 switch (idct_permutation_type) {
135 case FF_NO_IDCT_PERM:
136 for (i = 0; i < 64; i++)
137 idct_permutation[i] = i;
139 case FF_LIBMPEG2_IDCT_PERM:
140 for (i = 0; i < 64; i++)
141 idct_permutation[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
143 case FF_SIMPLE_IDCT_PERM:
144 for (i = 0; i < 64; i++)
145 idct_permutation[i] = simple_mmx_permutation[i];
147 case FF_TRANSPOSE_IDCT_PERM:
148 for (i = 0; i < 64; i++)
149 idct_permutation[i] = ((i & 7) << 3) | (i >> 3);
151 case FF_PARTTRANS_IDCT_PERM:
152 for (i = 0; i < 64; i++)
153 idct_permutation[i] = (i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3);
155 case FF_SSE2_IDCT_PERM:
156 for (i = 0; i < 64; i++)
157 idct_permutation[i] = (i & 0x38) | idct_sse2_row_perm[i & 7];
160 av_log(NULL, AV_LOG_ERROR,
161 "Internal error, IDCT permutation not set\n");
165 static int pix_sum_c(uint8_t *pix, int line_size)
169 for (i = 0; i < 16; i++) {
170 for (j = 0; j < 16; j += 8) {
181 pix += line_size - 16;
186 static int pix_norm1_c(uint8_t *pix, int line_size)
189 uint32_t *sq = ff_square_tab + 256;
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
204 register uint64_t x = *(uint64_t *) pix;
206 s += sq[(x >> 8) & 0xff];
207 s += sq[(x >> 16) & 0xff];
208 s += sq[(x >> 24) & 0xff];
209 s += sq[(x >> 32) & 0xff];
210 s += sq[(x >> 40) & 0xff];
211 s += sq[(x >> 48) & 0xff];
212 s += sq[(x >> 56) & 0xff];
214 register uint32_t x = *(uint32_t *) pix;
216 s += sq[(x >> 8) & 0xff];
217 s += sq[(x >> 16) & 0xff];
218 s += sq[(x >> 24) & 0xff];
219 x = *(uint32_t *) (pix + 4);
221 s += sq[(x >> 8) & 0xff];
222 s += sq[(x >> 16) & 0xff];
223 s += sq[(x >> 24) & 0xff];
228 pix += line_size - 16;
233 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w)
237 for (i = 0; i + 8 <= w; i += 8) {
238 dst[i + 0] = av_bswap32(src[i + 0]);
239 dst[i + 1] = av_bswap32(src[i + 1]);
240 dst[i + 2] = av_bswap32(src[i + 2]);
241 dst[i + 3] = av_bswap32(src[i + 3]);
242 dst[i + 4] = av_bswap32(src[i + 4]);
243 dst[i + 5] = av_bswap32(src[i + 5]);
244 dst[i + 6] = av_bswap32(src[i + 6]);
245 dst[i + 7] = av_bswap32(src[i + 7]);
248 dst[i + 0] = av_bswap32(src[i + 0]);
251 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
254 *dst++ = av_bswap16(*src++);
257 static int sse4_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
260 uint32_t *sq = ff_square_tab + 256;
262 for (i = 0; i < h; i++) {
263 s += sq[pix1[0] - pix2[0]];
264 s += sq[pix1[1] - pix2[1]];
265 s += sq[pix1[2] - pix2[2]];
266 s += sq[pix1[3] - pix2[3]];
273 static int sse8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
276 uint32_t *sq = ff_square_tab + 256;
278 for (i = 0; i < h; i++) {
279 s += sq[pix1[0] - pix2[0]];
280 s += sq[pix1[1] - pix2[1]];
281 s += sq[pix1[2] - pix2[2]];
282 s += sq[pix1[3] - pix2[3]];
283 s += sq[pix1[4] - pix2[4]];
284 s += sq[pix1[5] - pix2[5]];
285 s += sq[pix1[6] - pix2[6]];
286 s += sq[pix1[7] - pix2[7]];
293 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
296 uint32_t *sq = ff_square_tab + 256;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
307 s += sq[pix1[8] - pix2[8]];
308 s += sq[pix1[9] - pix2[9]];
309 s += sq[pix1[10] - pix2[10]];
310 s += sq[pix1[11] - pix2[11]];
311 s += sq[pix1[12] - pix2[12]];
312 s += sq[pix1[13] - pix2[13]];
313 s += sq[pix1[14] - pix2[14]];
314 s += sq[pix1[15] - pix2[15]];
322 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
323 const uint8_t *s2, int stride)
327 /* read the pixels */
328 for (i = 0; i < 8; i++) {
329 block[0] = s1[0] - s2[0];
330 block[1] = s1[1] - s2[1];
331 block[2] = s1[2] - s2[2];
332 block[3] = s1[3] - s2[3];
333 block[4] = s1[4] - s2[4];
334 block[5] = s1[5] - s2[5];
335 block[6] = s1[6] - s2[6];
336 block[7] = s1[7] - s2[7];
343 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
348 /* read the pixels */
349 for (i = 0; i < 8; i++) {
350 pixels[0] = av_clip_uint8(block[0]);
351 pixels[1] = av_clip_uint8(block[1]);
352 pixels[2] = av_clip_uint8(block[2]);
353 pixels[3] = av_clip_uint8(block[3]);
354 pixels[4] = av_clip_uint8(block[4]);
355 pixels[5] = av_clip_uint8(block[5]);
356 pixels[6] = av_clip_uint8(block[6]);
357 pixels[7] = av_clip_uint8(block[7]);
364 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
369 /* read the pixels */
371 pixels[0] = av_clip_uint8(block[0]);
372 pixels[1] = av_clip_uint8(block[1]);
373 pixels[2] = av_clip_uint8(block[2]);
374 pixels[3] = av_clip_uint8(block[3]);
381 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
386 /* read the pixels */
388 pixels[0] = av_clip_uint8(block[0]);
389 pixels[1] = av_clip_uint8(block[1]);
396 static void put_signed_pixels_clamped_c(const int16_t *block,
397 uint8_t *av_restrict pixels,
402 for (i = 0; i < 8; i++) {
403 for (j = 0; j < 8; j++) {
406 else if (*block > 127)
409 *pixels = (uint8_t) (*block + 128);
413 pixels += (line_size - 8);
417 static void add_pixels8_c(uint8_t *av_restrict pixels, int16_t *block,
422 for (i = 0; i < 8; i++) {
423 pixels[0] += block[0];
424 pixels[1] += block[1];
425 pixels[2] += block[2];
426 pixels[3] += block[3];
427 pixels[4] += block[4];
428 pixels[5] += block[5];
429 pixels[6] += block[6];
430 pixels[7] += block[7];
436 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
441 /* read the pixels */
442 for (i = 0; i < 8; i++) {
443 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
444 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
445 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
446 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
447 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
448 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
449 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
450 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
456 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
461 /* read the pixels */
463 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
464 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
465 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
466 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
472 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
477 /* read the pixels */
479 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
480 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
486 static int sum_abs_dctelem_c(int16_t *block)
490 for (i = 0; i < 64; i++)
491 sum += FFABS(block[i]);
495 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
499 for (i = 0; i < h; i++) {
500 memset(block, value, 16);
505 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
509 for (i = 0; i < h; i++) {
510 memset(block, value, 8);
515 #define avg2(a, b) ((a + b + 1) >> 1)
516 #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
518 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h,
519 int x16, int y16, int rounder)
521 const int A = (16 - x16) * (16 - y16);
522 const int B = (x16) * (16 - y16);
523 const int C = (16 - x16) * (y16);
524 const int D = (x16) * (y16);
527 for (i = 0; i < h; i++) {
528 dst[0] = (A * src[0] + B * src[1] + C * src[stride + 0] + D * src[stride + 1] + rounder) >> 8;
529 dst[1] = (A * src[1] + B * src[2] + C * src[stride + 1] + D * src[stride + 2] + rounder) >> 8;
530 dst[2] = (A * src[2] + B * src[3] + C * src[stride + 2] + D * src[stride + 3] + rounder) >> 8;
531 dst[3] = (A * src[3] + B * src[4] + C * src[stride + 3] + D * src[stride + 4] + rounder) >> 8;
532 dst[4] = (A * src[4] + B * src[5] + C * src[stride + 4] + D * src[stride + 5] + rounder) >> 8;
533 dst[5] = (A * src[5] + B * src[6] + C * src[stride + 5] + D * src[stride + 6] + rounder) >> 8;
534 dst[6] = (A * src[6] + B * src[7] + C * src[stride + 6] + D * src[stride + 7] + rounder) >> 8;
535 dst[7] = (A * src[7] + B * src[8] + C * src[stride + 7] + D * src[stride + 8] + rounder) >> 8;
541 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
542 int dxx, int dxy, int dyx, int dyy, int shift, int r,
543 int width, int height)
546 const int s = 1 << shift;
551 for (y = 0; y < h; y++) {
556 for (x = 0; x < 8; x++) { // FIXME: optimize
558 int src_x = vx >> 16;
559 int src_y = vy >> 16;
560 int frac_x = src_x & (s - 1);
561 int frac_y = src_y & (s - 1);
566 if ((unsigned) src_x < width) {
567 if ((unsigned) src_y < height) {
568 index = src_x + src_y * stride;
569 dst[y * stride + x] =
570 ((src[index] * (s - frac_x) +
571 src[index + 1] * frac_x) * (s - frac_y) +
572 (src[index + stride] * (s - frac_x) +
573 src[index + stride + 1] * frac_x) * frac_y +
576 index = src_x + av_clip(src_y, 0, height) * stride;
577 dst[y * stride + x] =
578 ((src[index] * (s - frac_x) +
579 src[index + 1] * frac_x) * s +
583 if ((unsigned) src_y < height) {
584 index = av_clip(src_x, 0, width) + src_y * stride;
585 dst[y * stride + x] =
586 ((src[index] * (s - frac_y) +
587 src[index + stride] * frac_y) * s +
590 index = av_clip(src_x, 0, width) +
591 av_clip(src_y, 0, height) * stride;
592 dst[y * stride + x] = src[index];
604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
605 int stride, int width, int height)
609 put_pixels2_8_c(dst, src, stride, height);
612 put_pixels4_8_c(dst, src, stride, height);
615 put_pixels8_8_c(dst, src, stride, height);
618 put_pixels16_8_c(dst, src, stride, height);
623 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
624 int stride, int width, int height)
628 for (i = 0; i < height; i++) {
629 for (j = 0; j < width; j++)
630 dst[j] = ((2 * src[j] + src[j + 1] + 1) *
637 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
638 int stride, int width, int height)
642 for (i = 0; i < height; i++) {
643 for (j = 0; j < width; j++)
644 dst[j] = ((src[j] + 2 * src[j + 1] + 1) *
651 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
652 int stride, int width, int height)
656 for (i = 0; i < height; i++) {
657 for (j = 0; j < width; j++)
658 dst[j] = ((2 * src[j] + src[j + stride] + 1) *
665 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
666 int stride, int width, int height)
670 for (i = 0; i < height; i++) {
671 for (j = 0; j < width; j++)
672 dst[j] = ((4 * src[j] + 3 * src[j + 1] +
673 3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
680 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
681 int stride, int width, int height)
685 for (i = 0; i < height; i++) {
686 for (j = 0; j < width; j++)
687 dst[j] = ((3 * src[j] + 2 * src[j + 1] +
688 4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
695 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
696 int stride, int width, int height)
700 for (i = 0; i < height; i++) {
701 for (j = 0; j < width; j++)
702 dst[j] = ((src[j] + 2 * src[j + stride] + 1) *
709 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
710 int stride, int width, int height)
714 for (i = 0; i < height; i++) {
715 for (j = 0; j < width; j++)
716 dst[j] = ((3 * src[j] + 4 * src[j + 1] +
717 2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
724 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
725 int stride, int width, int height)
729 for (i = 0; i < height; i++) {
730 for (j = 0; j < width; j++)
731 dst[j] = ((2 * src[j] + 3 * src[j + 1] +
732 3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
739 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src,
740 int stride, int width, int height)
744 avg_pixels2_8_c(dst, src, stride, height);
747 avg_pixels4_8_c(dst, src, stride, height);
750 avg_pixels8_8_c(dst, src, stride, height);
753 avg_pixels16_8_c(dst, src, stride, height);
758 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src,
759 int stride, int width, int height)
763 for (i = 0; i < height; i++) {
764 for (j = 0; j < width; j++)
766 (((2 * src[j] + src[j + 1] + 1) *
767 683) >> 11) + 1) >> 1;
773 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src,
774 int stride, int width, int height)
778 for (i = 0; i < height; i++) {
779 for (j = 0; j < width; j++)
781 (((src[j] + 2 * src[j + 1] + 1) *
782 683) >> 11) + 1) >> 1;
788 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src,
789 int stride, int width, int height)
793 for (i = 0; i < height; i++) {
794 for (j = 0; j < width; j++)
796 (((2 * src[j] + src[j + stride] + 1) *
797 683) >> 11) + 1) >> 1;
803 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src,
804 int stride, int width, int height)
808 for (i = 0; i < height; i++) {
809 for (j = 0; j < width; j++)
811 (((4 * src[j] + 3 * src[j + 1] +
812 3 * src[j + stride] + 2 * src[j + stride + 1] + 6) *
813 2731) >> 15) + 1) >> 1;
819 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src,
820 int stride, int width, int height)
824 for (i = 0; i < height; i++) {
825 for (j = 0; j < width; j++)
827 (((3 * src[j] + 2 * src[j + 1] +
828 4 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
829 2731) >> 15) + 1) >> 1;
835 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src,
836 int stride, int width, int height)
840 for (i = 0; i < height; i++) {
841 for (j = 0; j < width; j++)
843 (((src[j] + 2 * src[j + stride] + 1) *
844 683) >> 11) + 1) >> 1;
850 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src,
851 int stride, int width, int height)
855 for (i = 0; i < height; i++) {
856 for (j = 0; j < width; j++)
858 (((3 * src[j] + 4 * src[j + 1] +
859 2 * src[j + stride] + 3 * src[j + stride + 1] + 6) *
860 2731) >> 15) + 1) >> 1;
866 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src,
867 int stride, int width, int height)
871 for (i = 0; i < height; i++) {
872 for (j = 0; j < width; j++)
874 (((2 * src[j] + 3 * src[j + 1] +
875 3 * src[j + stride] + 4 * src[j + stride + 1] + 6) *
876 2731) >> 15) + 1) >> 1;
882 #define QPEL_MC(r, OPNAME, RND, OP) \
883 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, \
884 int dstStride, int srcStride, \
887 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
890 for (i = 0; i < h; i++) { \
891 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
892 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
893 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
894 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
895 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
896 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[8])); \
897 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[8]) * 3 - (src[3] + src[7])); \
898 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[8]) * 6 + (src[5] + src[7]) * 3 - (src[4] + src[6])); \
904 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, \
905 int dstStride, int srcStride) \
907 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
911 for (i = 0; i < w; i++) { \
912 const int src0 = src[0 * srcStride]; \
913 const int src1 = src[1 * srcStride]; \
914 const int src2 = src[2 * srcStride]; \
915 const int src3 = src[3 * srcStride]; \
916 const int src4 = src[4 * srcStride]; \
917 const int src5 = src[5 * srcStride]; \
918 const int src6 = src[6 * srcStride]; \
919 const int src7 = src[7 * srcStride]; \
920 const int src8 = src[8 * srcStride]; \
921 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
922 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
923 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
924 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
925 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
926 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src8)); \
927 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src8) * 3 - (src3 + src7)); \
928 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src8) * 6 + (src5 + src7) * 3 - (src4 + src6)); \
934 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, \
935 int dstStride, int srcStride, \
938 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
941 for (i = 0; i < h; i++) { \
942 OP(dst[0], (src[0] + src[1]) * 20 - (src[0] + src[2]) * 6 + (src[1] + src[3]) * 3 - (src[2] + src[4])); \
943 OP(dst[1], (src[1] + src[2]) * 20 - (src[0] + src[3]) * 6 + (src[0] + src[4]) * 3 - (src[1] + src[5])); \
944 OP(dst[2], (src[2] + src[3]) * 20 - (src[1] + src[4]) * 6 + (src[0] + src[5]) * 3 - (src[0] + src[6])); \
945 OP(dst[3], (src[3] + src[4]) * 20 - (src[2] + src[5]) * 6 + (src[1] + src[6]) * 3 - (src[0] + src[7])); \
946 OP(dst[4], (src[4] + src[5]) * 20 - (src[3] + src[6]) * 6 + (src[2] + src[7]) * 3 - (src[1] + src[8])); \
947 OP(dst[5], (src[5] + src[6]) * 20 - (src[4] + src[7]) * 6 + (src[3] + src[8]) * 3 - (src[2] + src[9])); \
948 OP(dst[6], (src[6] + src[7]) * 20 - (src[5] + src[8]) * 6 + (src[4] + src[9]) * 3 - (src[3] + src[10])); \
949 OP(dst[7], (src[7] + src[8]) * 20 - (src[6] + src[9]) * 6 + (src[5] + src[10]) * 3 - (src[4] + src[11])); \
950 OP(dst[8], (src[8] + src[9]) * 20 - (src[7] + src[10]) * 6 + (src[6] + src[11]) * 3 - (src[5] + src[12])); \
951 OP(dst[9], (src[9] + src[10]) * 20 - (src[8] + src[11]) * 6 + (src[7] + src[12]) * 3 - (src[6] + src[13])); \
952 OP(dst[10], (src[10] + src[11]) * 20 - (src[9] + src[12]) * 6 + (src[8] + src[13]) * 3 - (src[7] + src[14])); \
953 OP(dst[11], (src[11] + src[12]) * 20 - (src[10] + src[13]) * 6 + (src[9] + src[14]) * 3 - (src[8] + src[15])); \
954 OP(dst[12], (src[12] + src[13]) * 20 - (src[11] + src[14]) * 6 + (src[10] + src[15]) * 3 - (src[9] + src[16])); \
955 OP(dst[13], (src[13] + src[14]) * 20 - (src[12] + src[15]) * 6 + (src[11] + src[16]) * 3 - (src[10] + src[16])); \
956 OP(dst[14], (src[14] + src[15]) * 20 - (src[13] + src[16]) * 6 + (src[12] + src[16]) * 3 - (src[11] + src[15])); \
957 OP(dst[15], (src[15] + src[16]) * 20 - (src[14] + src[16]) * 6 + (src[13] + src[15]) * 3 - (src[12] + src[14])); \
963 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, \
964 int dstStride, int srcStride) \
966 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP; \
970 for (i = 0; i < w; i++) { \
971 const int src0 = src[0 * srcStride]; \
972 const int src1 = src[1 * srcStride]; \
973 const int src2 = src[2 * srcStride]; \
974 const int src3 = src[3 * srcStride]; \
975 const int src4 = src[4 * srcStride]; \
976 const int src5 = src[5 * srcStride]; \
977 const int src6 = src[6 * srcStride]; \
978 const int src7 = src[7 * srcStride]; \
979 const int src8 = src[8 * srcStride]; \
980 const int src9 = src[9 * srcStride]; \
981 const int src10 = src[10 * srcStride]; \
982 const int src11 = src[11 * srcStride]; \
983 const int src12 = src[12 * srcStride]; \
984 const int src13 = src[13 * srcStride]; \
985 const int src14 = src[14 * srcStride]; \
986 const int src15 = src[15 * srcStride]; \
987 const int src16 = src[16 * srcStride]; \
988 OP(dst[0 * dstStride], (src0 + src1) * 20 - (src0 + src2) * 6 + (src1 + src3) * 3 - (src2 + src4)); \
989 OP(dst[1 * dstStride], (src1 + src2) * 20 - (src0 + src3) * 6 + (src0 + src4) * 3 - (src1 + src5)); \
990 OP(dst[2 * dstStride], (src2 + src3) * 20 - (src1 + src4) * 6 + (src0 + src5) * 3 - (src0 + src6)); \
991 OP(dst[3 * dstStride], (src3 + src4) * 20 - (src2 + src5) * 6 + (src1 + src6) * 3 - (src0 + src7)); \
992 OP(dst[4 * dstStride], (src4 + src5) * 20 - (src3 + src6) * 6 + (src2 + src7) * 3 - (src1 + src8)); \
993 OP(dst[5 * dstStride], (src5 + src6) * 20 - (src4 + src7) * 6 + (src3 + src8) * 3 - (src2 + src9)); \
994 OP(dst[6 * dstStride], (src6 + src7) * 20 - (src5 + src8) * 6 + (src4 + src9) * 3 - (src3 + src10)); \
995 OP(dst[7 * dstStride], (src7 + src8) * 20 - (src6 + src9) * 6 + (src5 + src10) * 3 - (src4 + src11)); \
996 OP(dst[8 * dstStride], (src8 + src9) * 20 - (src7 + src10) * 6 + (src6 + src11) * 3 - (src5 + src12)); \
997 OP(dst[9 * dstStride], (src9 + src10) * 20 - (src8 + src11) * 6 + (src7 + src12) * 3 - (src6 + src13)); \
998 OP(dst[10 * dstStride], (src10 + src11) * 20 - (src9 + src12) * 6 + (src8 + src13) * 3 - (src7 + src14)); \
999 OP(dst[11 * dstStride], (src11 + src12) * 20 - (src10 + src13) * 6 + (src9 + src14) * 3 - (src8 + src15)); \
1000 OP(dst[12 * dstStride], (src12 + src13) * 20 - (src11 + src14) * 6 + (src10 + src15) * 3 - (src9 + src16)); \
1001 OP(dst[13 * dstStride], (src13 + src14) * 20 - (src12 + src15) * 6 + (src11 + src16) * 3 - (src10 + src16)); \
1002 OP(dst[14 * dstStride], (src14 + src15) * 20 - (src13 + src16) * 6 + (src12 + src16) * 3 - (src11 + src15)); \
1003 OP(dst[15 * dstStride], (src15 + src16) * 20 - (src14 + src16) * 6 + (src13 + src15) * 3 - (src12 + src14)); \
1009 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, \
1014 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
1015 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8); \
1018 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, \
1021 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8); \
1024 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, \
1029 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8); \
1030 OPNAME ## pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8); \
1033 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, \
1036 uint8_t full[16 * 9]; \
1039 copy_block9(full, src, 16, stride, 9); \
1040 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
1041 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8); \
1044 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, \
1047 uint8_t full[16 * 9]; \
1049 copy_block9(full, src, 16, stride, 9); \
1050 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16); \
1053 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, \
1056 uint8_t full[16 * 9]; \
1059 copy_block9(full, src, 16, stride, 9); \
1060 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16); \
1061 OPNAME ## pixels8_l2_8(dst, full + 16, half, stride, 16, 8, 8); \
1064 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, \
1067 uint8_t full[16 * 9]; \
1068 uint8_t halfH[72]; \
1069 uint8_t halfV[64]; \
1070 uint8_t halfHV[64]; \
1072 copy_block9(full, src, 16, stride, 9); \
1073 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1074 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
1075 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1076 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, \
1077 stride, 16, 8, 8, 8, 8); \
1080 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, \
1083 uint8_t full[16 * 9]; \
1084 uint8_t halfH[72]; \
1085 uint8_t halfHV[64]; \
1087 copy_block9(full, src, 16, stride, 9); \
1088 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1089 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
1090 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1091 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
1094 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, \
1097 uint8_t full[16 * 9]; \
1098 uint8_t halfH[72]; \
1099 uint8_t halfV[64]; \
1100 uint8_t halfHV[64]; \
1102 copy_block9(full, src, 16, stride, 9); \
1103 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1104 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
1105 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1106 OPNAME ## pixels8_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1107 stride, 16, 8, 8, 8, 8); \
1110 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, \
1113 uint8_t full[16 * 9]; \
1114 uint8_t halfH[72]; \
1115 uint8_t halfHV[64]; \
1117 copy_block9(full, src, 16, stride, 9); \
1118 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1119 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
1120 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1121 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
1124 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, \
1127 uint8_t full[16 * 9]; \
1128 uint8_t halfH[72]; \
1129 uint8_t halfV[64]; \
1130 uint8_t halfHV[64]; \
1132 copy_block9(full, src, 16, stride, 9); \
1133 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1134 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
1135 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1136 OPNAME ## pixels8_l4_8(dst, full + 16, halfH + 8, halfV, halfHV, \
1137 stride, 16, 8, 8, 8, 8); \
1140 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, \
1143 uint8_t full[16 * 9]; \
1144 uint8_t halfH[72]; \
1145 uint8_t halfHV[64]; \
1147 copy_block9(full, src, 16, stride, 9); \
1148 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1149 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
1150 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1151 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
1154 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, \
1157 uint8_t full[16 * 9]; \
1158 uint8_t halfH[72]; \
1159 uint8_t halfV[64]; \
1160 uint8_t halfHV[64]; \
1162 copy_block9(full, src, 16, stride, 9); \
1163 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1164 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
1165 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1166 OPNAME ## pixels8_l4_8(dst, full + 17, halfH + 8, halfV, halfHV, \
1167 stride, 16, 8, 8, 8, 8); \
1170 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, \
1173 uint8_t full[16 * 9]; \
1174 uint8_t halfH[72]; \
1175 uint8_t halfHV[64]; \
1177 copy_block9(full, src, 16, stride, 9); \
1178 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1179 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
1180 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1181 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
1184 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, \
1187 uint8_t halfH[72]; \
1188 uint8_t halfHV[64]; \
1190 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
1191 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1192 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8); \
1195 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, \
1198 uint8_t halfH[72]; \
1199 uint8_t halfHV[64]; \
1201 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
1202 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1203 OPNAME ## pixels8_l2_8(dst, halfH + 8, halfHV, stride, 8, 8, 8); \
1206 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, \
1209 uint8_t full[16 * 9]; \
1210 uint8_t halfH[72]; \
1211 uint8_t halfV[64]; \
1212 uint8_t halfHV[64]; \
1214 copy_block9(full, src, 16, stride, 9); \
1215 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1216 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16); \
1217 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1218 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
1221 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, \
1224 uint8_t full[16 * 9]; \
1225 uint8_t halfH[72]; \
1227 copy_block9(full, src, 16, stride, 9); \
1228 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1229 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9); \
1230 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
1233 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, \
1236 uint8_t full[16 * 9]; \
1237 uint8_t halfH[72]; \
1238 uint8_t halfV[64]; \
1239 uint8_t halfHV[64]; \
1241 copy_block9(full, src, 16, stride, 9); \
1242 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1243 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full + 1, 8, 16); \
1244 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8); \
1245 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8); \
1248 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, \
1251 uint8_t full[16 * 9]; \
1252 uint8_t halfH[72]; \
1254 copy_block9(full, src, 16, stride, 9); \
1255 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9); \
1256 put ## RND ## pixels8_l2_8(halfH, halfH, full + 1, 8, 8, 16, 9); \
1257 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
1260 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, \
1263 uint8_t halfH[72]; \
1265 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9); \
1266 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8); \
1269 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, \
1272 uint8_t half[256]; \
1274 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1275 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16); \
1278 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, \
1281 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16); \
1284 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, \
1287 uint8_t half[256]; \
1289 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16); \
1290 OPNAME ## pixels16_l2_8(dst, src + 1, half, stride, stride, 16, 16); \
1293 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, \
1296 uint8_t full[24 * 17]; \
1297 uint8_t half[256]; \
1299 copy_block17(full, src, 24, stride, 17); \
1300 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1301 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16); \
1304 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, \
1307 uint8_t full[24 * 17]; \
1309 copy_block17(full, src, 24, stride, 17); \
1310 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24); \
1313 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, \
1316 uint8_t full[24 * 17]; \
1317 uint8_t half[256]; \
1319 copy_block17(full, src, 24, stride, 17); \
1320 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24); \
1321 OPNAME ## pixels16_l2_8(dst, full + 24, half, stride, 24, 16, 16); \
1324 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, \
1327 uint8_t full[24 * 17]; \
1328 uint8_t halfH[272]; \
1329 uint8_t halfV[256]; \
1330 uint8_t halfHV[256]; \
1332 copy_block17(full, src, 24, stride, 17); \
1333 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1334 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1335 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1336 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, \
1337 stride, 24, 16, 16, 16, 16); \
1340 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, \
1343 uint8_t full[24 * 17]; \
1344 uint8_t halfH[272]; \
1345 uint8_t halfHV[256]; \
1347 copy_block17(full, src, 24, stride, 17); \
1348 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1349 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1350 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1351 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1354 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, \
1357 uint8_t full[24 * 17]; \
1358 uint8_t halfH[272]; \
1359 uint8_t halfV[256]; \
1360 uint8_t halfHV[256]; \
1362 copy_block17(full, src, 24, stride, 17); \
1363 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1364 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1365 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1366 OPNAME ## pixels16_l4_8(dst, full + 1, halfH, halfV, halfHV, \
1367 stride, 24, 16, 16, 16, 16); \
1370 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, \
1373 uint8_t full[24 * 17]; \
1374 uint8_t halfH[272]; \
1375 uint8_t halfHV[256]; \
1377 copy_block17(full, src, 24, stride, 17); \
1378 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1379 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1380 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1381 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1384 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, \
1387 uint8_t full[24 * 17]; \
1388 uint8_t halfH[272]; \
1389 uint8_t halfV[256]; \
1390 uint8_t halfHV[256]; \
1392 copy_block17(full, src, 24, stride, 17); \
1393 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1394 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1395 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1396 OPNAME ## pixels16_l4_8(dst, full + 24, halfH + 16, halfV, halfHV, \
1397 stride, 24, 16, 16, 16, 16); \
1400 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, \
1403 uint8_t full[24 * 17]; \
1404 uint8_t halfH[272]; \
1405 uint8_t halfHV[256]; \
1407 copy_block17(full, src, 24, stride, 17); \
1408 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1409 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1410 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1411 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1414 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, \
1417 uint8_t full[24 * 17]; \
1418 uint8_t halfH[272]; \
1419 uint8_t halfV[256]; \
1420 uint8_t halfHV[256]; \
1422 copy_block17(full, src, 24, stride, 17); \
1423 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1424 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1425 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1426 OPNAME ## pixels16_l4_8(dst, full + 25, halfH + 16, halfV, halfHV, \
1427 stride, 24, 16, 16, 16, 16); \
1430 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, \
1433 uint8_t full[24 * 17]; \
1434 uint8_t halfH[272]; \
1435 uint8_t halfHV[256]; \
1437 copy_block17(full, src, 24, stride, 17); \
1438 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1439 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1440 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1441 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1444 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, \
1447 uint8_t halfH[272]; \
1448 uint8_t halfHV[256]; \
1450 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1451 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1452 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16); \
1455 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, \
1458 uint8_t halfH[272]; \
1459 uint8_t halfHV[256]; \
1461 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1462 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1463 OPNAME ## pixels16_l2_8(dst, halfH + 16, halfHV, stride, 16, 16, 16); \
1466 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, \
1469 uint8_t full[24 * 17]; \
1470 uint8_t halfH[272]; \
1471 uint8_t halfV[256]; \
1472 uint8_t halfHV[256]; \
1474 copy_block17(full, src, 24, stride, 17); \
1475 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1476 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24); \
1477 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1478 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1481 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, \
1484 uint8_t full[24 * 17]; \
1485 uint8_t halfH[272]; \
1487 copy_block17(full, src, 24, stride, 17); \
1488 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1489 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17); \
1490 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1493 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, \
1496 uint8_t full[24 * 17]; \
1497 uint8_t halfH[272]; \
1498 uint8_t halfV[256]; \
1499 uint8_t halfHV[256]; \
1501 copy_block17(full, src, 24, stride, 17); \
1502 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1503 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full + 1, 16, 24); \
1504 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16); \
1505 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16); \
1508 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, \
1511 uint8_t full[24 * 17]; \
1512 uint8_t halfH[272]; \
1514 copy_block17(full, src, 24, stride, 17); \
1515 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17); \
1516 put ## RND ## pixels16_l2_8(halfH, halfH, full + 1, 16, 16, 24, 17); \
1517 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1520 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, \
1523 uint8_t halfH[272]; \
1525 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17); \
1526 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16); \
1529 #define op_avg(a, b) a = (((a) + cm[((b) + 16) >> 5] + 1) >> 1)
1530 #define op_avg_no_rnd(a, b) a = (((a) + cm[((b) + 15) >> 5]) >> 1)
1531 #define op_put(a, b) a = cm[((b) + 16) >> 5]
1532 #define op_put_no_rnd(a, b) a = cm[((b) + 15) >> 5]
1534 QPEL_MC(0, put_, _, op_put)
1535 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1536 QPEL_MC(0, avg_, _, op_avg)
1540 #undef op_put_no_rnd
1542 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1544 put_pixels8_8_c(dst, src, stride, 8);
1547 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1549 avg_pixels8_8_c(dst, src, stride, 8);
1552 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1554 put_pixels16_8_c(dst, src, stride, 16);
1557 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1559 avg_pixels16_8_c(dst, src, stride, 16);
1562 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1563 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1564 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1565 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1566 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1567 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1569 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src,
1570 int dstStride, int srcStride, int h)
1572 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1575 for (i = 0; i < h; i++) {
1576 dst[0] = cm[(9 * (src[0] + src[1]) - (src[-1] + src[2]) + 8) >> 4];
1577 dst[1] = cm[(9 * (src[1] + src[2]) - (src[0] + src[3]) + 8) >> 4];
1578 dst[2] = cm[(9 * (src[2] + src[3]) - (src[1] + src[4]) + 8) >> 4];
1579 dst[3] = cm[(9 * (src[3] + src[4]) - (src[2] + src[5]) + 8) >> 4];
1580 dst[4] = cm[(9 * (src[4] + src[5]) - (src[3] + src[6]) + 8) >> 4];
1581 dst[5] = cm[(9 * (src[5] + src[6]) - (src[4] + src[7]) + 8) >> 4];
1582 dst[6] = cm[(9 * (src[6] + src[7]) - (src[5] + src[8]) + 8) >> 4];
1583 dst[7] = cm[(9 * (src[7] + src[8]) - (src[6] + src[9]) + 8) >> 4];
1589 #if CONFIG_RV40_DECODER
1590 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1592 put_pixels16_xy2_8_c(dst, src, stride, 16);
1595 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1597 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1600 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1602 put_pixels8_xy2_8_c(dst, src, stride, 8);
1605 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1607 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1609 #endif /* CONFIG_RV40_DECODER */
1611 #if CONFIG_DIRAC_DECODER
1612 #define DIRAC_MC(OPNAME)\
1613 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1615 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1617 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1619 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1621 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1623 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1624 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1626 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1628 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1630 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1632 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1634 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1636 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1637 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1639 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1641 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1643 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1645 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1647 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1649 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1650 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1656 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src,
1657 int dstStride, int srcStride, int w)
1659 const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
1662 for (i = 0; i < w; i++) {
1663 const int src_1 = src[-srcStride];
1664 const int src0 = src[0];
1665 const int src1 = src[srcStride];
1666 const int src2 = src[2 * srcStride];
1667 const int src3 = src[3 * srcStride];
1668 const int src4 = src[4 * srcStride];
1669 const int src5 = src[5 * srcStride];
1670 const int src6 = src[6 * srcStride];
1671 const int src7 = src[7 * srcStride];
1672 const int src8 = src[8 * srcStride];
1673 const int src9 = src[9 * srcStride];
1674 dst[0 * dstStride] = cm[(9 * (src0 + src1) - (src_1 + src2) + 8) >> 4];
1675 dst[1 * dstStride] = cm[(9 * (src1 + src2) - (src0 + src3) + 8) >> 4];
1676 dst[2 * dstStride] = cm[(9 * (src2 + src3) - (src1 + src4) + 8) >> 4];
1677 dst[3 * dstStride] = cm[(9 * (src3 + src4) - (src2 + src5) + 8) >> 4];
1678 dst[4 * dstStride] = cm[(9 * (src4 + src5) - (src3 + src6) + 8) >> 4];
1679 dst[5 * dstStride] = cm[(9 * (src5 + src6) - (src4 + src7) + 8) >> 4];
1680 dst[6 * dstStride] = cm[(9 * (src6 + src7) - (src5 + src8) + 8) >> 4];
1681 dst[7 * dstStride] = cm[(9 * (src7 + src8) - (src6 + src9) + 8) >> 4];
1687 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1691 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1692 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1695 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1697 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1700 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1704 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1705 put_pixels8_l2_8(dst, src + 1, half, stride, stride, 8, 8);
1708 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1710 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1713 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1719 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1720 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1721 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1722 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1725 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1731 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1732 wmv2_mspel8_v_lowpass(halfV, src + 1, 8, stride, 8);
1733 wmv2_mspel8_v_lowpass(halfHV, halfH + 8, 8, 8, 8);
1734 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1737 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1741 wmv2_mspel8_h_lowpass(halfH, src - stride, 8, stride, 11);
1742 wmv2_mspel8_v_lowpass(dst, halfH + 8, stride, 8, 8);
1745 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2,
1746 int line_size, int h)
1750 for (i = 0; i < h; i++) {
1751 s += abs(pix1[0] - pix2[0]);
1752 s += abs(pix1[1] - pix2[1]);
1753 s += abs(pix1[2] - pix2[2]);
1754 s += abs(pix1[3] - pix2[3]);
1755 s += abs(pix1[4] - pix2[4]);
1756 s += abs(pix1[5] - pix2[5]);
1757 s += abs(pix1[6] - pix2[6]);
1758 s += abs(pix1[7] - pix2[7]);
1759 s += abs(pix1[8] - pix2[8]);
1760 s += abs(pix1[9] - pix2[9]);
1761 s += abs(pix1[10] - pix2[10]);
1762 s += abs(pix1[11] - pix2[11]);
1763 s += abs(pix1[12] - pix2[12]);
1764 s += abs(pix1[13] - pix2[13]);
1765 s += abs(pix1[14] - pix2[14]);
1766 s += abs(pix1[15] - pix2[15]);
1773 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1774 int line_size, int h)
1778 for (i = 0; i < h; i++) {
1779 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1780 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1781 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1782 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1783 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1784 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1785 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1786 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1787 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1788 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1789 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1790 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1791 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1792 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1793 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1794 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1801 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1802 int line_size, int h)
1805 uint8_t *pix3 = pix2 + line_size;
1807 for (i = 0; i < h; i++) {
1808 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1809 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1810 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1811 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1812 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1813 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1814 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1815 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1816 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1817 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1818 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1819 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1820 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1821 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1822 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1823 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1831 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1832 int line_size, int h)
1835 uint8_t *pix3 = pix2 + line_size;
1837 for (i = 0; i < h; i++) {
1838 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1839 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1840 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1841 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1842 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1843 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1844 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1845 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1846 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1847 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1848 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1849 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1850 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1851 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1852 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1853 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1861 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2,
1862 int line_size, int h)
1866 for (i = 0; i < h; i++) {
1867 s += abs(pix1[0] - pix2[0]);
1868 s += abs(pix1[1] - pix2[1]);
1869 s += abs(pix1[2] - pix2[2]);
1870 s += abs(pix1[3] - pix2[3]);
1871 s += abs(pix1[4] - pix2[4]);
1872 s += abs(pix1[5] - pix2[5]);
1873 s += abs(pix1[6] - pix2[6]);
1874 s += abs(pix1[7] - pix2[7]);
1881 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1882 int line_size, int h)
1886 for (i = 0; i < h; i++) {
1887 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1888 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1889 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1890 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1891 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1892 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1893 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1894 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1901 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1902 int line_size, int h)
1905 uint8_t *pix3 = pix2 + line_size;
1907 for (i = 0; i < h; i++) {
1908 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1909 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1910 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1911 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1912 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1913 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1914 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1915 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1923 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2,
1924 int line_size, int h)
1927 uint8_t *pix3 = pix2 + line_size;
1929 for (i = 0; i < h; i++) {
1930 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1931 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1932 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1933 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1934 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1935 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1936 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1937 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1945 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1947 MpegEncContext *c = v;
1948 int score1 = 0, score2 = 0, x, y;
1950 for (y = 0; y < h; y++) {
1951 for (x = 0; x < 16; x++)
1952 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1954 for (x = 0; x < 15; x++)
1955 score2 += FFABS(s1[x] - s1[x + stride] -
1956 s1[x + 1] + s1[x + stride + 1]) -
1957 FFABS(s2[x] - s2[x + stride] -
1958 s2[x + 1] + s2[x + stride + 1]);
1965 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1967 return score1 + FFABS(score2) * 8;
1970 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h)
1972 MpegEncContext *c = v;
1973 int score1 = 0, score2 = 0, x, y;
1975 for (y = 0; y < h; y++) {
1976 for (x = 0; x < 8; x++)
1977 score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
1979 for (x = 0; x < 7; x++)
1980 score2 += FFABS(s1[x] - s1[x + stride] -
1981 s1[x + 1] + s1[x + stride + 1]) -
1982 FFABS(s2[x] - s2[x + stride] -
1983 s2[x + 1] + s2[x + stride + 1]);
1990 return score1 + FFABS(score2) * c->avctx->nsse_weight;
1992 return score1 + FFABS(score2) * 8;
1995 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64],
1996 int16_t basis[64], int scale)
1999 unsigned int sum = 0;
2001 for (i = 0; i < 8 * 8; i++) {
2002 int b = rem[i] + ((basis[i] * scale +
2003 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
2004 (BASIS_SHIFT - RECON_SHIFT));
2007 av_assert2(-512 < b && b < 512);
2009 sum += (w * b) * (w * b) >> 4;
2014 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale)
2018 for (i = 0; i < 8 * 8; i++)
2019 rem[i] += (basis[i] * scale +
2020 (1 << (BASIS_SHIFT - RECON_SHIFT - 1))) >>
2021 (BASIS_SHIFT - RECON_SHIFT);
2024 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h)
2029 void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
2033 memset(cmp, 0, sizeof(void *) * 6);
2035 for (i = 0; i < 6; i++) {
2036 switch (type & 0xFF) {
2041 cmp[i] = c->hadamard8_diff[i];
2047 cmp[i] = c->dct_sad[i];
2050 cmp[i] = c->dct264_sad[i];
2053 cmp[i] = c->dct_max[i];
2056 cmp[i] = c->quant_psnr[i];
2065 cmp[i] = c->vsad[i];
2068 cmp[i] = c->vsse[i];
2074 cmp[i] = c->nsse[i];
2085 av_log(NULL, AV_LOG_ERROR,
2086 "internal error in cmp function selection\n");
2091 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w)
2095 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
2096 long a = *(long *) (src + i);
2097 long b = *(long *) (dst + i);
2098 *(long *) (dst + i) = ((a & pb_7f) + (b & pb_7f)) ^ ((a ^ b) & pb_80);
2101 dst[i + 0] += src[i + 0];
2104 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w)
2108 #if !HAVE_FAST_UNALIGNED
2109 if ((long) src2 & (sizeof(long) - 1)) {
2110 for (i = 0; i + 7 < w; i += 8) {
2111 dst[i + 0] = src1[i + 0] - src2[i + 0];
2112 dst[i + 1] = src1[i + 1] - src2[i + 1];
2113 dst[i + 2] = src1[i + 2] - src2[i + 2];
2114 dst[i + 3] = src1[i + 3] - src2[i + 3];
2115 dst[i + 4] = src1[i + 4] - src2[i + 4];
2116 dst[i + 5] = src1[i + 5] - src2[i + 5];
2117 dst[i + 6] = src1[i + 6] - src2[i + 6];
2118 dst[i + 7] = src1[i + 7] - src2[i + 7];
2122 for (i = 0; i <= w - (int) sizeof(long); i += sizeof(long)) {
2123 long a = *(long *) (src1 + i);
2124 long b = *(long *) (src2 + i);
2125 *(long *) (dst + i) = ((a | pb_80) - (b & pb_7f)) ^
2126 ((a ^ b ^ pb_80) & pb_80);
2129 dst[i + 0] = src1[i + 0] - src2[i + 0];
2132 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2133 const uint8_t *diff, int w,
2134 int *left, int *left_top)
2142 for (i = 0; i < w; i++) {
2143 l = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF) + diff[i];
2152 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1,
2153 const uint8_t *src2, int w,
2154 int *left, int *left_top)
2162 for (i = 0; i < w; i++) {
2163 const int pred = mid_pred(l, src1[i], (l + src1[i] - lt) & 0xFF);
2173 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src,
2178 for (i = 0; i < w - 1; i++) {
2186 for (; i < w; i++) {
2205 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src,
2206 int w, int *red, int *green,
2207 int *blue, int *alpha)
2209 int i, r = *red, g = *green, b = *blue, a = *alpha;
2211 for (i = 0; i < w; i++) {
2212 b += src[4 * i + B];
2213 g += src[4 * i + G];
2214 r += src[4 * i + R];
2215 a += src[4 * i + A];
2233 #define BUTTERFLY2(o1, o2, i1, i2) \
2237 #define BUTTERFLY1(x, y) \
2246 #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
2248 static int hadamard8_diff8x8_c(/* MpegEncContext */ void *s, uint8_t *dst,
2249 uint8_t *src, int stride, int h)
2251 int i, temp[64], sum = 0;
2255 for (i = 0; i < 8; i++) {
2256 // FIXME: try pointer walks
2257 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2258 src[stride * i + 0] - dst[stride * i + 0],
2259 src[stride * i + 1] - dst[stride * i + 1]);
2260 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2261 src[stride * i + 2] - dst[stride * i + 2],
2262 src[stride * i + 3] - dst[stride * i + 3]);
2263 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2264 src[stride * i + 4] - dst[stride * i + 4],
2265 src[stride * i + 5] - dst[stride * i + 5]);
2266 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2267 src[stride * i + 6] - dst[stride * i + 6],
2268 src[stride * i + 7] - dst[stride * i + 7]);
2270 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2271 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2272 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2273 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2275 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2276 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2277 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2278 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2281 for (i = 0; i < 8; i++) {
2282 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2283 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2284 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2285 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2287 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2288 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2289 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2290 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2292 sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
2293 BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
2294 BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
2295 BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2300 static int hadamard8_intra8x8_c(/* MpegEncContext */ void *s, uint8_t *src,
2301 uint8_t *dummy, int stride, int h)
2303 int i, temp[64], sum = 0;
2307 for (i = 0; i < 8; i++) {
2308 // FIXME: try pointer walks
2309 BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
2310 src[stride * i + 0], src[stride * i + 1]);
2311 BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
2312 src[stride * i + 2], src[stride * i + 3]);
2313 BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
2314 src[stride * i + 4], src[stride * i + 5]);
2315 BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
2316 src[stride * i + 6], src[stride * i + 7]);
2318 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
2319 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
2320 BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
2321 BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
2323 BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
2324 BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
2325 BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
2326 BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
2329 for (i = 0; i < 8; i++) {
2330 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
2331 BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
2332 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
2333 BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
2335 BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
2336 BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
2337 BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
2338 BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
2341 BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
2342 + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
2343 + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
2344 + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
2347 sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
2352 static int dct_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2353 uint8_t *src2, int stride, int h)
2355 MpegEncContext *const s = (MpegEncContext *) c;
2356 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2360 s->dsp.diff_pixels(temp, src1, src2, stride);
2362 return s->dsp.sum_abs_dctelem(temp);
2368 const int s07 = SRC(0) + SRC(7); \
2369 const int s16 = SRC(1) + SRC(6); \
2370 const int s25 = SRC(2) + SRC(5); \
2371 const int s34 = SRC(3) + SRC(4); \
2372 const int a0 = s07 + s34; \
2373 const int a1 = s16 + s25; \
2374 const int a2 = s07 - s34; \
2375 const int a3 = s16 - s25; \
2376 const int d07 = SRC(0) - SRC(7); \
2377 const int d16 = SRC(1) - SRC(6); \
2378 const int d25 = SRC(2) - SRC(5); \
2379 const int d34 = SRC(3) - SRC(4); \
2380 const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
2381 const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
2382 const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
2383 const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
2385 DST(1, a4 + (a7 >> 2)); \
2386 DST(2, a2 + (a3 >> 1)); \
2387 DST(3, a5 + (a6 >> 2)); \
2389 DST(5, a6 - (a5 >> 2)); \
2390 DST(6, (a2 >> 1) - a3); \
2391 DST(7, (a4 >> 2) - a7); \
2394 static int dct264_sad8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2395 uint8_t *src2, int stride, int h)
2397 MpegEncContext *const s = (MpegEncContext *) c;
2401 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2403 #define SRC(x) dct[i][x]
2404 #define DST(x, v) dct[i][x] = v
2405 for (i = 0; i < 8; i++)
2410 #define SRC(x) dct[x][i]
2411 #define DST(x, v) sum += FFABS(v)
2412 for (i = 0; i < 8; i++)
2420 static int dct_max8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2421 uint8_t *src2, int stride, int h)
2423 MpegEncContext *const s = (MpegEncContext *) c;
2424 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2429 s->dsp.diff_pixels(temp, src1, src2, stride);
2432 for (i = 0; i < 64; i++)
2433 sum = FFMAX(sum, FFABS(temp[i]));
2438 static int quant_psnr8x8_c(/* MpegEncContext */ void *c, uint8_t *src1,
2439 uint8_t *src2, int stride, int h)
2441 MpegEncContext *const s = c;
2442 LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
2443 int16_t *const bak = temp + 64;
2449 s->dsp.diff_pixels(temp, src1, src2, stride);
2451 memcpy(bak, temp, 64 * sizeof(int16_t));
2453 s->block_last_index[0 /* FIXME */] =
2454 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2455 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2456 ff_simple_idct_8(temp); // FIXME
2458 for (i = 0; i < 64; i++)
2459 sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
2464 static int rd8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2467 MpegEncContext *const s = (MpegEncContext *) c;
2468 const uint8_t *scantable = s->intra_scantable.permutated;
2469 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2470 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2471 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2472 int i, last, run, bits, level, distortion, start_i;
2473 const int esc_length = s->ac_esc_length;
2474 uint8_t *length, *last_length;
2478 copy_block8(lsrc1, src1, 8, stride, 8);
2479 copy_block8(lsrc2, src2, 8, stride, 8);
2481 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2483 s->block_last_index[0 /* FIXME */] =
2485 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2491 length = s->intra_ac_vlc_length;
2492 last_length = s->intra_ac_vlc_last_length;
2493 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2496 length = s->inter_ac_vlc_length;
2497 last_length = s->inter_ac_vlc_last_length;
2500 if (last >= start_i) {
2502 for (i = start_i; i < last; i++) {
2503 int j = scantable[i];
2508 if ((level & (~127)) == 0)
2509 bits += length[UNI_AC_ENC_INDEX(run, level)];
2516 i = scantable[last];
2518 level = temp[i] + 64;
2520 av_assert2(level - 64);
2522 if ((level & (~127)) == 0) {
2523 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2530 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2532 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2535 s->dsp.idct_add(lsrc2, 8, temp);
2537 distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2539 return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
2542 static int bit8x8_c(/* MpegEncContext */ void *c, uint8_t *src1, uint8_t *src2,
2545 MpegEncContext *const s = (MpegEncContext *) c;
2546 const uint8_t *scantable = s->intra_scantable.permutated;
2547 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2548 int i, last, run, bits, level, start_i;
2549 const int esc_length = s->ac_esc_length;
2550 uint8_t *length, *last_length;
2554 s->dsp.diff_pixels(temp, src1, src2, stride);
2556 s->block_last_index[0 /* FIXME */] =
2558 s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
2564 length = s->intra_ac_vlc_length;
2565 last_length = s->intra_ac_vlc_last_length;
2566 bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
2569 length = s->inter_ac_vlc_length;
2570 last_length = s->inter_ac_vlc_last_length;
2573 if (last >= start_i) {
2575 for (i = start_i; i < last; i++) {
2576 int j = scantable[i];
2581 if ((level & (~127)) == 0)
2582 bits += length[UNI_AC_ENC_INDEX(run, level)];
2589 i = scantable[last];
2591 level = temp[i] + 64;
2593 av_assert2(level - 64);
2595 if ((level & (~127)) == 0)
2596 bits += last_length[UNI_AC_ENC_INDEX(run, level)];
2604 #define VSAD_INTRA(size) \
2605 static int vsad_intra ## size ## _c(/* MpegEncContext */ void *c, \
2606 uint8_t *s, uint8_t *dummy, \
2607 int stride, int h) \
2609 int score = 0, x, y; \
2611 for (y = 1; y < h; y++) { \
2612 for (x = 0; x < size; x += 4) { \
2613 score += FFABS(s[x] - s[x + stride]) + \
2614 FFABS(s[x + 1] - s[x + stride + 1]) + \
2615 FFABS(s[x + 2] - s[x + 2 + stride]) + \
2616 FFABS(s[x + 3] - s[x + 3 + stride]); \
2626 static int vsad16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2629 int score = 0, x, y;
2631 for (y = 1; y < h; y++) {
2632 for (x = 0; x < 16; x++)
2633 score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2641 #define SQ(a) ((a) * (a))
2642 #define VSSE_INTRA(size) \
2643 static int vsse_intra ## size ## _c(/* MpegEncContext */ void *c, \
2644 uint8_t *s, uint8_t *dummy, \
2645 int stride, int h) \
2647 int score = 0, x, y; \
2649 for (y = 1; y < h; y++) { \
2650 for (x = 0; x < size; x += 4) { \
2651 score += SQ(s[x] - s[x + stride]) + \
2652 SQ(s[x + 1] - s[x + stride + 1]) + \
2653 SQ(s[x + 2] - s[x + stride + 2]) + \
2654 SQ(s[x + 3] - s[x + stride + 3]); \
2664 static int vsse16_c(/* MpegEncContext */ void *c, uint8_t *s1, uint8_t *s2,
2667 int score = 0, x, y;
2669 for (y = 1; y < h; y++) {
2670 for (x = 0; x < 16; x++)
2671 score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]);
2679 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2684 for (i = 0; i < size; i++)
2685 score += (pix1[i] - pix2[i]) * (pix1[i] - pix2[i]);
2689 #define WRAPPER8_16_SQ(name8, name16) \
2690 static int name16(void /*MpegEncContext*/ *s, \
2691 uint8_t *dst, uint8_t *src, \
2692 int stride, int h) \
2696 score += name8(s, dst, src, stride, 8); \
2697 score += name8(s, dst + 8, src + 8, stride, 8); \
2699 dst += 8 * stride; \
2700 src += 8 * stride; \
2701 score += name8(s, dst, src, stride, 8); \
2702 score += name8(s, dst + 8, src + 8, stride, 8); \
2707 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2708 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2709 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2711 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2713 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2714 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2715 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2716 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2718 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2719 uint32_t maxi, uint32_t maxisign)
2723 else if ((a ^ (1U << 31)) > maxisign)
2729 static void vector_clipf_c_opposite_sign(float *dst, const float *src,
2730 float *min, float *max, int len)
2733 uint32_t mini = *(uint32_t *) min;
2734 uint32_t maxi = *(uint32_t *) max;
2735 uint32_t maxisign = maxi ^ (1U << 31);
2736 uint32_t *dsti = (uint32_t *) dst;
2737 const uint32_t *srci = (const uint32_t *) src;
2739 for (i = 0; i < len; i += 8) {
2740 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2741 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2742 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2743 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2744 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2745 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2746 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2747 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2751 static void vector_clipf_c(float *dst, const float *src,
2752 float min, float max, int len)
2756 if (min < 0 && max > 0) {
2757 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2759 for (i = 0; i < len; i += 8) {
2760 dst[i] = av_clipf(src[i], min, max);
2761 dst[i + 1] = av_clipf(src[i + 1], min, max);
2762 dst[i + 2] = av_clipf(src[i + 2], min, max);
2763 dst[i + 3] = av_clipf(src[i + 3], min, max);
2764 dst[i + 4] = av_clipf(src[i + 4], min, max);
2765 dst[i + 5] = av_clipf(src[i + 5], min, max);
2766 dst[i + 6] = av_clipf(src[i + 6], min, max);
2767 dst[i + 7] = av_clipf(src[i + 7], min, max);
2772 static int32_t scalarproduct_int16_c(const int16_t *v1, const int16_t *v2,
2778 res += *v1++ **v2++;
2783 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2,
2791 *v1++ += mul * *v3++;
2796 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2797 int32_t max, unsigned int len)
2800 *dst++ = av_clip(*src++, min, max);
2801 *dst++ = av_clip(*src++, min, max);
2802 *dst++ = av_clip(*src++, min, max);
2803 *dst++ = av_clip(*src++, min, max);
2804 *dst++ = av_clip(*src++, min, max);
2805 *dst++ = av_clip(*src++, min, max);
2806 *dst++ = av_clip(*src++, min, max);
2807 *dst++ = av_clip(*src++, min, max);
2812 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2814 ff_j_rev_dct(block);
2815 put_pixels_clamped_c(block, dest, line_size);
2818 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2820 ff_j_rev_dct(block);
2821 add_pixels_clamped_c(block, dest, line_size);
2824 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2826 ff_j_rev_dct4 (block);
2827 put_pixels_clamped4_c(block, dest, line_size);
2829 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2831 ff_j_rev_dct4 (block);
2832 add_pixels_clamped4_c(block, dest, line_size);
2835 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2837 ff_j_rev_dct2 (block);
2838 put_pixels_clamped2_c(block, dest, line_size);
2840 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2842 ff_j_rev_dct2 (block);
2843 add_pixels_clamped2_c(block, dest, line_size);
2846 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2848 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2850 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2852 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2855 /* init static data */
2856 av_cold void ff_dsputil_static_init(void)
2860 for (i = 0; i < 512; i++)
2861 ff_square_tab[i] = (i - 256) * (i - 256);
2864 int ff_check_alignment(void)
2866 static int did_fail = 0;
2867 LOCAL_ALIGNED_16(int, aligned, [4]);
2869 if ((intptr_t)aligned & 15) {
2871 #if HAVE_MMX || HAVE_ALTIVEC
2872 av_log(NULL, AV_LOG_ERROR,
2873 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2874 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2875 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2876 "Do not report crashes to FFmpeg developers.\n");
2885 av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2887 ff_check_alignment();
2890 if (avctx->bits_per_raw_sample == 10) {
2891 c->fdct = ff_jpeg_fdct_islow_10;
2892 c->fdct248 = ff_fdct248_islow_10;
2894 if (avctx->dct_algo == FF_DCT_FASTINT) {
2895 c->fdct = ff_fdct_ifast;
2896 c->fdct248 = ff_fdct_ifast248;
2897 } else if (avctx->dct_algo == FF_DCT_FAAN) {
2898 c->fdct = ff_faandct;
2899 c->fdct248 = ff_faandct248;
2901 c->fdct = ff_jpeg_fdct_islow_8; // slow/accurate/default
2902 c->fdct248 = ff_fdct248_islow_8;
2905 #endif /* CONFIG_ENCODERS */
2907 if (avctx->lowres==1) {
2908 c->idct_put = ff_jref_idct4_put;
2909 c->idct_add = ff_jref_idct4_add;
2910 c->idct = ff_j_rev_dct4;
2911 c->idct_permutation_type = FF_NO_IDCT_PERM;
2912 } else if (avctx->lowres==2) {
2913 c->idct_put = ff_jref_idct2_put;
2914 c->idct_add = ff_jref_idct2_add;
2915 c->idct = ff_j_rev_dct2;
2916 c->idct_permutation_type = FF_NO_IDCT_PERM;
2917 } else if (avctx->lowres==3) {
2918 c->idct_put = ff_jref_idct1_put;
2919 c->idct_add = ff_jref_idct1_add;
2920 c->idct = ff_j_rev_dct1;
2921 c->idct_permutation_type = FF_NO_IDCT_PERM;
2923 if (avctx->bits_per_raw_sample == 10) {
2924 c->idct_put = ff_simple_idct_put_10;
2925 c->idct_add = ff_simple_idct_add_10;
2926 c->idct = ff_simple_idct_10;
2927 c->idct_permutation_type = FF_NO_IDCT_PERM;
2928 } else if (avctx->bits_per_raw_sample == 12) {
2929 c->idct_put = ff_simple_idct_put_12;
2930 c->idct_add = ff_simple_idct_add_12;
2931 c->idct = ff_simple_idct_12;
2932 c->idct_permutation_type = FF_NO_IDCT_PERM;
2934 if (avctx->idct_algo == FF_IDCT_INT) {
2935 c->idct_put = jref_idct_put;
2936 c->idct_add = jref_idct_add;
2937 c->idct = ff_j_rev_dct;
2938 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
2939 } else if (avctx->idct_algo == FF_IDCT_FAAN) {
2940 c->idct_put = ff_faanidct_put;
2941 c->idct_add = ff_faanidct_add;
2942 c->idct = ff_faanidct;
2943 c->idct_permutation_type = FF_NO_IDCT_PERM;
2944 } else { // accurate/default
2945 c->idct_put = ff_simple_idct_put_8;
2946 c->idct_add = ff_simple_idct_add_8;
2947 c->idct = ff_simple_idct_8;
2948 c->idct_permutation_type = FF_NO_IDCT_PERM;
2953 c->diff_pixels = diff_pixels_c;
2955 c->put_pixels_clamped = put_pixels_clamped_c;
2956 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2957 c->add_pixels_clamped = add_pixels_clamped_c;
2959 c->sum_abs_dctelem = sum_abs_dctelem_c;
2964 c->pix_sum = pix_sum_c;
2965 c->pix_norm1 = pix_norm1_c;
2967 c->fill_block_tab[0] = fill_block16_c;
2968 c->fill_block_tab[1] = fill_block8_c;
2970 /* TODO [0] 16 [1] 8 */
2971 c->pix_abs[0][0] = pix_abs16_c;
2972 c->pix_abs[0][1] = pix_abs16_x2_c;
2973 c->pix_abs[0][2] = pix_abs16_y2_c;
2974 c->pix_abs[0][3] = pix_abs16_xy2_c;
2975 c->pix_abs[1][0] = pix_abs8_c;
2976 c->pix_abs[1][1] = pix_abs8_x2_c;
2977 c->pix_abs[1][2] = pix_abs8_y2_c;
2978 c->pix_abs[1][3] = pix_abs8_xy2_c;
2980 c->put_tpel_pixels_tab[0] = put_tpel_pixels_mc00_c;
2981 c->put_tpel_pixels_tab[1] = put_tpel_pixels_mc10_c;
2982 c->put_tpel_pixels_tab[2] = put_tpel_pixels_mc20_c;
2983 c->put_tpel_pixels_tab[4] = put_tpel_pixels_mc01_c;
2984 c->put_tpel_pixels_tab[5] = put_tpel_pixels_mc11_c;
2985 c->put_tpel_pixels_tab[6] = put_tpel_pixels_mc21_c;
2986 c->put_tpel_pixels_tab[8] = put_tpel_pixels_mc02_c;
2987 c->put_tpel_pixels_tab[9] = put_tpel_pixels_mc12_c;
2988 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2990 c->avg_tpel_pixels_tab[0] = avg_tpel_pixels_mc00_c;
2991 c->avg_tpel_pixels_tab[1] = avg_tpel_pixels_mc10_c;
2992 c->avg_tpel_pixels_tab[2] = avg_tpel_pixels_mc20_c;
2993 c->avg_tpel_pixels_tab[4] = avg_tpel_pixels_mc01_c;
2994 c->avg_tpel_pixels_tab[5] = avg_tpel_pixels_mc11_c;
2995 c->avg_tpel_pixels_tab[6] = avg_tpel_pixels_mc21_c;
2996 c->avg_tpel_pixels_tab[8] = avg_tpel_pixels_mc02_c;
2997 c->avg_tpel_pixels_tab[9] = avg_tpel_pixels_mc12_c;
2998 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3000 #define dspfunc(PFX, IDX, NUM) \
3001 c->PFX ## _pixels_tab[IDX][0] = PFX ## NUM ## _mc00_c; \
3002 c->PFX ## _pixels_tab[IDX][1] = PFX ## NUM ## _mc10_c; \
3003 c->PFX ## _pixels_tab[IDX][2] = PFX ## NUM ## _mc20_c; \
3004 c->PFX ## _pixels_tab[IDX][3] = PFX ## NUM ## _mc30_c; \
3005 c->PFX ## _pixels_tab[IDX][4] = PFX ## NUM ## _mc01_c; \
3006 c->PFX ## _pixels_tab[IDX][5] = PFX ## NUM ## _mc11_c; \
3007 c->PFX ## _pixels_tab[IDX][6] = PFX ## NUM ## _mc21_c; \
3008 c->PFX ## _pixels_tab[IDX][7] = PFX ## NUM ## _mc31_c; \
3009 c->PFX ## _pixels_tab[IDX][8] = PFX ## NUM ## _mc02_c; \
3010 c->PFX ## _pixels_tab[IDX][9] = PFX ## NUM ## _mc12_c; \
3011 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3012 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3013 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3014 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3015 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3016 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3018 dspfunc(put_qpel, 0, 16);
3019 dspfunc(put_qpel, 1, 8);
3021 dspfunc(put_no_rnd_qpel, 0, 16);
3022 dspfunc(put_no_rnd_qpel, 1, 8);
3024 dspfunc(avg_qpel, 0, 16);
3025 dspfunc(avg_qpel, 1, 8);
3029 c->put_mspel_pixels_tab[0] = ff_put_pixels8x8_c;
3030 c->put_mspel_pixels_tab[1] = put_mspel8_mc10_c;
3031 c->put_mspel_pixels_tab[2] = put_mspel8_mc20_c;
3032 c->put_mspel_pixels_tab[3] = put_mspel8_mc30_c;
3033 c->put_mspel_pixels_tab[4] = put_mspel8_mc02_c;
3034 c->put_mspel_pixels_tab[5] = put_mspel8_mc12_c;
3035 c->put_mspel_pixels_tab[6] = put_mspel8_mc22_c;
3036 c->put_mspel_pixels_tab[7] = put_mspel8_mc32_c;
3038 #define SET_CMP_FUNC(name) \
3039 c->name[0] = name ## 16_c; \
3040 c->name[1] = name ## 8x8_c;
3042 SET_CMP_FUNC(hadamard8_diff)
3043 c->hadamard8_diff[4] = hadamard8_intra16_c;
3044 c->hadamard8_diff[5] = hadamard8_intra8x8_c;
3045 SET_CMP_FUNC(dct_sad)
3046 SET_CMP_FUNC(dct_max)
3048 SET_CMP_FUNC(dct264_sad)
3050 c->sad[0] = pix_abs16_c;
3051 c->sad[1] = pix_abs8_c;
3052 c->sse[0] = sse16_c;
3055 SET_CMP_FUNC(quant_psnr)
3058 c->vsad[0] = vsad16_c;
3059 c->vsad[4] = vsad_intra16_c;
3060 c->vsad[5] = vsad_intra8_c;
3061 c->vsse[0] = vsse16_c;
3062 c->vsse[4] = vsse_intra16_c;
3063 c->vsse[5] = vsse_intra8_c;
3064 c->nsse[0] = nsse16_c;
3065 c->nsse[1] = nsse8_c;
3066 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
3067 ff_dsputil_init_dwt(c);
3070 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3072 c->add_bytes = add_bytes_c;
3073 c->add_hfyu_median_prediction = add_hfyu_median_prediction_c;
3074 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3075 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3077 c->diff_bytes = diff_bytes_c;
3078 c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_c;
3080 c->bswap_buf = bswap_buf;
3081 c->bswap16_buf = bswap16_buf;
3083 c->try_8x8basis = try_8x8basis_c;
3084 c->add_8x8basis = add_8x8basis_c;
3086 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3088 c->scalarproduct_int16 = scalarproduct_int16_c;
3089 c->vector_clip_int32 = vector_clip_int32_c;
3090 c->vector_clipf = vector_clipf_c;
3092 c->shrink[0] = av_image_copy_plane;
3093 c->shrink[1] = ff_shrink22;
3094 c->shrink[2] = ff_shrink44;
3095 c->shrink[3] = ff_shrink88;
3097 c->add_pixels8 = add_pixels8_c;
3101 #define FUNC(f, depth) f ## _ ## depth
3102 #define FUNCC(f, depth) f ## _ ## depth ## _c
3104 c->draw_edges = FUNCC(draw_edges, 8);
3106 c->clear_block = FUNCC(clear_block, 8);
3107 c->clear_blocks = FUNCC(clear_blocks, 8);
3109 #define BIT_DEPTH_FUNCS(depth) \
3110 c->get_pixels = FUNCC(get_pixels, depth);
3112 switch (avctx->bits_per_raw_sample) {
3117 BIT_DEPTH_FUNCS(16);
3120 if (avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
3128 ff_dsputil_init_alpha(c, avctx);
3130 ff_dsputil_init_arm(c, avctx);
3132 ff_dsputil_init_bfin(c, avctx);
3134 ff_dsputil_init_ppc(c, avctx);
3136 ff_dsputil_init_x86(c, avctx);
3138 ff_init_scantable_permutation(c->idct_permutation,
3139 c->idct_permutation_type);
3142 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3144 ff_dsputil_init(c, avctx);
3147 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
3149 ff_dsputil_init(c, avctx);