3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
45 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
46 uint32_t ff_squareTbl[512] = {0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
57 #include "dsputil_template.c"
61 #include "dsputil_template.c"
65 #include "dsputil_template.c"
67 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
68 #define pb_7f (~0UL/255 * 0x7f)
69 #define pb_80 (~0UL/255 * 0x80)
71 const uint8_t ff_zigzag_direct[64] = {
72 0, 1, 8, 16, 9, 2, 3, 10,
73 17, 24, 32, 25, 18, 11, 4, 5,
74 12, 19, 26, 33, 40, 48, 41, 34,
75 27, 20, 13, 6, 7, 14, 21, 28,
76 35, 42, 49, 56, 57, 50, 43, 36,
77 29, 22, 15, 23, 30, 37, 44, 51,
78 58, 59, 52, 45, 38, 31, 39, 46,
79 53, 60, 61, 54, 47, 55, 62, 63
82 /* Specific zigzag scan for 248 idct. NOTE that unlike the
83 specification, we interleave the fields */
84 const uint8_t ff_zigzag248_direct[64] = {
85 0, 8, 1, 9, 16, 24, 2, 10,
86 17, 25, 32, 40, 48, 56, 33, 41,
87 18, 26, 3, 11, 4, 12, 19, 27,
88 34, 42, 49, 57, 50, 58, 35, 43,
89 20, 28, 5, 13, 6, 14, 21, 29,
90 36, 44, 51, 59, 52, 60, 37, 45,
91 22, 30, 7, 15, 23, 31, 38, 46,
92 53, 61, 54, 62, 39, 47, 55, 63,
95 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
96 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
98 const uint8_t ff_alternate_horizontal_scan[64] = {
99 0, 1, 2, 3, 8, 9, 16, 17,
100 10, 11, 4, 5, 6, 7, 15, 14,
101 13, 12, 19, 18, 24, 25, 32, 33,
102 26, 27, 20, 21, 22, 23, 28, 29,
103 30, 31, 34, 35, 40, 41, 48, 49,
104 42, 43, 36, 37, 38, 39, 44, 45,
105 46, 47, 50, 51, 56, 57, 58, 59,
106 52, 53, 54, 55, 60, 61, 62, 63,
109 const uint8_t ff_alternate_vertical_scan[64] = {
110 0, 8, 16, 24, 1, 9, 2, 10,
111 17, 25, 32, 40, 48, 56, 57, 49,
112 41, 33, 26, 18, 3, 11, 4, 12,
113 19, 27, 34, 42, 50, 58, 35, 43,
114 51, 59, 20, 28, 5, 13, 6, 14,
115 21, 29, 36, 44, 52, 60, 37, 45,
116 53, 61, 22, 30, 7, 15, 23, 31,
117 38, 46, 54, 62, 39, 47, 55, 63,
120 /* Input permutation for the simple_idct_mmx */
121 static const uint8_t simple_mmx_permutation[64]={
122 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
123 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
124 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
125 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
126 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
127 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
128 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
129 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
134 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
138 st->scantable= src_scantable;
142 j = src_scantable[i];
143 st->permutated[i] = permutation[j];
149 j = st->permutated[i];
151 st->raster_end[i]= end;
155 void ff_init_scantable_permutation(uint8_t *idct_permutation,
156 int idct_permutation_type)
160 switch(idct_permutation_type){
161 case FF_NO_IDCT_PERM:
163 idct_permutation[i]= i;
165 case FF_LIBMPEG2_IDCT_PERM:
167 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
169 case FF_SIMPLE_IDCT_PERM:
171 idct_permutation[i]= simple_mmx_permutation[i];
173 case FF_TRANSPOSE_IDCT_PERM:
175 idct_permutation[i]= ((i&7)<<3) | (i>>3);
177 case FF_PARTTRANS_IDCT_PERM:
179 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
181 case FF_SSE2_IDCT_PERM:
183 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
186 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
190 static int pix_sum_c(uint8_t * pix, int line_size)
195 for (i = 0; i < 16; i++) {
196 for (j = 0; j < 16; j += 8) {
207 pix += line_size - 16;
212 static int pix_norm1_c(uint8_t * pix, int line_size)
215 uint32_t *sq = ff_squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 for (j = 0; j < 16; j += 8) {
231 register uint64_t x=*(uint64_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 s += sq[(x>>32)&0xff];
237 s += sq[(x>>40)&0xff];
238 s += sq[(x>>48)&0xff];
239 s += sq[(x>>56)&0xff];
241 register uint32_t x=*(uint32_t*)pix;
243 s += sq[(x>>8)&0xff];
244 s += sq[(x>>16)&0xff];
245 s += sq[(x>>24)&0xff];
246 x=*(uint32_t*)(pix+4);
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
255 pix += line_size - 16;
260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
263 for(i=0; i+8<=w; i+=8){
264 dst[i+0]= av_bswap32(src[i+0]);
265 dst[i+1]= av_bswap32(src[i+1]);
266 dst[i+2]= av_bswap32(src[i+2]);
267 dst[i+3]= av_bswap32(src[i+3]);
268 dst[i+4]= av_bswap32(src[i+4]);
269 dst[i+5]= av_bswap32(src[i+5]);
270 dst[i+6]= av_bswap32(src[i+6]);
271 dst[i+7]= av_bswap32(src[i+7]);
274 dst[i+0]= av_bswap32(src[i+0]);
278 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
281 *dst++ = av_bswap16(*src++);
284 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
287 uint32_t *sq = ff_squareTbl + 256;
290 for (i = 0; i < h; i++) {
291 s += sq[pix1[0] - pix2[0]];
292 s += sq[pix1[1] - pix2[1]];
293 s += sq[pix1[2] - pix2[2]];
294 s += sq[pix1[3] - pix2[3]];
301 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
304 uint32_t *sq = ff_squareTbl + 256;
307 for (i = 0; i < h; i++) {
308 s += sq[pix1[0] - pix2[0]];
309 s += sq[pix1[1] - pix2[1]];
310 s += sq[pix1[2] - pix2[2]];
311 s += sq[pix1[3] - pix2[3]];
312 s += sq[pix1[4] - pix2[4]];
313 s += sq[pix1[5] - pix2[5]];
314 s += sq[pix1[6] - pix2[6]];
315 s += sq[pix1[7] - pix2[7]];
322 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
325 uint32_t *sq = ff_squareTbl + 256;
328 for (i = 0; i < h; i++) {
329 s += sq[pix1[ 0] - pix2[ 0]];
330 s += sq[pix1[ 1] - pix2[ 1]];
331 s += sq[pix1[ 2] - pix2[ 2]];
332 s += sq[pix1[ 3] - pix2[ 3]];
333 s += sq[pix1[ 4] - pix2[ 4]];
334 s += sq[pix1[ 5] - pix2[ 5]];
335 s += sq[pix1[ 6] - pix2[ 6]];
336 s += sq[pix1[ 7] - pix2[ 7]];
337 s += sq[pix1[ 8] - pix2[ 8]];
338 s += sq[pix1[ 9] - pix2[ 9]];
339 s += sq[pix1[10] - pix2[10]];
340 s += sq[pix1[11] - pix2[11]];
341 s += sq[pix1[12] - pix2[12]];
342 s += sq[pix1[13] - pix2[13]];
343 s += sq[pix1[14] - pix2[14]];
344 s += sq[pix1[15] - pix2[15]];
352 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
353 const uint8_t *s2, int stride){
356 /* read the pixels */
358 block[0] = s1[0] - s2[0];
359 block[1] = s1[1] - s2[1];
360 block[2] = s1[2] - s2[2];
361 block[3] = s1[3] - s2[3];
362 block[4] = s1[4] - s2[4];
363 block[5] = s1[5] - s2[5];
364 block[6] = s1[6] - s2[6];
365 block[7] = s1[7] - s2[7];
372 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
377 /* read the pixels */
379 pixels[0] = av_clip_uint8(block[0]);
380 pixels[1] = av_clip_uint8(block[1]);
381 pixels[2] = av_clip_uint8(block[2]);
382 pixels[3] = av_clip_uint8(block[3]);
383 pixels[4] = av_clip_uint8(block[4]);
384 pixels[5] = av_clip_uint8(block[5]);
385 pixels[6] = av_clip_uint8(block[6]);
386 pixels[7] = av_clip_uint8(block[7]);
393 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
398 /* read the pixels */
400 pixels[0] = av_clip_uint8(block[0]);
401 pixels[1] = av_clip_uint8(block[1]);
402 pixels[2] = av_clip_uint8(block[2]);
403 pixels[3] = av_clip_uint8(block[3]);
410 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
415 /* read the pixels */
417 pixels[0] = av_clip_uint8(block[0]);
418 pixels[1] = av_clip_uint8(block[1]);
425 static void put_signed_pixels_clamped_c(const int16_t *block,
426 uint8_t *av_restrict pixels,
431 for (i = 0; i < 8; i++) {
432 for (j = 0; j < 8; j++) {
435 else if (*block > 127)
438 *pixels = (uint8_t)(*block + 128);
442 pixels += (line_size - 8);
446 static void add_pixels8_c(uint8_t *av_restrict pixels,
453 pixels[0] += block[0];
454 pixels[1] += block[1];
455 pixels[2] += block[2];
456 pixels[3] += block[3];
457 pixels[4] += block[4];
458 pixels[5] += block[5];
459 pixels[6] += block[6];
460 pixels[7] += block[7];
466 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
471 /* read the pixels */
473 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
474 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
475 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
476 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
477 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
478 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
479 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
480 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
486 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
491 /* read the pixels */
493 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
494 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
495 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
496 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
502 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
507 /* read the pixels */
509 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
510 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
516 static int sum_abs_dctelem_c(int16_t *block)
520 sum+= FFABS(block[i]);
524 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
528 for (i = 0; i < h; i++) {
529 memset(block, value, 16);
534 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
538 for (i = 0; i < h; i++) {
539 memset(block, value, 8);
544 #define avg2(a,b) ((a+b+1)>>1)
545 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
547 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
549 const int A=(16-x16)*(16-y16);
550 const int B=( x16)*(16-y16);
551 const int C=(16-x16)*( y16);
552 const int D=( x16)*( y16);
557 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
558 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
559 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
560 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
561 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
562 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
563 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
564 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
570 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
571 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
574 const int s= 1<<shift;
584 for(x=0; x<8; x++){ //XXX FIXME optimize
585 int src_x, src_y, frac_x, frac_y, index;
594 if((unsigned)src_x < width){
595 if((unsigned)src_y < height){
596 index= src_x + src_y*stride;
597 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
598 + src[index +1]* frac_x )*(s-frac_y)
599 + ( src[index+stride ]*(s-frac_x)
600 + src[index+stride+1]* frac_x )* frac_y
603 index= src_x + av_clip(src_y, 0, height)*stride;
604 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
605 + src[index +1]* frac_x )*s
609 if((unsigned)src_y < height){
610 index= av_clip(src_x, 0, width) + src_y*stride;
611 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
612 + src[index+stride ]* frac_y )*s
615 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
616 dst[y*stride + x]= src[index ];
628 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 case 2: put_pixels2_8_c (dst, src, stride, height); break;
631 case 4: put_pixels4_8_c (dst, src, stride, height); break;
632 case 8: put_pixels8_8_c (dst, src, stride, height); break;
633 case 16:put_pixels16_8_c(dst, src, stride, height); break;
637 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
659 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
670 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 for (i=0; i < height; i++) {
673 for (j=0; j < width; j++) {
674 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
681 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 for (i=0; i < height; i++) {
684 for (j=0; j < width; j++) {
685 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
692 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 for (i=0; i < height; i++) {
695 for (j=0; j < width; j++) {
696 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
703 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 for (i=0; i < height; i++) {
706 for (j=0; j < width; j++) {
707 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
714 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 for (i=0; i < height; i++) {
717 for (j=0; j < width; j++) {
718 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
725 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
728 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
729 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
730 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
734 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
767 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 for (i=0; i < height; i++) {
770 for (j=0; j < width; j++) {
771 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
778 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 for (i=0; i < height; i++) {
781 for (j=0; j < width; j++) {
782 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
789 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 for (i=0; i < height; i++) {
792 for (j=0; j < width; j++) {
793 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
800 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
802 for (i=0; i < height; i++) {
803 for (j=0; j < width; j++) {
804 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
811 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
813 for (i=0; i < height; i++) {
814 for (j=0; j < width; j++) {
815 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
822 #define QPEL_MC(r, OPNAME, RND, OP) \
823 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
824 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
828 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
829 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
830 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
831 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
832 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
833 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
834 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
835 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
841 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
843 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847 const int src0= src[0*srcStride];\
848 const int src1= src[1*srcStride];\
849 const int src2= src[2*srcStride];\
850 const int src3= src[3*srcStride];\
851 const int src4= src[4*srcStride];\
852 const int src5= src[5*srcStride];\
853 const int src6= src[6*srcStride];\
854 const int src7= src[7*srcStride];\
855 const int src8= src[8*srcStride];\
856 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
857 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
858 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
859 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
860 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
861 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
862 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
863 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
869 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
870 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
875 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
876 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
877 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
878 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
879 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
880 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
881 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
882 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
883 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
884 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
885 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
886 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
887 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
888 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
889 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
890 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
896 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
897 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
902 const int src0= src[0*srcStride];\
903 const int src1= src[1*srcStride];\
904 const int src2= src[2*srcStride];\
905 const int src3= src[3*srcStride];\
906 const int src4= src[4*srcStride];\
907 const int src5= src[5*srcStride];\
908 const int src6= src[6*srcStride];\
909 const int src7= src[7*srcStride];\
910 const int src8= src[8*srcStride];\
911 const int src9= src[9*srcStride];\
912 const int src10= src[10*srcStride];\
913 const int src11= src[11*srcStride];\
914 const int src12= src[12*srcStride];\
915 const int src13= src[13*srcStride];\
916 const int src14= src[14*srcStride];\
917 const int src15= src[15*srcStride];\
918 const int src16= src[16*srcStride];\
919 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
920 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
921 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
922 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
923 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
924 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
925 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
926 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
927 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
928 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
929 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
930 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
931 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
932 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
933 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
934 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
940 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
942 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
943 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
946 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
947 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
950 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
952 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
953 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
956 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
959 copy_block9(full, src, 16, stride, 9);\
960 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
961 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
964 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
966 copy_block9(full, src, 16, stride, 9);\
967 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
970 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
973 copy_block9(full, src, 16, stride, 9);\
974 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
975 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
977 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
982 copy_block9(full, src, 16, stride, 9);\
983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
984 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
986 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
988 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
992 copy_block9(full, src, 16, stride, 9);\
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
994 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
996 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
998 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t halfHV[64];\
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1010 uint8_t full[16*9];\
1012 uint8_t halfHV[64];\
1013 copy_block9(full, src, 16, stride, 9);\
1014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t full[16*9];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1047 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1052 uint8_t full[16*9];\
1054 uint8_t halfHV[64];\
1055 copy_block9(full, src, 16, stride, 9);\
1056 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1057 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1058 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1059 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t halfHV[64];\
1064 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1065 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1066 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1068 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1070 uint8_t halfHV[64];\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1073 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1075 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1076 uint8_t full[16*9];\
1079 uint8_t halfHV[64];\
1080 copy_block9(full, src, 16, stride, 9);\
1081 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1082 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1084 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1086 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1087 uint8_t full[16*9];\
1089 copy_block9(full, src, 16, stride, 9);\
1090 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1091 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1092 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1094 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[16*9];\
1098 uint8_t halfHV[64];\
1099 copy_block9(full, src, 16, stride, 9);\
1100 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1102 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1103 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1105 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1106 uint8_t full[16*9];\
1108 copy_block9(full, src, 16, stride, 9);\
1109 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1110 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1111 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1113 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1121 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1125 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1126 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1129 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1136 uint8_t full[24*17];\
1138 copy_block17(full, src, 24, stride, 17);\
1139 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1140 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1143 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1144 uint8_t full[24*17];\
1145 copy_block17(full, src, 24, stride, 17);\
1146 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1149 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1150 uint8_t full[24*17];\
1152 copy_block17(full, src, 24, stride, 17);\
1153 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1154 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1156 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157 uint8_t full[24*17];\
1158 uint8_t halfH[272];\
1159 uint8_t halfV[256];\
1160 uint8_t halfHV[256];\
1161 copy_block17(full, src, 24, stride, 17);\
1162 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1168 uint8_t full[24*17];\
1169 uint8_t halfH[272];\
1170 uint8_t halfHV[256];\
1171 copy_block17(full, src, 24, stride, 17);\
1172 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178 uint8_t full[24*17];\
1179 uint8_t halfH[272];\
1180 uint8_t halfV[256];\
1181 uint8_t halfHV[256];\
1182 copy_block17(full, src, 24, stride, 17);\
1183 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1189 uint8_t full[24*17];\
1190 uint8_t halfH[272];\
1191 uint8_t halfHV[256];\
1192 copy_block17(full, src, 24, stride, 17);\
1193 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfV[256];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfHV[256];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220 uint8_t full[24*17];\
1221 uint8_t halfH[272];\
1222 uint8_t halfV[256];\
1223 uint8_t halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1231 uint8_t full[24*17];\
1232 uint8_t halfH[272];\
1233 uint8_t halfHV[256];\
1234 copy_block17(full, src, 24, stride, 17);\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1241 uint8_t halfH[272];\
1242 uint8_t halfHV[256];\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1244 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1245 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1247 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1248 uint8_t halfH[272];\
1249 uint8_t halfHV[256];\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1255 uint8_t full[24*17];\
1256 uint8_t halfH[272];\
1257 uint8_t halfV[256];\
1258 uint8_t halfHV[256];\
1259 copy_block17(full, src, 24, stride, 17);\
1260 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1261 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1265 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1266 uint8_t full[24*17];\
1267 uint8_t halfH[272];\
1268 copy_block17(full, src, 24, stride, 17);\
1269 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1271 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1273 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1274 uint8_t full[24*17];\
1275 uint8_t halfH[272];\
1276 uint8_t halfV[256];\
1277 uint8_t halfHV[256];\
1278 copy_block17(full, src, 24, stride, 17);\
1279 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1280 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1282 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1284 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1285 uint8_t full[24*17];\
1286 uint8_t halfH[272];\
1287 copy_block17(full, src, 24, stride, 17);\
1288 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1289 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1290 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1292 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1293 uint8_t halfH[272];\
1294 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1295 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1298 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1299 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1300 #define op_put(a, b) a = cm[((b) + 16)>>5]
1301 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1303 QPEL_MC(0, put_ , _ , op_put)
1304 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1305 QPEL_MC(0, avg_ , _ , op_avg)
1306 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1308 #undef op_avg_no_rnd
1310 #undef op_put_no_rnd
1312 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1313 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1314 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1315 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1316 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1317 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1319 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1320 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1324 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1325 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1326 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1327 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1328 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1329 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1330 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1331 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1337 #if CONFIG_RV40_DECODER
1338 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1339 put_pixels16_xy2_8_c(dst, src, stride, 16);
1341 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1342 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1344 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1345 put_pixels8_xy2_8_c(dst, src, stride, 8);
1347 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1348 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1350 #endif /* CONFIG_RV40_DECODER */
1352 #if CONFIG_DIRAC_DECODER
1353 #define DIRAC_MC(OPNAME)\
1354 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1356 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1358 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1360 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1362 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1364 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1365 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1367 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1369 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1371 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1373 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1375 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1377 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1378 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1380 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1382 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1384 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1386 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1388 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1390 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1391 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1397 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1398 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1402 const int src_1= src[ -srcStride];
1403 const int src0 = src[0 ];
1404 const int src1 = src[ srcStride];
1405 const int src2 = src[2*srcStride];
1406 const int src3 = src[3*srcStride];
1407 const int src4 = src[4*srcStride];
1408 const int src5 = src[5*srcStride];
1409 const int src6 = src[6*srcStride];
1410 const int src7 = src[7*srcStride];
1411 const int src8 = src[8*srcStride];
1412 const int src9 = src[9*srcStride];
1413 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1414 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1415 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1416 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1417 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1418 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1419 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1420 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1426 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1428 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1429 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1432 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1433 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1436 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1438 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1439 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1442 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1443 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1446 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1450 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1451 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1452 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1453 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1455 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1459 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1460 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1461 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1462 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1464 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1466 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1467 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1470 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1471 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1473 const int strength= ff_h263_loop_filter_strength[qscale];
1477 int p0= src[x-2*stride];
1478 int p1= src[x-1*stride];
1479 int p2= src[x+0*stride];
1480 int p3= src[x+1*stride];
1481 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1483 if (d<-2*strength) d1= 0;
1484 else if(d<- strength) d1=-2*strength - d;
1485 else if(d< strength) d1= d;
1486 else if(d< 2*strength) d1= 2*strength - d;
1491 if(p1&256) p1= ~(p1>>31);
1492 if(p2&256) p2= ~(p2>>31);
1494 src[x-1*stride] = p1;
1495 src[x+0*stride] = p2;
1499 d2= av_clip((p0-p3)/4, -ad1, ad1);
1501 src[x-2*stride] = p0 - d2;
1502 src[x+ stride] = p3 + d2;
1507 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1508 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1510 const int strength= ff_h263_loop_filter_strength[qscale];
1514 int p0= src[y*stride-2];
1515 int p1= src[y*stride-1];
1516 int p2= src[y*stride+0];
1517 int p3= src[y*stride+1];
1518 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1520 if (d<-2*strength) d1= 0;
1521 else if(d<- strength) d1=-2*strength - d;
1522 else if(d< strength) d1= d;
1523 else if(d< 2*strength) d1= 2*strength - d;
1528 if(p1&256) p1= ~(p1>>31);
1529 if(p2&256) p2= ~(p2>>31);
1531 src[y*stride-1] = p1;
1532 src[y*stride+0] = p2;
1536 d2= av_clip((p0-p3)/4, -ad1, ad1);
1538 src[y*stride-2] = p0 - d2;
1539 src[y*stride+1] = p3 + d2;
1544 static void h261_loop_filter_c(uint8_t *src, int stride){
1549 temp[x ] = 4*src[x ];
1550 temp[x + 7*8] = 4*src[x + 7*stride];
1554 xy = y * stride + x;
1556 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1561 src[ y*stride] = (temp[ y*8] + 2)>>2;
1562 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1564 xy = y * stride + x;
1566 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1571 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1577 s += abs(pix1[0] - pix2[0]);
1578 s += abs(pix1[1] - pix2[1]);
1579 s += abs(pix1[2] - pix2[2]);
1580 s += abs(pix1[3] - pix2[3]);
1581 s += abs(pix1[4] - pix2[4]);
1582 s += abs(pix1[5] - pix2[5]);
1583 s += abs(pix1[6] - pix2[6]);
1584 s += abs(pix1[7] - pix2[7]);
1585 s += abs(pix1[8] - pix2[8]);
1586 s += abs(pix1[9] - pix2[9]);
1587 s += abs(pix1[10] - pix2[10]);
1588 s += abs(pix1[11] - pix2[11]);
1589 s += abs(pix1[12] - pix2[12]);
1590 s += abs(pix1[13] - pix2[13]);
1591 s += abs(pix1[14] - pix2[14]);
1592 s += abs(pix1[15] - pix2[15]);
1599 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1606 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1607 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1608 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1609 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1610 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1611 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1612 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1613 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1614 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1615 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1616 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1617 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1618 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1619 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1620 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1627 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1630 uint8_t *pix3 = pix2 + line_size;
1634 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1642 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1643 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1644 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1645 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1646 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1647 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1648 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1649 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1657 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1660 uint8_t *pix3 = pix2 + line_size;
1664 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1665 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1666 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1667 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1668 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1669 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1670 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1671 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1672 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1673 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1674 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1675 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1676 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1677 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1678 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1679 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1687 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1693 s += abs(pix1[0] - pix2[0]);
1694 s += abs(pix1[1] - pix2[1]);
1695 s += abs(pix1[2] - pix2[2]);
1696 s += abs(pix1[3] - pix2[3]);
1697 s += abs(pix1[4] - pix2[4]);
1698 s += abs(pix1[5] - pix2[5]);
1699 s += abs(pix1[6] - pix2[6]);
1700 s += abs(pix1[7] - pix2[7]);
1707 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1713 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1714 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1715 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1716 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1717 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1718 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1719 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1720 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1727 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1730 uint8_t *pix3 = pix2 + line_size;
1734 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1735 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1736 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1737 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1738 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1739 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1740 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1741 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1749 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1752 uint8_t *pix3 = pix2 + line_size;
1756 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1757 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1758 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1759 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1760 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1761 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1762 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1763 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1771 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1772 MpegEncContext *c = v;
1778 for(x=0; x<16; x++){
1779 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1782 for(x=0; x<15; x++){
1783 score2+= FFABS( s1[x ] - s1[x +stride]
1784 - s1[x+1] + s1[x+1+stride])
1785 -FFABS( s2[x ] - s2[x +stride]
1786 - s2[x+1] + s2[x+1+stride]);
1793 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1794 else return score1 + FFABS(score2)*8;
1797 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1798 MpegEncContext *c = v;
1805 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1809 score2+= FFABS( s1[x ] - s1[x +stride]
1810 - s1[x+1] + s1[x+1+stride])
1811 -FFABS( s2[x ] - s2[x +stride]
1812 - s2[x+1] + s2[x+1+stride]);
1819 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1820 else return score1 + FFABS(score2)*8;
1823 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1827 for(i=0; i<8*8; i++){
1828 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1831 av_assert2(-512<b && b<512);
1833 sum += (w*b)*(w*b)>>4;
1838 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1841 for(i=0; i<8*8; i++){
1842 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1846 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1850 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1853 memset(cmp, 0, sizeof(void*)*6);
1861 cmp[i]= c->hadamard8_diff[i];
1867 cmp[i]= c->dct_sad[i];
1870 cmp[i]= c->dct264_sad[i];
1873 cmp[i]= c->dct_max[i];
1876 cmp[i]= c->quant_psnr[i];
1905 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1910 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1912 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1913 long a = *(long*)(src+i);
1914 long b = *(long*)(dst+i);
1915 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1918 dst[i+0] += src[i+0];
1921 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1923 #if !HAVE_FAST_UNALIGNED
1924 if((long)src2 & (sizeof(long)-1)){
1925 for(i=0; i+7<w; i+=8){
1926 dst[i+0] = src1[i+0]-src2[i+0];
1927 dst[i+1] = src1[i+1]-src2[i+1];
1928 dst[i+2] = src1[i+2]-src2[i+2];
1929 dst[i+3] = src1[i+3]-src2[i+3];
1930 dst[i+4] = src1[i+4]-src2[i+4];
1931 dst[i+5] = src1[i+5]-src2[i+5];
1932 dst[i+6] = src1[i+6]-src2[i+6];
1933 dst[i+7] = src1[i+7]-src2[i+7];
1937 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1938 long a = *(long*)(src1+i);
1939 long b = *(long*)(src2+i);
1940 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1943 dst[i+0] = src1[i+0]-src2[i+0];
1946 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1954 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1963 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1971 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1981 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1984 for(i=0; i<w-1; i++){
2011 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2041 #define BUTTERFLY2(o1,o2,i1,i2) \
2045 #define BUTTERFLY1(x,y) \
2054 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2056 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2064 //FIXME try pointer walks
2065 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2066 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2067 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2068 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2070 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2071 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2072 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2073 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2075 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2076 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2077 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2078 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2082 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2083 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2084 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2085 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2087 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2088 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2089 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2090 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2093 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2094 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2095 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2096 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2101 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2109 //FIXME try pointer walks
2110 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2111 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2112 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2113 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2115 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2116 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2117 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2118 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2120 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2121 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2122 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2123 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2127 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2128 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2129 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2130 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2132 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2133 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2134 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2135 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2138 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2139 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2140 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2141 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2144 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2149 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2150 MpegEncContext * const s= (MpegEncContext *)c;
2151 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2155 s->dsp.diff_pixels(temp, src1, src2, stride);
2157 return s->dsp.sum_abs_dctelem(temp);
2162 const int s07 = SRC(0) + SRC(7);\
2163 const int s16 = SRC(1) + SRC(6);\
2164 const int s25 = SRC(2) + SRC(5);\
2165 const int s34 = SRC(3) + SRC(4);\
2166 const int a0 = s07 + s34;\
2167 const int a1 = s16 + s25;\
2168 const int a2 = s07 - s34;\
2169 const int a3 = s16 - s25;\
2170 const int d07 = SRC(0) - SRC(7);\
2171 const int d16 = SRC(1) - SRC(6);\
2172 const int d25 = SRC(2) - SRC(5);\
2173 const int d34 = SRC(3) - SRC(4);\
2174 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2175 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2176 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2177 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2179 DST(1, a4 + (a7>>2)) ;\
2180 DST(2, a2 + (a3>>1)) ;\
2181 DST(3, a5 + (a6>>2)) ;\
2183 DST(5, a6 - (a5>>2)) ;\
2184 DST(6, (a2>>1) - a3 ) ;\
2185 DST(7, (a4>>2) - a7 ) ;\
2188 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2189 MpegEncContext * const s= (MpegEncContext *)c;
2194 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2196 #define SRC(x) dct[i][x]
2197 #define DST(x,v) dct[i][x]= v
2198 for( i = 0; i < 8; i++ )
2203 #define SRC(x) dct[x][i]
2204 #define DST(x,v) sum += FFABS(v)
2205 for( i = 0; i < 8; i++ )
2213 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2214 MpegEncContext * const s= (MpegEncContext *)c;
2215 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2220 s->dsp.diff_pixels(temp, src1, src2, stride);
2224 sum= FFMAX(sum, FFABS(temp[i]));
2229 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2230 MpegEncContext * const s= (MpegEncContext *)c;
2231 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2232 int16_t * const bak = temp+64;
2238 s->dsp.diff_pixels(temp, src1, src2, stride);
2240 memcpy(bak, temp, 64*sizeof(int16_t));
2242 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2243 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2244 ff_simple_idct_8(temp); //FIXME
2247 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2252 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2253 MpegEncContext * const s= (MpegEncContext *)c;
2254 const uint8_t *scantable= s->intra_scantable.permutated;
2255 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2256 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2257 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2258 int i, last, run, bits, level, distortion, start_i;
2259 const int esc_length= s->ac_esc_length;
2261 uint8_t * last_length;
2265 copy_block8(lsrc1, src1, 8, stride, 8);
2266 copy_block8(lsrc2, src2, 8, stride, 8);
2268 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2270 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2276 length = s->intra_ac_vlc_length;
2277 last_length= s->intra_ac_vlc_last_length;
2278 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2281 length = s->inter_ac_vlc_length;
2282 last_length= s->inter_ac_vlc_last_length;
2287 for(i=start_i; i<last; i++){
2288 int j= scantable[i];
2293 if((level&(~127)) == 0){
2294 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2303 level= temp[i] + 64;
2305 av_assert2(level - 64);
2307 if((level&(~127)) == 0){
2308 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2316 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2318 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2321 s->dsp.idct_add(lsrc2, 8, temp);
2323 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2325 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2328 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2329 MpegEncContext * const s= (MpegEncContext *)c;
2330 const uint8_t *scantable= s->intra_scantable.permutated;
2331 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2332 int i, last, run, bits, level, start_i;
2333 const int esc_length= s->ac_esc_length;
2335 uint8_t * last_length;
2339 s->dsp.diff_pixels(temp, src1, src2, stride);
2341 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2347 length = s->intra_ac_vlc_length;
2348 last_length= s->intra_ac_vlc_last_length;
2349 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2352 length = s->inter_ac_vlc_length;
2353 last_length= s->inter_ac_vlc_last_length;
2358 for(i=start_i; i<last; i++){
2359 int j= scantable[i];
2364 if((level&(~127)) == 0){
2365 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2374 level= temp[i] + 64;
2376 av_assert2(level - 64);
2378 if((level&(~127)) == 0){
2379 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2387 #define VSAD_INTRA(size) \
2388 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2392 for(y=1; y<h; y++){ \
2393 for(x=0; x<size; x+=4){ \
2394 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2395 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2405 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2410 for(x=0; x<16; x++){
2411 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2420 #define SQ(a) ((a)*(a))
2421 #define VSSE_INTRA(size) \
2422 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2426 for(y=1; y<h; y++){ \
2427 for(x=0; x<size; x+=4){ \
2428 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2429 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2439 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2444 for(x=0; x<16; x++){
2445 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2454 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2458 for(i=0; i<size; i++)
2459 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2463 #define WRAPPER8_16_SQ(name8, name16)\
2464 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2466 score +=name8(s, dst , src , stride, 8);\
2467 score +=name8(s, dst+8 , src+8 , stride, 8);\
2471 score +=name8(s, dst , src , stride, 8);\
2472 score +=name8(s, dst+8 , src+8 , stride, 8);\
2477 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2478 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2479 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2481 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2483 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2484 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2485 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2486 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2488 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2489 uint32_t maxi, uint32_t maxisign)
2492 if(a > mini) return mini;
2493 else if((a^(1U<<31)) > maxisign) return maxi;
2497 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2499 uint32_t mini = *(uint32_t*)min;
2500 uint32_t maxi = *(uint32_t*)max;
2501 uint32_t maxisign = maxi ^ (1U<<31);
2502 uint32_t *dsti = (uint32_t*)dst;
2503 const uint32_t *srci = (const uint32_t*)src;
2504 for(i=0; i<len; i+=8) {
2505 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2506 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2507 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2508 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2509 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2510 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2511 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2512 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2515 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2517 if(min < 0 && max > 0) {
2518 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2520 for(i=0; i < len; i+=8) {
2521 dst[i ] = av_clipf(src[i ], min, max);
2522 dst[i + 1] = av_clipf(src[i + 1], min, max);
2523 dst[i + 2] = av_clipf(src[i + 2], min, max);
2524 dst[i + 3] = av_clipf(src[i + 3], min, max);
2525 dst[i + 4] = av_clipf(src[i + 4], min, max);
2526 dst[i + 5] = av_clipf(src[i + 5], min, max);
2527 dst[i + 6] = av_clipf(src[i + 6], min, max);
2528 dst[i + 7] = av_clipf(src[i + 7], min, max);
2533 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2538 res += *v1++ * *v2++;
2543 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2548 *v1++ += mul * *v3++;
2553 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2554 const int16_t *window, unsigned int len)
2557 int len2 = len >> 1;
2559 for (i = 0; i < len2; i++) {
2560 int16_t w = window[i];
2561 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2562 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2566 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2567 int32_t max, unsigned int len)
2570 *dst++ = av_clip(*src++, min, max);
2571 *dst++ = av_clip(*src++, min, max);
2572 *dst++ = av_clip(*src++, min, max);
2573 *dst++ = av_clip(*src++, min, max);
2574 *dst++ = av_clip(*src++, min, max);
2575 *dst++ = av_clip(*src++, min, max);
2576 *dst++ = av_clip(*src++, min, max);
2577 *dst++ = av_clip(*src++, min, max);
2582 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2584 ff_j_rev_dct (block);
2585 put_pixels_clamped_c(block, dest, line_size);
2587 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2589 ff_j_rev_dct (block);
2590 add_pixels_clamped_c(block, dest, line_size);
2593 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2595 ff_j_rev_dct4 (block);
2596 put_pixels_clamped4_c(block, dest, line_size);
2598 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2600 ff_j_rev_dct4 (block);
2601 add_pixels_clamped4_c(block, dest, line_size);
2604 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2606 ff_j_rev_dct2 (block);
2607 put_pixels_clamped2_c(block, dest, line_size);
2609 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2611 ff_j_rev_dct2 (block);
2612 add_pixels_clamped2_c(block, dest, line_size);
2615 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2617 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2619 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2621 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2624 /* init static data */
2625 av_cold void ff_dsputil_static_init(void)
2629 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2630 for(i=0;i<MAX_NEG_CROP;i++) {
2632 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2635 for(i=0;i<512;i++) {
2636 ff_squareTbl[i] = (i - 256) * (i - 256);
2639 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2642 int ff_check_alignment(void){
2643 static int did_fail=0;
2644 LOCAL_ALIGNED_16(int, aligned, [4]);
2646 if((intptr_t)aligned & 15){
2648 #if HAVE_MMX || HAVE_ALTIVEC
2649 av_log(NULL, AV_LOG_ERROR,
2650 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2651 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2652 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2653 "Do not report crashes to FFmpeg developers.\n");
2662 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2664 ff_check_alignment();
2667 if (avctx->bits_per_raw_sample == 10) {
2668 c->fdct = ff_jpeg_fdct_islow_10;
2669 c->fdct248 = ff_fdct248_islow_10;
2671 if(avctx->dct_algo==FF_DCT_FASTINT) {
2672 c->fdct = ff_fdct_ifast;
2673 c->fdct248 = ff_fdct_ifast248;
2675 else if(avctx->dct_algo==FF_DCT_FAAN) {
2676 c->fdct = ff_faandct;
2677 c->fdct248 = ff_faandct248;
2680 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2681 c->fdct248 = ff_fdct248_islow_8;
2684 #endif //CONFIG_ENCODERS
2686 if(avctx->lowres==1){
2687 c->idct_put= ff_jref_idct4_put;
2688 c->idct_add= ff_jref_idct4_add;
2689 c->idct = ff_j_rev_dct4;
2690 c->idct_permutation_type= FF_NO_IDCT_PERM;
2691 }else if(avctx->lowres==2){
2692 c->idct_put= ff_jref_idct2_put;
2693 c->idct_add= ff_jref_idct2_add;
2694 c->idct = ff_j_rev_dct2;
2695 c->idct_permutation_type= FF_NO_IDCT_PERM;
2696 }else if(avctx->lowres==3){
2697 c->idct_put= ff_jref_idct1_put;
2698 c->idct_add= ff_jref_idct1_add;
2699 c->idct = ff_j_rev_dct1;
2700 c->idct_permutation_type= FF_NO_IDCT_PERM;
2702 if (avctx->bits_per_raw_sample == 10) {
2703 c->idct_put = ff_simple_idct_put_10;
2704 c->idct_add = ff_simple_idct_add_10;
2705 c->idct = ff_simple_idct_10;
2706 c->idct_permutation_type = FF_NO_IDCT_PERM;
2708 if(avctx->idct_algo==FF_IDCT_INT){
2709 c->idct_put= ff_jref_idct_put;
2710 c->idct_add= ff_jref_idct_add;
2711 c->idct = ff_j_rev_dct;
2712 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2713 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2714 c->idct_put= ff_faanidct_put;
2715 c->idct_add= ff_faanidct_add;
2716 c->idct = ff_faanidct;
2717 c->idct_permutation_type= FF_NO_IDCT_PERM;
2718 }else{ //accurate/default
2719 c->idct_put = ff_simple_idct_put_8;
2720 c->idct_add = ff_simple_idct_add_8;
2721 c->idct = ff_simple_idct_8;
2722 c->idct_permutation_type= FF_NO_IDCT_PERM;
2727 c->diff_pixels = diff_pixels_c;
2728 c->put_pixels_clamped = put_pixels_clamped_c;
2729 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2730 c->add_pixels_clamped = add_pixels_clamped_c;
2731 c->sum_abs_dctelem = sum_abs_dctelem_c;
2734 c->pix_sum = pix_sum_c;
2735 c->pix_norm1 = pix_norm1_c;
2737 c->fill_block_tab[0] = fill_block16_c;
2738 c->fill_block_tab[1] = fill_block8_c;
2740 /* TODO [0] 16 [1] 8 */
2741 c->pix_abs[0][0] = pix_abs16_c;
2742 c->pix_abs[0][1] = pix_abs16_x2_c;
2743 c->pix_abs[0][2] = pix_abs16_y2_c;
2744 c->pix_abs[0][3] = pix_abs16_xy2_c;
2745 c->pix_abs[1][0] = pix_abs8_c;
2746 c->pix_abs[1][1] = pix_abs8_x2_c;
2747 c->pix_abs[1][2] = pix_abs8_y2_c;
2748 c->pix_abs[1][3] = pix_abs8_xy2_c;
2750 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2751 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2752 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2753 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2754 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2755 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2756 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2757 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2758 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2760 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2761 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2762 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2763 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2764 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2765 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2766 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2767 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2768 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2770 #define dspfunc(PFX, IDX, NUM) \
2771 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2772 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2773 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2774 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2775 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2776 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2777 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2778 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2779 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2780 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2781 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2782 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2783 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2784 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2785 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2786 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2788 dspfunc(put_qpel, 0, 16);
2789 dspfunc(put_no_rnd_qpel, 0, 16);
2791 dspfunc(avg_qpel, 0, 16);
2792 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2794 dspfunc(put_qpel, 1, 8);
2795 dspfunc(put_no_rnd_qpel, 1, 8);
2797 dspfunc(avg_qpel, 1, 8);
2798 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2802 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2803 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2804 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2805 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2806 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2807 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2808 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2809 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2811 #define SET_CMP_FUNC(name) \
2812 c->name[0]= name ## 16_c;\
2813 c->name[1]= name ## 8x8_c;
2815 SET_CMP_FUNC(hadamard8_diff)
2816 c->hadamard8_diff[4]= hadamard8_intra16_c;
2817 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2818 SET_CMP_FUNC(dct_sad)
2819 SET_CMP_FUNC(dct_max)
2821 SET_CMP_FUNC(dct264_sad)
2823 c->sad[0]= pix_abs16_c;
2824 c->sad[1]= pix_abs8_c;
2828 SET_CMP_FUNC(quant_psnr)
2831 c->vsad[0]= vsad16_c;
2832 c->vsad[4]= vsad_intra16_c;
2833 c->vsad[5]= vsad_intra8_c;
2834 c->vsse[0]= vsse16_c;
2835 c->vsse[4]= vsse_intra16_c;
2836 c->vsse[5]= vsse_intra8_c;
2837 c->nsse[0]= nsse16_c;
2838 c->nsse[1]= nsse8_c;
2839 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2840 ff_dsputil_init_dwt(c);
2843 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2845 c->add_bytes= add_bytes_c;
2846 c->diff_bytes= diff_bytes_c;
2847 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2848 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2849 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2850 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2851 c->bswap_buf= bswap_buf;
2852 c->bswap16_buf = bswap16_buf;
2854 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2855 c->h263_h_loop_filter= h263_h_loop_filter_c;
2856 c->h263_v_loop_filter= h263_v_loop_filter_c;
2859 c->h261_loop_filter= h261_loop_filter_c;
2861 c->try_8x8basis= try_8x8basis_c;
2862 c->add_8x8basis= add_8x8basis_c;
2864 c->vector_clipf = vector_clipf_c;
2865 c->scalarproduct_int16 = scalarproduct_int16_c;
2866 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2867 c->apply_window_int16 = apply_window_int16_c;
2868 c->vector_clip_int32 = vector_clip_int32_c;
2870 c->shrink[0]= av_image_copy_plane;
2871 c->shrink[1]= ff_shrink22;
2872 c->shrink[2]= ff_shrink44;
2873 c->shrink[3]= ff_shrink88;
2875 c->add_pixels8 = add_pixels8_c;
2877 #define hpel_funcs(prefix, idx, num) \
2878 c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2879 c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2880 c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2881 c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2883 hpel_funcs(put, [0], 16);
2884 hpel_funcs(put, [1], 8);
2885 hpel_funcs(put, [2], 4);
2886 hpel_funcs(put, [3], 2);
2887 hpel_funcs(put_no_rnd, [0], 16);
2888 hpel_funcs(put_no_rnd, [1], 8);
2889 hpel_funcs(avg, [0], 16);
2890 hpel_funcs(avg, [1], 8);
2891 hpel_funcs(avg, [2], 4);
2892 hpel_funcs(avg, [3], 2);
2893 hpel_funcs(avg_no_rnd,, 16);
2897 #define FUNC(f, depth) f ## _ ## depth
2898 #define FUNCC(f, depth) f ## _ ## depth ## _c
2900 #define BIT_DEPTH_FUNCS(depth, dct)\
2901 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2902 c->draw_edges = FUNCC(draw_edges , depth);\
2903 c->clear_block = FUNCC(clear_block ## dct , depth);\
2904 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2906 switch (avctx->bits_per_raw_sample) {
2908 if (c->dct_bits == 32) {
2909 BIT_DEPTH_FUNCS(9, _32);
2911 BIT_DEPTH_FUNCS(9, _16);
2915 if (c->dct_bits == 32) {
2916 BIT_DEPTH_FUNCS(10, _32);
2918 BIT_DEPTH_FUNCS(10, _16);
2922 if (c->dct_bits == 32) {
2923 BIT_DEPTH_FUNCS(12, _32);
2925 BIT_DEPTH_FUNCS(12, _16);
2929 if (c->dct_bits == 32) {
2930 BIT_DEPTH_FUNCS(14, _32);
2932 BIT_DEPTH_FUNCS(14, _16);
2936 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2937 BIT_DEPTH_FUNCS(8, _16);
2943 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2944 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2945 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2946 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2947 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2948 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2949 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2951 ff_init_scantable_permutation(c->idct_permutation,
2952 c->idct_permutation_type);
2955 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2957 ff_dsputil_init(c, avctx);