3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
45 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
46 uint32_t ff_squareTbl[512] = {0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
57 #include "dsputil_template.c"
61 #include "dsputil_template.c"
65 #include "dsputil_template.c"
67 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
68 #define pb_7f (~0UL/255 * 0x7f)
69 #define pb_80 (~0UL/255 * 0x80)
71 const uint8_t ff_zigzag_direct[64] = {
72 0, 1, 8, 16, 9, 2, 3, 10,
73 17, 24, 32, 25, 18, 11, 4, 5,
74 12, 19, 26, 33, 40, 48, 41, 34,
75 27, 20, 13, 6, 7, 14, 21, 28,
76 35, 42, 49, 56, 57, 50, 43, 36,
77 29, 22, 15, 23, 30, 37, 44, 51,
78 58, 59, 52, 45, 38, 31, 39, 46,
79 53, 60, 61, 54, 47, 55, 62, 63
82 /* Specific zigzag scan for 248 idct. NOTE that unlike the
83 specification, we interleave the fields */
84 const uint8_t ff_zigzag248_direct[64] = {
85 0, 8, 1, 9, 16, 24, 2, 10,
86 17, 25, 32, 40, 48, 56, 33, 41,
87 18, 26, 3, 11, 4, 12, 19, 27,
88 34, 42, 49, 57, 50, 58, 35, 43,
89 20, 28, 5, 13, 6, 14, 21, 29,
90 36, 44, 51, 59, 52, 60, 37, 45,
91 22, 30, 7, 15, 23, 31, 38, 46,
92 53, 61, 54, 62, 39, 47, 55, 63,
95 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
96 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
98 const uint8_t ff_alternate_horizontal_scan[64] = {
99 0, 1, 2, 3, 8, 9, 16, 17,
100 10, 11, 4, 5, 6, 7, 15, 14,
101 13, 12, 19, 18, 24, 25, 32, 33,
102 26, 27, 20, 21, 22, 23, 28, 29,
103 30, 31, 34, 35, 40, 41, 48, 49,
104 42, 43, 36, 37, 38, 39, 44, 45,
105 46, 47, 50, 51, 56, 57, 58, 59,
106 52, 53, 54, 55, 60, 61, 62, 63,
109 const uint8_t ff_alternate_vertical_scan[64] = {
110 0, 8, 16, 24, 1, 9, 2, 10,
111 17, 25, 32, 40, 48, 56, 57, 49,
112 41, 33, 26, 18, 3, 11, 4, 12,
113 19, 27, 34, 42, 50, 58, 35, 43,
114 51, 59, 20, 28, 5, 13, 6, 14,
115 21, 29, 36, 44, 52, 60, 37, 45,
116 53, 61, 22, 30, 7, 15, 23, 31,
117 38, 46, 54, 62, 39, 47, 55, 63,
120 /* Input permutation for the simple_idct_mmx */
121 static const uint8_t simple_mmx_permutation[64]={
122 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
123 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
124 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
125 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
126 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
127 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
128 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
129 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
132 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
134 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
138 st->scantable= src_scantable;
142 j = src_scantable[i];
143 st->permutated[i] = permutation[j];
149 j = st->permutated[i];
151 st->raster_end[i]= end;
155 void ff_init_scantable_permutation(uint8_t *idct_permutation,
156 int idct_permutation_type)
160 switch(idct_permutation_type){
161 case FF_NO_IDCT_PERM:
163 idct_permutation[i]= i;
165 case FF_LIBMPEG2_IDCT_PERM:
167 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
169 case FF_SIMPLE_IDCT_PERM:
171 idct_permutation[i]= simple_mmx_permutation[i];
173 case FF_TRANSPOSE_IDCT_PERM:
175 idct_permutation[i]= ((i&7)<<3) | (i>>3);
177 case FF_PARTTRANS_IDCT_PERM:
179 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
181 case FF_SSE2_IDCT_PERM:
183 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
186 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
190 static int pix_sum_c(uint8_t * pix, int line_size)
195 for (i = 0; i < 16; i++) {
196 for (j = 0; j < 16; j += 8) {
207 pix += line_size - 16;
212 static int pix_norm1_c(uint8_t * pix, int line_size)
215 uint32_t *sq = ff_squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 for (j = 0; j < 16; j += 8) {
231 register uint64_t x=*(uint64_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 s += sq[(x>>32)&0xff];
237 s += sq[(x>>40)&0xff];
238 s += sq[(x>>48)&0xff];
239 s += sq[(x>>56)&0xff];
241 register uint32_t x=*(uint32_t*)pix;
243 s += sq[(x>>8)&0xff];
244 s += sq[(x>>16)&0xff];
245 s += sq[(x>>24)&0xff];
246 x=*(uint32_t*)(pix+4);
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
255 pix += line_size - 16;
260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
263 for(i=0; i+8<=w; i+=8){
264 dst[i+0]= av_bswap32(src[i+0]);
265 dst[i+1]= av_bswap32(src[i+1]);
266 dst[i+2]= av_bswap32(src[i+2]);
267 dst[i+3]= av_bswap32(src[i+3]);
268 dst[i+4]= av_bswap32(src[i+4]);
269 dst[i+5]= av_bswap32(src[i+5]);
270 dst[i+6]= av_bswap32(src[i+6]);
271 dst[i+7]= av_bswap32(src[i+7]);
274 dst[i+0]= av_bswap32(src[i+0]);
278 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
281 *dst++ = av_bswap16(*src++);
284 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
287 uint32_t *sq = ff_squareTbl + 256;
290 for (i = 0; i < h; i++) {
291 s += sq[pix1[0] - pix2[0]];
292 s += sq[pix1[1] - pix2[1]];
293 s += sq[pix1[2] - pix2[2]];
294 s += sq[pix1[3] - pix2[3]];
301 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
304 uint32_t *sq = ff_squareTbl + 256;
307 for (i = 0; i < h; i++) {
308 s += sq[pix1[0] - pix2[0]];
309 s += sq[pix1[1] - pix2[1]];
310 s += sq[pix1[2] - pix2[2]];
311 s += sq[pix1[3] - pix2[3]];
312 s += sq[pix1[4] - pix2[4]];
313 s += sq[pix1[5] - pix2[5]];
314 s += sq[pix1[6] - pix2[6]];
315 s += sq[pix1[7] - pix2[7]];
322 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
325 uint32_t *sq = ff_squareTbl + 256;
328 for (i = 0; i < h; i++) {
329 s += sq[pix1[ 0] - pix2[ 0]];
330 s += sq[pix1[ 1] - pix2[ 1]];
331 s += sq[pix1[ 2] - pix2[ 2]];
332 s += sq[pix1[ 3] - pix2[ 3]];
333 s += sq[pix1[ 4] - pix2[ 4]];
334 s += sq[pix1[ 5] - pix2[ 5]];
335 s += sq[pix1[ 6] - pix2[ 6]];
336 s += sq[pix1[ 7] - pix2[ 7]];
337 s += sq[pix1[ 8] - pix2[ 8]];
338 s += sq[pix1[ 9] - pix2[ 9]];
339 s += sq[pix1[10] - pix2[10]];
340 s += sq[pix1[11] - pix2[11]];
341 s += sq[pix1[12] - pix2[12]];
342 s += sq[pix1[13] - pix2[13]];
343 s += sq[pix1[14] - pix2[14]];
344 s += sq[pix1[15] - pix2[15]];
352 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
353 const uint8_t *s2, int stride){
356 /* read the pixels */
358 block[0] = s1[0] - s2[0];
359 block[1] = s1[1] - s2[1];
360 block[2] = s1[2] - s2[2];
361 block[3] = s1[3] - s2[3];
362 block[4] = s1[4] - s2[4];
363 block[5] = s1[5] - s2[5];
364 block[6] = s1[6] - s2[6];
365 block[7] = s1[7] - s2[7];
372 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
377 /* read the pixels */
379 pixels[0] = av_clip_uint8(block[0]);
380 pixels[1] = av_clip_uint8(block[1]);
381 pixels[2] = av_clip_uint8(block[2]);
382 pixels[3] = av_clip_uint8(block[3]);
383 pixels[4] = av_clip_uint8(block[4]);
384 pixels[5] = av_clip_uint8(block[5]);
385 pixels[6] = av_clip_uint8(block[6]);
386 pixels[7] = av_clip_uint8(block[7]);
393 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
398 /* read the pixels */
400 pixels[0] = av_clip_uint8(block[0]);
401 pixels[1] = av_clip_uint8(block[1]);
402 pixels[2] = av_clip_uint8(block[2]);
403 pixels[3] = av_clip_uint8(block[3]);
410 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
415 /* read the pixels */
417 pixels[0] = av_clip_uint8(block[0]);
418 pixels[1] = av_clip_uint8(block[1]);
425 static void put_signed_pixels_clamped_c(const int16_t *block,
426 uint8_t *av_restrict pixels,
431 for (i = 0; i < 8; i++) {
432 for (j = 0; j < 8; j++) {
435 else if (*block > 127)
438 *pixels = (uint8_t)(*block + 128);
442 pixels += (line_size - 8);
446 static void add_pixels8_c(uint8_t *av_restrict pixels,
453 pixels[0] += block[0];
454 pixels[1] += block[1];
455 pixels[2] += block[2];
456 pixels[3] += block[3];
457 pixels[4] += block[4];
458 pixels[5] += block[5];
459 pixels[6] += block[6];
460 pixels[7] += block[7];
467 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
472 /* read the pixels */
474 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
475 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
476 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
477 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
478 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
479 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
480 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
481 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
487 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
492 /* read the pixels */
494 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
495 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
496 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
497 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
503 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
508 /* read the pixels */
510 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
511 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
517 static int sum_abs_dctelem_c(int16_t *block)
521 sum+= FFABS(block[i]);
525 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
529 for (i = 0; i < h; i++) {
530 memset(block, value, 16);
535 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
539 for (i = 0; i < h; i++) {
540 memset(block, value, 8);
545 #define avg2(a,b) ((a+b+1)>>1)
546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
550 const int A=(16-x16)*(16-y16);
551 const int B=( x16)*(16-y16);
552 const int C=(16-x16)*( y16);
553 const int D=( x16)*( y16);
558 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
575 const int s= 1<<shift;
585 for(x=0; x<8; x++){ //XXX FIXME optimize
586 int src_x, src_y, frac_x, frac_y, index;
595 if((unsigned)src_x < width){
596 if((unsigned)src_y < height){
597 index= src_x + src_y*stride;
598 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
599 + src[index +1]* frac_x )*(s-frac_y)
600 + ( src[index+stride ]*(s-frac_x)
601 + src[index+stride+1]* frac_x )* frac_y
604 index= src_x + av_clip(src_y, 0, height)*stride;
605 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
606 + src[index +1]* frac_x )*s
610 if((unsigned)src_y < height){
611 index= av_clip(src_x, 0, width) + src_y*stride;
612 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
613 + src[index+stride ]* frac_y )*s
616 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617 dst[y*stride + x]= src[index ];
629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
631 case 2: put_pixels2_8_c (dst, src, stride, height); break;
632 case 4: put_pixels4_8_c (dst, src, stride, height); break;
633 case 8: put_pixels8_8_c (dst, src, stride, height); break;
634 case 16:put_pixels16_8_c(dst, src, stride, height); break;
638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
717 for (i=0; i < height; i++) {
718 for (j=0; j < width; j++) {
719 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
728 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
814 for (i=0; i < height; i++) {
815 for (j=0; j < width; j++) {
816 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
823 #define QPEL_MC(r, OPNAME, RND, OP) \
824 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
825 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
829 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
830 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
831 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
832 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
833 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
834 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
835 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
836 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
842 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
844 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
848 const int src0= src[0*srcStride];\
849 const int src1= src[1*srcStride];\
850 const int src2= src[2*srcStride];\
851 const int src3= src[3*srcStride];\
852 const int src4= src[4*srcStride];\
853 const int src5= src[5*srcStride];\
854 const int src6= src[6*srcStride];\
855 const int src7= src[7*srcStride];\
856 const int src8= src[8*srcStride];\
857 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
858 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
859 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
860 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
861 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
862 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
863 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
864 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
870 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
871 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
876 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
877 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
878 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
879 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
880 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
881 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
882 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
883 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
884 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
885 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
886 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
887 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
888 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
889 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
890 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
891 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
897 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
898 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
903 const int src0= src[0*srcStride];\
904 const int src1= src[1*srcStride];\
905 const int src2= src[2*srcStride];\
906 const int src3= src[3*srcStride];\
907 const int src4= src[4*srcStride];\
908 const int src5= src[5*srcStride];\
909 const int src6= src[6*srcStride];\
910 const int src7= src[7*srcStride];\
911 const int src8= src[8*srcStride];\
912 const int src9= src[9*srcStride];\
913 const int src10= src[10*srcStride];\
914 const int src11= src[11*srcStride];\
915 const int src12= src[12*srcStride];\
916 const int src13= src[13*srcStride];\
917 const int src14= src[14*srcStride];\
918 const int src15= src[15*srcStride];\
919 const int src16= src[16*srcStride];\
920 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
921 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
922 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
923 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
924 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
925 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
926 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
927 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
928 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
929 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
930 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
931 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
932 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
933 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
934 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
935 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
941 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
943 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
944 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
947 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
948 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
951 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
953 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
954 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
957 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
960 copy_block9(full, src, 16, stride, 9);\
961 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
962 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
965 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
967 copy_block9(full, src, 16, stride, 9);\
968 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
971 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
976 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
978 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
983 copy_block9(full, src, 16, stride, 9);\
984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
987 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
989 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
993 copy_block9(full, src, 16, stride, 9);\
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
995 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
997 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
999 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1010 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1041 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1045 uint8_t halfHV[64];\
1046 copy_block9(full, src, 16, stride, 9);\
1047 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1052 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t full[16*9];\
1055 uint8_t halfHV[64];\
1056 copy_block9(full, src, 16, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1062 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1064 uint8_t halfHV[64];\
1065 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1066 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1067 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1069 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1071 uint8_t halfHV[64];\
1072 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1076 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1080 uint8_t halfHV[64];\
1081 copy_block9(full, src, 16, stride, 9);\
1082 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1084 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1085 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1087 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1088 uint8_t full[16*9];\
1090 copy_block9(full, src, 16, stride, 9);\
1091 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1092 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1093 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1095 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1096 uint8_t full[16*9];\
1099 uint8_t halfHV[64];\
1100 copy_block9(full, src, 16, stride, 9);\
1101 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1102 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1103 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1104 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1106 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1107 uint8_t full[16*9];\
1109 copy_block9(full, src, 16, stride, 9);\
1110 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1111 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1112 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1114 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1116 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1117 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1120 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1127 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1130 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1132 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1133 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1136 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1145 uint8_t full[24*17];\
1146 copy_block17(full, src, 24, stride, 17);\
1147 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1150 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[24*17];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1155 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1157 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1159 uint8_t halfH[272];\
1160 uint8_t halfV[256];\
1161 uint8_t halfHV[256];\
1162 copy_block17(full, src, 24, stride, 17);\
1163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1168 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1169 uint8_t full[24*17];\
1170 uint8_t halfH[272];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1178 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1220 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfV[256];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1231 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1232 uint8_t full[24*17];\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1241 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1242 uint8_t halfH[272];\
1243 uint8_t halfHV[256];\
1244 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1248 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1249 uint8_t halfH[272];\
1250 uint8_t halfHV[256];\
1251 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1255 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 uint8_t halfV[256];\
1259 uint8_t halfHV[256];\
1260 copy_block17(full, src, 24, stride, 17);\
1261 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1263 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1264 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1266 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1267 uint8_t full[24*17];\
1268 uint8_t halfH[272];\
1269 copy_block17(full, src, 24, stride, 17);\
1270 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1271 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1272 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1274 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1275 uint8_t full[24*17];\
1276 uint8_t halfH[272];\
1277 uint8_t halfV[256];\
1278 uint8_t halfHV[256];\
1279 copy_block17(full, src, 24, stride, 17);\
1280 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1283 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1285 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1286 uint8_t full[24*17];\
1287 uint8_t halfH[272];\
1288 copy_block17(full, src, 24, stride, 17);\
1289 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1290 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1291 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1293 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t halfH[272];\
1295 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1296 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1299 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1300 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1301 #define op_put(a, b) a = cm[((b) + 16)>>5]
1302 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1304 QPEL_MC(0, put_ , _ , op_put)
1305 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1306 QPEL_MC(0, avg_ , _ , op_avg)
1307 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1309 #undef op_avg_no_rnd
1311 #undef op_put_no_rnd
1313 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1314 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1315 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1316 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1317 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1318 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1320 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1321 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1325 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1326 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1327 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1328 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1329 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1330 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1331 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1332 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1338 #if CONFIG_RV40_DECODER
1339 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1340 put_pixels16_xy2_8_c(dst, src, stride, 16);
1342 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1343 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1345 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1346 put_pixels8_xy2_8_c(dst, src, stride, 8);
1348 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1349 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1351 #endif /* CONFIG_RV40_DECODER */
1353 #if CONFIG_DIRAC_DECODER
1354 #define DIRAC_MC(OPNAME)\
1355 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1359 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1363 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1365 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1366 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1368 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1370 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1372 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1374 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1376 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1378 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1379 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1381 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1383 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1385 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1387 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1389 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1391 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1392 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1398 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1399 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1403 const int src_1= src[ -srcStride];
1404 const int src0 = src[0 ];
1405 const int src1 = src[ srcStride];
1406 const int src2 = src[2*srcStride];
1407 const int src3 = src[3*srcStride];
1408 const int src4 = src[4*srcStride];
1409 const int src5 = src[5*srcStride];
1410 const int src6 = src[6*srcStride];
1411 const int src7 = src[7*srcStride];
1412 const int src8 = src[8*srcStride];
1413 const int src9 = src[9*srcStride];
1414 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1415 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1416 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1417 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1418 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1419 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1420 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1421 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1427 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1429 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1430 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1433 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1434 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1437 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1439 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1440 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1443 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1444 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1447 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1451 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1452 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1453 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1454 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1456 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1460 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1461 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1462 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1463 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1465 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1467 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1468 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1471 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1472 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1474 const int strength= ff_h263_loop_filter_strength[qscale];
1478 int p0= src[x-2*stride];
1479 int p1= src[x-1*stride];
1480 int p2= src[x+0*stride];
1481 int p3= src[x+1*stride];
1482 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1484 if (d<-2*strength) d1= 0;
1485 else if(d<- strength) d1=-2*strength - d;
1486 else if(d< strength) d1= d;
1487 else if(d< 2*strength) d1= 2*strength - d;
1492 if(p1&256) p1= ~(p1>>31);
1493 if(p2&256) p2= ~(p2>>31);
1495 src[x-1*stride] = p1;
1496 src[x+0*stride] = p2;
1500 d2= av_clip((p0-p3)/4, -ad1, ad1);
1502 src[x-2*stride] = p0 - d2;
1503 src[x+ stride] = p3 + d2;
1508 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1509 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1511 const int strength= ff_h263_loop_filter_strength[qscale];
1515 int p0= src[y*stride-2];
1516 int p1= src[y*stride-1];
1517 int p2= src[y*stride+0];
1518 int p3= src[y*stride+1];
1519 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1521 if (d<-2*strength) d1= 0;
1522 else if(d<- strength) d1=-2*strength - d;
1523 else if(d< strength) d1= d;
1524 else if(d< 2*strength) d1= 2*strength - d;
1529 if(p1&256) p1= ~(p1>>31);
1530 if(p2&256) p2= ~(p2>>31);
1532 src[y*stride-1] = p1;
1533 src[y*stride+0] = p2;
1537 d2= av_clip((p0-p3)/4, -ad1, ad1);
1539 src[y*stride-2] = p0 - d2;
1540 src[y*stride+1] = p3 + d2;
1545 static void h261_loop_filter_c(uint8_t *src, int stride){
1550 temp[x ] = 4*src[x ];
1551 temp[x + 7*8] = 4*src[x + 7*stride];
1555 xy = y * stride + x;
1557 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1562 src[ y*stride] = (temp[ y*8] + 2)>>2;
1563 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1565 xy = y * stride + x;
1567 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1572 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1578 s += abs(pix1[0] - pix2[0]);
1579 s += abs(pix1[1] - pix2[1]);
1580 s += abs(pix1[2] - pix2[2]);
1581 s += abs(pix1[3] - pix2[3]);
1582 s += abs(pix1[4] - pix2[4]);
1583 s += abs(pix1[5] - pix2[5]);
1584 s += abs(pix1[6] - pix2[6]);
1585 s += abs(pix1[7] - pix2[7]);
1586 s += abs(pix1[8] - pix2[8]);
1587 s += abs(pix1[9] - pix2[9]);
1588 s += abs(pix1[10] - pix2[10]);
1589 s += abs(pix1[11] - pix2[11]);
1590 s += abs(pix1[12] - pix2[12]);
1591 s += abs(pix1[13] - pix2[13]);
1592 s += abs(pix1[14] - pix2[14]);
1593 s += abs(pix1[15] - pix2[15]);
1600 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1606 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1607 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1608 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1609 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1610 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1611 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1612 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1613 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1614 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1615 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1616 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1617 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1618 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1619 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1620 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1621 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1628 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1631 uint8_t *pix3 = pix2 + line_size;
1635 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1636 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1637 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1638 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1639 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1640 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1641 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1642 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1643 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1644 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1645 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1646 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1647 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1648 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1649 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1650 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1658 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1661 uint8_t *pix3 = pix2 + line_size;
1665 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1666 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1667 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1668 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1669 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1670 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1671 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1672 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1673 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1674 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1675 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1676 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1677 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1678 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1679 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1680 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1688 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1694 s += abs(pix1[0] - pix2[0]);
1695 s += abs(pix1[1] - pix2[1]);
1696 s += abs(pix1[2] - pix2[2]);
1697 s += abs(pix1[3] - pix2[3]);
1698 s += abs(pix1[4] - pix2[4]);
1699 s += abs(pix1[5] - pix2[5]);
1700 s += abs(pix1[6] - pix2[6]);
1701 s += abs(pix1[7] - pix2[7]);
1708 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1714 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1715 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1716 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1717 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1718 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1719 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1720 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1721 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1728 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1731 uint8_t *pix3 = pix2 + line_size;
1735 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1736 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1737 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1738 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1739 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1740 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1741 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1742 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1750 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1753 uint8_t *pix3 = pix2 + line_size;
1757 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1758 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1759 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1760 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1761 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1762 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1763 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1764 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1772 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1773 MpegEncContext *c = v;
1779 for(x=0; x<16; x++){
1780 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1783 for(x=0; x<15; x++){
1784 score2+= FFABS( s1[x ] - s1[x +stride]
1785 - s1[x+1] + s1[x+1+stride])
1786 -FFABS( s2[x ] - s2[x +stride]
1787 - s2[x+1] + s2[x+1+stride]);
1794 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1795 else return score1 + FFABS(score2)*8;
1798 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1799 MpegEncContext *c = v;
1806 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1810 score2+= FFABS( s1[x ] - s1[x +stride]
1811 - s1[x+1] + s1[x+1+stride])
1812 -FFABS( s2[x ] - s2[x +stride]
1813 - s2[x+1] + s2[x+1+stride]);
1820 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1821 else return score1 + FFABS(score2)*8;
1824 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1828 for(i=0; i<8*8; i++){
1829 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1832 av_assert2(-512<b && b<512);
1834 sum += (w*b)*(w*b)>>4;
1839 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1842 for(i=0; i<8*8; i++){
1843 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1847 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1851 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1854 memset(cmp, 0, sizeof(void*)*6);
1862 cmp[i]= c->hadamard8_diff[i];
1868 cmp[i]= c->dct_sad[i];
1871 cmp[i]= c->dct264_sad[i];
1874 cmp[i]= c->dct_max[i];
1877 cmp[i]= c->quant_psnr[i];
1906 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1911 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1913 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1914 long a = *(long*)(src+i);
1915 long b = *(long*)(dst+i);
1916 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1919 dst[i+0] += src[i+0];
1922 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1924 #if !HAVE_FAST_UNALIGNED
1925 if((long)src2 & (sizeof(long)-1)){
1926 for(i=0; i+7<w; i+=8){
1927 dst[i+0] = src1[i+0]-src2[i+0];
1928 dst[i+1] = src1[i+1]-src2[i+1];
1929 dst[i+2] = src1[i+2]-src2[i+2];
1930 dst[i+3] = src1[i+3]-src2[i+3];
1931 dst[i+4] = src1[i+4]-src2[i+4];
1932 dst[i+5] = src1[i+5]-src2[i+5];
1933 dst[i+6] = src1[i+6]-src2[i+6];
1934 dst[i+7] = src1[i+7]-src2[i+7];
1938 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1939 long a = *(long*)(src1+i);
1940 long b = *(long*)(src2+i);
1941 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1944 dst[i+0] = src1[i+0]-src2[i+0];
1947 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1955 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1964 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1972 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1982 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1985 for(i=0; i<w-1; i++){
2012 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2042 #define BUTTERFLY2(o1,o2,i1,i2) \
2046 #define BUTTERFLY1(x,y) \
2055 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2057 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2065 //FIXME try pointer walks
2066 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2067 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2068 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2069 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2071 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2072 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2073 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2074 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2076 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2077 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2078 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2079 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2083 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2084 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2085 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2086 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2088 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2089 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2090 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2091 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2094 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2095 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2096 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2097 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2102 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2110 //FIXME try pointer walks
2111 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2112 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2113 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2114 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2116 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2117 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2118 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2119 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2121 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2122 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2123 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2124 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2128 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2129 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2130 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2131 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2133 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2134 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2135 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2136 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2139 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2140 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2141 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2142 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2145 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2150 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2151 MpegEncContext * const s= (MpegEncContext *)c;
2152 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2156 s->dsp.diff_pixels(temp, src1, src2, stride);
2158 return s->dsp.sum_abs_dctelem(temp);
2163 const int s07 = SRC(0) + SRC(7);\
2164 const int s16 = SRC(1) + SRC(6);\
2165 const int s25 = SRC(2) + SRC(5);\
2166 const int s34 = SRC(3) + SRC(4);\
2167 const int a0 = s07 + s34;\
2168 const int a1 = s16 + s25;\
2169 const int a2 = s07 - s34;\
2170 const int a3 = s16 - s25;\
2171 const int d07 = SRC(0) - SRC(7);\
2172 const int d16 = SRC(1) - SRC(6);\
2173 const int d25 = SRC(2) - SRC(5);\
2174 const int d34 = SRC(3) - SRC(4);\
2175 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2176 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2177 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2178 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2180 DST(1, a4 + (a7>>2)) ;\
2181 DST(2, a2 + (a3>>1)) ;\
2182 DST(3, a5 + (a6>>2)) ;\
2184 DST(5, a6 - (a5>>2)) ;\
2185 DST(6, (a2>>1) - a3 ) ;\
2186 DST(7, (a4>>2) - a7 ) ;\
2189 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2190 MpegEncContext * const s= (MpegEncContext *)c;
2195 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2197 #define SRC(x) dct[i][x]
2198 #define DST(x,v) dct[i][x]= v
2199 for( i = 0; i < 8; i++ )
2204 #define SRC(x) dct[x][i]
2205 #define DST(x,v) sum += FFABS(v)
2206 for( i = 0; i < 8; i++ )
2214 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2215 MpegEncContext * const s= (MpegEncContext *)c;
2216 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2221 s->dsp.diff_pixels(temp, src1, src2, stride);
2225 sum= FFMAX(sum, FFABS(temp[i]));
2230 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2231 MpegEncContext * const s= (MpegEncContext *)c;
2232 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2233 int16_t * const bak = temp+64;
2239 s->dsp.diff_pixels(temp, src1, src2, stride);
2241 memcpy(bak, temp, 64*sizeof(int16_t));
2243 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2244 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2245 ff_simple_idct_8(temp); //FIXME
2248 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2253 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2254 MpegEncContext * const s= (MpegEncContext *)c;
2255 const uint8_t *scantable= s->intra_scantable.permutated;
2256 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2257 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2258 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2259 int i, last, run, bits, level, distortion, start_i;
2260 const int esc_length= s->ac_esc_length;
2262 uint8_t * last_length;
2266 copy_block8(lsrc1, src1, 8, stride, 8);
2267 copy_block8(lsrc2, src2, 8, stride, 8);
2269 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2271 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2277 length = s->intra_ac_vlc_length;
2278 last_length= s->intra_ac_vlc_last_length;
2279 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2282 length = s->inter_ac_vlc_length;
2283 last_length= s->inter_ac_vlc_last_length;
2288 for(i=start_i; i<last; i++){
2289 int j= scantable[i];
2294 if((level&(~127)) == 0){
2295 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2304 level= temp[i] + 64;
2306 av_assert2(level - 64);
2308 if((level&(~127)) == 0){
2309 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2317 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2319 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2322 s->dsp.idct_add(lsrc2, 8, temp);
2324 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2326 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2329 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2330 MpegEncContext * const s= (MpegEncContext *)c;
2331 const uint8_t *scantable= s->intra_scantable.permutated;
2332 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2333 int i, last, run, bits, level, start_i;
2334 const int esc_length= s->ac_esc_length;
2336 uint8_t * last_length;
2340 s->dsp.diff_pixels(temp, src1, src2, stride);
2342 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2348 length = s->intra_ac_vlc_length;
2349 last_length= s->intra_ac_vlc_last_length;
2350 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2353 length = s->inter_ac_vlc_length;
2354 last_length= s->inter_ac_vlc_last_length;
2359 for(i=start_i; i<last; i++){
2360 int j= scantable[i];
2365 if((level&(~127)) == 0){
2366 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2375 level= temp[i] + 64;
2377 av_assert2(level - 64);
2379 if((level&(~127)) == 0){
2380 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2388 #define VSAD_INTRA(size) \
2389 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2393 for(y=1; y<h; y++){ \
2394 for(x=0; x<size; x+=4){ \
2395 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2396 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2406 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2411 for(x=0; x<16; x++){
2412 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2421 #define SQ(a) ((a)*(a))
2422 #define VSSE_INTRA(size) \
2423 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2427 for(y=1; y<h; y++){ \
2428 for(x=0; x<size; x+=4){ \
2429 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2430 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2440 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2445 for(x=0; x<16; x++){
2446 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2455 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2459 for(i=0; i<size; i++)
2460 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2464 #define WRAPPER8_16_SQ(name8, name16)\
2465 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2467 score +=name8(s, dst , src , stride, 8);\
2468 score +=name8(s, dst+8 , src+8 , stride, 8);\
2472 score +=name8(s, dst , src , stride, 8);\
2473 score +=name8(s, dst+8 , src+8 , stride, 8);\
2478 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2479 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2480 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2482 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2484 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2485 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2486 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2487 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2489 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2490 uint32_t maxi, uint32_t maxisign)
2493 if(a > mini) return mini;
2494 else if((a^(1U<<31)) > maxisign) return maxi;
2498 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2500 uint32_t mini = *(uint32_t*)min;
2501 uint32_t maxi = *(uint32_t*)max;
2502 uint32_t maxisign = maxi ^ (1U<<31);
2503 uint32_t *dsti = (uint32_t*)dst;
2504 const uint32_t *srci = (const uint32_t*)src;
2505 for(i=0; i<len; i+=8) {
2506 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2507 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2508 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2509 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2510 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2511 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2512 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2513 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2516 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2518 if(min < 0 && max > 0) {
2519 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2521 for(i=0; i < len; i+=8) {
2522 dst[i ] = av_clipf(src[i ], min, max);
2523 dst[i + 1] = av_clipf(src[i + 1], min, max);
2524 dst[i + 2] = av_clipf(src[i + 2], min, max);
2525 dst[i + 3] = av_clipf(src[i + 3], min, max);
2526 dst[i + 4] = av_clipf(src[i + 4], min, max);
2527 dst[i + 5] = av_clipf(src[i + 5], min, max);
2528 dst[i + 6] = av_clipf(src[i + 6], min, max);
2529 dst[i + 7] = av_clipf(src[i + 7], min, max);
2534 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2539 res += *v1++ * *v2++;
2544 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2549 *v1++ += mul * *v3++;
2554 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2555 const int16_t *window, unsigned int len)
2558 int len2 = len >> 1;
2560 for (i = 0; i < len2; i++) {
2561 int16_t w = window[i];
2562 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2563 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2567 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2568 int32_t max, unsigned int len)
2571 *dst++ = av_clip(*src++, min, max);
2572 *dst++ = av_clip(*src++, min, max);
2573 *dst++ = av_clip(*src++, min, max);
2574 *dst++ = av_clip(*src++, min, max);
2575 *dst++ = av_clip(*src++, min, max);
2576 *dst++ = av_clip(*src++, min, max);
2577 *dst++ = av_clip(*src++, min, max);
2578 *dst++ = av_clip(*src++, min, max);
2583 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2585 ff_j_rev_dct (block);
2586 put_pixels_clamped_c(block, dest, line_size);
2588 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2590 ff_j_rev_dct (block);
2591 add_pixels_clamped_c(block, dest, line_size);
2594 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2596 ff_j_rev_dct4 (block);
2597 put_pixels_clamped4_c(block, dest, line_size);
2599 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2601 ff_j_rev_dct4 (block);
2602 add_pixels_clamped4_c(block, dest, line_size);
2605 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2607 ff_j_rev_dct2 (block);
2608 put_pixels_clamped2_c(block, dest, line_size);
2610 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2612 ff_j_rev_dct2 (block);
2613 add_pixels_clamped2_c(block, dest, line_size);
2616 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2618 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2620 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2622 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2625 /* init static data */
2626 av_cold void ff_dsputil_static_init(void)
2630 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2631 for(i=0;i<MAX_NEG_CROP;i++) {
2633 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2636 for(i=0;i<512;i++) {
2637 ff_squareTbl[i] = (i - 256) * (i - 256);
2640 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2643 int ff_check_alignment(void){
2644 static int did_fail=0;
2645 LOCAL_ALIGNED_16(int, aligned, [4]);
2647 if((intptr_t)aligned & 15){
2649 #if HAVE_MMX || HAVE_ALTIVEC
2650 av_log(NULL, AV_LOG_ERROR,
2651 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2652 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2653 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2654 "Do not report crashes to FFmpeg developers.\n");
2663 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2665 ff_check_alignment();
2668 if (avctx->bits_per_raw_sample == 10) {
2669 c->fdct = ff_jpeg_fdct_islow_10;
2670 c->fdct248 = ff_fdct248_islow_10;
2672 if(avctx->dct_algo==FF_DCT_FASTINT) {
2673 c->fdct = ff_fdct_ifast;
2674 c->fdct248 = ff_fdct_ifast248;
2676 else if(avctx->dct_algo==FF_DCT_FAAN) {
2677 c->fdct = ff_faandct;
2678 c->fdct248 = ff_faandct248;
2681 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2682 c->fdct248 = ff_fdct248_islow_8;
2685 #endif //CONFIG_ENCODERS
2687 if(avctx->lowres==1){
2688 c->idct_put= ff_jref_idct4_put;
2689 c->idct_add= ff_jref_idct4_add;
2690 c->idct = ff_j_rev_dct4;
2691 c->idct_permutation_type= FF_NO_IDCT_PERM;
2692 }else if(avctx->lowres==2){
2693 c->idct_put= ff_jref_idct2_put;
2694 c->idct_add= ff_jref_idct2_add;
2695 c->idct = ff_j_rev_dct2;
2696 c->idct_permutation_type= FF_NO_IDCT_PERM;
2697 }else if(avctx->lowres==3){
2698 c->idct_put= ff_jref_idct1_put;
2699 c->idct_add= ff_jref_idct1_add;
2700 c->idct = ff_j_rev_dct1;
2701 c->idct_permutation_type= FF_NO_IDCT_PERM;
2703 if (avctx->bits_per_raw_sample == 10) {
2704 c->idct_put = ff_simple_idct_put_10;
2705 c->idct_add = ff_simple_idct_add_10;
2706 c->idct = ff_simple_idct_10;
2707 c->idct_permutation_type = FF_NO_IDCT_PERM;
2709 if(avctx->idct_algo==FF_IDCT_INT){
2710 c->idct_put= ff_jref_idct_put;
2711 c->idct_add= ff_jref_idct_add;
2712 c->idct = ff_j_rev_dct;
2713 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2714 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2715 c->idct_put= ff_faanidct_put;
2716 c->idct_add= ff_faanidct_add;
2717 c->idct = ff_faanidct;
2718 c->idct_permutation_type= FF_NO_IDCT_PERM;
2719 }else{ //accurate/default
2720 c->idct_put = ff_simple_idct_put_8;
2721 c->idct_add = ff_simple_idct_add_8;
2722 c->idct = ff_simple_idct_8;
2723 c->idct_permutation_type= FF_NO_IDCT_PERM;
2728 c->diff_pixels = diff_pixels_c;
2729 c->put_pixels_clamped = put_pixels_clamped_c;
2730 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2731 c->add_pixels_clamped = add_pixels_clamped_c;
2732 c->sum_abs_dctelem = sum_abs_dctelem_c;
2735 c->pix_sum = pix_sum_c;
2736 c->pix_norm1 = pix_norm1_c;
2738 c->fill_block_tab[0] = fill_block16_c;
2739 c->fill_block_tab[1] = fill_block8_c;
2741 /* TODO [0] 16 [1] 8 */
2742 c->pix_abs[0][0] = pix_abs16_c;
2743 c->pix_abs[0][1] = pix_abs16_x2_c;
2744 c->pix_abs[0][2] = pix_abs16_y2_c;
2745 c->pix_abs[0][3] = pix_abs16_xy2_c;
2746 c->pix_abs[1][0] = pix_abs8_c;
2747 c->pix_abs[1][1] = pix_abs8_x2_c;
2748 c->pix_abs[1][2] = pix_abs8_y2_c;
2749 c->pix_abs[1][3] = pix_abs8_xy2_c;
2751 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2752 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2753 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2754 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2755 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2756 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2757 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2758 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2759 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2761 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2762 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2763 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2764 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2765 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2766 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2767 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2768 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2769 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2771 #define dspfunc(PFX, IDX, NUM) \
2772 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2773 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2774 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2775 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2776 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2777 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2778 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2779 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2780 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2781 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2782 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2783 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2784 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2785 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2786 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2787 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2789 dspfunc(put_qpel, 0, 16);
2790 dspfunc(put_no_rnd_qpel, 0, 16);
2792 dspfunc(avg_qpel, 0, 16);
2793 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2795 dspfunc(put_qpel, 1, 8);
2796 dspfunc(put_no_rnd_qpel, 1, 8);
2798 dspfunc(avg_qpel, 1, 8);
2799 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2803 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2804 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2805 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2806 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2807 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2808 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2809 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2810 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2812 #define SET_CMP_FUNC(name) \
2813 c->name[0]= name ## 16_c;\
2814 c->name[1]= name ## 8x8_c;
2816 SET_CMP_FUNC(hadamard8_diff)
2817 c->hadamard8_diff[4]= hadamard8_intra16_c;
2818 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2819 SET_CMP_FUNC(dct_sad)
2820 SET_CMP_FUNC(dct_max)
2822 SET_CMP_FUNC(dct264_sad)
2824 c->sad[0]= pix_abs16_c;
2825 c->sad[1]= pix_abs8_c;
2829 SET_CMP_FUNC(quant_psnr)
2832 c->vsad[0]= vsad16_c;
2833 c->vsad[4]= vsad_intra16_c;
2834 c->vsad[5]= vsad_intra8_c;
2835 c->vsse[0]= vsse16_c;
2836 c->vsse[4]= vsse_intra16_c;
2837 c->vsse[5]= vsse_intra8_c;
2838 c->nsse[0]= nsse16_c;
2839 c->nsse[1]= nsse8_c;
2840 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2841 ff_dsputil_init_dwt(c);
2844 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2846 c->add_bytes= add_bytes_c;
2847 c->diff_bytes= diff_bytes_c;
2848 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2849 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2850 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2851 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2852 c->bswap_buf= bswap_buf;
2853 c->bswap16_buf = bswap16_buf;
2855 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2856 c->h263_h_loop_filter= h263_h_loop_filter_c;
2857 c->h263_v_loop_filter= h263_v_loop_filter_c;
2860 c->h261_loop_filter= h261_loop_filter_c;
2862 c->try_8x8basis= try_8x8basis_c;
2863 c->add_8x8basis= add_8x8basis_c;
2865 c->vector_clipf = vector_clipf_c;
2866 c->scalarproduct_int16 = scalarproduct_int16_c;
2867 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2868 c->apply_window_int16 = apply_window_int16_c;
2869 c->vector_clip_int32 = vector_clip_int32_c;
2871 c->shrink[0]= av_image_copy_plane;
2872 c->shrink[1]= ff_shrink22;
2873 c->shrink[2]= ff_shrink44;
2874 c->shrink[3]= ff_shrink88;
2876 c->add_pixels8 = add_pixels8_c;
2878 #define hpel_funcs(prefix, idx, num) \
2879 c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2880 c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2881 c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2882 c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2884 hpel_funcs(put, [0], 16);
2885 hpel_funcs(put, [1], 8);
2886 hpel_funcs(put, [2], 4);
2887 hpel_funcs(put, [3], 2);
2888 hpel_funcs(put_no_rnd, [0], 16);
2889 hpel_funcs(put_no_rnd, [1], 8);
2890 hpel_funcs(avg, [0], 16);
2891 hpel_funcs(avg, [1], 8);
2892 hpel_funcs(avg, [2], 4);
2893 hpel_funcs(avg, [3], 2);
2894 hpel_funcs(avg_no_rnd,, 16);
2898 #define FUNC(f, depth) f ## _ ## depth
2899 #define FUNCC(f, depth) f ## _ ## depth ## _c
2901 #define BIT_DEPTH_FUNCS(depth, dct)\
2902 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2903 c->draw_edges = FUNCC(draw_edges , depth);\
2904 c->clear_block = FUNCC(clear_block ## dct , depth);\
2905 c->clear_blocks = FUNCC(clear_blocks ## dct , depth)
2907 switch (avctx->bits_per_raw_sample) {
2909 if (c->dct_bits == 32) {
2910 BIT_DEPTH_FUNCS(9, _32);
2912 BIT_DEPTH_FUNCS(9, _16);
2916 if (c->dct_bits == 32) {
2917 BIT_DEPTH_FUNCS(10, _32);
2919 BIT_DEPTH_FUNCS(10, _16);
2923 if (c->dct_bits == 32) {
2924 BIT_DEPTH_FUNCS(12, _32);
2926 BIT_DEPTH_FUNCS(12, _16);
2930 if (c->dct_bits == 32) {
2931 BIT_DEPTH_FUNCS(14, _32);
2933 BIT_DEPTH_FUNCS(14, _16);
2937 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2938 BIT_DEPTH_FUNCS(8, _16);
2944 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2945 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2946 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2947 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2948 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2949 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2950 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2952 ff_init_scantable_permutation(c->idct_permutation,
2953 c->idct_permutation_type);
2956 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2958 ff_dsputil_init(c, avctx);