3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
45 uint32_t ff_squareTbl[512] = {0, };
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
59 specification, we interleave the fields */
60 const uint8_t ff_zigzag248_direct[64] = {
61 0, 8, 1, 9, 16, 24, 2, 10,
62 17, 25, 32, 40, 48, 56, 33, 41,
63 18, 26, 3, 11, 4, 12, 19, 27,
64 34, 42, 49, 57, 50, 58, 35, 43,
65 20, 28, 5, 13, 6, 14, 21, 29,
66 36, 44, 51, 59, 52, 60, 37, 45,
67 22, 30, 7, 15, 23, 31, 38, 46,
68 53, 61, 54, 62, 39, 47, 55, 63,
71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
74 const uint8_t ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 const uint8_t ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
96 /* Input permutation for the simple_idct_mmx */
97 static const uint8_t simple_mmx_permutation[64]={
98 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
99 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
100 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
101 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
102 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
103 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
104 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
105 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
110 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
111 const uint8_t *src_scantable)
116 st->scantable= src_scantable;
120 j = src_scantable[i];
121 st->permutated[i] = permutation[j];
127 j = st->permutated[i];
129 st->raster_end[i]= end;
133 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
134 int idct_permutation_type)
138 switch(idct_permutation_type){
139 case FF_NO_IDCT_PERM:
141 idct_permutation[i]= i;
143 case FF_LIBMPEG2_IDCT_PERM:
145 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
147 case FF_SIMPLE_IDCT_PERM:
149 idct_permutation[i]= simple_mmx_permutation[i];
151 case FF_TRANSPOSE_IDCT_PERM:
153 idct_permutation[i]= ((i&7)<<3) | (i>>3);
155 case FF_PARTTRANS_IDCT_PERM:
157 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
159 case FF_SSE2_IDCT_PERM:
161 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
164 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
168 static int pix_sum_c(uint8_t * pix, int line_size)
173 for (i = 0; i < 16; i++) {
174 for (j = 0; j < 16; j += 8) {
185 pix += line_size - 16;
190 static int pix_norm1_c(uint8_t * pix, int line_size)
193 uint32_t *sq = ff_squareTbl + 256;
196 for (i = 0; i < 16; i++) {
197 for (j = 0; j < 16; j += 8) {
209 register uint64_t x=*(uint64_t*)pix;
211 s += sq[(x>>8)&0xff];
212 s += sq[(x>>16)&0xff];
213 s += sq[(x>>24)&0xff];
214 s += sq[(x>>32)&0xff];
215 s += sq[(x>>40)&0xff];
216 s += sq[(x>>48)&0xff];
217 s += sq[(x>>56)&0xff];
219 register uint32_t x=*(uint32_t*)pix;
221 s += sq[(x>>8)&0xff];
222 s += sq[(x>>16)&0xff];
223 s += sq[(x>>24)&0xff];
224 x=*(uint32_t*)(pix+4);
226 s += sq[(x>>8)&0xff];
227 s += sq[(x>>16)&0xff];
228 s += sq[(x>>24)&0xff];
233 pix += line_size - 16;
238 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
241 for(i=0; i+8<=w; i+=8){
242 dst[i+0]= av_bswap32(src[i+0]);
243 dst[i+1]= av_bswap32(src[i+1]);
244 dst[i+2]= av_bswap32(src[i+2]);
245 dst[i+3]= av_bswap32(src[i+3]);
246 dst[i+4]= av_bswap32(src[i+4]);
247 dst[i+5]= av_bswap32(src[i+5]);
248 dst[i+6]= av_bswap32(src[i+6]);
249 dst[i+7]= av_bswap32(src[i+7]);
252 dst[i+0]= av_bswap32(src[i+0]);
256 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
259 *dst++ = av_bswap16(*src++);
262 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
265 uint32_t *sq = ff_squareTbl + 256;
268 for (i = 0; i < h; i++) {
269 s += sq[pix1[0] - pix2[0]];
270 s += sq[pix1[1] - pix2[1]];
271 s += sq[pix1[2] - pix2[2]];
272 s += sq[pix1[3] - pix2[3]];
279 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
282 uint32_t *sq = ff_squareTbl + 256;
285 for (i = 0; i < h; i++) {
286 s += sq[pix1[0] - pix2[0]];
287 s += sq[pix1[1] - pix2[1]];
288 s += sq[pix1[2] - pix2[2]];
289 s += sq[pix1[3] - pix2[3]];
290 s += sq[pix1[4] - pix2[4]];
291 s += sq[pix1[5] - pix2[5]];
292 s += sq[pix1[6] - pix2[6]];
293 s += sq[pix1[7] - pix2[7]];
300 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
303 uint32_t *sq = ff_squareTbl + 256;
306 for (i = 0; i < h; i++) {
307 s += sq[pix1[ 0] - pix2[ 0]];
308 s += sq[pix1[ 1] - pix2[ 1]];
309 s += sq[pix1[ 2] - pix2[ 2]];
310 s += sq[pix1[ 3] - pix2[ 3]];
311 s += sq[pix1[ 4] - pix2[ 4]];
312 s += sq[pix1[ 5] - pix2[ 5]];
313 s += sq[pix1[ 6] - pix2[ 6]];
314 s += sq[pix1[ 7] - pix2[ 7]];
315 s += sq[pix1[ 8] - pix2[ 8]];
316 s += sq[pix1[ 9] - pix2[ 9]];
317 s += sq[pix1[10] - pix2[10]];
318 s += sq[pix1[11] - pix2[11]];
319 s += sq[pix1[12] - pix2[12]];
320 s += sq[pix1[13] - pix2[13]];
321 s += sq[pix1[14] - pix2[14]];
322 s += sq[pix1[15] - pix2[15]];
330 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
331 const uint8_t *s2, int stride){
334 /* read the pixels */
336 block[0] = s1[0] - s2[0];
337 block[1] = s1[1] - s2[1];
338 block[2] = s1[2] - s2[2];
339 block[3] = s1[3] - s2[3];
340 block[4] = s1[4] - s2[4];
341 block[5] = s1[5] - s2[5];
342 block[6] = s1[6] - s2[6];
343 block[7] = s1[7] - s2[7];
351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
356 /* read the pixels */
358 pixels[0] = av_clip_uint8(block[0]);
359 pixels[1] = av_clip_uint8(block[1]);
360 pixels[2] = av_clip_uint8(block[2]);
361 pixels[3] = av_clip_uint8(block[3]);
362 pixels[4] = av_clip_uint8(block[4]);
363 pixels[5] = av_clip_uint8(block[5]);
364 pixels[6] = av_clip_uint8(block[6]);
365 pixels[7] = av_clip_uint8(block[7]);
372 static void put_signed_pixels_clamped_c(const int16_t *block,
373 uint8_t *restrict pixels,
378 for (i = 0; i < 8; i++) {
379 for (j = 0; j < 8; j++) {
382 else if (*block > 127)
385 *pixels = (uint8_t)(*block + 128);
389 pixels += (line_size - 8);
393 static void add_pixels8_c(uint8_t *restrict pixels,
400 pixels[0] += block[0];
401 pixels[1] += block[1];
402 pixels[2] += block[2];
403 pixels[3] += block[3];
404 pixels[4] += block[4];
405 pixels[5] += block[5];
406 pixels[6] += block[6];
407 pixels[7] += block[7];
413 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
418 /* read the pixels */
420 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
421 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
422 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
423 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
424 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
425 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
426 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
427 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
433 static int sum_abs_dctelem_c(int16_t *block)
437 sum+= FFABS(block[i]);
441 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
445 for (i = 0; i < h; i++) {
446 memset(block, value, 16);
451 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
455 for (i = 0; i < h; i++) {
456 memset(block, value, 8);
461 #define avg2(a,b) ((a+b+1)>>1)
462 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
464 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
466 const int A=(16-x16)*(16-y16);
467 const int B=( x16)*(16-y16);
468 const int C=(16-x16)*( y16);
469 const int D=( x16)*( y16);
474 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
475 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
476 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
477 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
478 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
479 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
480 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
481 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
487 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
488 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
491 const int s= 1<<shift;
501 for(x=0; x<8; x++){ //XXX FIXME optimize
502 int src_x, src_y, frac_x, frac_y, index;
511 if((unsigned)src_x < width){
512 if((unsigned)src_y < height){
513 index= src_x + src_y*stride;
514 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
515 + src[index +1]* frac_x )*(s-frac_y)
516 + ( src[index+stride ]*(s-frac_x)
517 + src[index+stride+1]* frac_x )* frac_y
520 index= src_x + av_clip(src_y, 0, height)*stride;
521 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
522 + src[index +1]* frac_x )*s
526 if((unsigned)src_y < height){
527 index= av_clip(src_x, 0, width) + src_y*stride;
528 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
529 + src[index+stride ]* frac_y )*s
532 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
533 dst[y*stride + x]= src[index ];
545 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
547 case 2: put_pixels2_8_c (dst, src, stride, height); break;
548 case 4: put_pixels4_8_c (dst, src, stride, height); break;
549 case 8: put_pixels8_8_c (dst, src, stride, height); break;
550 case 16:put_pixels16_8_c(dst, src, stride, height); break;
554 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
556 for (i=0; i < height; i++) {
557 for (j=0; j < width; j++) {
558 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
565 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
567 for (i=0; i < height; i++) {
568 for (j=0; j < width; j++) {
569 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
576 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
578 for (i=0; i < height; i++) {
579 for (j=0; j < width; j++) {
580 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
587 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
589 for (i=0; i < height; i++) {
590 for (j=0; j < width; j++) {
591 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
598 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
600 for (i=0; i < height; i++) {
601 for (j=0; j < width; j++) {
602 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
609 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
611 for (i=0; i < height; i++) {
612 for (j=0; j < width; j++) {
613 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
620 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
622 for (i=0; i < height; i++) {
623 for (j=0; j < width; j++) {
624 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
631 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
633 for (i=0; i < height; i++) {
634 for (j=0; j < width; j++) {
635 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
642 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
644 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
645 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
646 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
647 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
651 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
653 for (i=0; i < height; i++) {
654 for (j=0; j < width; j++) {
655 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
662 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
664 for (i=0; i < height; i++) {
665 for (j=0; j < width; j++) {
666 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
673 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
675 for (i=0; i < height; i++) {
676 for (j=0; j < width; j++) {
677 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
684 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
686 for (i=0; i < height; i++) {
687 for (j=0; j < width; j++) {
688 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
695 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
706 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
717 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
719 for (i=0; i < height; i++) {
720 for (j=0; j < width; j++) {
721 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
728 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
730 for (i=0; i < height; i++) {
731 for (j=0; j < width; j++) {
732 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
739 #define QPEL_MC(r, OPNAME, RND, OP) \
740 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
741 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
745 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
746 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
747 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
748 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
749 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
750 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
751 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
752 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
758 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
760 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
764 const int src0= src[0*srcStride];\
765 const int src1= src[1*srcStride];\
766 const int src2= src[2*srcStride];\
767 const int src3= src[3*srcStride];\
768 const int src4= src[4*srcStride];\
769 const int src5= src[5*srcStride];\
770 const int src6= src[6*srcStride];\
771 const int src7= src[7*srcStride];\
772 const int src8= src[8*srcStride];\
773 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
774 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
775 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
776 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
777 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
778 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
779 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
780 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
786 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
787 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
792 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
793 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
794 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
795 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
796 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
797 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
798 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
799 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
800 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
801 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
802 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
803 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
804 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
805 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
806 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
807 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
813 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
814 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
819 const int src0= src[0*srcStride];\
820 const int src1= src[1*srcStride];\
821 const int src2= src[2*srcStride];\
822 const int src3= src[3*srcStride];\
823 const int src4= src[4*srcStride];\
824 const int src5= src[5*srcStride];\
825 const int src6= src[6*srcStride];\
826 const int src7= src[7*srcStride];\
827 const int src8= src[8*srcStride];\
828 const int src9= src[9*srcStride];\
829 const int src10= src[10*srcStride];\
830 const int src11= src[11*srcStride];\
831 const int src12= src[12*srcStride];\
832 const int src13= src[13*srcStride];\
833 const int src14= src[14*srcStride];\
834 const int src15= src[15*srcStride];\
835 const int src16= src[16*srcStride];\
836 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
837 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
838 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
839 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
840 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
841 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
842 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
843 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
844 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
845 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
846 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
847 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
848 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
849 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
850 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
851 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
857 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
860 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
861 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
864 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
866 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
869 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
872 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
873 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
876 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
880 copy_block9(full, src, 16, stride, 9);\
881 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
882 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
885 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
888 copy_block9(full, src, 16, stride, 9);\
889 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
892 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
896 copy_block9(full, src, 16, stride, 9);\
897 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
898 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
900 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
906 copy_block9(full, src, 16, stride, 9);\
907 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
909 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
910 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
912 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
917 copy_block9(full, src, 16, stride, 9);\
918 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
919 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
920 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
921 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
923 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
929 copy_block9(full, src, 16, stride, 9);\
930 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
931 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
932 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
933 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
935 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
940 copy_block9(full, src, 16, stride, 9);\
941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
942 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
944 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
946 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
952 copy_block9(full, src, 16, stride, 9);\
953 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
954 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
955 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
956 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
958 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
963 copy_block9(full, src, 16, stride, 9);\
964 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
965 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
967 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
969 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
975 copy_block9(full, src, 16, stride, 9);\
976 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
977 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
978 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
979 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
981 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
986 copy_block9(full, src, 16, stride, 9);\
987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
988 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
990 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
992 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
997 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
998 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1000 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1003 uint8_t halfHV[64];\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1008 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1010 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1020 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1022 uint8_t full[16*9];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1029 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1031 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1041 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1043 uint8_t full[16*9];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1050 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1053 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1054 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1057 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1060 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1061 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1064 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1066 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1069 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1072 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1073 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1076 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1078 uint8_t full[24*17];\
1080 copy_block17(full, src, 24, stride, 17);\
1081 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1082 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1085 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1087 uint8_t full[24*17];\
1088 copy_block17(full, src, 24, stride, 17);\
1089 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1092 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1094 uint8_t full[24*17];\
1096 copy_block17(full, src, 24, stride, 17);\
1097 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1114 uint8_t full[24*17];\
1115 uint8_t halfH[272];\
1116 uint8_t halfHV[256];\
1117 copy_block17(full, src, 24, stride, 17);\
1118 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1119 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1120 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1121 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1123 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1125 uint8_t full[24*17];\
1126 uint8_t halfH[272];\
1127 uint8_t halfV[256];\
1128 uint8_t halfHV[256];\
1129 copy_block17(full, src, 24, stride, 17);\
1130 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1133 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1135 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1137 uint8_t full[24*17];\
1138 uint8_t halfH[272];\
1139 uint8_t halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1146 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1148 uint8_t full[24*17];\
1149 uint8_t halfH[272];\
1150 uint8_t halfV[256];\
1151 uint8_t halfHV[256];\
1152 copy_block17(full, src, 24, stride, 17);\
1153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1156 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1158 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1160 uint8_t full[24*17];\
1161 uint8_t halfH[272];\
1162 uint8_t halfHV[256];\
1163 copy_block17(full, src, 24, stride, 17);\
1164 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1169 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1171 uint8_t full[24*17];\
1172 uint8_t halfH[272];\
1173 uint8_t halfV[256];\
1174 uint8_t halfHV[256];\
1175 copy_block17(full, src, 24, stride, 17);\
1176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1178 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1179 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1181 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1183 uint8_t full[24*17];\
1184 uint8_t halfH[272];\
1185 uint8_t halfHV[256];\
1186 copy_block17(full, src, 24, stride, 17);\
1187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1189 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1192 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1194 uint8_t halfH[272];\
1195 uint8_t halfHV[256];\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1197 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1198 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1200 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1202 uint8_t halfH[272];\
1203 uint8_t halfHV[256];\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1208 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfV[256];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1222 uint8_t full[24*17];\
1223 uint8_t halfH[272];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1227 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1229 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1231 uint8_t full[24*17];\
1232 uint8_t halfH[272];\
1233 uint8_t halfV[256];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1241 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1243 uint8_t full[24*17];\
1244 uint8_t halfH[272];\
1245 copy_block17(full, src, 24, stride, 17);\
1246 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1250 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1252 uint8_t halfH[272];\
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1254 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1257 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1258 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1259 #define op_put(a, b) a = cm[((b) + 16)>>5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1262 QPEL_MC(0, put_ , _ , op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_ , _ , op_avg)
1265 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1267 #undef op_avg_no_rnd
1269 #undef op_put_no_rnd
1271 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 put_pixels8_8_c(dst, src, stride, 8);
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1277 avg_pixels8_8_c(dst, src, stride, 8);
1279 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1281 put_pixels16_8_c(dst, src, stride, 16);
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1285 avg_pixels16_8_c(dst, src, stride, 16);
1288 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1296 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1300 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1301 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1302 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1303 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1304 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1305 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1306 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1307 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1313 #if CONFIG_RV40_DECODER
1314 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 put_pixels16_xy2_8_c(dst, src, stride, 16);
1318 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1322 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1324 put_pixels8_xy2_8_c(dst, src, stride, 8);
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1330 #endif /* CONFIG_RV40_DECODER */
1332 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1333 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1337 const int src_1= src[ -srcStride];
1338 const int src0 = src[0 ];
1339 const int src1 = src[ srcStride];
1340 const int src2 = src[2*srcStride];
1341 const int src3 = src[3*srcStride];
1342 const int src4 = src[4*srcStride];
1343 const int src5 = src[5*srcStride];
1344 const int src6 = src[6*srcStride];
1345 const int src7 = src[7*srcStride];
1346 const int src8 = src[8*srcStride];
1347 const int src9 = src[9*srcStride];
1348 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1349 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1350 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1351 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1352 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1353 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1354 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1355 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1361 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1364 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1365 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1368 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1370 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1373 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1376 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1377 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1380 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1385 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1390 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1391 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1392 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1393 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1395 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1400 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1402 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1403 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1405 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1408 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1409 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1412 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1413 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1415 const int strength= ff_h263_loop_filter_strength[qscale];
1419 int p0= src[x-2*stride];
1420 int p1= src[x-1*stride];
1421 int p2= src[x+0*stride];
1422 int p3= src[x+1*stride];
1423 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1425 if (d<-2*strength) d1= 0;
1426 else if(d<- strength) d1=-2*strength - d;
1427 else if(d< strength) d1= d;
1428 else if(d< 2*strength) d1= 2*strength - d;
1433 if(p1&256) p1= ~(p1>>31);
1434 if(p2&256) p2= ~(p2>>31);
1436 src[x-1*stride] = p1;
1437 src[x+0*stride] = p2;
1441 d2= av_clip((p0-p3)/4, -ad1, ad1);
1443 src[x-2*stride] = p0 - d2;
1444 src[x+ stride] = p3 + d2;
1449 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1450 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1452 const int strength= ff_h263_loop_filter_strength[qscale];
1456 int p0= src[y*stride-2];
1457 int p1= src[y*stride-1];
1458 int p2= src[y*stride+0];
1459 int p3= src[y*stride+1];
1460 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1462 if (d<-2*strength) d1= 0;
1463 else if(d<- strength) d1=-2*strength - d;
1464 else if(d< strength) d1= d;
1465 else if(d< 2*strength) d1= 2*strength - d;
1470 if(p1&256) p1= ~(p1>>31);
1471 if(p2&256) p2= ~(p2>>31);
1473 src[y*stride-1] = p1;
1474 src[y*stride+0] = p2;
1478 d2= av_clip((p0-p3)/4, -ad1, ad1);
1480 src[y*stride-2] = p0 - d2;
1481 src[y*stride+1] = p3 + d2;
1486 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1492 s += abs(pix1[0] - pix2[0]);
1493 s += abs(pix1[1] - pix2[1]);
1494 s += abs(pix1[2] - pix2[2]);
1495 s += abs(pix1[3] - pix2[3]);
1496 s += abs(pix1[4] - pix2[4]);
1497 s += abs(pix1[5] - pix2[5]);
1498 s += abs(pix1[6] - pix2[6]);
1499 s += abs(pix1[7] - pix2[7]);
1500 s += abs(pix1[8] - pix2[8]);
1501 s += abs(pix1[9] - pix2[9]);
1502 s += abs(pix1[10] - pix2[10]);
1503 s += abs(pix1[11] - pix2[11]);
1504 s += abs(pix1[12] - pix2[12]);
1505 s += abs(pix1[13] - pix2[13]);
1506 s += abs(pix1[14] - pix2[14]);
1507 s += abs(pix1[15] - pix2[15]);
1514 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1520 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1521 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1522 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1523 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1524 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1525 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1526 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1527 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1528 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1529 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1530 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1531 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1532 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1533 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1534 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1535 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1542 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1545 uint8_t *pix3 = pix2 + line_size;
1549 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1550 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1551 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1552 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1553 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1554 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1555 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1556 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1557 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1558 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1559 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1560 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1561 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1562 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1563 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1564 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1572 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1575 uint8_t *pix3 = pix2 + line_size;
1579 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1580 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1581 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1582 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1583 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1584 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1585 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1586 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1587 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1588 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1589 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1590 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1591 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1592 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1593 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1594 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1602 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1608 s += abs(pix1[0] - pix2[0]);
1609 s += abs(pix1[1] - pix2[1]);
1610 s += abs(pix1[2] - pix2[2]);
1611 s += abs(pix1[3] - pix2[3]);
1612 s += abs(pix1[4] - pix2[4]);
1613 s += abs(pix1[5] - pix2[5]);
1614 s += abs(pix1[6] - pix2[6]);
1615 s += abs(pix1[7] - pix2[7]);
1622 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1628 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1629 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1630 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1631 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1632 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1633 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1634 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1635 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1642 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1645 uint8_t *pix3 = pix2 + line_size;
1649 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1650 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1651 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1652 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1653 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1654 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1655 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1656 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1664 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1667 uint8_t *pix3 = pix2 + line_size;
1671 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1672 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1673 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1674 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1675 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1676 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1677 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1678 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1686 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1687 MpegEncContext *c = v;
1693 for(x=0; x<16; x++){
1694 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1697 for(x=0; x<15; x++){
1698 score2+= FFABS( s1[x ] - s1[x +stride]
1699 - s1[x+1] + s1[x+1+stride])
1700 -FFABS( s2[x ] - s2[x +stride]
1701 - s2[x+1] + s2[x+1+stride]);
1708 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1709 else return score1 + FFABS(score2)*8;
1712 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1713 MpegEncContext *c = v;
1720 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1724 score2+= FFABS( s1[x ] - s1[x +stride]
1725 - s1[x+1] + s1[x+1+stride])
1726 -FFABS( s2[x ] - s2[x +stride]
1727 - s2[x+1] + s2[x+1+stride]);
1734 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1735 else return score1 + FFABS(score2)*8;
1738 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1742 for(i=0; i<8*8; i++){
1743 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1746 assert(-512<b && b<512);
1748 sum += (w*b)*(w*b)>>4;
1753 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1756 for(i=0; i<8*8; i++){
1757 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1761 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1765 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1768 memset(cmp, 0, sizeof(void*)*6);
1776 cmp[i]= c->hadamard8_diff[i];
1782 cmp[i]= c->dct_sad[i];
1785 cmp[i]= c->dct264_sad[i];
1788 cmp[i]= c->dct_max[i];
1791 cmp[i]= c->quant_psnr[i];
1812 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1817 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1819 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1820 long a = *(long*)(src+i);
1821 long b = *(long*)(dst+i);
1822 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1825 dst[i+0] += src[i+0];
1828 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1830 #if !HAVE_FAST_UNALIGNED
1831 if((long)src2 & (sizeof(long)-1)){
1832 for(i=0; i+7<w; i+=8){
1833 dst[i+0] = src1[i+0]-src2[i+0];
1834 dst[i+1] = src1[i+1]-src2[i+1];
1835 dst[i+2] = src1[i+2]-src2[i+2];
1836 dst[i+3] = src1[i+3]-src2[i+3];
1837 dst[i+4] = src1[i+4]-src2[i+4];
1838 dst[i+5] = src1[i+5]-src2[i+5];
1839 dst[i+6] = src1[i+6]-src2[i+6];
1840 dst[i+7] = src1[i+7]-src2[i+7];
1844 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1845 long a = *(long*)(src1+i);
1846 long b = *(long*)(src2+i);
1847 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1850 dst[i+0] = src1[i+0]-src2[i+0];
1853 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1861 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1870 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1878 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1888 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1891 for(i=0; i<w-1; i++){
1918 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1948 #define BUTTERFLY2(o1,o2,i1,i2) \
1952 #define BUTTERFLY1(x,y) \
1961 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1963 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1971 //FIXME try pointer walks
1972 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1973 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1974 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1975 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1977 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1978 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1979 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1980 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1982 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1983 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1984 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1985 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1989 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1990 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1991 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1992 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1994 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1995 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1996 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1997 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2000 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2001 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2002 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2003 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2008 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2016 //FIXME try pointer walks
2017 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2018 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2019 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2020 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2022 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2023 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2024 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2025 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2027 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2028 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2029 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2030 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2034 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2035 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2036 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2037 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2039 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2040 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2041 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2042 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2045 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2046 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2047 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2048 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2051 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2056 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2057 MpegEncContext * const s= (MpegEncContext *)c;
2058 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2062 s->dsp.diff_pixels(temp, src1, src2, stride);
2064 return s->dsp.sum_abs_dctelem(temp);
2069 const int s07 = SRC(0) + SRC(7);\
2070 const int s16 = SRC(1) + SRC(6);\
2071 const int s25 = SRC(2) + SRC(5);\
2072 const int s34 = SRC(3) + SRC(4);\
2073 const int a0 = s07 + s34;\
2074 const int a1 = s16 + s25;\
2075 const int a2 = s07 - s34;\
2076 const int a3 = s16 - s25;\
2077 const int d07 = SRC(0) - SRC(7);\
2078 const int d16 = SRC(1) - SRC(6);\
2079 const int d25 = SRC(2) - SRC(5);\
2080 const int d34 = SRC(3) - SRC(4);\
2081 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2082 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2083 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2084 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2086 DST(1, a4 + (a7>>2)) ;\
2087 DST(2, a2 + (a3>>1)) ;\
2088 DST(3, a5 + (a6>>2)) ;\
2090 DST(5, a6 - (a5>>2)) ;\
2091 DST(6, (a2>>1) - a3 ) ;\
2092 DST(7, (a4>>2) - a7 ) ;\
2095 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2096 MpegEncContext * const s= (MpegEncContext *)c;
2101 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2103 #define SRC(x) dct[i][x]
2104 #define DST(x,v) dct[i][x]= v
2105 for( i = 0; i < 8; i++ )
2110 #define SRC(x) dct[x][i]
2111 #define DST(x,v) sum += FFABS(v)
2112 for( i = 0; i < 8; i++ )
2120 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2121 MpegEncContext * const s= (MpegEncContext *)c;
2122 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2127 s->dsp.diff_pixels(temp, src1, src2, stride);
2131 sum= FFMAX(sum, FFABS(temp[i]));
2136 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2137 MpegEncContext * const s= (MpegEncContext *)c;
2138 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2139 int16_t * const bak = temp+64;
2145 s->dsp.diff_pixels(temp, src1, src2, stride);
2147 memcpy(bak, temp, 64*sizeof(int16_t));
2149 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2150 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2151 ff_simple_idct_8(temp); //FIXME
2154 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2159 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2160 MpegEncContext * const s= (MpegEncContext *)c;
2161 const uint8_t *scantable= s->intra_scantable.permutated;
2162 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2163 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2164 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2165 int i, last, run, bits, level, distortion, start_i;
2166 const int esc_length= s->ac_esc_length;
2168 uint8_t * last_length;
2172 copy_block8(lsrc1, src1, 8, stride, 8);
2173 copy_block8(lsrc2, src2, 8, stride, 8);
2175 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2177 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2183 length = s->intra_ac_vlc_length;
2184 last_length= s->intra_ac_vlc_last_length;
2185 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2188 length = s->inter_ac_vlc_length;
2189 last_length= s->inter_ac_vlc_last_length;
2194 for(i=start_i; i<last; i++){
2195 int j= scantable[i];
2200 if((level&(~127)) == 0){
2201 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2210 level= temp[i] + 64;
2214 if((level&(~127)) == 0){
2215 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2223 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2225 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2228 s->dsp.idct_add(lsrc2, 8, temp);
2230 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2232 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2235 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2236 MpegEncContext * const s= (MpegEncContext *)c;
2237 const uint8_t *scantable= s->intra_scantable.permutated;
2238 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2239 int i, last, run, bits, level, start_i;
2240 const int esc_length= s->ac_esc_length;
2242 uint8_t * last_length;
2246 s->dsp.diff_pixels(temp, src1, src2, stride);
2248 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2254 length = s->intra_ac_vlc_length;
2255 last_length= s->intra_ac_vlc_last_length;
2256 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2259 length = s->inter_ac_vlc_length;
2260 last_length= s->inter_ac_vlc_last_length;
2265 for(i=start_i; i<last; i++){
2266 int j= scantable[i];
2271 if((level&(~127)) == 0){
2272 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2281 level= temp[i] + 64;
2285 if((level&(~127)) == 0){
2286 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2294 #define VSAD_INTRA(size) \
2295 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2299 for(y=1; y<h; y++){ \
2300 for(x=0; x<size; x+=4){ \
2301 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2302 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2312 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2317 for(x=0; x<16; x++){
2318 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2327 #define SQ(a) ((a)*(a))
2328 #define VSSE_INTRA(size) \
2329 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2333 for(y=1; y<h; y++){ \
2334 for(x=0; x<size; x+=4){ \
2335 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2336 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2346 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2351 for(x=0; x<16; x++){
2352 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2361 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2365 for(i=0; i<size; i++)
2366 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2370 #define WRAPPER8_16_SQ(name8, name16)\
2371 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2373 score +=name8(s, dst , src , stride, 8);\
2374 score +=name8(s, dst+8 , src+8 , stride, 8);\
2378 score +=name8(s, dst , src , stride, 8);\
2379 score +=name8(s, dst+8 , src+8 , stride, 8);\
2384 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2385 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2386 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2388 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2390 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2391 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2392 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2393 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2395 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2396 uint32_t maxi, uint32_t maxisign)
2399 if(a > mini) return mini;
2400 else if((a^(1U<<31)) > maxisign) return maxi;
2404 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2406 uint32_t mini = *(uint32_t*)min;
2407 uint32_t maxi = *(uint32_t*)max;
2408 uint32_t maxisign = maxi ^ (1U<<31);
2409 uint32_t *dsti = (uint32_t*)dst;
2410 const uint32_t *srci = (const uint32_t*)src;
2411 for(i=0; i<len; i+=8) {
2412 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2413 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2414 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2415 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2416 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2417 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2418 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2419 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2422 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2424 if(min < 0 && max > 0) {
2425 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2427 for(i=0; i < len; i+=8) {
2428 dst[i ] = av_clipf(src[i ], min, max);
2429 dst[i + 1] = av_clipf(src[i + 1], min, max);
2430 dst[i + 2] = av_clipf(src[i + 2], min, max);
2431 dst[i + 3] = av_clipf(src[i + 3], min, max);
2432 dst[i + 4] = av_clipf(src[i + 4], min, max);
2433 dst[i + 5] = av_clipf(src[i + 5], min, max);
2434 dst[i + 6] = av_clipf(src[i + 6], min, max);
2435 dst[i + 7] = av_clipf(src[i + 7], min, max);
2440 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2445 res += *v1++ * *v2++;
2450 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2455 *v1++ += mul * *v3++;
2460 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2461 const int16_t *window, unsigned int len)
2464 int len2 = len >> 1;
2466 for (i = 0; i < len2; i++) {
2467 int16_t w = window[i];
2468 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2469 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2473 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2474 int32_t max, unsigned int len)
2477 *dst++ = av_clip(*src++, min, max);
2478 *dst++ = av_clip(*src++, min, max);
2479 *dst++ = av_clip(*src++, min, max);
2480 *dst++ = av_clip(*src++, min, max);
2481 *dst++ = av_clip(*src++, min, max);
2482 *dst++ = av_clip(*src++, min, max);
2483 *dst++ = av_clip(*src++, min, max);
2484 *dst++ = av_clip(*src++, min, max);
2489 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2491 ff_j_rev_dct (block);
2492 put_pixels_clamped_c(block, dest, line_size);
2494 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2496 ff_j_rev_dct (block);
2497 add_pixels_clamped_c(block, dest, line_size);
2500 /* init static data */
2501 av_cold void ff_dsputil_static_init(void)
2505 for(i=0;i<512;i++) {
2506 ff_squareTbl[i] = (i - 256) * (i - 256);
2509 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2512 int ff_check_alignment(void){
2513 static int did_fail=0;
2514 LOCAL_ALIGNED_16(int, aligned, [4]);
2516 if((intptr_t)aligned & 15){
2518 #if HAVE_MMX || HAVE_ALTIVEC
2519 av_log(NULL, AV_LOG_ERROR,
2520 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2521 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2522 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2523 "Do not report crashes to Libav developers.\n");
2532 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2534 ff_check_alignment();
2537 if (avctx->bits_per_raw_sample == 10) {
2538 c->fdct = ff_jpeg_fdct_islow_10;
2539 c->fdct248 = ff_fdct248_islow_10;
2541 if(avctx->dct_algo==FF_DCT_FASTINT) {
2542 c->fdct = ff_fdct_ifast;
2543 c->fdct248 = ff_fdct_ifast248;
2545 else if(avctx->dct_algo==FF_DCT_FAAN) {
2546 c->fdct = ff_faandct;
2547 c->fdct248 = ff_faandct248;
2550 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2551 c->fdct248 = ff_fdct248_islow_8;
2554 #endif //CONFIG_ENCODERS
2556 if (avctx->bits_per_raw_sample == 10) {
2557 c->idct_put = ff_simple_idct_put_10;
2558 c->idct_add = ff_simple_idct_add_10;
2559 c->idct = ff_simple_idct_10;
2560 c->idct_permutation_type = FF_NO_IDCT_PERM;
2562 if(avctx->idct_algo==FF_IDCT_INT){
2563 c->idct_put= jref_idct_put;
2564 c->idct_add= jref_idct_add;
2565 c->idct = ff_j_rev_dct;
2566 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2567 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2568 c->idct_put= ff_faanidct_put;
2569 c->idct_add= ff_faanidct_add;
2570 c->idct = ff_faanidct;
2571 c->idct_permutation_type= FF_NO_IDCT_PERM;
2572 }else{ //accurate/default
2573 c->idct_put = ff_simple_idct_put_8;
2574 c->idct_add = ff_simple_idct_add_8;
2575 c->idct = ff_simple_idct_8;
2576 c->idct_permutation_type= FF_NO_IDCT_PERM;
2580 c->diff_pixels = diff_pixels_c;
2581 c->put_pixels_clamped = put_pixels_clamped_c;
2582 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2583 c->add_pixels_clamped = add_pixels_clamped_c;
2584 c->sum_abs_dctelem = sum_abs_dctelem_c;
2587 c->pix_sum = pix_sum_c;
2588 c->pix_norm1 = pix_norm1_c;
2590 c->fill_block_tab[0] = fill_block16_c;
2591 c->fill_block_tab[1] = fill_block8_c;
2593 /* TODO [0] 16 [1] 8 */
2594 c->pix_abs[0][0] = pix_abs16_c;
2595 c->pix_abs[0][1] = pix_abs16_x2_c;
2596 c->pix_abs[0][2] = pix_abs16_y2_c;
2597 c->pix_abs[0][3] = pix_abs16_xy2_c;
2598 c->pix_abs[1][0] = pix_abs8_c;
2599 c->pix_abs[1][1] = pix_abs8_x2_c;
2600 c->pix_abs[1][2] = pix_abs8_y2_c;
2601 c->pix_abs[1][3] = pix_abs8_xy2_c;
2603 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2604 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2605 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2606 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2607 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2608 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2609 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2610 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2611 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2613 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2614 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2615 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2616 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2617 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2618 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2619 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2620 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2621 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2623 #define dspfunc(PFX, IDX, NUM) \
2624 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2625 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2626 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2627 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2628 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2629 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2630 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2631 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2632 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2633 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2634 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2635 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2636 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2637 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2638 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2639 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2641 dspfunc(put_qpel, 0, 16);
2642 dspfunc(put_no_rnd_qpel, 0, 16);
2644 dspfunc(avg_qpel, 0, 16);
2645 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2647 dspfunc(put_qpel, 1, 8);
2648 dspfunc(put_no_rnd_qpel, 1, 8);
2650 dspfunc(avg_qpel, 1, 8);
2651 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2655 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2656 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2657 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2658 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2659 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2660 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2661 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2662 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2664 #define SET_CMP_FUNC(name) \
2665 c->name[0]= name ## 16_c;\
2666 c->name[1]= name ## 8x8_c;
2668 SET_CMP_FUNC(hadamard8_diff)
2669 c->hadamard8_diff[4]= hadamard8_intra16_c;
2670 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2671 SET_CMP_FUNC(dct_sad)
2672 SET_CMP_FUNC(dct_max)
2674 SET_CMP_FUNC(dct264_sad)
2676 c->sad[0]= pix_abs16_c;
2677 c->sad[1]= pix_abs8_c;
2681 SET_CMP_FUNC(quant_psnr)
2684 c->vsad[0]= vsad16_c;
2685 c->vsad[4]= vsad_intra16_c;
2686 c->vsad[5]= vsad_intra8_c;
2687 c->vsse[0]= vsse16_c;
2688 c->vsse[4]= vsse_intra16_c;
2689 c->vsse[5]= vsse_intra8_c;
2690 c->nsse[0]= nsse16_c;
2691 c->nsse[1]= nsse8_c;
2693 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2695 c->add_bytes= add_bytes_c;
2696 c->diff_bytes= diff_bytes_c;
2697 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2698 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2699 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2700 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2701 c->bswap_buf= bswap_buf;
2702 c->bswap16_buf = bswap16_buf;
2704 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2705 c->h263_h_loop_filter= h263_h_loop_filter_c;
2706 c->h263_v_loop_filter= h263_v_loop_filter_c;
2709 c->try_8x8basis= try_8x8basis_c;
2710 c->add_8x8basis= add_8x8basis_c;
2712 c->vector_clipf = vector_clipf_c;
2713 c->scalarproduct_int16 = scalarproduct_int16_c;
2714 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2715 c->apply_window_int16 = apply_window_int16_c;
2716 c->vector_clip_int32 = vector_clip_int32_c;
2718 c->shrink[0]= av_image_copy_plane;
2719 c->shrink[1]= ff_shrink22;
2720 c->shrink[2]= ff_shrink44;
2721 c->shrink[3]= ff_shrink88;
2723 c->add_pixels8 = add_pixels8_c;
2727 #define FUNC(f, depth) f ## _ ## depth
2728 #define FUNCC(f, depth) f ## _ ## depth ## _c
2730 c->draw_edges = FUNCC(draw_edges, 8);
2731 c->clear_block = FUNCC(clear_block, 8);
2732 c->clear_blocks = FUNCC(clear_blocks, 8);
2734 #define BIT_DEPTH_FUNCS(depth) \
2735 c->get_pixels = FUNCC(get_pixels, depth);
2737 switch (avctx->bits_per_raw_sample) {
2740 BIT_DEPTH_FUNCS(16);
2748 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2749 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2750 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2751 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2752 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2753 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2754 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2756 ff_init_scantable_permutation(c->idct_permutation,
2757 c->idct_permutation_type);