3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
45 uint32_t ff_squareTbl[512] = {0, };
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
59 specification, we interleave the fields */
60 const uint8_t ff_zigzag248_direct[64] = {
61 0, 8, 1, 9, 16, 24, 2, 10,
62 17, 25, 32, 40, 48, 56, 33, 41,
63 18, 26, 3, 11, 4, 12, 19, 27,
64 34, 42, 49, 57, 50, 58, 35, 43,
65 20, 28, 5, 13, 6, 14, 21, 29,
66 36, 44, 51, 59, 52, 60, 37, 45,
67 22, 30, 7, 15, 23, 31, 38, 46,
68 53, 61, 54, 62, 39, 47, 55, 63,
71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
74 const uint8_t ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 const uint8_t ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
96 /* Input permutation for the simple_idct_mmx */
97 static const uint8_t simple_mmx_permutation[64]={
98 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
99 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
100 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
101 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
102 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
103 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
104 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
105 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
110 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
111 const uint8_t *src_scantable)
116 st->scantable= src_scantable;
120 j = src_scantable[i];
121 st->permutated[i] = permutation[j];
127 j = st->permutated[i];
129 st->raster_end[i]= end;
133 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
134 int idct_permutation_type)
138 switch(idct_permutation_type){
139 case FF_NO_IDCT_PERM:
141 idct_permutation[i]= i;
143 case FF_LIBMPEG2_IDCT_PERM:
145 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
147 case FF_SIMPLE_IDCT_PERM:
149 idct_permutation[i]= simple_mmx_permutation[i];
151 case FF_TRANSPOSE_IDCT_PERM:
153 idct_permutation[i]= ((i&7)<<3) | (i>>3);
155 case FF_PARTTRANS_IDCT_PERM:
157 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
159 case FF_SSE2_IDCT_PERM:
161 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
164 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
168 static int pix_sum_c(uint8_t * pix, int line_size)
173 for (i = 0; i < 16; i++) {
174 for (j = 0; j < 16; j += 8) {
185 pix += line_size - 16;
190 static int pix_norm1_c(uint8_t * pix, int line_size)
193 uint32_t *sq = ff_squareTbl + 256;
196 for (i = 0; i < 16; i++) {
197 for (j = 0; j < 16; j += 8) {
209 register uint64_t x=*(uint64_t*)pix;
211 s += sq[(x>>8)&0xff];
212 s += sq[(x>>16)&0xff];
213 s += sq[(x>>24)&0xff];
214 s += sq[(x>>32)&0xff];
215 s += sq[(x>>40)&0xff];
216 s += sq[(x>>48)&0xff];
217 s += sq[(x>>56)&0xff];
219 register uint32_t x=*(uint32_t*)pix;
221 s += sq[(x>>8)&0xff];
222 s += sq[(x>>16)&0xff];
223 s += sq[(x>>24)&0xff];
224 x=*(uint32_t*)(pix+4);
226 s += sq[(x>>8)&0xff];
227 s += sq[(x>>16)&0xff];
228 s += sq[(x>>24)&0xff];
233 pix += line_size - 16;
238 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
241 for(i=0; i+8<=w; i+=8){
242 dst[i+0]= av_bswap32(src[i+0]);
243 dst[i+1]= av_bswap32(src[i+1]);
244 dst[i+2]= av_bswap32(src[i+2]);
245 dst[i+3]= av_bswap32(src[i+3]);
246 dst[i+4]= av_bswap32(src[i+4]);
247 dst[i+5]= av_bswap32(src[i+5]);
248 dst[i+6]= av_bswap32(src[i+6]);
249 dst[i+7]= av_bswap32(src[i+7]);
252 dst[i+0]= av_bswap32(src[i+0]);
256 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
259 *dst++ = av_bswap16(*src++);
262 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
265 uint32_t *sq = ff_squareTbl + 256;
268 for (i = 0; i < h; i++) {
269 s += sq[pix1[0] - pix2[0]];
270 s += sq[pix1[1] - pix2[1]];
271 s += sq[pix1[2] - pix2[2]];
272 s += sq[pix1[3] - pix2[3]];
279 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
282 uint32_t *sq = ff_squareTbl + 256;
285 for (i = 0; i < h; i++) {
286 s += sq[pix1[0] - pix2[0]];
287 s += sq[pix1[1] - pix2[1]];
288 s += sq[pix1[2] - pix2[2]];
289 s += sq[pix1[3] - pix2[3]];
290 s += sq[pix1[4] - pix2[4]];
291 s += sq[pix1[5] - pix2[5]];
292 s += sq[pix1[6] - pix2[6]];
293 s += sq[pix1[7] - pix2[7]];
300 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
303 uint32_t *sq = ff_squareTbl + 256;
306 for (i = 0; i < h; i++) {
307 s += sq[pix1[ 0] - pix2[ 0]];
308 s += sq[pix1[ 1] - pix2[ 1]];
309 s += sq[pix1[ 2] - pix2[ 2]];
310 s += sq[pix1[ 3] - pix2[ 3]];
311 s += sq[pix1[ 4] - pix2[ 4]];
312 s += sq[pix1[ 5] - pix2[ 5]];
313 s += sq[pix1[ 6] - pix2[ 6]];
314 s += sq[pix1[ 7] - pix2[ 7]];
315 s += sq[pix1[ 8] - pix2[ 8]];
316 s += sq[pix1[ 9] - pix2[ 9]];
317 s += sq[pix1[10] - pix2[10]];
318 s += sq[pix1[11] - pix2[11]];
319 s += sq[pix1[12] - pix2[12]];
320 s += sq[pix1[13] - pix2[13]];
321 s += sq[pix1[14] - pix2[14]];
322 s += sq[pix1[15] - pix2[15]];
330 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
331 const uint8_t *s2, int stride){
334 /* read the pixels */
336 block[0] = s1[0] - s2[0];
337 block[1] = s1[1] - s2[1];
338 block[2] = s1[2] - s2[2];
339 block[3] = s1[3] - s2[3];
340 block[4] = s1[4] - s2[4];
341 block[5] = s1[5] - s2[5];
342 block[6] = s1[6] - s2[6];
343 block[7] = s1[7] - s2[7];
351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
356 /* read the pixels */
358 pixels[0] = av_clip_uint8(block[0]);
359 pixels[1] = av_clip_uint8(block[1]);
360 pixels[2] = av_clip_uint8(block[2]);
361 pixels[3] = av_clip_uint8(block[3]);
362 pixels[4] = av_clip_uint8(block[4]);
363 pixels[5] = av_clip_uint8(block[5]);
364 pixels[6] = av_clip_uint8(block[6]);
365 pixels[7] = av_clip_uint8(block[7]);
372 static void put_signed_pixels_clamped_c(const int16_t *block,
373 uint8_t *restrict pixels,
378 for (i = 0; i < 8; i++) {
379 for (j = 0; j < 8; j++) {
382 else if (*block > 127)
385 *pixels = (uint8_t)(*block + 128);
389 pixels += (line_size - 8);
393 static void add_pixels8_c(uint8_t *restrict pixels,
400 pixels[0] += block[0];
401 pixels[1] += block[1];
402 pixels[2] += block[2];
403 pixels[3] += block[3];
404 pixels[4] += block[4];
405 pixels[5] += block[5];
406 pixels[6] += block[6];
407 pixels[7] += block[7];
413 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
418 /* read the pixels */
420 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
421 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
422 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
423 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
424 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
425 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
426 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
427 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
433 static int sum_abs_dctelem_c(int16_t *block)
437 sum+= FFABS(block[i]);
441 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
445 for (i = 0; i < h; i++) {
446 memset(block, value, 16);
451 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
455 for (i = 0; i < h; i++) {
456 memset(block, value, 8);
461 #define avg2(a,b) ((a+b+1)>>1)
462 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
464 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
466 const int A=(16-x16)*(16-y16);
467 const int B=( x16)*(16-y16);
468 const int C=(16-x16)*( y16);
469 const int D=( x16)*( y16);
474 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
475 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
476 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
477 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
478 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
479 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
480 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
481 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
487 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
488 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
491 const int s= 1<<shift;
501 for(x=0; x<8; x++){ //XXX FIXME optimize
502 int src_x, src_y, frac_x, frac_y, index;
511 if((unsigned)src_x < width){
512 if((unsigned)src_y < height){
513 index= src_x + src_y*stride;
514 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
515 + src[index +1]* frac_x )*(s-frac_y)
516 + ( src[index+stride ]*(s-frac_x)
517 + src[index+stride+1]* frac_x )* frac_y
520 index= src_x + av_clip(src_y, 0, height)*stride;
521 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
522 + src[index +1]* frac_x )*s
526 if((unsigned)src_y < height){
527 index= av_clip(src_x, 0, width) + src_y*stride;
528 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
529 + src[index+stride ]* frac_y )*s
532 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
533 dst[y*stride + x]= src[index ];
545 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
547 case 2: put_pixels2_8_c (dst, src, stride, height); break;
548 case 4: put_pixels4_8_c (dst, src, stride, height); break;
549 case 8: put_pixels8_8_c (dst, src, stride, height); break;
550 case 16:put_pixels16_8_c(dst, src, stride, height); break;
554 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
556 for (i=0; i < height; i++) {
557 for (j=0; j < width; j++) {
558 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
565 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
567 for (i=0; i < height; i++) {
568 for (j=0; j < width; j++) {
569 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
576 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
578 for (i=0; i < height; i++) {
579 for (j=0; j < width; j++) {
580 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
587 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
589 for (i=0; i < height; i++) {
590 for (j=0; j < width; j++) {
591 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
598 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
600 for (i=0; i < height; i++) {
601 for (j=0; j < width; j++) {
602 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
609 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
611 for (i=0; i < height; i++) {
612 for (j=0; j < width; j++) {
613 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
620 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
622 for (i=0; i < height; i++) {
623 for (j=0; j < width; j++) {
624 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
631 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
633 for (i=0; i < height; i++) {
634 for (j=0; j < width; j++) {
635 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
642 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
644 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
645 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
646 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
647 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
651 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
653 for (i=0; i < height; i++) {
654 for (j=0; j < width; j++) {
655 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
662 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
664 for (i=0; i < height; i++) {
665 for (j=0; j < width; j++) {
666 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
673 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
675 for (i=0; i < height; i++) {
676 for (j=0; j < width; j++) {
677 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
684 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
686 for (i=0; i < height; i++) {
687 for (j=0; j < width; j++) {
688 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
695 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
706 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
717 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
719 for (i=0; i < height; i++) {
720 for (j=0; j < width; j++) {
721 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
728 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
730 for (i=0; i < height; i++) {
731 for (j=0; j < width; j++) {
732 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
739 #define QPEL_MC(r, OPNAME, RND, OP) \
740 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
741 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
745 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
746 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
747 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
748 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
749 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
750 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
751 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
752 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
758 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
760 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
764 const int src0= src[0*srcStride];\
765 const int src1= src[1*srcStride];\
766 const int src2= src[2*srcStride];\
767 const int src3= src[3*srcStride];\
768 const int src4= src[4*srcStride];\
769 const int src5= src[5*srcStride];\
770 const int src6= src[6*srcStride];\
771 const int src7= src[7*srcStride];\
772 const int src8= src[8*srcStride];\
773 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
774 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
775 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
776 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
777 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
778 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
779 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
780 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
786 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
787 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
792 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
793 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
794 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
795 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
796 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
797 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
798 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
799 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
800 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
801 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
802 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
803 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
804 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
805 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
806 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
807 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
813 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
814 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
819 const int src0= src[0*srcStride];\
820 const int src1= src[1*srcStride];\
821 const int src2= src[2*srcStride];\
822 const int src3= src[3*srcStride];\
823 const int src4= src[4*srcStride];\
824 const int src5= src[5*srcStride];\
825 const int src6= src[6*srcStride];\
826 const int src7= src[7*srcStride];\
827 const int src8= src[8*srcStride];\
828 const int src9= src[9*srcStride];\
829 const int src10= src[10*srcStride];\
830 const int src11= src[11*srcStride];\
831 const int src12= src[12*srcStride];\
832 const int src13= src[13*srcStride];\
833 const int src14= src[14*srcStride];\
834 const int src15= src[15*srcStride];\
835 const int src16= src[16*srcStride];\
836 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
837 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
838 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
839 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
840 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
841 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
842 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
843 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
844 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
845 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
846 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
847 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
848 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
849 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
850 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
851 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
857 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
860 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
861 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
864 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
866 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
869 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
872 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
873 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
876 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
880 copy_block9(full, src, 16, stride, 9);\
881 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
882 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
885 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
888 copy_block9(full, src, 16, stride, 9);\
889 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
892 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
896 copy_block9(full, src, 16, stride, 9);\
897 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
898 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
900 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
906 copy_block9(full, src, 16, stride, 9);\
907 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
908 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
909 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
910 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
912 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
917 copy_block9(full, src, 16, stride, 9);\
918 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
919 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
920 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
921 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
923 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
929 copy_block9(full, src, 16, stride, 9);\
930 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
931 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
932 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
933 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
935 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
940 copy_block9(full, src, 16, stride, 9);\
941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
942 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
944 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
946 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
952 copy_block9(full, src, 16, stride, 9);\
953 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
954 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
955 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
956 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
958 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
963 copy_block9(full, src, 16, stride, 9);\
964 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
965 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
967 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
969 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
975 copy_block9(full, src, 16, stride, 9);\
976 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
977 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
978 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
979 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
981 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
986 copy_block9(full, src, 16, stride, 9);\
987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
988 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
990 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
992 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
997 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
998 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1000 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1003 uint8_t halfHV[64];\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1008 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1010 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1020 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1022 uint8_t full[16*9];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1027 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1029 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1031 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1041 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1043 uint8_t full[16*9];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1050 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1053 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1054 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1057 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1060 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1061 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1064 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1066 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1069 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1072 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1073 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1076 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1078 uint8_t full[24*17];\
1080 copy_block17(full, src, 24, stride, 17);\
1081 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1082 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1085 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1087 uint8_t full[24*17];\
1088 copy_block17(full, src, 24, stride, 17);\
1089 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1092 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1094 uint8_t full[24*17];\
1096 copy_block17(full, src, 24, stride, 17);\
1097 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1114 uint8_t full[24*17];\
1115 uint8_t halfH[272];\
1116 uint8_t halfHV[256];\
1117 copy_block17(full, src, 24, stride, 17);\
1118 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1119 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1120 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1121 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1123 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1125 uint8_t full[24*17];\
1126 uint8_t halfH[272];\
1127 uint8_t halfV[256];\
1128 uint8_t halfHV[256];\
1129 copy_block17(full, src, 24, stride, 17);\
1130 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1131 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1133 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1135 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1137 uint8_t full[24*17];\
1138 uint8_t halfH[272];\
1139 uint8_t halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1146 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1148 uint8_t full[24*17];\
1149 uint8_t halfH[272];\
1150 uint8_t halfV[256];\
1151 uint8_t halfHV[256];\
1152 copy_block17(full, src, 24, stride, 17);\
1153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1156 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1158 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1160 uint8_t full[24*17];\
1161 uint8_t halfH[272];\
1162 uint8_t halfHV[256];\
1163 copy_block17(full, src, 24, stride, 17);\
1164 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1169 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1171 uint8_t full[24*17];\
1172 uint8_t halfH[272];\
1173 uint8_t halfV[256];\
1174 uint8_t halfHV[256];\
1175 copy_block17(full, src, 24, stride, 17);\
1176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1178 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1179 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1181 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1183 uint8_t full[24*17];\
1184 uint8_t halfH[272];\
1185 uint8_t halfHV[256];\
1186 copy_block17(full, src, 24, stride, 17);\
1187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1188 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1189 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1192 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1194 uint8_t halfH[272];\
1195 uint8_t halfHV[256];\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1197 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1198 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1200 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1202 uint8_t halfH[272];\
1203 uint8_t halfHV[256];\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1208 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfV[256];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1222 uint8_t full[24*17];\
1223 uint8_t halfH[272];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1227 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1229 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1231 uint8_t full[24*17];\
1232 uint8_t halfH[272];\
1233 uint8_t halfV[256];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1241 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1243 uint8_t full[24*17];\
1244 uint8_t halfH[272];\
1245 copy_block17(full, src, 24, stride, 17);\
1246 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1250 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1252 uint8_t halfH[272];\
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1254 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1257 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1258 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1259 #define op_put(a, b) a = cm[((b) + 16)>>5]
1260 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1262 QPEL_MC(0, put_ , _ , op_put)
1263 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1264 QPEL_MC(0, avg_ , _ , op_avg)
1265 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1267 #undef op_avg_no_rnd
1269 #undef op_put_no_rnd
1271 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 put_pixels8_8_c(dst, src, stride, 8);
1275 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1277 avg_pixels8_8_c(dst, src, stride, 8);
1279 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1281 put_pixels16_8_c(dst, src, stride, 16);
1283 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1285 avg_pixels16_8_c(dst, src, stride, 16);
1288 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1289 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1290 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1291 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1292 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1293 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1295 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1296 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1300 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1301 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1302 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1303 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1304 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1305 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1306 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1307 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1313 #if CONFIG_RV40_DECODER
1314 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 put_pixels16_xy2_8_c(dst, src, stride, 16);
1318 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1322 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1324 put_pixels8_xy2_8_c(dst, src, stride, 8);
1326 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1328 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1330 #endif /* CONFIG_RV40_DECODER */
1332 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1333 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1337 const int src_1= src[ -srcStride];
1338 const int src0 = src[0 ];
1339 const int src1 = src[ srcStride];
1340 const int src2 = src[2*srcStride];
1341 const int src3 = src[3*srcStride];
1342 const int src4 = src[4*srcStride];
1343 const int src5 = src[5*srcStride];
1344 const int src6 = src[6*srcStride];
1345 const int src7 = src[7*srcStride];
1346 const int src8 = src[8*srcStride];
1347 const int src9 = src[9*srcStride];
1348 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1349 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1350 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1351 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1352 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1353 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1354 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1355 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1361 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1364 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1365 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1368 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1370 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1373 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1376 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1377 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1380 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1385 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1390 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1391 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1392 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1393 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1395 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1400 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1401 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1402 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1403 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1405 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1408 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1409 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1412 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1418 s += abs(pix1[0] - pix2[0]);
1419 s += abs(pix1[1] - pix2[1]);
1420 s += abs(pix1[2] - pix2[2]);
1421 s += abs(pix1[3] - pix2[3]);
1422 s += abs(pix1[4] - pix2[4]);
1423 s += abs(pix1[5] - pix2[5]);
1424 s += abs(pix1[6] - pix2[6]);
1425 s += abs(pix1[7] - pix2[7]);
1426 s += abs(pix1[8] - pix2[8]);
1427 s += abs(pix1[9] - pix2[9]);
1428 s += abs(pix1[10] - pix2[10]);
1429 s += abs(pix1[11] - pix2[11]);
1430 s += abs(pix1[12] - pix2[12]);
1431 s += abs(pix1[13] - pix2[13]);
1432 s += abs(pix1[14] - pix2[14]);
1433 s += abs(pix1[15] - pix2[15]);
1440 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1446 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1447 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1448 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1449 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1450 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1451 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1452 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1453 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1454 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1455 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1456 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1457 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1458 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1459 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1460 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1461 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1468 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1471 uint8_t *pix3 = pix2 + line_size;
1475 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1476 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1477 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1478 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1479 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1480 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1481 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1482 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1483 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1484 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1485 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1486 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1487 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1488 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1489 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1490 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1498 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1501 uint8_t *pix3 = pix2 + line_size;
1505 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1506 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1507 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1508 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1509 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1510 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1511 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1512 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1513 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1514 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1515 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1516 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1517 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1518 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1519 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1520 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1528 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1534 s += abs(pix1[0] - pix2[0]);
1535 s += abs(pix1[1] - pix2[1]);
1536 s += abs(pix1[2] - pix2[2]);
1537 s += abs(pix1[3] - pix2[3]);
1538 s += abs(pix1[4] - pix2[4]);
1539 s += abs(pix1[5] - pix2[5]);
1540 s += abs(pix1[6] - pix2[6]);
1541 s += abs(pix1[7] - pix2[7]);
1548 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1554 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1555 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1556 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1557 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1558 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1559 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1560 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1561 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1568 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1571 uint8_t *pix3 = pix2 + line_size;
1575 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1576 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1577 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1578 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1579 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1580 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1581 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1582 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1590 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1593 uint8_t *pix3 = pix2 + line_size;
1597 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1598 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1599 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1600 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1601 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1602 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1603 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1604 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1612 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1613 MpegEncContext *c = v;
1619 for(x=0; x<16; x++){
1620 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1623 for(x=0; x<15; x++){
1624 score2+= FFABS( s1[x ] - s1[x +stride]
1625 - s1[x+1] + s1[x+1+stride])
1626 -FFABS( s2[x ] - s2[x +stride]
1627 - s2[x+1] + s2[x+1+stride]);
1634 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1635 else return score1 + FFABS(score2)*8;
1638 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1639 MpegEncContext *c = v;
1646 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1650 score2+= FFABS( s1[x ] - s1[x +stride]
1651 - s1[x+1] + s1[x+1+stride])
1652 -FFABS( s2[x ] - s2[x +stride]
1653 - s2[x+1] + s2[x+1+stride]);
1660 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1661 else return score1 + FFABS(score2)*8;
1664 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1668 for(i=0; i<8*8; i++){
1669 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1672 assert(-512<b && b<512);
1674 sum += (w*b)*(w*b)>>4;
1679 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1682 for(i=0; i<8*8; i++){
1683 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1687 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1691 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1694 memset(cmp, 0, sizeof(void*)*6);
1702 cmp[i]= c->hadamard8_diff[i];
1708 cmp[i]= c->dct_sad[i];
1711 cmp[i]= c->dct264_sad[i];
1714 cmp[i]= c->dct_max[i];
1717 cmp[i]= c->quant_psnr[i];
1738 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1743 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1745 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1746 long a = *(long*)(src+i);
1747 long b = *(long*)(dst+i);
1748 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1751 dst[i+0] += src[i+0];
1754 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1756 #if !HAVE_FAST_UNALIGNED
1757 if((long)src2 & (sizeof(long)-1)){
1758 for(i=0; i+7<w; i+=8){
1759 dst[i+0] = src1[i+0]-src2[i+0];
1760 dst[i+1] = src1[i+1]-src2[i+1];
1761 dst[i+2] = src1[i+2]-src2[i+2];
1762 dst[i+3] = src1[i+3]-src2[i+3];
1763 dst[i+4] = src1[i+4]-src2[i+4];
1764 dst[i+5] = src1[i+5]-src2[i+5];
1765 dst[i+6] = src1[i+6]-src2[i+6];
1766 dst[i+7] = src1[i+7]-src2[i+7];
1770 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1771 long a = *(long*)(src1+i);
1772 long b = *(long*)(src2+i);
1773 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1776 dst[i+0] = src1[i+0]-src2[i+0];
1779 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1787 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1796 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1804 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1814 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1817 for(i=0; i<w-1; i++){
1844 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1874 #define BUTTERFLY2(o1,o2,i1,i2) \
1878 #define BUTTERFLY1(x,y) \
1887 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1889 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1897 //FIXME try pointer walks
1898 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1899 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1900 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1901 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1903 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1904 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1905 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1906 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1908 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1909 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1910 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1911 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1915 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1916 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1917 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1918 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1920 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1921 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1922 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1923 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1926 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1927 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1928 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1929 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1934 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1942 //FIXME try pointer walks
1943 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1944 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1945 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1946 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1948 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1949 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1950 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1951 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1953 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1954 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1955 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1956 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1960 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1961 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1962 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1963 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1965 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1966 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1967 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1968 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1971 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1972 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1973 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1974 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1977 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
1982 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
1983 MpegEncContext * const s= (MpegEncContext *)c;
1984 LOCAL_ALIGNED_16(int16_t, temp, [64]);
1988 s->dsp.diff_pixels(temp, src1, src2, stride);
1990 return s->dsp.sum_abs_dctelem(temp);
1995 const int s07 = SRC(0) + SRC(7);\
1996 const int s16 = SRC(1) + SRC(6);\
1997 const int s25 = SRC(2) + SRC(5);\
1998 const int s34 = SRC(3) + SRC(4);\
1999 const int a0 = s07 + s34;\
2000 const int a1 = s16 + s25;\
2001 const int a2 = s07 - s34;\
2002 const int a3 = s16 - s25;\
2003 const int d07 = SRC(0) - SRC(7);\
2004 const int d16 = SRC(1) - SRC(6);\
2005 const int d25 = SRC(2) - SRC(5);\
2006 const int d34 = SRC(3) - SRC(4);\
2007 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2008 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2009 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2010 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2012 DST(1, a4 + (a7>>2)) ;\
2013 DST(2, a2 + (a3>>1)) ;\
2014 DST(3, a5 + (a6>>2)) ;\
2016 DST(5, a6 - (a5>>2)) ;\
2017 DST(6, (a2>>1) - a3 ) ;\
2018 DST(7, (a4>>2) - a7 ) ;\
2021 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2022 MpegEncContext * const s= (MpegEncContext *)c;
2027 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2029 #define SRC(x) dct[i][x]
2030 #define DST(x,v) dct[i][x]= v
2031 for( i = 0; i < 8; i++ )
2036 #define SRC(x) dct[x][i]
2037 #define DST(x,v) sum += FFABS(v)
2038 for( i = 0; i < 8; i++ )
2046 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2047 MpegEncContext * const s= (MpegEncContext *)c;
2048 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2053 s->dsp.diff_pixels(temp, src1, src2, stride);
2057 sum= FFMAX(sum, FFABS(temp[i]));
2062 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2063 MpegEncContext * const s= (MpegEncContext *)c;
2064 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2065 int16_t * const bak = temp+64;
2071 s->dsp.diff_pixels(temp, src1, src2, stride);
2073 memcpy(bak, temp, 64*sizeof(int16_t));
2075 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2076 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2077 ff_simple_idct_8(temp); //FIXME
2080 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2085 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2086 MpegEncContext * const s= (MpegEncContext *)c;
2087 const uint8_t *scantable= s->intra_scantable.permutated;
2088 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2089 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2090 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2091 int i, last, run, bits, level, distortion, start_i;
2092 const int esc_length= s->ac_esc_length;
2094 uint8_t * last_length;
2098 copy_block8(lsrc1, src1, 8, stride, 8);
2099 copy_block8(lsrc2, src2, 8, stride, 8);
2101 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2103 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2109 length = s->intra_ac_vlc_length;
2110 last_length= s->intra_ac_vlc_last_length;
2111 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2114 length = s->inter_ac_vlc_length;
2115 last_length= s->inter_ac_vlc_last_length;
2120 for(i=start_i; i<last; i++){
2121 int j= scantable[i];
2126 if((level&(~127)) == 0){
2127 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2136 level= temp[i] + 64;
2140 if((level&(~127)) == 0){
2141 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2149 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2151 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2154 s->dsp.idct_add(lsrc2, 8, temp);
2156 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2158 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2161 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2162 MpegEncContext * const s= (MpegEncContext *)c;
2163 const uint8_t *scantable= s->intra_scantable.permutated;
2164 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2165 int i, last, run, bits, level, start_i;
2166 const int esc_length= s->ac_esc_length;
2168 uint8_t * last_length;
2172 s->dsp.diff_pixels(temp, src1, src2, stride);
2174 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2180 length = s->intra_ac_vlc_length;
2181 last_length= s->intra_ac_vlc_last_length;
2182 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2185 length = s->inter_ac_vlc_length;
2186 last_length= s->inter_ac_vlc_last_length;
2191 for(i=start_i; i<last; i++){
2192 int j= scantable[i];
2197 if((level&(~127)) == 0){
2198 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2207 level= temp[i] + 64;
2211 if((level&(~127)) == 0){
2212 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2220 #define VSAD_INTRA(size) \
2221 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2225 for(y=1; y<h; y++){ \
2226 for(x=0; x<size; x+=4){ \
2227 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2228 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2238 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2243 for(x=0; x<16; x++){
2244 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2253 #define SQ(a) ((a)*(a))
2254 #define VSSE_INTRA(size) \
2255 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2259 for(y=1; y<h; y++){ \
2260 for(x=0; x<size; x+=4){ \
2261 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2262 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2272 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2277 for(x=0; x<16; x++){
2278 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2287 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2291 for(i=0; i<size; i++)
2292 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2296 #define WRAPPER8_16_SQ(name8, name16)\
2297 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2299 score +=name8(s, dst , src , stride, 8);\
2300 score +=name8(s, dst+8 , src+8 , stride, 8);\
2304 score +=name8(s, dst , src , stride, 8);\
2305 score +=name8(s, dst+8 , src+8 , stride, 8);\
2310 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2311 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2312 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2314 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2316 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2317 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2318 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2319 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2321 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2322 uint32_t maxi, uint32_t maxisign)
2325 if(a > mini) return mini;
2326 else if((a^(1U<<31)) > maxisign) return maxi;
2330 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2332 uint32_t mini = *(uint32_t*)min;
2333 uint32_t maxi = *(uint32_t*)max;
2334 uint32_t maxisign = maxi ^ (1U<<31);
2335 uint32_t *dsti = (uint32_t*)dst;
2336 const uint32_t *srci = (const uint32_t*)src;
2337 for(i=0; i<len; i+=8) {
2338 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2339 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2340 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2341 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2342 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2343 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2344 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2345 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2348 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2350 if(min < 0 && max > 0) {
2351 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2353 for(i=0; i < len; i+=8) {
2354 dst[i ] = av_clipf(src[i ], min, max);
2355 dst[i + 1] = av_clipf(src[i + 1], min, max);
2356 dst[i + 2] = av_clipf(src[i + 2], min, max);
2357 dst[i + 3] = av_clipf(src[i + 3], min, max);
2358 dst[i + 4] = av_clipf(src[i + 4], min, max);
2359 dst[i + 5] = av_clipf(src[i + 5], min, max);
2360 dst[i + 6] = av_clipf(src[i + 6], min, max);
2361 dst[i + 7] = av_clipf(src[i + 7], min, max);
2366 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2371 res += *v1++ * *v2++;
2376 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2381 *v1++ += mul * *v3++;
2386 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2387 const int16_t *window, unsigned int len)
2390 int len2 = len >> 1;
2392 for (i = 0; i < len2; i++) {
2393 int16_t w = window[i];
2394 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2395 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2399 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2400 int32_t max, unsigned int len)
2403 *dst++ = av_clip(*src++, min, max);
2404 *dst++ = av_clip(*src++, min, max);
2405 *dst++ = av_clip(*src++, min, max);
2406 *dst++ = av_clip(*src++, min, max);
2407 *dst++ = av_clip(*src++, min, max);
2408 *dst++ = av_clip(*src++, min, max);
2409 *dst++ = av_clip(*src++, min, max);
2410 *dst++ = av_clip(*src++, min, max);
2415 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2417 ff_j_rev_dct (block);
2418 put_pixels_clamped_c(block, dest, line_size);
2420 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2422 ff_j_rev_dct (block);
2423 add_pixels_clamped_c(block, dest, line_size);
2426 /* init static data */
2427 av_cold void ff_dsputil_static_init(void)
2431 for(i=0;i<512;i++) {
2432 ff_squareTbl[i] = (i - 256) * (i - 256);
2435 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2438 int ff_check_alignment(void){
2439 static int did_fail=0;
2440 LOCAL_ALIGNED_16(int, aligned, [4]);
2442 if((intptr_t)aligned & 15){
2444 #if HAVE_MMX || HAVE_ALTIVEC
2445 av_log(NULL, AV_LOG_ERROR,
2446 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2447 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2448 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2449 "Do not report crashes to Libav developers.\n");
2458 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2460 ff_check_alignment();
2463 if (avctx->bits_per_raw_sample == 10) {
2464 c->fdct = ff_jpeg_fdct_islow_10;
2465 c->fdct248 = ff_fdct248_islow_10;
2467 if(avctx->dct_algo==FF_DCT_FASTINT) {
2468 c->fdct = ff_fdct_ifast;
2469 c->fdct248 = ff_fdct_ifast248;
2471 else if(avctx->dct_algo==FF_DCT_FAAN) {
2472 c->fdct = ff_faandct;
2473 c->fdct248 = ff_faandct248;
2476 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2477 c->fdct248 = ff_fdct248_islow_8;
2480 #endif //CONFIG_ENCODERS
2482 if (avctx->bits_per_raw_sample == 10) {
2483 c->idct_put = ff_simple_idct_put_10;
2484 c->idct_add = ff_simple_idct_add_10;
2485 c->idct = ff_simple_idct_10;
2486 c->idct_permutation_type = FF_NO_IDCT_PERM;
2488 if(avctx->idct_algo==FF_IDCT_INT){
2489 c->idct_put= jref_idct_put;
2490 c->idct_add= jref_idct_add;
2491 c->idct = ff_j_rev_dct;
2492 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2493 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2494 c->idct_put= ff_faanidct_put;
2495 c->idct_add= ff_faanidct_add;
2496 c->idct = ff_faanidct;
2497 c->idct_permutation_type= FF_NO_IDCT_PERM;
2498 }else{ //accurate/default
2499 c->idct_put = ff_simple_idct_put_8;
2500 c->idct_add = ff_simple_idct_add_8;
2501 c->idct = ff_simple_idct_8;
2502 c->idct_permutation_type= FF_NO_IDCT_PERM;
2506 c->diff_pixels = diff_pixels_c;
2507 c->put_pixels_clamped = put_pixels_clamped_c;
2508 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2509 c->add_pixels_clamped = add_pixels_clamped_c;
2510 c->sum_abs_dctelem = sum_abs_dctelem_c;
2513 c->pix_sum = pix_sum_c;
2514 c->pix_norm1 = pix_norm1_c;
2516 c->fill_block_tab[0] = fill_block16_c;
2517 c->fill_block_tab[1] = fill_block8_c;
2519 /* TODO [0] 16 [1] 8 */
2520 c->pix_abs[0][0] = pix_abs16_c;
2521 c->pix_abs[0][1] = pix_abs16_x2_c;
2522 c->pix_abs[0][2] = pix_abs16_y2_c;
2523 c->pix_abs[0][3] = pix_abs16_xy2_c;
2524 c->pix_abs[1][0] = pix_abs8_c;
2525 c->pix_abs[1][1] = pix_abs8_x2_c;
2526 c->pix_abs[1][2] = pix_abs8_y2_c;
2527 c->pix_abs[1][3] = pix_abs8_xy2_c;
2529 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2530 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2531 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2532 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2533 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2534 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2535 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2536 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2537 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2539 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2540 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2541 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2542 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2543 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2544 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2545 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2546 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2547 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2549 #define dspfunc(PFX, IDX, NUM) \
2550 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2551 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2552 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2553 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2554 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2555 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2556 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2557 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2558 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2559 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2560 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2561 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2562 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2563 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2564 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2565 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2567 dspfunc(put_qpel, 0, 16);
2568 dspfunc(put_no_rnd_qpel, 0, 16);
2570 dspfunc(avg_qpel, 0, 16);
2571 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2573 dspfunc(put_qpel, 1, 8);
2574 dspfunc(put_no_rnd_qpel, 1, 8);
2576 dspfunc(avg_qpel, 1, 8);
2577 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2581 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2582 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2583 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2584 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2585 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2586 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2587 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2588 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2590 #define SET_CMP_FUNC(name) \
2591 c->name[0]= name ## 16_c;\
2592 c->name[1]= name ## 8x8_c;
2594 SET_CMP_FUNC(hadamard8_diff)
2595 c->hadamard8_diff[4]= hadamard8_intra16_c;
2596 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2597 SET_CMP_FUNC(dct_sad)
2598 SET_CMP_FUNC(dct_max)
2600 SET_CMP_FUNC(dct264_sad)
2602 c->sad[0]= pix_abs16_c;
2603 c->sad[1]= pix_abs8_c;
2607 SET_CMP_FUNC(quant_psnr)
2610 c->vsad[0]= vsad16_c;
2611 c->vsad[4]= vsad_intra16_c;
2612 c->vsad[5]= vsad_intra8_c;
2613 c->vsse[0]= vsse16_c;
2614 c->vsse[4]= vsse_intra16_c;
2615 c->vsse[5]= vsse_intra8_c;
2616 c->nsse[0]= nsse16_c;
2617 c->nsse[1]= nsse8_c;
2619 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2621 c->add_bytes= add_bytes_c;
2622 c->diff_bytes= diff_bytes_c;
2623 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2624 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2625 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2626 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2627 c->bswap_buf= bswap_buf;
2628 c->bswap16_buf = bswap16_buf;
2630 c->try_8x8basis= try_8x8basis_c;
2631 c->add_8x8basis= add_8x8basis_c;
2633 c->vector_clipf = vector_clipf_c;
2634 c->scalarproduct_int16 = scalarproduct_int16_c;
2635 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2636 c->apply_window_int16 = apply_window_int16_c;
2637 c->vector_clip_int32 = vector_clip_int32_c;
2639 c->shrink[0]= av_image_copy_plane;
2640 c->shrink[1]= ff_shrink22;
2641 c->shrink[2]= ff_shrink44;
2642 c->shrink[3]= ff_shrink88;
2644 c->add_pixels8 = add_pixels8_c;
2648 #define FUNC(f, depth) f ## _ ## depth
2649 #define FUNCC(f, depth) f ## _ ## depth ## _c
2651 c->draw_edges = FUNCC(draw_edges, 8);
2652 c->clear_block = FUNCC(clear_block, 8);
2653 c->clear_blocks = FUNCC(clear_blocks, 8);
2655 #define BIT_DEPTH_FUNCS(depth) \
2656 c->get_pixels = FUNCC(get_pixels, depth);
2658 switch (avctx->bits_per_raw_sample) {
2661 BIT_DEPTH_FUNCS(16);
2670 ff_dsputil_init_arm(c, avctx);
2672 ff_dsputil_init_bfin(c, avctx);
2674 ff_dsputil_init_ppc(c, avctx);
2676 ff_dsputil_init_sh4(c, avctx);
2678 ff_dsputil_init_vis(c, avctx);
2680 ff_dsputil_init_x86(c, avctx);
2682 ff_init_scantable_permutation(c->idct_permutation,
2683 c->idct_permutation_type);