3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
149 int idct_permutation_type)
153 switch(idct_permutation_type){
154 case FF_NO_IDCT_PERM:
156 idct_permutation[i]= i;
158 case FF_LIBMPEG2_IDCT_PERM:
160 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
162 case FF_SIMPLE_IDCT_PERM:
164 idct_permutation[i]= simple_mmx_permutation[i];
166 case FF_TRANSPOSE_IDCT_PERM:
168 idct_permutation[i]= ((i&7)<<3) | (i>>3);
170 case FF_PARTTRANS_IDCT_PERM:
172 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
174 case FF_SSE2_IDCT_PERM:
176 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
179 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
183 static int pix_sum_c(uint8_t * pix, int line_size)
188 for (i = 0; i < 16; i++) {
189 for (j = 0; j < 16; j += 8) {
200 pix += line_size - 16;
205 static int pix_norm1_c(uint8_t * pix, int line_size)
208 uint32_t *sq = ff_squareTbl + 256;
211 for (i = 0; i < 16; i++) {
212 for (j = 0; j < 16; j += 8) {
224 register uint64_t x=*(uint64_t*)pix;
226 s += sq[(x>>8)&0xff];
227 s += sq[(x>>16)&0xff];
228 s += sq[(x>>24)&0xff];
229 s += sq[(x>>32)&0xff];
230 s += sq[(x>>40)&0xff];
231 s += sq[(x>>48)&0xff];
232 s += sq[(x>>56)&0xff];
234 register uint32_t x=*(uint32_t*)pix;
236 s += sq[(x>>8)&0xff];
237 s += sq[(x>>16)&0xff];
238 s += sq[(x>>24)&0xff];
239 x=*(uint32_t*)(pix+4);
241 s += sq[(x>>8)&0xff];
242 s += sq[(x>>16)&0xff];
243 s += sq[(x>>24)&0xff];
248 pix += line_size - 16;
253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
256 for(i=0; i+8<=w; i+=8){
257 dst[i+0]= av_bswap32(src[i+0]);
258 dst[i+1]= av_bswap32(src[i+1]);
259 dst[i+2]= av_bswap32(src[i+2]);
260 dst[i+3]= av_bswap32(src[i+3]);
261 dst[i+4]= av_bswap32(src[i+4]);
262 dst[i+5]= av_bswap32(src[i+5]);
263 dst[i+6]= av_bswap32(src[i+6]);
264 dst[i+7]= av_bswap32(src[i+7]);
267 dst[i+0]= av_bswap32(src[i+0]);
271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
274 *dst++ = av_bswap16(*src++);
277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
280 uint32_t *sq = ff_squareTbl + 256;
283 for (i = 0; i < h; i++) {
284 s += sq[pix1[0] - pix2[0]];
285 s += sq[pix1[1] - pix2[1]];
286 s += sq[pix1[2] - pix2[2]];
287 s += sq[pix1[3] - pix2[3]];
294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
297 uint32_t *sq = ff_squareTbl + 256;
300 for (i = 0; i < h; i++) {
301 s += sq[pix1[0] - pix2[0]];
302 s += sq[pix1[1] - pix2[1]];
303 s += sq[pix1[2] - pix2[2]];
304 s += sq[pix1[3] - pix2[3]];
305 s += sq[pix1[4] - pix2[4]];
306 s += sq[pix1[5] - pix2[5]];
307 s += sq[pix1[6] - pix2[6]];
308 s += sq[pix1[7] - pix2[7]];
315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
318 uint32_t *sq = ff_squareTbl + 256;
321 for (i = 0; i < h; i++) {
322 s += sq[pix1[ 0] - pix2[ 0]];
323 s += sq[pix1[ 1] - pix2[ 1]];
324 s += sq[pix1[ 2] - pix2[ 2]];
325 s += sq[pix1[ 3] - pix2[ 3]];
326 s += sq[pix1[ 4] - pix2[ 4]];
327 s += sq[pix1[ 5] - pix2[ 5]];
328 s += sq[pix1[ 6] - pix2[ 6]];
329 s += sq[pix1[ 7] - pix2[ 7]];
330 s += sq[pix1[ 8] - pix2[ 8]];
331 s += sq[pix1[ 9] - pix2[ 9]];
332 s += sq[pix1[10] - pix2[10]];
333 s += sq[pix1[11] - pix2[11]];
334 s += sq[pix1[12] - pix2[12]];
335 s += sq[pix1[13] - pix2[13]];
336 s += sq[pix1[14] - pix2[14]];
337 s += sq[pix1[15] - pix2[15]];
345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
346 const uint8_t *s2, int stride){
349 /* read the pixels */
351 block[0] = s1[0] - s2[0];
352 block[1] = s1[1] - s2[1];
353 block[2] = s1[2] - s2[2];
354 block[3] = s1[3] - s2[3];
355 block[4] = s1[4] - s2[4];
356 block[5] = s1[5] - s2[5];
357 block[6] = s1[6] - s2[6];
358 block[7] = s1[7] - s2[7];
366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
371 /* read the pixels */
373 pixels[0] = av_clip_uint8(block[0]);
374 pixels[1] = av_clip_uint8(block[1]);
375 pixels[2] = av_clip_uint8(block[2]);
376 pixels[3] = av_clip_uint8(block[3]);
377 pixels[4] = av_clip_uint8(block[4]);
378 pixels[5] = av_clip_uint8(block[5]);
379 pixels[6] = av_clip_uint8(block[6]);
380 pixels[7] = av_clip_uint8(block[7]);
387 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
392 /* read the pixels */
394 pixels[0] = av_clip_uint8(block[0]);
395 pixels[1] = av_clip_uint8(block[1]);
396 pixels[2] = av_clip_uint8(block[2]);
397 pixels[3] = av_clip_uint8(block[3]);
404 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
409 /* read the pixels */
411 pixels[0] = av_clip_uint8(block[0]);
412 pixels[1] = av_clip_uint8(block[1]);
419 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
420 uint8_t *restrict pixels,
425 for (i = 0; i < 8; i++) {
426 for (j = 0; j < 8; j++) {
429 else if (*block > 127)
432 *pixels = (uint8_t)(*block + 128);
436 pixels += (line_size - 8);
440 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
445 /* read the pixels */
447 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
448 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
449 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
450 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
451 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
452 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
453 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
454 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
460 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
465 /* read the pixels */
467 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
468 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
469 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
470 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
476 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
481 /* read the pixels */
483 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
484 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
490 static int sum_abs_dctelem_c(DCTELEM *block)
494 sum+= FFABS(block[i]);
498 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
502 for (i = 0; i < h; i++) {
503 memset(block, value, 16);
508 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
512 for (i = 0; i < h; i++) {
513 memset(block, value, 8);
518 #define avg2(a,b) ((a+b+1)>>1)
519 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
521 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
523 const int A=(16-x16)*(16-y16);
524 const int B=( x16)*(16-y16);
525 const int C=(16-x16)*( y16);
526 const int D=( x16)*( y16);
531 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
532 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
533 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
534 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
535 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
536 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
537 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
538 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
544 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
545 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
548 const int s= 1<<shift;
558 for(x=0; x<8; x++){ //XXX FIXME optimize
559 int src_x, src_y, frac_x, frac_y, index;
568 if((unsigned)src_x < width){
569 if((unsigned)src_y < height){
570 index= src_x + src_y*stride;
571 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
572 + src[index +1]* frac_x )*(s-frac_y)
573 + ( src[index+stride ]*(s-frac_x)
574 + src[index+stride+1]* frac_x )* frac_y
577 index= src_x + av_clip(src_y, 0, height)*stride;
578 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
579 + src[index +1]* frac_x )*s
583 if((unsigned)src_y < height){
584 index= av_clip(src_x, 0, width) + src_y*stride;
585 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
586 + src[index+stride ]* frac_y )*s
589 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
590 dst[y*stride + x]= src[index ];
602 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
604 case 2: put_pixels2_8_c (dst, src, stride, height); break;
605 case 4: put_pixels4_8_c (dst, src, stride, height); break;
606 case 8: put_pixels8_8_c (dst, src, stride, height); break;
607 case 16:put_pixels16_8_c(dst, src, stride, height); break;
611 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
613 for (i=0; i < height; i++) {
614 for (j=0; j < width; j++) {
615 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
622 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
624 for (i=0; i < height; i++) {
625 for (j=0; j < width; j++) {
626 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
633 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
635 for (i=0; i < height; i++) {
636 for (j=0; j < width; j++) {
637 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
644 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
646 for (i=0; i < height; i++) {
647 for (j=0; j < width; j++) {
648 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
655 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
657 for (i=0; i < height; i++) {
658 for (j=0; j < width; j++) {
659 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
666 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
668 for (i=0; i < height; i++) {
669 for (j=0; j < width; j++) {
670 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
677 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
679 for (i=0; i < height; i++) {
680 for (j=0; j < width; j++) {
681 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
688 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
690 for (i=0; i < height; i++) {
691 for (j=0; j < width; j++) {
692 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
699 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
701 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
702 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
703 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
704 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
708 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
710 for (i=0; i < height; i++) {
711 for (j=0; j < width; j++) {
712 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
719 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
721 for (i=0; i < height; i++) {
722 for (j=0; j < width; j++) {
723 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
730 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
732 for (i=0; i < height; i++) {
733 for (j=0; j < width; j++) {
734 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
741 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
743 for (i=0; i < height; i++) {
744 for (j=0; j < width; j++) {
745 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
752 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
754 for (i=0; i < height; i++) {
755 for (j=0; j < width; j++) {
756 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
763 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
765 for (i=0; i < height; i++) {
766 for (j=0; j < width; j++) {
767 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
774 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
776 for (i=0; i < height; i++) {
777 for (j=0; j < width; j++) {
778 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
785 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
787 for (i=0; i < height; i++) {
788 for (j=0; j < width; j++) {
789 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
796 #define QPEL_MC(r, OPNAME, RND, OP) \
797 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
798 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
802 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
803 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
804 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
805 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
806 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
807 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
808 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
809 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
815 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
817 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
821 const int src0= src[0*srcStride];\
822 const int src1= src[1*srcStride];\
823 const int src2= src[2*srcStride];\
824 const int src3= src[3*srcStride];\
825 const int src4= src[4*srcStride];\
826 const int src5= src[5*srcStride];\
827 const int src6= src[6*srcStride];\
828 const int src7= src[7*srcStride];\
829 const int src8= src[8*srcStride];\
830 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
831 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
832 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
833 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
834 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
835 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
836 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
837 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
843 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
844 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
849 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
850 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
851 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
852 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
853 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
854 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
855 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
856 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
857 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
858 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
859 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
860 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
861 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
862 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
863 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
864 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
870 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
871 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
876 const int src0= src[0*srcStride];\
877 const int src1= src[1*srcStride];\
878 const int src2= src[2*srcStride];\
879 const int src3= src[3*srcStride];\
880 const int src4= src[4*srcStride];\
881 const int src5= src[5*srcStride];\
882 const int src6= src[6*srcStride];\
883 const int src7= src[7*srcStride];\
884 const int src8= src[8*srcStride];\
885 const int src9= src[9*srcStride];\
886 const int src10= src[10*srcStride];\
887 const int src11= src[11*srcStride];\
888 const int src12= src[12*srcStride];\
889 const int src13= src[13*srcStride];\
890 const int src14= src[14*srcStride];\
891 const int src15= src[15*srcStride];\
892 const int src16= src[16*srcStride];\
893 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
894 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
895 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
896 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
897 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
898 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
899 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
900 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
901 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
902 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
903 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
904 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
905 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
906 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
907 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
908 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
914 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
916 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
917 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
920 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
921 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
924 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
926 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
927 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
930 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
933 copy_block9(full, src, 16, stride, 9);\
934 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
935 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
938 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
940 copy_block9(full, src, 16, stride, 9);\
941 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
944 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
947 copy_block9(full, src, 16, stride, 9);\
948 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
949 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
951 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
956 copy_block9(full, src, 16, stride, 9);\
957 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
958 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
960 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
962 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
966 copy_block9(full, src, 16, stride, 9);\
967 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
968 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
969 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
970 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
972 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
977 copy_block9(full, src, 16, stride, 9);\
978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
979 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
981 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
983 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
987 copy_block9(full, src, 16, stride, 9);\
988 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
989 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
990 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
991 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
993 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
998 copy_block9(full, src, 16, stride, 9);\
999 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1001 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1004 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1005 uint8_t full[16*9];\
1007 uint8_t halfHV[64];\
1008 copy_block9(full, src, 16, stride, 9);\
1009 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1010 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1011 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1012 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1014 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1015 uint8_t full[16*9];\
1018 uint8_t halfHV[64];\
1019 copy_block9(full, src, 16, stride, 9);\
1020 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1021 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1022 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1023 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1025 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1026 uint8_t full[16*9];\
1028 uint8_t halfHV[64];\
1029 copy_block9(full, src, 16, stride, 9);\
1030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1031 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1032 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1033 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1035 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1037 uint8_t halfHV[64];\
1038 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1039 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1040 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1042 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1044 uint8_t halfHV[64];\
1045 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1049 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1050 uint8_t full[16*9];\
1053 uint8_t halfHV[64];\
1054 copy_block9(full, src, 16, stride, 9);\
1055 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1056 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1057 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1058 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1060 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1061 uint8_t full[16*9];\
1063 copy_block9(full, src, 16, stride, 9);\
1064 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1065 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1066 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1068 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1069 uint8_t full[16*9];\
1072 uint8_t halfHV[64];\
1073 copy_block9(full, src, 16, stride, 9);\
1074 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1076 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1079 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1080 uint8_t full[16*9];\
1082 copy_block9(full, src, 16, stride, 9);\
1083 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1084 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1085 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1087 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1089 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1090 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1093 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1095 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1096 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1099 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1100 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1103 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1105 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1106 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1109 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1110 uint8_t full[24*17];\
1112 copy_block17(full, src, 24, stride, 17);\
1113 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1114 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1117 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1118 uint8_t full[24*17];\
1119 copy_block17(full, src, 24, stride, 17);\
1120 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1123 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1124 uint8_t full[24*17];\
1126 copy_block17(full, src, 24, stride, 17);\
1127 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1128 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1130 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1131 uint8_t full[24*17];\
1132 uint8_t halfH[272];\
1133 uint8_t halfV[256];\
1134 uint8_t halfHV[256];\
1135 copy_block17(full, src, 24, stride, 17);\
1136 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1137 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1138 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1139 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1141 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1142 uint8_t full[24*17];\
1143 uint8_t halfH[272];\
1144 uint8_t halfHV[256];\
1145 copy_block17(full, src, 24, stride, 17);\
1146 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1147 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1148 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1149 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1151 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1152 uint8_t full[24*17];\
1153 uint8_t halfH[272];\
1154 uint8_t halfV[256];\
1155 uint8_t halfHV[256];\
1156 copy_block17(full, src, 24, stride, 17);\
1157 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1158 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1159 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1160 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1162 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1163 uint8_t full[24*17];\
1164 uint8_t halfH[272];\
1165 uint8_t halfHV[256];\
1166 copy_block17(full, src, 24, stride, 17);\
1167 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1168 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1170 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1172 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1173 uint8_t full[24*17];\
1174 uint8_t halfH[272];\
1175 uint8_t halfV[256];\
1176 uint8_t halfHV[256];\
1177 copy_block17(full, src, 24, stride, 17);\
1178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1181 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1183 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1184 uint8_t full[24*17];\
1185 uint8_t halfH[272];\
1186 uint8_t halfHV[256];\
1187 copy_block17(full, src, 24, stride, 17);\
1188 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1189 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1191 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1193 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1194 uint8_t full[24*17];\
1195 uint8_t halfH[272];\
1196 uint8_t halfV[256];\
1197 uint8_t halfHV[256];\
1198 copy_block17(full, src, 24, stride, 17);\
1199 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1200 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1201 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1204 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1205 uint8_t full[24*17];\
1206 uint8_t halfH[272];\
1207 uint8_t halfHV[256];\
1208 copy_block17(full, src, 24, stride, 17);\
1209 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1210 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1211 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1212 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1214 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1215 uint8_t halfH[272];\
1216 uint8_t halfHV[256];\
1217 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1218 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1219 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1221 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1222 uint8_t halfH[272];\
1223 uint8_t halfHV[256];\
1224 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1228 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1229 uint8_t full[24*17];\
1230 uint8_t halfH[272];\
1231 uint8_t halfV[256];\
1232 uint8_t halfHV[256];\
1233 copy_block17(full, src, 24, stride, 17);\
1234 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1235 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1236 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1237 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1239 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1240 uint8_t full[24*17];\
1241 uint8_t halfH[272];\
1242 copy_block17(full, src, 24, stride, 17);\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1245 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1247 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1248 uint8_t full[24*17];\
1249 uint8_t halfH[272];\
1250 uint8_t halfV[256];\
1251 uint8_t halfHV[256];\
1252 copy_block17(full, src, 24, stride, 17);\
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1255 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1256 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1258 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1259 uint8_t full[24*17];\
1260 uint8_t halfH[272];\
1261 copy_block17(full, src, 24, stride, 17);\
1262 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1263 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1264 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1266 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1267 uint8_t halfH[272];\
1268 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1269 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1272 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1273 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1274 #define op_put(a, b) a = cm[((b) + 16)>>5]
1275 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1277 QPEL_MC(0, put_ , _ , op_put)
1278 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1279 QPEL_MC(0, avg_ , _ , op_avg)
1280 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1282 #undef op_avg_no_rnd
1284 #undef op_put_no_rnd
1286 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1287 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1288 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1289 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1290 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1291 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1293 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1294 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1298 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1299 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1300 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1301 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1302 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1303 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1304 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1305 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1311 #if CONFIG_RV40_DECODER
1312 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1313 put_pixels16_xy2_8_c(dst, src, stride, 16);
1315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1316 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1318 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319 put_pixels8_xy2_8_c(dst, src, stride, 8);
1321 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1324 #endif /* CONFIG_RV40_DECODER */
1326 #if CONFIG_DIRAC_DECODER
1327 #define DIRAC_MC(OPNAME)\
1328 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1330 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1332 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1334 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1336 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1338 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1339 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1341 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1343 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1345 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1347 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1349 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1351 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1352 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1354 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1356 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1358 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1360 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1362 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1364 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1365 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1371 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1372 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1376 const int src_1= src[ -srcStride];
1377 const int src0 = src[0 ];
1378 const int src1 = src[ srcStride];
1379 const int src2 = src[2*srcStride];
1380 const int src3 = src[3*srcStride];
1381 const int src4 = src[4*srcStride];
1382 const int src5 = src[5*srcStride];
1383 const int src6 = src[6*srcStride];
1384 const int src7 = src[7*srcStride];
1385 const int src8 = src[8*srcStride];
1386 const int src9 = src[9*srcStride];
1387 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1388 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1389 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1390 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1391 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1392 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1393 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1394 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1400 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1402 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1403 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1406 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1407 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1410 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1412 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1413 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1416 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1417 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1420 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1424 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1425 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1426 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1427 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1429 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1433 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1434 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1435 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1436 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1438 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1440 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1441 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1444 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1445 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1447 const int strength= ff_h263_loop_filter_strength[qscale];
1451 int p0= src[x-2*stride];
1452 int p1= src[x-1*stride];
1453 int p2= src[x+0*stride];
1454 int p3= src[x+1*stride];
1455 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1457 if (d<-2*strength) d1= 0;
1458 else if(d<- strength) d1=-2*strength - d;
1459 else if(d< strength) d1= d;
1460 else if(d< 2*strength) d1= 2*strength - d;
1465 if(p1&256) p1= ~(p1>>31);
1466 if(p2&256) p2= ~(p2>>31);
1468 src[x-1*stride] = p1;
1469 src[x+0*stride] = p2;
1473 d2= av_clip((p0-p3)/4, -ad1, ad1);
1475 src[x-2*stride] = p0 - d2;
1476 src[x+ stride] = p3 + d2;
1481 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1482 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1484 const int strength= ff_h263_loop_filter_strength[qscale];
1488 int p0= src[y*stride-2];
1489 int p1= src[y*stride-1];
1490 int p2= src[y*stride+0];
1491 int p3= src[y*stride+1];
1492 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1494 if (d<-2*strength) d1= 0;
1495 else if(d<- strength) d1=-2*strength - d;
1496 else if(d< strength) d1= d;
1497 else if(d< 2*strength) d1= 2*strength - d;
1502 if(p1&256) p1= ~(p1>>31);
1503 if(p2&256) p2= ~(p2>>31);
1505 src[y*stride-1] = p1;
1506 src[y*stride+0] = p2;
1510 d2= av_clip((p0-p3)/4, -ad1, ad1);
1512 src[y*stride-2] = p0 - d2;
1513 src[y*stride+1] = p3 + d2;
1518 static void h261_loop_filter_c(uint8_t *src, int stride){
1523 temp[x ] = 4*src[x ];
1524 temp[x + 7*8] = 4*src[x + 7*stride];
1528 xy = y * stride + x;
1530 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1535 src[ y*stride] = (temp[ y*8] + 2)>>2;
1536 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1538 xy = y * stride + x;
1540 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1545 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1551 s += abs(pix1[0] - pix2[0]);
1552 s += abs(pix1[1] - pix2[1]);
1553 s += abs(pix1[2] - pix2[2]);
1554 s += abs(pix1[3] - pix2[3]);
1555 s += abs(pix1[4] - pix2[4]);
1556 s += abs(pix1[5] - pix2[5]);
1557 s += abs(pix1[6] - pix2[6]);
1558 s += abs(pix1[7] - pix2[7]);
1559 s += abs(pix1[8] - pix2[8]);
1560 s += abs(pix1[9] - pix2[9]);
1561 s += abs(pix1[10] - pix2[10]);
1562 s += abs(pix1[11] - pix2[11]);
1563 s += abs(pix1[12] - pix2[12]);
1564 s += abs(pix1[13] - pix2[13]);
1565 s += abs(pix1[14] - pix2[14]);
1566 s += abs(pix1[15] - pix2[15]);
1573 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1579 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1580 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1581 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1582 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1583 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1584 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1585 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1586 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1587 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1588 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1589 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1590 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1591 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1592 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1593 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1594 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1601 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1604 uint8_t *pix3 = pix2 + line_size;
1608 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1609 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1610 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1611 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1612 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1613 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1614 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1615 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1616 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1617 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1618 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1619 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1620 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1621 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1622 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1623 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1631 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1634 uint8_t *pix3 = pix2 + line_size;
1638 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1639 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1640 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1641 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1642 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1643 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1644 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1645 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1646 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1647 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1648 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1649 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1650 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1651 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1652 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1653 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1661 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1667 s += abs(pix1[0] - pix2[0]);
1668 s += abs(pix1[1] - pix2[1]);
1669 s += abs(pix1[2] - pix2[2]);
1670 s += abs(pix1[3] - pix2[3]);
1671 s += abs(pix1[4] - pix2[4]);
1672 s += abs(pix1[5] - pix2[5]);
1673 s += abs(pix1[6] - pix2[6]);
1674 s += abs(pix1[7] - pix2[7]);
1681 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1687 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1688 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1689 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1690 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1691 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1692 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1693 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1694 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1701 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1704 uint8_t *pix3 = pix2 + line_size;
1708 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1709 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1710 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1711 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1712 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1713 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1714 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1715 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1723 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1726 uint8_t *pix3 = pix2 + line_size;
1730 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1731 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1732 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1733 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1734 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1735 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1736 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1737 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1745 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1746 MpegEncContext *c = v;
1752 for(x=0; x<16; x++){
1753 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1756 for(x=0; x<15; x++){
1757 score2+= FFABS( s1[x ] - s1[x +stride]
1758 - s1[x+1] + s1[x+1+stride])
1759 -FFABS( s2[x ] - s2[x +stride]
1760 - s2[x+1] + s2[x+1+stride]);
1767 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1768 else return score1 + FFABS(score2)*8;
1771 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1772 MpegEncContext *c = v;
1779 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1783 score2+= FFABS( s1[x ] - s1[x +stride]
1784 - s1[x+1] + s1[x+1+stride])
1785 -FFABS( s2[x ] - s2[x +stride]
1786 - s2[x+1] + s2[x+1+stride]);
1793 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1794 else return score1 + FFABS(score2)*8;
1797 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1801 for(i=0; i<8*8; i++){
1802 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1805 assert(-512<b && b<512);
1807 sum += (w*b)*(w*b)>>4;
1812 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1815 for(i=0; i<8*8; i++){
1816 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1821 * Permute an 8x8 block.
1822 * @param block the block which will be permuted according to the given permutation vector
1823 * @param permutation the permutation vector
1824 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1825 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1826 * (inverse) permutated to scantable order!
1828 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1834 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1836 for(i=0; i<=last; i++){
1837 const int j= scantable[i];
1842 for(i=0; i<=last; i++){
1843 const int j= scantable[i];
1844 const int perm_j= permutation[j];
1845 block[perm_j]= temp[j];
1849 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1853 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1856 memset(cmp, 0, sizeof(void*)*6);
1864 cmp[i]= c->hadamard8_diff[i];
1870 cmp[i]= c->dct_sad[i];
1873 cmp[i]= c->dct264_sad[i];
1876 cmp[i]= c->dct_max[i];
1879 cmp[i]= c->quant_psnr[i];
1908 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1913 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1915 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1916 long a = *(long*)(src+i);
1917 long b = *(long*)(dst+i);
1918 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1921 dst[i+0] += src[i+0];
1924 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1926 #if !HAVE_FAST_UNALIGNED
1927 if((long)src2 & (sizeof(long)-1)){
1928 for(i=0; i+7<w; i+=8){
1929 dst[i+0] = src1[i+0]-src2[i+0];
1930 dst[i+1] = src1[i+1]-src2[i+1];
1931 dst[i+2] = src1[i+2]-src2[i+2];
1932 dst[i+3] = src1[i+3]-src2[i+3];
1933 dst[i+4] = src1[i+4]-src2[i+4];
1934 dst[i+5] = src1[i+5]-src2[i+5];
1935 dst[i+6] = src1[i+6]-src2[i+6];
1936 dst[i+7] = src1[i+7]-src2[i+7];
1940 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1941 long a = *(long*)(src1+i);
1942 long b = *(long*)(src2+i);
1943 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1946 dst[i+0] = src1[i+0]-src2[i+0];
1949 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1957 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1966 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1974 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1984 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1987 for(i=0; i<w-1; i++){
2014 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2044 #define BUTTERFLY2(o1,o2,i1,i2) \
2048 #define BUTTERFLY1(x,y) \
2057 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2059 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2067 //FIXME try pointer walks
2068 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2069 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2070 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2071 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2073 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2074 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2075 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2076 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2078 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2079 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2080 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2081 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2085 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2086 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2087 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2088 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2090 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2091 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2092 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2093 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2096 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2097 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2098 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2099 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2104 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2112 //FIXME try pointer walks
2113 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2114 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2115 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2116 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2118 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2119 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2120 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2121 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2123 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2124 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2125 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2126 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2130 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2131 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2132 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2133 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2135 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2136 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2137 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2138 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2141 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2142 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2143 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2144 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2147 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2152 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2153 MpegEncContext * const s= (MpegEncContext *)c;
2154 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2158 s->dsp.diff_pixels(temp, src1, src2, stride);
2160 return s->dsp.sum_abs_dctelem(temp);
2165 const int s07 = SRC(0) + SRC(7);\
2166 const int s16 = SRC(1) + SRC(6);\
2167 const int s25 = SRC(2) + SRC(5);\
2168 const int s34 = SRC(3) + SRC(4);\
2169 const int a0 = s07 + s34;\
2170 const int a1 = s16 + s25;\
2171 const int a2 = s07 - s34;\
2172 const int a3 = s16 - s25;\
2173 const int d07 = SRC(0) - SRC(7);\
2174 const int d16 = SRC(1) - SRC(6);\
2175 const int d25 = SRC(2) - SRC(5);\
2176 const int d34 = SRC(3) - SRC(4);\
2177 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2178 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2179 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2180 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2182 DST(1, a4 + (a7>>2)) ;\
2183 DST(2, a2 + (a3>>1)) ;\
2184 DST(3, a5 + (a6>>2)) ;\
2186 DST(5, a6 - (a5>>2)) ;\
2187 DST(6, (a2>>1) - a3 ) ;\
2188 DST(7, (a4>>2) - a7 ) ;\
2191 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2192 MpegEncContext * const s= (MpegEncContext *)c;
2197 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2199 #define SRC(x) dct[i][x]
2200 #define DST(x,v) dct[i][x]= v
2201 for( i = 0; i < 8; i++ )
2206 #define SRC(x) dct[x][i]
2207 #define DST(x,v) sum += FFABS(v)
2208 for( i = 0; i < 8; i++ )
2216 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2217 MpegEncContext * const s= (MpegEncContext *)c;
2218 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2223 s->dsp.diff_pixels(temp, src1, src2, stride);
2227 sum= FFMAX(sum, FFABS(temp[i]));
2232 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2233 MpegEncContext * const s= (MpegEncContext *)c;
2234 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2235 DCTELEM * const bak = temp+64;
2241 s->dsp.diff_pixels(temp, src1, src2, stride);
2243 memcpy(bak, temp, 64*sizeof(DCTELEM));
2245 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2246 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2247 ff_simple_idct_8(temp); //FIXME
2250 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2255 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2256 MpegEncContext * const s= (MpegEncContext *)c;
2257 const uint8_t *scantable= s->intra_scantable.permutated;
2258 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2259 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2260 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2261 int i, last, run, bits, level, distortion, start_i;
2262 const int esc_length= s->ac_esc_length;
2264 uint8_t * last_length;
2268 copy_block8(lsrc1, src1, 8, stride, 8);
2269 copy_block8(lsrc2, src2, 8, stride, 8);
2271 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2273 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2279 length = s->intra_ac_vlc_length;
2280 last_length= s->intra_ac_vlc_last_length;
2281 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2284 length = s->inter_ac_vlc_length;
2285 last_length= s->inter_ac_vlc_last_length;
2290 for(i=start_i; i<last; i++){
2291 int j= scantable[i];
2296 if((level&(~127)) == 0){
2297 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2306 level= temp[i] + 64;
2310 if((level&(~127)) == 0){
2311 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2319 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2321 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2324 s->dsp.idct_add(lsrc2, 8, temp);
2326 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2328 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2331 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2332 MpegEncContext * const s= (MpegEncContext *)c;
2333 const uint8_t *scantable= s->intra_scantable.permutated;
2334 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2335 int i, last, run, bits, level, start_i;
2336 const int esc_length= s->ac_esc_length;
2338 uint8_t * last_length;
2342 s->dsp.diff_pixels(temp, src1, src2, stride);
2344 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2350 length = s->intra_ac_vlc_length;
2351 last_length= s->intra_ac_vlc_last_length;
2352 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2355 length = s->inter_ac_vlc_length;
2356 last_length= s->inter_ac_vlc_last_length;
2361 for(i=start_i; i<last; i++){
2362 int j= scantable[i];
2367 if((level&(~127)) == 0){
2368 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2377 level= temp[i] + 64;
2381 if((level&(~127)) == 0){
2382 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2390 #define VSAD_INTRA(size) \
2391 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2395 for(y=1; y<h; y++){ \
2396 for(x=0; x<size; x+=4){ \
2397 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2398 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2408 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2413 for(x=0; x<16; x++){
2414 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2423 #define SQ(a) ((a)*(a))
2424 #define VSSE_INTRA(size) \
2425 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2429 for(y=1; y<h; y++){ \
2430 for(x=0; x<size; x+=4){ \
2431 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2432 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2442 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2447 for(x=0; x<16; x++){
2448 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2457 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2461 for(i=0; i<size; i++)
2462 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2466 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2467 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2468 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2470 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2472 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2473 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2474 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2475 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2477 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2479 for(i=0; i<len; i++)
2480 dst[i] = src0[i] * src1[i];
2483 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2486 for(i=0; i<len; i++)
2487 dst[i] = src0[i] * src1[-i];
2490 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2492 for(i=0; i<len; i++)
2493 dst[i] = src0[i] * src1[i] + src2[i];
2496 static void vector_fmul_window_c(float *dst, const float *src0,
2497 const float *src1, const float *win, int len)
2503 for(i=-len, j=len-1; i<0; i++, j--) {
2508 dst[i] = s0*wj - s1*wi;
2509 dst[j] = s0*wi + s1*wj;
2513 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2517 for (i = 0; i < len; i++)
2518 dst[i] = src[i] * mul;
2521 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2525 for (i = 0; i < len; i++)
2526 dst[i] += src[i] * mul;
2529 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2533 for (i = 0; i < len; i++) {
2534 float t = v1[i] - v2[i];
2540 static void butterflies_float_interleave_c(float *dst, const float *src0,
2541 const float *src1, int len)
2544 for (i = 0; i < len; i++) {
2547 dst[2*i ] = f1 + f2;
2548 dst[2*i + 1] = f1 - f2;
2552 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2557 for (i = 0; i < len; i++)
2563 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2564 uint32_t maxi, uint32_t maxisign)
2567 if(a > mini) return mini;
2568 else if((a^(1U<<31)) > maxisign) return maxi;
2572 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2574 uint32_t mini = *(uint32_t*)min;
2575 uint32_t maxi = *(uint32_t*)max;
2576 uint32_t maxisign = maxi ^ (1U<<31);
2577 uint32_t *dsti = (uint32_t*)dst;
2578 const uint32_t *srci = (const uint32_t*)src;
2579 for(i=0; i<len; i+=8) {
2580 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2581 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2582 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2583 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2584 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2585 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2586 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2587 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2590 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2592 if(min < 0 && max > 0) {
2593 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2595 for(i=0; i < len; i+=8) {
2596 dst[i ] = av_clipf(src[i ], min, max);
2597 dst[i + 1] = av_clipf(src[i + 1], min, max);
2598 dst[i + 2] = av_clipf(src[i + 2], min, max);
2599 dst[i + 3] = av_clipf(src[i + 3], min, max);
2600 dst[i + 4] = av_clipf(src[i + 4], min, max);
2601 dst[i + 5] = av_clipf(src[i + 5], min, max);
2602 dst[i + 6] = av_clipf(src[i + 6], min, max);
2603 dst[i + 7] = av_clipf(src[i + 7], min, max);
2608 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2613 res += *v1++ * *v2++;
2618 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2623 *v1++ += mul * *v3++;
2628 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2629 const int16_t *window, unsigned int len)
2632 int len2 = len >> 1;
2634 for (i = 0; i < len2; i++) {
2635 int16_t w = window[i];
2636 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2637 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2641 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2642 int32_t max, unsigned int len)
2645 *dst++ = av_clip(*src++, min, max);
2646 *dst++ = av_clip(*src++, min, max);
2647 *dst++ = av_clip(*src++, min, max);
2648 *dst++ = av_clip(*src++, min, max);
2649 *dst++ = av_clip(*src++, min, max);
2650 *dst++ = av_clip(*src++, min, max);
2651 *dst++ = av_clip(*src++, min, max);
2652 *dst++ = av_clip(*src++, min, max);
2658 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2659 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2660 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2661 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2662 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2663 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2664 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2666 static void wmv2_idct_row(short * b)
2669 int a0,a1,a2,a3,a4,a5,a6,a7;
2671 a1 = W1*b[1]+W7*b[7];
2672 a7 = W7*b[1]-W1*b[7];
2673 a5 = W5*b[5]+W3*b[3];
2674 a3 = W3*b[5]-W5*b[3];
2675 a2 = W2*b[2]+W6*b[6];
2676 a6 = W6*b[2]-W2*b[6];
2677 a0 = W0*b[0]+W0*b[4];
2678 a4 = W0*b[0]-W0*b[4];
2680 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2681 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2683 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2684 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2685 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2686 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2687 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2688 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2689 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2690 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2692 static void wmv2_idct_col(short * b)
2695 int a0,a1,a2,a3,a4,a5,a6,a7;
2696 /*step 1, with extended precision*/
2697 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2698 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2699 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2700 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2701 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2702 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2703 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2704 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2706 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2707 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2709 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2710 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2711 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2712 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2714 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2715 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2716 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2717 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2719 void ff_wmv2_idct_c(short * block){
2723 wmv2_idct_row(block+i);
2726 wmv2_idct_col(block+i);
2729 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2731 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2733 ff_wmv2_idct_c(block);
2734 ff_put_pixels_clamped_c(block, dest, line_size);
2736 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2738 ff_wmv2_idct_c(block);
2739 ff_add_pixels_clamped_c(block, dest, line_size);
2741 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2743 ff_j_rev_dct (block);
2744 ff_put_pixels_clamped_c(block, dest, line_size);
2746 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2748 ff_j_rev_dct (block);
2749 ff_add_pixels_clamped_c(block, dest, line_size);
2752 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2754 ff_j_rev_dct4 (block);
2755 put_pixels_clamped4_c(block, dest, line_size);
2757 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2759 ff_j_rev_dct4 (block);
2760 add_pixels_clamped4_c(block, dest, line_size);
2763 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2765 ff_j_rev_dct2 (block);
2766 put_pixels_clamped2_c(block, dest, line_size);
2768 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2770 ff_j_rev_dct2 (block);
2771 add_pixels_clamped2_c(block, dest, line_size);
2774 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2776 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2778 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2780 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2783 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2785 /* init static data */
2786 av_cold void ff_dsputil_static_init(void)
2790 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2791 for(i=0;i<MAX_NEG_CROP;i++) {
2793 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2796 for(i=0;i<512;i++) {
2797 ff_squareTbl[i] = (i - 256) * (i - 256);
2800 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2803 int ff_check_alignment(void){
2804 static int did_fail=0;
2805 LOCAL_ALIGNED_16(int, aligned, [4]);
2807 if((intptr_t)aligned & 15){
2809 #if HAVE_MMX || HAVE_ALTIVEC
2810 av_log(NULL, AV_LOG_ERROR,
2811 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2812 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2813 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2814 "Do not report crashes to FFmpeg developers.\n");
2823 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2827 ff_check_alignment();
2830 if (avctx->bits_per_raw_sample == 10) {
2831 c->fdct = ff_jpeg_fdct_islow_10;
2832 c->fdct248 = ff_fdct248_islow_10;
2834 if(avctx->dct_algo==FF_DCT_FASTINT) {
2835 c->fdct = ff_fdct_ifast;
2836 c->fdct248 = ff_fdct_ifast248;
2838 else if(avctx->dct_algo==FF_DCT_FAAN) {
2839 c->fdct = ff_faandct;
2840 c->fdct248 = ff_faandct248;
2843 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2844 c->fdct248 = ff_fdct248_islow_8;
2847 #endif //CONFIG_ENCODERS
2849 if(avctx->lowres==1){
2850 c->idct_put= ff_jref_idct4_put;
2851 c->idct_add= ff_jref_idct4_add;
2852 c->idct = ff_j_rev_dct4;
2853 c->idct_permutation_type= FF_NO_IDCT_PERM;
2854 }else if(avctx->lowres==2){
2855 c->idct_put= ff_jref_idct2_put;
2856 c->idct_add= ff_jref_idct2_add;
2857 c->idct = ff_j_rev_dct2;
2858 c->idct_permutation_type= FF_NO_IDCT_PERM;
2859 }else if(avctx->lowres==3){
2860 c->idct_put= ff_jref_idct1_put;
2861 c->idct_add= ff_jref_idct1_add;
2862 c->idct = ff_j_rev_dct1;
2863 c->idct_permutation_type= FF_NO_IDCT_PERM;
2865 if (avctx->bits_per_raw_sample == 10) {
2866 c->idct_put = ff_simple_idct_put_10;
2867 c->idct_add = ff_simple_idct_add_10;
2868 c->idct = ff_simple_idct_10;
2869 c->idct_permutation_type = FF_NO_IDCT_PERM;
2871 if(avctx->idct_algo==FF_IDCT_INT){
2872 c->idct_put= ff_jref_idct_put;
2873 c->idct_add= ff_jref_idct_add;
2874 c->idct = ff_j_rev_dct;
2875 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2876 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2877 avctx->idct_algo==FF_IDCT_VP3){
2878 c->idct_put= ff_vp3_idct_put_c;
2879 c->idct_add= ff_vp3_idct_add_c;
2880 c->idct = ff_vp3_idct_c;
2881 c->idct_permutation_type= FF_NO_IDCT_PERM;
2882 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2883 c->idct_put= ff_wmv2_idct_put_c;
2884 c->idct_add= ff_wmv2_idct_add_c;
2885 c->idct = ff_wmv2_idct_c;
2886 c->idct_permutation_type= FF_NO_IDCT_PERM;
2887 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2888 c->idct_put= ff_faanidct_put;
2889 c->idct_add= ff_faanidct_add;
2890 c->idct = ff_faanidct;
2891 c->idct_permutation_type= FF_NO_IDCT_PERM;
2892 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2893 c->idct_put= ff_ea_idct_put_c;
2894 c->idct_permutation_type= FF_NO_IDCT_PERM;
2895 }else{ //accurate/default
2896 c->idct_put = ff_simple_idct_put_8;
2897 c->idct_add = ff_simple_idct_add_8;
2898 c->idct = ff_simple_idct_8;
2899 c->idct_permutation_type= FF_NO_IDCT_PERM;
2904 c->diff_pixels = diff_pixels_c;
2905 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2906 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2907 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2908 c->sum_abs_dctelem = sum_abs_dctelem_c;
2911 c->pix_sum = pix_sum_c;
2912 c->pix_norm1 = pix_norm1_c;
2914 c->fill_block_tab[0] = fill_block16_c;
2915 c->fill_block_tab[1] = fill_block8_c;
2917 /* TODO [0] 16 [1] 8 */
2918 c->pix_abs[0][0] = pix_abs16_c;
2919 c->pix_abs[0][1] = pix_abs16_x2_c;
2920 c->pix_abs[0][2] = pix_abs16_y2_c;
2921 c->pix_abs[0][3] = pix_abs16_xy2_c;
2922 c->pix_abs[1][0] = pix_abs8_c;
2923 c->pix_abs[1][1] = pix_abs8_x2_c;
2924 c->pix_abs[1][2] = pix_abs8_y2_c;
2925 c->pix_abs[1][3] = pix_abs8_xy2_c;
2927 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2928 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2929 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2930 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2931 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2932 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2933 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2934 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2935 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2937 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2938 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2939 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2940 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2941 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2942 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2943 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2944 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2945 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2947 #define dspfunc(PFX, IDX, NUM) \
2948 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2949 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2950 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2951 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2952 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2953 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2954 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2955 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2956 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2957 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2958 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2959 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2960 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2961 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2962 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2963 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2965 dspfunc(put_qpel, 0, 16);
2966 dspfunc(put_no_rnd_qpel, 0, 16);
2968 dspfunc(avg_qpel, 0, 16);
2969 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2971 dspfunc(put_qpel, 1, 8);
2972 dspfunc(put_no_rnd_qpel, 1, 8);
2974 dspfunc(avg_qpel, 1, 8);
2975 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2979 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2980 ff_mlp_init(c, avctx);
2982 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2983 ff_intrax8dsp_init(c,avctx);
2986 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2987 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2988 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2989 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2990 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2991 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2992 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2993 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2995 #define SET_CMP_FUNC(name) \
2996 c->name[0]= name ## 16_c;\
2997 c->name[1]= name ## 8x8_c;
2999 SET_CMP_FUNC(hadamard8_diff)
3000 c->hadamard8_diff[4]= hadamard8_intra16_c;
3001 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3002 SET_CMP_FUNC(dct_sad)
3003 SET_CMP_FUNC(dct_max)
3005 SET_CMP_FUNC(dct264_sad)
3007 c->sad[0]= pix_abs16_c;
3008 c->sad[1]= pix_abs8_c;
3012 SET_CMP_FUNC(quant_psnr)
3015 c->vsad[0]= vsad16_c;
3016 c->vsad[4]= vsad_intra16_c;
3017 c->vsad[5]= vsad_intra8_c;
3018 c->vsse[0]= vsse16_c;
3019 c->vsse[4]= vsse_intra16_c;
3020 c->vsse[5]= vsse_intra8_c;
3021 c->nsse[0]= nsse16_c;
3022 c->nsse[1]= nsse8_c;
3024 ff_dsputil_init_dwt(c);
3027 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3029 c->add_bytes= add_bytes_c;
3030 c->diff_bytes= diff_bytes_c;
3031 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3032 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3033 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3034 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3035 c->bswap_buf= bswap_buf;
3036 c->bswap16_buf = bswap16_buf;
3038 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3039 c->h263_h_loop_filter= h263_h_loop_filter_c;
3040 c->h263_v_loop_filter= h263_v_loop_filter_c;
3043 if (CONFIG_VP3_DECODER) {
3044 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3045 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3046 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3049 c->h261_loop_filter= h261_loop_filter_c;
3051 c->try_8x8basis= try_8x8basis_c;
3052 c->add_8x8basis= add_8x8basis_c;
3054 #if CONFIG_VORBIS_DECODER
3055 c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3057 #if CONFIG_AC3_DECODER
3058 c->ac3_downmix = ff_ac3_downmix_c;
3060 c->vector_fmul = vector_fmul_c;
3061 c->vector_fmul_reverse = vector_fmul_reverse_c;
3062 c->vector_fmul_add = vector_fmul_add_c;
3063 c->vector_fmul_window = vector_fmul_window_c;
3064 c->vector_clipf = vector_clipf_c;
3065 c->scalarproduct_int16 = scalarproduct_int16_c;
3066 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3067 c->apply_window_int16 = apply_window_int16_c;
3068 c->vector_clip_int32 = vector_clip_int32_c;
3069 c->scalarproduct_float = scalarproduct_float_c;
3070 c->butterflies_float = butterflies_float_c;
3071 c->butterflies_float_interleave = butterflies_float_interleave_c;
3072 c->vector_fmul_scalar = vector_fmul_scalar_c;
3073 c->vector_fmac_scalar = vector_fmac_scalar_c;
3075 c->shrink[0]= av_image_copy_plane;
3076 c->shrink[1]= ff_shrink22;
3077 c->shrink[2]= ff_shrink44;
3078 c->shrink[3]= ff_shrink88;
3080 c->prefetch= just_return;
3082 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3083 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3087 #define FUNC(f, depth) f ## _ ## depth
3088 #define FUNCC(f, depth) f ## _ ## depth ## _c
3090 #define dspfunc1(PFX, IDX, NUM, depth)\
3091 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3092 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3093 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3094 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3096 #define dspfunc2(PFX, IDX, NUM, depth)\
3097 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3098 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3099 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3100 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3101 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3102 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3103 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3104 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3105 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3106 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3107 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3108 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3109 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3110 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3111 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3112 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3115 #define BIT_DEPTH_FUNCS(depth, dct)\
3116 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3117 c->draw_edges = FUNCC(draw_edges , depth);\
3118 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3119 c->clear_block = FUNCC(clear_block ## dct , depth);\
3120 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3121 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3122 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3123 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3124 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3126 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3127 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3128 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3129 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3130 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3131 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3133 dspfunc1(put , 0, 16, depth);\
3134 dspfunc1(put , 1, 8, depth);\
3135 dspfunc1(put , 2, 4, depth);\
3136 dspfunc1(put , 3, 2, depth);\
3137 dspfunc1(put_no_rnd, 0, 16, depth);\
3138 dspfunc1(put_no_rnd, 1, 8, depth);\
3139 dspfunc1(avg , 0, 16, depth);\
3140 dspfunc1(avg , 1, 8, depth);\
3141 dspfunc1(avg , 2, 4, depth);\
3142 dspfunc1(avg , 3, 2, depth);\
3143 dspfunc1(avg_no_rnd, 0, 16, depth);\
3144 dspfunc1(avg_no_rnd, 1, 8, depth);\
3146 dspfunc2(put_h264_qpel, 0, 16, depth);\
3147 dspfunc2(put_h264_qpel, 1, 8, depth);\
3148 dspfunc2(put_h264_qpel, 2, 4, depth);\
3149 dspfunc2(put_h264_qpel, 3, 2, depth);\
3150 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3151 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3152 dspfunc2(avg_h264_qpel, 2, 4, depth);
3154 switch (avctx->bits_per_raw_sample) {
3156 if (c->dct_bits == 32) {
3157 BIT_DEPTH_FUNCS(9, _32);
3159 BIT_DEPTH_FUNCS(9, _16);
3163 if (c->dct_bits == 32) {
3164 BIT_DEPTH_FUNCS(10, _32);
3166 BIT_DEPTH_FUNCS(10, _16);
3170 BIT_DEPTH_FUNCS(8, _16);
3175 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
3176 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
3177 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
3178 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
3179 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
3180 if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
3181 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
3182 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
3184 for(i=0; i<64; i++){
3185 if(!c->put_2tap_qpel_pixels_tab[0][i])
3186 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3187 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3188 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3191 ff_init_scantable_permutation(c->idct_permutation,
3192 c->idct_permutation_type);
3195 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3197 ff_dsputil_init(c, avctx);