3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 #define pixeltmp int16_t
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
56 #define pixeltmp int32_t
58 #include "dsputil_template.c"
62 #include "dsputil_template.c"
66 #define pixeltmp int16_t
68 #include "dsputil_template.c"
71 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
72 #define pb_7f (~0UL/255 * 0x7f)
73 #define pb_80 (~0UL/255 * 0x80)
75 const uint8_t ff_zigzag_direct[64] = {
76 0, 1, 8, 16, 9, 2, 3, 10,
77 17, 24, 32, 25, 18, 11, 4, 5,
78 12, 19, 26, 33, 40, 48, 41, 34,
79 27, 20, 13, 6, 7, 14, 21, 28,
80 35, 42, 49, 56, 57, 50, 43, 36,
81 29, 22, 15, 23, 30, 37, 44, 51,
82 58, 59, 52, 45, 38, 31, 39, 46,
83 53, 60, 61, 54, 47, 55, 62, 63
86 /* Specific zigzag scan for 248 idct. NOTE that unlike the
87 specification, we interleave the fields */
88 const uint8_t ff_zigzag248_direct[64] = {
89 0, 8, 1, 9, 16, 24, 2, 10,
90 17, 25, 32, 40, 48, 56, 33, 41,
91 18, 26, 3, 11, 4, 12, 19, 27,
92 34, 42, 49, 57, 50, 58, 35, 43,
93 20, 28, 5, 13, 6, 14, 21, 29,
94 36, 44, 51, 59, 52, 60, 37, 45,
95 22, 30, 7, 15, 23, 31, 38, 46,
96 53, 61, 54, 62, 39, 47, 55, 63,
99 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
100 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
102 const uint8_t ff_alternate_horizontal_scan[64] = {
103 0, 1, 2, 3, 8, 9, 16, 17,
104 10, 11, 4, 5, 6, 7, 15, 14,
105 13, 12, 19, 18, 24, 25, 32, 33,
106 26, 27, 20, 21, 22, 23, 28, 29,
107 30, 31, 34, 35, 40, 41, 48, 49,
108 42, 43, 36, 37, 38, 39, 44, 45,
109 46, 47, 50, 51, 56, 57, 58, 59,
110 52, 53, 54, 55, 60, 61, 62, 63,
113 const uint8_t ff_alternate_vertical_scan[64] = {
114 0, 8, 16, 24, 1, 9, 2, 10,
115 17, 25, 32, 40, 48, 56, 57, 49,
116 41, 33, 26, 18, 3, 11, 4, 12,
117 19, 27, 34, 42, 50, 58, 35, 43,
118 51, 59, 20, 28, 5, 13, 6, 14,
119 21, 29, 36, 44, 52, 60, 37, 45,
120 53, 61, 22, 30, 7, 15, 23, 31,
121 38, 46, 54, 62, 39, 47, 55, 63,
124 /* Input permutation for the simple_idct_mmx */
125 static const uint8_t simple_mmx_permutation[64]={
126 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
127 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
128 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
129 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
130 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
131 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
132 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
133 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
136 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
138 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
142 st->scantable= src_scantable;
146 j = src_scantable[i];
147 st->permutated[i] = permutation[j];
153 j = st->permutated[i];
155 st->raster_end[i]= end;
159 void ff_init_scantable_permutation(uint8_t *idct_permutation,
160 int idct_permutation_type)
164 switch(idct_permutation_type){
165 case FF_NO_IDCT_PERM:
167 idct_permutation[i]= i;
169 case FF_LIBMPEG2_IDCT_PERM:
171 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
173 case FF_SIMPLE_IDCT_PERM:
175 idct_permutation[i]= simple_mmx_permutation[i];
177 case FF_TRANSPOSE_IDCT_PERM:
179 idct_permutation[i]= ((i&7)<<3) | (i>>3);
181 case FF_PARTTRANS_IDCT_PERM:
183 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
185 case FF_SSE2_IDCT_PERM:
187 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
190 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
194 static int pix_sum_c(uint8_t * pix, int line_size)
199 for (i = 0; i < 16; i++) {
200 for (j = 0; j < 16; j += 8) {
211 pix += line_size - 16;
216 static int pix_norm1_c(uint8_t * pix, int line_size)
219 uint32_t *sq = ff_squareTbl + 256;
222 for (i = 0; i < 16; i++) {
223 for (j = 0; j < 16; j += 8) {
235 register uint64_t x=*(uint64_t*)pix;
237 s += sq[(x>>8)&0xff];
238 s += sq[(x>>16)&0xff];
239 s += sq[(x>>24)&0xff];
240 s += sq[(x>>32)&0xff];
241 s += sq[(x>>40)&0xff];
242 s += sq[(x>>48)&0xff];
243 s += sq[(x>>56)&0xff];
245 register uint32_t x=*(uint32_t*)pix;
247 s += sq[(x>>8)&0xff];
248 s += sq[(x>>16)&0xff];
249 s += sq[(x>>24)&0xff];
250 x=*(uint32_t*)(pix+4);
252 s += sq[(x>>8)&0xff];
253 s += sq[(x>>16)&0xff];
254 s += sq[(x>>24)&0xff];
259 pix += line_size - 16;
264 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
267 for(i=0; i+8<=w; i+=8){
268 dst[i+0]= av_bswap32(src[i+0]);
269 dst[i+1]= av_bswap32(src[i+1]);
270 dst[i+2]= av_bswap32(src[i+2]);
271 dst[i+3]= av_bswap32(src[i+3]);
272 dst[i+4]= av_bswap32(src[i+4]);
273 dst[i+5]= av_bswap32(src[i+5]);
274 dst[i+6]= av_bswap32(src[i+6]);
275 dst[i+7]= av_bswap32(src[i+7]);
278 dst[i+0]= av_bswap32(src[i+0]);
282 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
285 *dst++ = av_bswap16(*src++);
288 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
291 uint32_t *sq = ff_squareTbl + 256;
294 for (i = 0; i < h; i++) {
295 s += sq[pix1[0] - pix2[0]];
296 s += sq[pix1[1] - pix2[1]];
297 s += sq[pix1[2] - pix2[2]];
298 s += sq[pix1[3] - pix2[3]];
305 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
308 uint32_t *sq = ff_squareTbl + 256;
311 for (i = 0; i < h; i++) {
312 s += sq[pix1[0] - pix2[0]];
313 s += sq[pix1[1] - pix2[1]];
314 s += sq[pix1[2] - pix2[2]];
315 s += sq[pix1[3] - pix2[3]];
316 s += sq[pix1[4] - pix2[4]];
317 s += sq[pix1[5] - pix2[5]];
318 s += sq[pix1[6] - pix2[6]];
319 s += sq[pix1[7] - pix2[7]];
326 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
329 uint32_t *sq = ff_squareTbl + 256;
332 for (i = 0; i < h; i++) {
333 s += sq[pix1[ 0] - pix2[ 0]];
334 s += sq[pix1[ 1] - pix2[ 1]];
335 s += sq[pix1[ 2] - pix2[ 2]];
336 s += sq[pix1[ 3] - pix2[ 3]];
337 s += sq[pix1[ 4] - pix2[ 4]];
338 s += sq[pix1[ 5] - pix2[ 5]];
339 s += sq[pix1[ 6] - pix2[ 6]];
340 s += sq[pix1[ 7] - pix2[ 7]];
341 s += sq[pix1[ 8] - pix2[ 8]];
342 s += sq[pix1[ 9] - pix2[ 9]];
343 s += sq[pix1[10] - pix2[10]];
344 s += sq[pix1[11] - pix2[11]];
345 s += sq[pix1[12] - pix2[12]];
346 s += sq[pix1[13] - pix2[13]];
347 s += sq[pix1[14] - pix2[14]];
348 s += sq[pix1[15] - pix2[15]];
356 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
357 const uint8_t *s2, int stride){
360 /* read the pixels */
362 block[0] = s1[0] - s2[0];
363 block[1] = s1[1] - s2[1];
364 block[2] = s1[2] - s2[2];
365 block[3] = s1[3] - s2[3];
366 block[4] = s1[4] - s2[4];
367 block[5] = s1[5] - s2[5];
368 block[6] = s1[6] - s2[6];
369 block[7] = s1[7] - s2[7];
377 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
382 /* read the pixels */
384 pixels[0] = av_clip_uint8(block[0]);
385 pixels[1] = av_clip_uint8(block[1]);
386 pixels[2] = av_clip_uint8(block[2]);
387 pixels[3] = av_clip_uint8(block[3]);
388 pixels[4] = av_clip_uint8(block[4]);
389 pixels[5] = av_clip_uint8(block[5]);
390 pixels[6] = av_clip_uint8(block[6]);
391 pixels[7] = av_clip_uint8(block[7]);
398 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
403 /* read the pixels */
405 pixels[0] = av_clip_uint8(block[0]);
406 pixels[1] = av_clip_uint8(block[1]);
407 pixels[2] = av_clip_uint8(block[2]);
408 pixels[3] = av_clip_uint8(block[3]);
415 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
420 /* read the pixels */
422 pixels[0] = av_clip_uint8(block[0]);
423 pixels[1] = av_clip_uint8(block[1]);
430 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
431 uint8_t *restrict pixels,
436 for (i = 0; i < 8; i++) {
437 for (j = 0; j < 8; j++) {
440 else if (*block > 127)
443 *pixels = (uint8_t)(*block + 128);
447 pixels += (line_size - 8);
451 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
456 /* read the pixels */
458 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
459 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
460 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
461 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
462 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
463 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
464 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
465 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
471 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
476 /* read the pixels */
478 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
479 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
480 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
481 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
487 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
492 /* read the pixels */
494 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
495 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
501 static int sum_abs_dctelem_c(DCTELEM *block)
505 sum+= FFABS(block[i]);
509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
513 for (i = 0; i < h; i++) {
514 memset(block, value, 16);
519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
523 for (i = 0; i < h; i++) {
524 memset(block, value, 8);
529 #define avg2(a,b) ((a+b+1)>>1)
530 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
532 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
534 const int A=(16-x16)*(16-y16);
535 const int B=( x16)*(16-y16);
536 const int C=(16-x16)*( y16);
537 const int D=( x16)*( y16);
542 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
543 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
544 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
545 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
546 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
547 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
548 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
549 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
555 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
556 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
559 const int s= 1<<shift;
569 for(x=0; x<8; x++){ //XXX FIXME optimize
570 int src_x, src_y, frac_x, frac_y, index;
579 if((unsigned)src_x < width){
580 if((unsigned)src_y < height){
581 index= src_x + src_y*stride;
582 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
583 + src[index +1]* frac_x )*(s-frac_y)
584 + ( src[index+stride ]*(s-frac_x)
585 + src[index+stride+1]* frac_x )* frac_y
588 index= src_x + av_clip(src_y, 0, height)*stride;
589 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
590 + src[index +1]* frac_x )*s
594 if((unsigned)src_y < height){
595 index= av_clip(src_x, 0, width) + src_y*stride;
596 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
597 + src[index+stride ]* frac_y )*s
600 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
601 dst[y*stride + x]= src[index ];
613 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
615 case 2: put_pixels2_8_c (dst, src, stride, height); break;
616 case 4: put_pixels4_8_c (dst, src, stride, height); break;
617 case 8: put_pixels8_8_c (dst, src, stride, height); break;
618 case 16:put_pixels16_8_c(dst, src, stride, height); break;
622 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
624 for (i=0; i < height; i++) {
625 for (j=0; j < width; j++) {
626 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
633 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
635 for (i=0; i < height; i++) {
636 for (j=0; j < width; j++) {
637 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
644 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
646 for (i=0; i < height; i++) {
647 for (j=0; j < width; j++) {
648 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
655 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
657 for (i=0; i < height; i++) {
658 for (j=0; j < width; j++) {
659 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
666 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
668 for (i=0; i < height; i++) {
669 for (j=0; j < width; j++) {
670 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
677 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
679 for (i=0; i < height; i++) {
680 for (j=0; j < width; j++) {
681 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
688 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
690 for (i=0; i < height; i++) {
691 for (j=0; j < width; j++) {
692 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
699 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
701 for (i=0; i < height; i++) {
702 for (j=0; j < width; j++) {
703 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
710 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
712 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
713 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
714 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
715 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
719 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
721 for (i=0; i < height; i++) {
722 for (j=0; j < width; j++) {
723 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
730 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
732 for (i=0; i < height; i++) {
733 for (j=0; j < width; j++) {
734 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
741 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
743 for (i=0; i < height; i++) {
744 for (j=0; j < width; j++) {
745 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
752 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
754 for (i=0; i < height; i++) {
755 for (j=0; j < width; j++) {
756 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
763 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
765 for (i=0; i < height; i++) {
766 for (j=0; j < width; j++) {
767 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
774 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
776 for (i=0; i < height; i++) {
777 for (j=0; j < width; j++) {
778 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
785 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
787 for (i=0; i < height; i++) {
788 for (j=0; j < width; j++) {
789 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
796 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
798 for (i=0; i < height; i++) {
799 for (j=0; j < width; j++) {
800 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
807 #define QPEL_MC(r, OPNAME, RND, OP) \
808 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
809 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
813 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
814 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
815 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
816 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
817 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
818 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
819 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
820 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
826 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
828 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
832 const int src0= src[0*srcStride];\
833 const int src1= src[1*srcStride];\
834 const int src2= src[2*srcStride];\
835 const int src3= src[3*srcStride];\
836 const int src4= src[4*srcStride];\
837 const int src5= src[5*srcStride];\
838 const int src6= src[6*srcStride];\
839 const int src7= src[7*srcStride];\
840 const int src8= src[8*srcStride];\
841 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
842 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
843 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
844 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
845 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
846 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
847 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
848 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
854 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
855 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
860 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
861 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
862 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
863 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
864 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
865 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
866 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
867 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
868 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
869 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
870 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
871 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
872 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
873 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
874 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
875 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
881 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
882 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
887 const int src0= src[0*srcStride];\
888 const int src1= src[1*srcStride];\
889 const int src2= src[2*srcStride];\
890 const int src3= src[3*srcStride];\
891 const int src4= src[4*srcStride];\
892 const int src5= src[5*srcStride];\
893 const int src6= src[6*srcStride];\
894 const int src7= src[7*srcStride];\
895 const int src8= src[8*srcStride];\
896 const int src9= src[9*srcStride];\
897 const int src10= src[10*srcStride];\
898 const int src11= src[11*srcStride];\
899 const int src12= src[12*srcStride];\
900 const int src13= src[13*srcStride];\
901 const int src14= src[14*srcStride];\
902 const int src15= src[15*srcStride];\
903 const int src16= src[16*srcStride];\
904 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
905 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
906 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
907 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
908 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
909 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
910 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
911 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
912 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
913 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
914 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
915 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
916 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
917 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
918 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
919 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
925 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
927 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
928 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
931 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
932 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
935 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
937 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
938 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
941 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
944 copy_block9(full, src, 16, stride, 9);\
945 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
946 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
949 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
951 copy_block9(full, src, 16, stride, 9);\
952 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
955 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
958 copy_block9(full, src, 16, stride, 9);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
960 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
962 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
967 copy_block9(full, src, 16, stride, 9);\
968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
971 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
973 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
977 copy_block9(full, src, 16, stride, 9);\
978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
979 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
981 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
983 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
988 copy_block9(full, src, 16, stride, 9);\
989 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
990 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
991 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
992 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
998 copy_block9(full, src, 16, stride, 9);\
999 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1000 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1001 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1002 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1004 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1005 uint8_t full[16*9];\
1008 uint8_t halfHV[64];\
1009 copy_block9(full, src, 16, stride, 9);\
1010 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1011 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1012 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1013 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1015 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1016 uint8_t full[16*9];\
1018 uint8_t halfHV[64];\
1019 copy_block9(full, src, 16, stride, 9);\
1020 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1021 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1022 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1023 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1025 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1026 uint8_t full[16*9];\
1029 uint8_t halfHV[64];\
1030 copy_block9(full, src, 16, stride, 9);\
1031 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1032 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1033 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1034 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1036 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1037 uint8_t full[16*9];\
1039 uint8_t halfHV[64];\
1040 copy_block9(full, src, 16, stride, 9);\
1041 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1042 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1043 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1044 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1046 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t halfHV[64];\
1049 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1050 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1053 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1055 uint8_t halfHV[64];\
1056 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1058 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1060 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1061 uint8_t full[16*9];\
1064 uint8_t halfHV[64];\
1065 copy_block9(full, src, 16, stride, 9);\
1066 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1067 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1068 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1069 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1071 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1072 uint8_t full[16*9];\
1074 copy_block9(full, src, 16, stride, 9);\
1075 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1076 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1077 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1079 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1080 uint8_t full[16*9];\
1083 uint8_t halfHV[64];\
1084 copy_block9(full, src, 16, stride, 9);\
1085 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1086 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1087 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1090 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1091 uint8_t full[16*9];\
1093 copy_block9(full, src, 16, stride, 9);\
1094 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1095 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1096 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1098 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1100 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1101 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1104 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1106 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1107 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1110 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1111 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1114 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1116 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1117 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1120 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t full[24*17];\
1123 copy_block17(full, src, 24, stride, 17);\
1124 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1125 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1128 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1129 uint8_t full[24*17];\
1130 copy_block17(full, src, 24, stride, 17);\
1131 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1134 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1135 uint8_t full[24*17];\
1137 copy_block17(full, src, 24, stride, 17);\
1138 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1139 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1141 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1142 uint8_t full[24*17];\
1143 uint8_t halfH[272];\
1144 uint8_t halfV[256];\
1145 uint8_t halfHV[256];\
1146 copy_block17(full, src, 24, stride, 17);\
1147 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1148 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1149 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1150 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1152 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1153 uint8_t full[24*17];\
1154 uint8_t halfH[272];\
1155 uint8_t halfHV[256];\
1156 copy_block17(full, src, 24, stride, 17);\
1157 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1158 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1159 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1160 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1162 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1163 uint8_t full[24*17];\
1164 uint8_t halfH[272];\
1165 uint8_t halfV[256];\
1166 uint8_t halfHV[256];\
1167 copy_block17(full, src, 24, stride, 17);\
1168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1169 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1170 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1171 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1173 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1174 uint8_t full[24*17];\
1175 uint8_t halfH[272];\
1176 uint8_t halfHV[256];\
1177 copy_block17(full, src, 24, stride, 17);\
1178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1179 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1181 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1183 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1184 uint8_t full[24*17];\
1185 uint8_t halfH[272];\
1186 uint8_t halfV[256];\
1187 uint8_t halfHV[256];\
1188 copy_block17(full, src, 24, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1191 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1194 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1195 uint8_t full[24*17];\
1196 uint8_t halfH[272];\
1197 uint8_t halfHV[256];\
1198 copy_block17(full, src, 24, stride, 17);\
1199 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1200 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1201 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1202 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1204 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1205 uint8_t full[24*17];\
1206 uint8_t halfH[272];\
1207 uint8_t halfV[256];\
1208 uint8_t halfHV[256];\
1209 copy_block17(full, src, 24, stride, 17);\
1210 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1211 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1212 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1213 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1215 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1216 uint8_t full[24*17];\
1217 uint8_t halfH[272];\
1218 uint8_t halfHV[256];\
1219 copy_block17(full, src, 24, stride, 17);\
1220 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1221 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1223 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1225 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1226 uint8_t halfH[272];\
1227 uint8_t halfHV[256];\
1228 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1229 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1232 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1237 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1239 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1240 uint8_t full[24*17];\
1241 uint8_t halfH[272];\
1242 uint8_t halfV[256];\
1243 uint8_t halfHV[256];\
1244 copy_block17(full, src, 24, stride, 17);\
1245 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1246 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1247 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1248 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1250 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1251 uint8_t full[24*17];\
1252 uint8_t halfH[272];\
1253 copy_block17(full, src, 24, stride, 17);\
1254 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1255 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1256 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1258 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1259 uint8_t full[24*17];\
1260 uint8_t halfH[272];\
1261 uint8_t halfV[256];\
1262 uint8_t halfHV[256];\
1263 copy_block17(full, src, 24, stride, 17);\
1264 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1265 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1266 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1269 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1270 uint8_t full[24*17];\
1271 uint8_t halfH[272];\
1272 copy_block17(full, src, 24, stride, 17);\
1273 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1274 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1275 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1277 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1278 uint8_t halfH[272];\
1279 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1280 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1283 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1284 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1285 #define op_put(a, b) a = cm[((b) + 16)>>5]
1286 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1288 QPEL_MC(0, put_ , _ , op_put)
1289 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1290 QPEL_MC(0, avg_ , _ , op_avg)
1291 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1293 #undef op_avg_no_rnd
1295 #undef op_put_no_rnd
1297 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1298 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1299 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1300 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1301 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1302 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1304 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1305 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1309 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1310 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1311 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1312 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1313 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1314 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1315 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1316 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1322 #if CONFIG_RV40_DECODER
1323 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1324 put_pixels16_xy2_8_c(dst, src, stride, 16);
1326 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1327 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1329 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1330 put_pixels8_xy2_8_c(dst, src, stride, 8);
1332 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1333 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1335 #endif /* CONFIG_RV40_DECODER */
1337 #if CONFIG_DIRAC_DECODER
1338 #define DIRAC_MC(OPNAME)\
1339 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1341 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1343 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1345 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1347 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1349 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1350 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1352 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1354 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1356 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1358 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1360 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1362 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1363 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1365 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1367 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1369 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1371 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1373 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1375 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1376 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1382 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1383 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1387 const int src_1= src[ -srcStride];
1388 const int src0 = src[0 ];
1389 const int src1 = src[ srcStride];
1390 const int src2 = src[2*srcStride];
1391 const int src3 = src[3*srcStride];
1392 const int src4 = src[4*srcStride];
1393 const int src5 = src[5*srcStride];
1394 const int src6 = src[6*srcStride];
1395 const int src7 = src[7*srcStride];
1396 const int src8 = src[8*srcStride];
1397 const int src9 = src[9*srcStride];
1398 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1399 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1400 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1401 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1402 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1403 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1404 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1405 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1411 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1413 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1414 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1417 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1418 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1421 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1423 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1424 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1427 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1428 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1431 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1435 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1436 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1437 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1438 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1440 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1444 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1445 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1446 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1447 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1449 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1451 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1452 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1455 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1456 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1458 const int strength= ff_h263_loop_filter_strength[qscale];
1462 int p0= src[x-2*stride];
1463 int p1= src[x-1*stride];
1464 int p2= src[x+0*stride];
1465 int p3= src[x+1*stride];
1466 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1468 if (d<-2*strength) d1= 0;
1469 else if(d<- strength) d1=-2*strength - d;
1470 else if(d< strength) d1= d;
1471 else if(d< 2*strength) d1= 2*strength - d;
1476 if(p1&256) p1= ~(p1>>31);
1477 if(p2&256) p2= ~(p2>>31);
1479 src[x-1*stride] = p1;
1480 src[x+0*stride] = p2;
1484 d2= av_clip((p0-p3)/4, -ad1, ad1);
1486 src[x-2*stride] = p0 - d2;
1487 src[x+ stride] = p3 + d2;
1492 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1493 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1495 const int strength= ff_h263_loop_filter_strength[qscale];
1499 int p0= src[y*stride-2];
1500 int p1= src[y*stride-1];
1501 int p2= src[y*stride+0];
1502 int p3= src[y*stride+1];
1503 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1505 if (d<-2*strength) d1= 0;
1506 else if(d<- strength) d1=-2*strength - d;
1507 else if(d< strength) d1= d;
1508 else if(d< 2*strength) d1= 2*strength - d;
1513 if(p1&256) p1= ~(p1>>31);
1514 if(p2&256) p2= ~(p2>>31);
1516 src[y*stride-1] = p1;
1517 src[y*stride+0] = p2;
1521 d2= av_clip((p0-p3)/4, -ad1, ad1);
1523 src[y*stride-2] = p0 - d2;
1524 src[y*stride+1] = p3 + d2;
1529 static void h261_loop_filter_c(uint8_t *src, int stride){
1534 temp[x ] = 4*src[x ];
1535 temp[x + 7*8] = 4*src[x + 7*stride];
1539 xy = y * stride + x;
1541 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1546 src[ y*stride] = (temp[ y*8] + 2)>>2;
1547 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1549 xy = y * stride + x;
1551 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1556 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1562 s += abs(pix1[0] - pix2[0]);
1563 s += abs(pix1[1] - pix2[1]);
1564 s += abs(pix1[2] - pix2[2]);
1565 s += abs(pix1[3] - pix2[3]);
1566 s += abs(pix1[4] - pix2[4]);
1567 s += abs(pix1[5] - pix2[5]);
1568 s += abs(pix1[6] - pix2[6]);
1569 s += abs(pix1[7] - pix2[7]);
1570 s += abs(pix1[8] - pix2[8]);
1571 s += abs(pix1[9] - pix2[9]);
1572 s += abs(pix1[10] - pix2[10]);
1573 s += abs(pix1[11] - pix2[11]);
1574 s += abs(pix1[12] - pix2[12]);
1575 s += abs(pix1[13] - pix2[13]);
1576 s += abs(pix1[14] - pix2[14]);
1577 s += abs(pix1[15] - pix2[15]);
1584 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1590 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1591 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1592 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1593 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1594 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1595 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1596 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1597 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1598 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1599 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1600 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1601 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1602 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1603 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1604 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1605 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1612 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1615 uint8_t *pix3 = pix2 + line_size;
1619 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1620 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1621 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1622 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1623 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1624 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1625 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1626 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1627 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1628 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1629 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1630 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1631 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1632 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1633 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1634 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1642 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1645 uint8_t *pix3 = pix2 + line_size;
1649 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1650 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1651 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1652 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1653 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1654 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1655 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1656 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1657 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1658 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1659 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1660 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1661 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1662 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1663 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1664 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1672 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1678 s += abs(pix1[0] - pix2[0]);
1679 s += abs(pix1[1] - pix2[1]);
1680 s += abs(pix1[2] - pix2[2]);
1681 s += abs(pix1[3] - pix2[3]);
1682 s += abs(pix1[4] - pix2[4]);
1683 s += abs(pix1[5] - pix2[5]);
1684 s += abs(pix1[6] - pix2[6]);
1685 s += abs(pix1[7] - pix2[7]);
1692 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1698 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1699 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1700 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1701 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1702 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1703 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1704 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1705 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1712 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1715 uint8_t *pix3 = pix2 + line_size;
1719 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1720 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1721 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1722 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1723 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1724 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1725 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1726 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1734 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1737 uint8_t *pix3 = pix2 + line_size;
1741 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1742 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1743 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1744 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1745 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1746 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1747 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1748 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1756 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1757 MpegEncContext *c = v;
1763 for(x=0; x<16; x++){
1764 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1767 for(x=0; x<15; x++){
1768 score2+= FFABS( s1[x ] - s1[x +stride]
1769 - s1[x+1] + s1[x+1+stride])
1770 -FFABS( s2[x ] - s2[x +stride]
1771 - s2[x+1] + s2[x+1+stride]);
1778 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1779 else return score1 + FFABS(score2)*8;
1782 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1783 MpegEncContext *c = v;
1790 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1794 score2+= FFABS( s1[x ] - s1[x +stride]
1795 - s1[x+1] + s1[x+1+stride])
1796 -FFABS( s2[x ] - s2[x +stride]
1797 - s2[x+1] + s2[x+1+stride]);
1804 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1805 else return score1 + FFABS(score2)*8;
1808 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1812 for(i=0; i<8*8; i++){
1813 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1816 assert(-512<b && b<512);
1818 sum += (w*b)*(w*b)>>4;
1823 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1826 for(i=0; i<8*8; i++){
1827 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1832 * Permute an 8x8 block.
1833 * @param block the block which will be permuted according to the given permutation vector
1834 * @param permutation the permutation vector
1835 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1836 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1837 * (inverse) permutated to scantable order!
1839 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1845 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1847 for(i=0; i<=last; i++){
1848 const int j= scantable[i];
1853 for(i=0; i<=last; i++){
1854 const int j= scantable[i];
1855 const int perm_j= permutation[j];
1856 block[perm_j]= temp[j];
1860 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1864 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1867 memset(cmp, 0, sizeof(void*)*6);
1875 cmp[i]= c->hadamard8_diff[i];
1881 cmp[i]= c->dct_sad[i];
1884 cmp[i]= c->dct264_sad[i];
1887 cmp[i]= c->dct_max[i];
1890 cmp[i]= c->quant_psnr[i];
1919 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1924 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1926 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1927 long a = *(long*)(src+i);
1928 long b = *(long*)(dst+i);
1929 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1932 dst[i+0] += src[i+0];
1935 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1937 #if !HAVE_FAST_UNALIGNED
1938 if((long)src2 & (sizeof(long)-1)){
1939 for(i=0; i+7<w; i+=8){
1940 dst[i+0] = src1[i+0]-src2[i+0];
1941 dst[i+1] = src1[i+1]-src2[i+1];
1942 dst[i+2] = src1[i+2]-src2[i+2];
1943 dst[i+3] = src1[i+3]-src2[i+3];
1944 dst[i+4] = src1[i+4]-src2[i+4];
1945 dst[i+5] = src1[i+5]-src2[i+5];
1946 dst[i+6] = src1[i+6]-src2[i+6];
1947 dst[i+7] = src1[i+7]-src2[i+7];
1951 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1952 long a = *(long*)(src1+i);
1953 long b = *(long*)(src2+i);
1954 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1957 dst[i+0] = src1[i+0]-src2[i+0];
1960 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1968 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1977 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1985 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1995 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1998 for(i=0; i<w-1; i++){
2025 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2055 #define BUTTERFLY2(o1,o2,i1,i2) \
2059 #define BUTTERFLY1(x,y) \
2068 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2070 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2078 //FIXME try pointer walks
2079 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2080 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2081 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2082 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2084 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2085 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2086 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2087 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2089 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2090 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2091 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2092 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2096 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2097 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2098 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2099 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2101 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2102 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2103 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2104 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2107 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2108 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2109 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2110 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2115 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2123 //FIXME try pointer walks
2124 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2125 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2126 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2127 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2129 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2130 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2131 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2132 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2134 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2135 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2136 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2137 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2141 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2142 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2143 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2144 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2146 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2147 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2148 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2149 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2152 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2153 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2154 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2155 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2158 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2163 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2164 MpegEncContext * const s= (MpegEncContext *)c;
2165 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2169 s->dsp.diff_pixels(temp, src1, src2, stride);
2171 return s->dsp.sum_abs_dctelem(temp);
2176 const int s07 = SRC(0) + SRC(7);\
2177 const int s16 = SRC(1) + SRC(6);\
2178 const int s25 = SRC(2) + SRC(5);\
2179 const int s34 = SRC(3) + SRC(4);\
2180 const int a0 = s07 + s34;\
2181 const int a1 = s16 + s25;\
2182 const int a2 = s07 - s34;\
2183 const int a3 = s16 - s25;\
2184 const int d07 = SRC(0) - SRC(7);\
2185 const int d16 = SRC(1) - SRC(6);\
2186 const int d25 = SRC(2) - SRC(5);\
2187 const int d34 = SRC(3) - SRC(4);\
2188 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2189 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2190 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2191 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2193 DST(1, a4 + (a7>>2)) ;\
2194 DST(2, a2 + (a3>>1)) ;\
2195 DST(3, a5 + (a6>>2)) ;\
2197 DST(5, a6 - (a5>>2)) ;\
2198 DST(6, (a2>>1) - a3 ) ;\
2199 DST(7, (a4>>2) - a7 ) ;\
2202 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2203 MpegEncContext * const s= (MpegEncContext *)c;
2208 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2210 #define SRC(x) dct[i][x]
2211 #define DST(x,v) dct[i][x]= v
2212 for( i = 0; i < 8; i++ )
2217 #define SRC(x) dct[x][i]
2218 #define DST(x,v) sum += FFABS(v)
2219 for( i = 0; i < 8; i++ )
2227 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2228 MpegEncContext * const s= (MpegEncContext *)c;
2229 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2234 s->dsp.diff_pixels(temp, src1, src2, stride);
2238 sum= FFMAX(sum, FFABS(temp[i]));
2243 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2244 MpegEncContext * const s= (MpegEncContext *)c;
2245 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2246 DCTELEM * const bak = temp+64;
2252 s->dsp.diff_pixels(temp, src1, src2, stride);
2254 memcpy(bak, temp, 64*sizeof(DCTELEM));
2256 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2257 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2258 ff_simple_idct_8(temp); //FIXME
2261 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2266 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2267 MpegEncContext * const s= (MpegEncContext *)c;
2268 const uint8_t *scantable= s->intra_scantable.permutated;
2269 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2270 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2271 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2272 int i, last, run, bits, level, distortion, start_i;
2273 const int esc_length= s->ac_esc_length;
2275 uint8_t * last_length;
2279 copy_block8(lsrc1, src1, 8, stride, 8);
2280 copy_block8(lsrc2, src2, 8, stride, 8);
2282 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2284 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2290 length = s->intra_ac_vlc_length;
2291 last_length= s->intra_ac_vlc_last_length;
2292 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2295 length = s->inter_ac_vlc_length;
2296 last_length= s->inter_ac_vlc_last_length;
2301 for(i=start_i; i<last; i++){
2302 int j= scantable[i];
2307 if((level&(~127)) == 0){
2308 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2317 level= temp[i] + 64;
2321 if((level&(~127)) == 0){
2322 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2330 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2332 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2335 s->dsp.idct_add(lsrc2, 8, temp);
2337 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2339 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2342 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2343 MpegEncContext * const s= (MpegEncContext *)c;
2344 const uint8_t *scantable= s->intra_scantable.permutated;
2345 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2346 int i, last, run, bits, level, start_i;
2347 const int esc_length= s->ac_esc_length;
2349 uint8_t * last_length;
2353 s->dsp.diff_pixels(temp, src1, src2, stride);
2355 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2361 length = s->intra_ac_vlc_length;
2362 last_length= s->intra_ac_vlc_last_length;
2363 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2366 length = s->inter_ac_vlc_length;
2367 last_length= s->inter_ac_vlc_last_length;
2372 for(i=start_i; i<last; i++){
2373 int j= scantable[i];
2378 if((level&(~127)) == 0){
2379 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2388 level= temp[i] + 64;
2392 if((level&(~127)) == 0){
2393 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2401 #define VSAD_INTRA(size) \
2402 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2406 for(y=1; y<h; y++){ \
2407 for(x=0; x<size; x+=4){ \
2408 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2409 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2419 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2424 for(x=0; x<16; x++){
2425 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2434 #define SQ(a) ((a)*(a))
2435 #define VSSE_INTRA(size) \
2436 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2440 for(y=1; y<h; y++){ \
2441 for(x=0; x<size; x+=4){ \
2442 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2443 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2453 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2458 for(x=0; x<16; x++){
2459 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2468 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2472 for(i=0; i<size; i++)
2473 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2477 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2478 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2479 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2481 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2483 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2484 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2485 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2486 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2488 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2491 for(i=0; i<len; i++)
2492 dst[i] = src0[i] * src1[-i];
2495 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2497 for(i=0; i<len; i++)
2498 dst[i] = src0[i] * src1[i] + src2[i];
2501 static void vector_fmul_window_c(float *dst, const float *src0,
2502 const float *src1, const float *win, int len)
2508 for(i=-len, j=len-1; i<0; i++, j--) {
2513 dst[i] = s0*wj - s1*wi;
2514 dst[j] = s0*wi + s1*wj;
2518 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2522 for (i = 0; i < len; i++)
2523 dst[i] = src[i] * mul;
2526 static void butterflies_float_c(float *av_restrict v1, float *av_restrict v2,
2530 for (i = 0; i < len; i++) {
2531 float t = v1[i] - v2[i];
2537 static void butterflies_float_interleave_c(float *dst, const float *src0,
2538 const float *src1, int len)
2541 for (i = 0; i < len; i++) {
2544 dst[2*i ] = f1 + f2;
2545 dst[2*i + 1] = f1 - f2;
2549 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2554 for (i = 0; i < len; i++)
2560 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2561 uint32_t maxi, uint32_t maxisign)
2564 if(a > mini) return mini;
2565 else if((a^(1U<<31)) > maxisign) return maxi;
2569 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2571 uint32_t mini = *(uint32_t*)min;
2572 uint32_t maxi = *(uint32_t*)max;
2573 uint32_t maxisign = maxi ^ (1U<<31);
2574 uint32_t *dsti = (uint32_t*)dst;
2575 const uint32_t *srci = (const uint32_t*)src;
2576 for(i=0; i<len; i+=8) {
2577 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2578 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2579 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2580 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2581 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2582 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2583 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2584 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2587 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2589 if(min < 0 && max > 0) {
2590 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2592 for(i=0; i < len; i+=8) {
2593 dst[i ] = av_clipf(src[i ], min, max);
2594 dst[i + 1] = av_clipf(src[i + 1], min, max);
2595 dst[i + 2] = av_clipf(src[i + 2], min, max);
2596 dst[i + 3] = av_clipf(src[i + 3], min, max);
2597 dst[i + 4] = av_clipf(src[i + 4], min, max);
2598 dst[i + 5] = av_clipf(src[i + 5], min, max);
2599 dst[i + 6] = av_clipf(src[i + 6], min, max);
2600 dst[i + 7] = av_clipf(src[i + 7], min, max);
2605 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2610 res += *v1++ * *v2++;
2615 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2620 *v1++ += mul * *v3++;
2625 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2626 const int16_t *window, unsigned int len)
2629 int len2 = len >> 1;
2631 for (i = 0; i < len2; i++) {
2632 int16_t w = window[i];
2633 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2634 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2638 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2639 int32_t max, unsigned int len)
2642 *dst++ = av_clip(*src++, min, max);
2643 *dst++ = av_clip(*src++, min, max);
2644 *dst++ = av_clip(*src++, min, max);
2645 *dst++ = av_clip(*src++, min, max);
2646 *dst++ = av_clip(*src++, min, max);
2647 *dst++ = av_clip(*src++, min, max);
2648 *dst++ = av_clip(*src++, min, max);
2649 *dst++ = av_clip(*src++, min, max);
2655 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2656 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2657 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2658 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2659 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2660 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2661 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2663 static void wmv2_idct_row(short * b)
2666 int a0,a1,a2,a3,a4,a5,a6,a7;
2668 a1 = W1*b[1]+W7*b[7];
2669 a7 = W7*b[1]-W1*b[7];
2670 a5 = W5*b[5]+W3*b[3];
2671 a3 = W3*b[5]-W5*b[3];
2672 a2 = W2*b[2]+W6*b[6];
2673 a6 = W6*b[2]-W2*b[6];
2674 a0 = W0*b[0]+W0*b[4];
2675 a4 = W0*b[0]-W0*b[4];
2677 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2678 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2680 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2681 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2682 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2683 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2684 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2685 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2686 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2687 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2689 static void wmv2_idct_col(short * b)
2692 int a0,a1,a2,a3,a4,a5,a6,a7;
2693 /*step 1, with extended precision*/
2694 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2695 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2696 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2697 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2698 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2699 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2700 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2701 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2703 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2704 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2706 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2707 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2708 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2709 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2711 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2712 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2713 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2714 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2716 void ff_wmv2_idct_c(short * block){
2720 wmv2_idct_row(block+i);
2723 wmv2_idct_col(block+i);
2726 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2728 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2730 ff_wmv2_idct_c(block);
2731 ff_put_pixels_clamped_c(block, dest, line_size);
2733 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2735 ff_wmv2_idct_c(block);
2736 ff_add_pixels_clamped_c(block, dest, line_size);
2738 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2740 ff_j_rev_dct (block);
2741 ff_put_pixels_clamped_c(block, dest, line_size);
2743 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2745 ff_j_rev_dct (block);
2746 ff_add_pixels_clamped_c(block, dest, line_size);
2749 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2751 ff_j_rev_dct4 (block);
2752 put_pixels_clamped4_c(block, dest, line_size);
2754 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2756 ff_j_rev_dct4 (block);
2757 add_pixels_clamped4_c(block, dest, line_size);
2760 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2762 ff_j_rev_dct2 (block);
2763 put_pixels_clamped2_c(block, dest, line_size);
2765 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2767 ff_j_rev_dct2 (block);
2768 add_pixels_clamped2_c(block, dest, line_size);
2771 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2773 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2775 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2777 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2780 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2782 /* init static data */
2783 av_cold void ff_dsputil_static_init(void)
2787 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2788 for(i=0;i<MAX_NEG_CROP;i++) {
2790 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2793 for(i=0;i<512;i++) {
2794 ff_squareTbl[i] = (i - 256) * (i - 256);
2797 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2800 int ff_check_alignment(void){
2801 static int did_fail=0;
2802 LOCAL_ALIGNED_16(int, aligned, [4]);
2804 if((intptr_t)aligned & 15){
2806 #if HAVE_MMX || HAVE_ALTIVEC
2807 av_log(NULL, AV_LOG_ERROR,
2808 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2809 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2810 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2811 "Do not report crashes to FFmpeg developers.\n");
2820 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2824 ff_check_alignment();
2827 if (avctx->bits_per_raw_sample == 10) {
2828 c->fdct = ff_jpeg_fdct_islow_10;
2829 c->fdct248 = ff_fdct248_islow_10;
2831 if(avctx->dct_algo==FF_DCT_FASTINT) {
2832 c->fdct = ff_fdct_ifast;
2833 c->fdct248 = ff_fdct_ifast248;
2835 else if(avctx->dct_algo==FF_DCT_FAAN) {
2836 c->fdct = ff_faandct;
2837 c->fdct248 = ff_faandct248;
2840 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2841 c->fdct248 = ff_fdct248_islow_8;
2844 #endif //CONFIG_ENCODERS
2846 if(avctx->lowres==1){
2847 c->idct_put= ff_jref_idct4_put;
2848 c->idct_add= ff_jref_idct4_add;
2849 c->idct = ff_j_rev_dct4;
2850 c->idct_permutation_type= FF_NO_IDCT_PERM;
2851 }else if(avctx->lowres==2){
2852 c->idct_put= ff_jref_idct2_put;
2853 c->idct_add= ff_jref_idct2_add;
2854 c->idct = ff_j_rev_dct2;
2855 c->idct_permutation_type= FF_NO_IDCT_PERM;
2856 }else if(avctx->lowres==3){
2857 c->idct_put= ff_jref_idct1_put;
2858 c->idct_add= ff_jref_idct1_add;
2859 c->idct = ff_j_rev_dct1;
2860 c->idct_permutation_type= FF_NO_IDCT_PERM;
2862 if (avctx->bits_per_raw_sample == 10) {
2863 c->idct_put = ff_simple_idct_put_10;
2864 c->idct_add = ff_simple_idct_add_10;
2865 c->idct = ff_simple_idct_10;
2866 c->idct_permutation_type = FF_NO_IDCT_PERM;
2868 if(avctx->idct_algo==FF_IDCT_INT){
2869 c->idct_put= ff_jref_idct_put;
2870 c->idct_add= ff_jref_idct_add;
2871 c->idct = ff_j_rev_dct;
2872 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2873 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2874 avctx->idct_algo==FF_IDCT_VP3){
2875 c->idct_put= ff_vp3_idct_put_c;
2876 c->idct_add= ff_vp3_idct_add_c;
2877 c->idct = ff_vp3_idct_c;
2878 c->idct_permutation_type= FF_NO_IDCT_PERM;
2879 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2880 c->idct_put= ff_wmv2_idct_put_c;
2881 c->idct_add= ff_wmv2_idct_add_c;
2882 c->idct = ff_wmv2_idct_c;
2883 c->idct_permutation_type= FF_NO_IDCT_PERM;
2884 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2885 c->idct_put= ff_faanidct_put;
2886 c->idct_add= ff_faanidct_add;
2887 c->idct = ff_faanidct;
2888 c->idct_permutation_type= FF_NO_IDCT_PERM;
2889 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2890 c->idct_put= ff_ea_idct_put_c;
2891 c->idct_permutation_type= FF_NO_IDCT_PERM;
2892 }else{ //accurate/default
2893 c->idct_put = ff_simple_idct_put_8;
2894 c->idct_add = ff_simple_idct_add_8;
2895 c->idct = ff_simple_idct_8;
2896 c->idct_permutation_type= FF_NO_IDCT_PERM;
2901 c->diff_pixels = diff_pixels_c;
2902 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2903 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2904 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2905 c->sum_abs_dctelem = sum_abs_dctelem_c;
2908 c->pix_sum = pix_sum_c;
2909 c->pix_norm1 = pix_norm1_c;
2911 c->fill_block_tab[0] = fill_block16_c;
2912 c->fill_block_tab[1] = fill_block8_c;
2914 /* TODO [0] 16 [1] 8 */
2915 c->pix_abs[0][0] = pix_abs16_c;
2916 c->pix_abs[0][1] = pix_abs16_x2_c;
2917 c->pix_abs[0][2] = pix_abs16_y2_c;
2918 c->pix_abs[0][3] = pix_abs16_xy2_c;
2919 c->pix_abs[1][0] = pix_abs8_c;
2920 c->pix_abs[1][1] = pix_abs8_x2_c;
2921 c->pix_abs[1][2] = pix_abs8_y2_c;
2922 c->pix_abs[1][3] = pix_abs8_xy2_c;
2924 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2925 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2926 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2927 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2928 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2929 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2930 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2931 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2932 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2934 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2935 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2936 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2937 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2938 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2939 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2940 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2941 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2942 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2944 #define dspfunc(PFX, IDX, NUM) \
2945 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2946 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2947 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2948 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2949 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2950 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2951 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2952 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2953 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2954 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2955 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2956 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2957 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2958 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2959 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2960 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2962 dspfunc(put_qpel, 0, 16);
2963 dspfunc(put_no_rnd_qpel, 0, 16);
2965 dspfunc(avg_qpel, 0, 16);
2966 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2968 dspfunc(put_qpel, 1, 8);
2969 dspfunc(put_no_rnd_qpel, 1, 8);
2971 dspfunc(avg_qpel, 1, 8);
2972 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2976 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2977 ff_mlp_init(c, avctx);
2979 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2980 ff_intrax8dsp_init(c,avctx);
2983 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2984 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2985 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2986 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2987 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2988 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2989 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2990 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2992 #define SET_CMP_FUNC(name) \
2993 c->name[0]= name ## 16_c;\
2994 c->name[1]= name ## 8x8_c;
2996 SET_CMP_FUNC(hadamard8_diff)
2997 c->hadamard8_diff[4]= hadamard8_intra16_c;
2998 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2999 SET_CMP_FUNC(dct_sad)
3000 SET_CMP_FUNC(dct_max)
3002 SET_CMP_FUNC(dct264_sad)
3004 c->sad[0]= pix_abs16_c;
3005 c->sad[1]= pix_abs8_c;
3009 SET_CMP_FUNC(quant_psnr)
3012 c->vsad[0]= vsad16_c;
3013 c->vsad[4]= vsad_intra16_c;
3014 c->vsad[5]= vsad_intra8_c;
3015 c->vsse[0]= vsse16_c;
3016 c->vsse[4]= vsse_intra16_c;
3017 c->vsse[5]= vsse_intra8_c;
3018 c->nsse[0]= nsse16_c;
3019 c->nsse[1]= nsse8_c;
3021 ff_dsputil_init_dwt(c);
3024 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3026 c->add_bytes= add_bytes_c;
3027 c->diff_bytes= diff_bytes_c;
3028 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3029 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3030 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3031 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3032 c->bswap_buf= bswap_buf;
3033 c->bswap16_buf = bswap16_buf;
3035 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3036 c->h263_h_loop_filter= h263_h_loop_filter_c;
3037 c->h263_v_loop_filter= h263_v_loop_filter_c;
3040 if (CONFIG_VP3_DECODER) {
3041 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3042 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3043 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3046 c->h261_loop_filter= h261_loop_filter_c;
3048 c->try_8x8basis= try_8x8basis_c;
3049 c->add_8x8basis= add_8x8basis_c;
3051 #if CONFIG_VORBIS_DECODER
3052 c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3054 #if CONFIG_AC3_DECODER
3055 c->ac3_downmix = ff_ac3_downmix_c;
3057 c->vector_fmul_reverse = vector_fmul_reverse_c;
3058 c->vector_fmul_add = vector_fmul_add_c;
3059 c->vector_fmul_window = vector_fmul_window_c;
3060 c->vector_clipf = vector_clipf_c;
3061 c->scalarproduct_int16 = scalarproduct_int16_c;
3062 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3063 c->apply_window_int16 = apply_window_int16_c;
3064 c->vector_clip_int32 = vector_clip_int32_c;
3065 c->scalarproduct_float = scalarproduct_float_c;
3066 c->butterflies_float = butterflies_float_c;
3067 c->butterflies_float_interleave = butterflies_float_interleave_c;
3068 c->vector_fmul_scalar = vector_fmul_scalar_c;
3070 c->shrink[0]= av_image_copy_plane;
3071 c->shrink[1]= ff_shrink22;
3072 c->shrink[2]= ff_shrink44;
3073 c->shrink[3]= ff_shrink88;
3075 c->prefetch= just_return;
3077 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3078 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3082 #define FUNC(f, depth) f ## _ ## depth
3083 #define FUNCC(f, depth) f ## _ ## depth ## _c
3085 #define dspfunc1(PFX, IDX, NUM, depth)\
3086 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3087 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3088 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3089 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3091 #define dspfunc2(PFX, IDX, NUM, depth)\
3092 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3093 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3094 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3095 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3096 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3097 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3098 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3099 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3100 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3101 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3102 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3103 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3104 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3105 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3106 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3107 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3110 #define BIT_DEPTH_FUNCS(depth, dct)\
3111 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3112 c->draw_edges = FUNCC(draw_edges , depth);\
3113 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3114 c->clear_block = FUNCC(clear_block ## dct , depth);\
3115 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3116 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3117 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3118 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3119 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3121 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3122 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3123 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3124 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3125 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3126 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3128 dspfunc1(put , 0, 16, depth);\
3129 dspfunc1(put , 1, 8, depth);\
3130 dspfunc1(put , 2, 4, depth);\
3131 dspfunc1(put , 3, 2, depth);\
3132 dspfunc1(put_no_rnd, 0, 16, depth);\
3133 dspfunc1(put_no_rnd, 1, 8, depth);\
3134 dspfunc1(avg , 0, 16, depth);\
3135 dspfunc1(avg , 1, 8, depth);\
3136 dspfunc1(avg , 2, 4, depth);\
3137 dspfunc1(avg , 3, 2, depth);\
3138 dspfunc1(avg_no_rnd, 0, 16, depth);\
3139 dspfunc1(avg_no_rnd, 1, 8, depth);\
3141 dspfunc2(put_h264_qpel, 0, 16, depth);\
3142 dspfunc2(put_h264_qpel, 1, 8, depth);\
3143 dspfunc2(put_h264_qpel, 2, 4, depth);\
3144 dspfunc2(put_h264_qpel, 3, 2, depth);\
3145 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3146 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3147 dspfunc2(avg_h264_qpel, 2, 4, depth);
3149 switch (avctx->bits_per_raw_sample) {
3151 if (c->dct_bits == 32) {
3152 BIT_DEPTH_FUNCS(9, _32);
3154 BIT_DEPTH_FUNCS(9, _16);
3158 if (c->dct_bits == 32) {
3159 BIT_DEPTH_FUNCS(10, _32);
3161 BIT_DEPTH_FUNCS(10, _16);
3165 if (c->dct_bits == 32) {
3166 BIT_DEPTH_FUNCS(12, _32);
3168 BIT_DEPTH_FUNCS(12, _16);
3172 if (c->dct_bits == 32) {
3173 BIT_DEPTH_FUNCS(14, _32);
3175 BIT_DEPTH_FUNCS(14, _16);
3179 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
3180 BIT_DEPTH_FUNCS(8, _16);
3186 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
3187 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
3188 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
3189 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
3190 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
3191 if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
3192 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
3193 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
3195 for (i = 0; i < 4; i++) {
3196 for (j = 0; j < 16; j++) {
3197 if(!c->put_2tap_qpel_pixels_tab[i][j])
3198 c->put_2tap_qpel_pixels_tab[i][j] =
3199 c->put_h264_qpel_pixels_tab[i][j];
3200 if(!c->avg_2tap_qpel_pixels_tab[i][j])
3201 c->avg_2tap_qpel_pixels_tab[i][j] =
3202 c->avg_h264_qpel_pixels_tab[i][j];
3206 ff_init_scantable_permutation(c->idct_permutation,
3207 c->idct_permutation_type);
3210 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3212 ff_dsputil_init(c, avctx);