3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
139 j = st->permutated[i];
141 st->raster_end[i]= end;
145 void ff_init_scantable_permutation(uint8_t *idct_permutation,
146 int idct_permutation_type)
150 switch(idct_permutation_type){
151 case FF_NO_IDCT_PERM:
153 idct_permutation[i]= i;
155 case FF_LIBMPEG2_IDCT_PERM:
157 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
159 case FF_SIMPLE_IDCT_PERM:
161 idct_permutation[i]= simple_mmx_permutation[i];
163 case FF_TRANSPOSE_IDCT_PERM:
165 idct_permutation[i]= ((i&7)<<3) | (i>>3);
167 case FF_PARTTRANS_IDCT_PERM:
169 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
171 case FF_SSE2_IDCT_PERM:
173 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
176 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
180 static int pix_sum_c(uint8_t * pix, int line_size)
185 for (i = 0; i < 16; i++) {
186 for (j = 0; j < 16; j += 8) {
197 pix += line_size - 16;
202 static int pix_norm1_c(uint8_t * pix, int line_size)
205 uint32_t *sq = ff_squareTbl + 256;
208 for (i = 0; i < 16; i++) {
209 for (j = 0; j < 16; j += 8) {
221 register uint64_t x=*(uint64_t*)pix;
223 s += sq[(x>>8)&0xff];
224 s += sq[(x>>16)&0xff];
225 s += sq[(x>>24)&0xff];
226 s += sq[(x>>32)&0xff];
227 s += sq[(x>>40)&0xff];
228 s += sq[(x>>48)&0xff];
229 s += sq[(x>>56)&0xff];
231 register uint32_t x=*(uint32_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 x=*(uint32_t*)(pix+4);
238 s += sq[(x>>8)&0xff];
239 s += sq[(x>>16)&0xff];
240 s += sq[(x>>24)&0xff];
245 pix += line_size - 16;
250 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
253 for(i=0; i+8<=w; i+=8){
254 dst[i+0]= av_bswap32(src[i+0]);
255 dst[i+1]= av_bswap32(src[i+1]);
256 dst[i+2]= av_bswap32(src[i+2]);
257 dst[i+3]= av_bswap32(src[i+3]);
258 dst[i+4]= av_bswap32(src[i+4]);
259 dst[i+5]= av_bswap32(src[i+5]);
260 dst[i+6]= av_bswap32(src[i+6]);
261 dst[i+7]= av_bswap32(src[i+7]);
264 dst[i+0]= av_bswap32(src[i+0]);
268 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
271 *dst++ = av_bswap16(*src++);
274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 uint32_t *sq = ff_squareTbl + 256;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 uint32_t *sq = ff_squareTbl + 256;
297 for (i = 0; i < h; i++) {
298 s += sq[pix1[0] - pix2[0]];
299 s += sq[pix1[1] - pix2[1]];
300 s += sq[pix1[2] - pix2[2]];
301 s += sq[pix1[3] - pix2[3]];
302 s += sq[pix1[4] - pix2[4]];
303 s += sq[pix1[5] - pix2[5]];
304 s += sq[pix1[6] - pix2[6]];
305 s += sq[pix1[7] - pix2[7]];
312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 uint32_t *sq = ff_squareTbl + 256;
318 for (i = 0; i < h; i++) {
319 s += sq[pix1[ 0] - pix2[ 0]];
320 s += sq[pix1[ 1] - pix2[ 1]];
321 s += sq[pix1[ 2] - pix2[ 2]];
322 s += sq[pix1[ 3] - pix2[ 3]];
323 s += sq[pix1[ 4] - pix2[ 4]];
324 s += sq[pix1[ 5] - pix2[ 5]];
325 s += sq[pix1[ 6] - pix2[ 6]];
326 s += sq[pix1[ 7] - pix2[ 7]];
327 s += sq[pix1[ 8] - pix2[ 8]];
328 s += sq[pix1[ 9] - pix2[ 9]];
329 s += sq[pix1[10] - pix2[10]];
330 s += sq[pix1[11] - pix2[11]];
331 s += sq[pix1[12] - pix2[12]];
332 s += sq[pix1[13] - pix2[13]];
333 s += sq[pix1[14] - pix2[14]];
334 s += sq[pix1[15] - pix2[15]];
342 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
343 const uint8_t *s2, int stride){
346 /* read the pixels */
348 block[0] = s1[0] - s2[0];
349 block[1] = s1[1] - s2[1];
350 block[2] = s1[2] - s2[2];
351 block[3] = s1[3] - s2[3];
352 block[4] = s1[4] - s2[4];
353 block[5] = s1[5] - s2[5];
354 block[6] = s1[6] - s2[6];
355 block[7] = s1[7] - s2[7];
363 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
368 /* read the pixels */
370 pixels[0] = av_clip_uint8(block[0]);
371 pixels[1] = av_clip_uint8(block[1]);
372 pixels[2] = av_clip_uint8(block[2]);
373 pixels[3] = av_clip_uint8(block[3]);
374 pixels[4] = av_clip_uint8(block[4]);
375 pixels[5] = av_clip_uint8(block[5]);
376 pixels[6] = av_clip_uint8(block[6]);
377 pixels[7] = av_clip_uint8(block[7]);
384 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
389 /* read the pixels */
391 pixels[0] = av_clip_uint8(block[0]);
392 pixels[1] = av_clip_uint8(block[1]);
393 pixels[2] = av_clip_uint8(block[2]);
394 pixels[3] = av_clip_uint8(block[3]);
401 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
406 /* read the pixels */
408 pixels[0] = av_clip_uint8(block[0]);
409 pixels[1] = av_clip_uint8(block[1]);
416 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
417 uint8_t *restrict pixels,
422 for (i = 0; i < 8; i++) {
423 for (j = 0; j < 8; j++) {
426 else if (*block > 127)
429 *pixels = (uint8_t)(*block + 128);
433 pixels += (line_size - 8);
437 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
442 /* read the pixels */
444 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
445 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
446 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
447 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
448 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
449 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
450 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
451 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
457 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 /* read the pixels */
464 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
465 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
466 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
467 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
473 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478 /* read the pixels */
480 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
481 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
487 static int sum_abs_dctelem_c(DCTELEM *block)
491 sum+= FFABS(block[i]);
495 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
499 for (i = 0; i < h; i++) {
500 memset(block, value, 16);
505 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
509 for (i = 0; i < h; i++) {
510 memset(block, value, 8);
515 #define avg2(a,b) ((a+b+1)>>1)
516 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
518 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
520 const int A=(16-x16)*(16-y16);
521 const int B=( x16)*(16-y16);
522 const int C=(16-x16)*( y16);
523 const int D=( x16)*( y16);
528 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
529 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
530 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
531 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
532 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
533 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
534 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
535 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
541 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
542 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
545 const int s= 1<<shift;
555 for(x=0; x<8; x++){ //XXX FIXME optimize
556 int src_x, src_y, frac_x, frac_y, index;
565 if((unsigned)src_x < width){
566 if((unsigned)src_y < height){
567 index= src_x + src_y*stride;
568 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
569 + src[index +1]* frac_x )*(s-frac_y)
570 + ( src[index+stride ]*(s-frac_x)
571 + src[index+stride+1]* frac_x )* frac_y
574 index= src_x + av_clip(src_y, 0, height)*stride;
575 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
576 + src[index +1]* frac_x )*s
580 if((unsigned)src_y < height){
581 index= av_clip(src_x, 0, width) + src_y*stride;
582 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
583 + src[index+stride ]* frac_y )*s
586 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
587 dst[y*stride + x]= src[index ];
599 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
601 case 2: put_pixels2_8_c (dst, src, stride, height); break;
602 case 4: put_pixels4_8_c (dst, src, stride, height); break;
603 case 8: put_pixels8_8_c (dst, src, stride, height); break;
604 case 16:put_pixels16_8_c(dst, src, stride, height); break;
608 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
610 for (i=0; i < height; i++) {
611 for (j=0; j < width; j++) {
612 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
619 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
621 for (i=0; i < height; i++) {
622 for (j=0; j < width; j++) {
623 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
630 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
632 for (i=0; i < height; i++) {
633 for (j=0; j < width; j++) {
634 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
641 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
643 for (i=0; i < height; i++) {
644 for (j=0; j < width; j++) {
645 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
652 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
654 for (i=0; i < height; i++) {
655 for (j=0; j < width; j++) {
656 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
663 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
665 for (i=0; i < height; i++) {
666 for (j=0; j < width; j++) {
667 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
674 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
676 for (i=0; i < height; i++) {
677 for (j=0; j < width; j++) {
678 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
685 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
687 for (i=0; i < height; i++) {
688 for (j=0; j < width; j++) {
689 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
696 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
698 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
699 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
700 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
701 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
705 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707 for (i=0; i < height; i++) {
708 for (j=0; j < width; j++) {
709 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
716 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
718 for (i=0; i < height; i++) {
719 for (j=0; j < width; j++) {
720 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
727 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
729 for (i=0; i < height; i++) {
730 for (j=0; j < width; j++) {
731 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
738 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
740 for (i=0; i < height; i++) {
741 for (j=0; j < width; j++) {
742 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
749 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
751 for (i=0; i < height; i++) {
752 for (j=0; j < width; j++) {
753 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
760 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
762 for (i=0; i < height; i++) {
763 for (j=0; j < width; j++) {
764 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
771 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
773 for (i=0; i < height; i++) {
774 for (j=0; j < width; j++) {
775 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
782 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
784 for (i=0; i < height; i++) {
785 for (j=0; j < width; j++) {
786 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
793 #define QPEL_MC(r, OPNAME, RND, OP) \
794 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
795 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
799 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
800 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
801 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
802 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
803 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
804 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
805 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
806 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
812 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
814 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
818 const int src0= src[0*srcStride];\
819 const int src1= src[1*srcStride];\
820 const int src2= src[2*srcStride];\
821 const int src3= src[3*srcStride];\
822 const int src4= src[4*srcStride];\
823 const int src5= src[5*srcStride];\
824 const int src6= src[6*srcStride];\
825 const int src7= src[7*srcStride];\
826 const int src8= src[8*srcStride];\
827 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
828 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
829 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
830 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
831 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
832 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
833 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
834 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
840 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
841 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
846 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
847 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
848 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
849 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
850 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
851 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
852 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
853 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
854 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
855 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
856 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
857 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
858 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
859 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
860 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
861 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
867 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
868 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
873 const int src0= src[0*srcStride];\
874 const int src1= src[1*srcStride];\
875 const int src2= src[2*srcStride];\
876 const int src3= src[3*srcStride];\
877 const int src4= src[4*srcStride];\
878 const int src5= src[5*srcStride];\
879 const int src6= src[6*srcStride];\
880 const int src7= src[7*srcStride];\
881 const int src8= src[8*srcStride];\
882 const int src9= src[9*srcStride];\
883 const int src10= src[10*srcStride];\
884 const int src11= src[11*srcStride];\
885 const int src12= src[12*srcStride];\
886 const int src13= src[13*srcStride];\
887 const int src14= src[14*srcStride];\
888 const int src15= src[15*srcStride];\
889 const int src16= src[16*srcStride];\
890 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
891 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
892 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
893 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
894 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
895 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
896 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
897 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
898 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
899 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
900 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
901 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
902 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
903 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
904 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
905 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
911 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
913 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
914 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
917 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
918 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
921 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
923 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
924 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
927 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
930 copy_block9(full, src, 16, stride, 9);\
931 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
932 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
935 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
937 copy_block9(full, src, 16, stride, 9);\
938 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
941 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
944 copy_block9(full, src, 16, stride, 9);\
945 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
946 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
948 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
953 copy_block9(full, src, 16, stride, 9);\
954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
955 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
957 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
959 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
963 copy_block9(full, src, 16, stride, 9);\
964 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
965 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
967 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
969 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
976 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
977 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
978 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
980 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
984 copy_block9(full, src, 16, stride, 9);\
985 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
986 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
988 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
990 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1001 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t full[16*9];\
1004 uint8_t halfHV[64];\
1005 copy_block9(full, src, 16, stride, 9);\
1006 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1011 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1012 uint8_t full[16*9];\
1015 uint8_t halfHV[64];\
1016 copy_block9(full, src, 16, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1018 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1019 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1022 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1023 uint8_t full[16*9];\
1025 uint8_t halfHV[64];\
1026 copy_block9(full, src, 16, stride, 9);\
1027 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1028 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1029 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1030 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1032 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1034 uint8_t halfHV[64];\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1037 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1039 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t halfHV[64];\
1042 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1043 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1044 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1046 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1047 uint8_t full[16*9];\
1050 uint8_t halfHV[64];\
1051 copy_block9(full, src, 16, stride, 9);\
1052 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1053 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1054 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1055 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1057 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1058 uint8_t full[16*9];\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1063 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1065 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066 uint8_t full[16*9];\
1069 uint8_t halfHV[64];\
1070 copy_block9(full, src, 16, stride, 9);\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1076 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1079 copy_block9(full, src, 16, stride, 9);\
1080 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1082 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1084 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1090 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1092 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1093 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1096 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1097 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1100 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1102 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1103 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1106 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1107 uint8_t full[24*17];\
1109 copy_block17(full, src, 24, stride, 17);\
1110 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1111 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1114 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1115 uint8_t full[24*17];\
1116 copy_block17(full, src, 24, stride, 17);\
1117 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1120 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t full[24*17];\
1123 copy_block17(full, src, 24, stride, 17);\
1124 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1125 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1127 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1128 uint8_t full[24*17];\
1129 uint8_t halfH[272];\
1130 uint8_t halfV[256];\
1131 uint8_t halfHV[256];\
1132 copy_block17(full, src, 24, stride, 17);\
1133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1134 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1136 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1138 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1139 uint8_t full[24*17];\
1140 uint8_t halfH[272];\
1141 uint8_t halfHV[256];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1144 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1146 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1148 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1149 uint8_t full[24*17];\
1150 uint8_t halfH[272];\
1151 uint8_t halfV[256];\
1152 uint8_t halfHV[256];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1159 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1160 uint8_t full[24*17];\
1161 uint8_t halfH[272];\
1162 uint8_t halfHV[256];\
1163 copy_block17(full, src, 24, stride, 17);\
1164 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1169 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1170 uint8_t full[24*17];\
1171 uint8_t halfH[272];\
1172 uint8_t halfV[256];\
1173 uint8_t halfHV[256];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1181 uint8_t full[24*17];\
1182 uint8_t halfH[272];\
1183 uint8_t halfHV[256];\
1184 copy_block17(full, src, 24, stride, 17);\
1185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1190 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1191 uint8_t full[24*17];\
1192 uint8_t halfH[272];\
1193 uint8_t halfV[256];\
1194 uint8_t halfHV[256];\
1195 copy_block17(full, src, 24, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1197 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1201 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1202 uint8_t full[24*17];\
1203 uint8_t halfH[272];\
1204 uint8_t halfHV[256];\
1205 copy_block17(full, src, 24, stride, 17);\
1206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1207 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1208 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1209 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1211 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1216 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1218 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1219 uint8_t halfH[272];\
1220 uint8_t halfHV[256];\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1223 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1225 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1226 uint8_t full[24*17];\
1227 uint8_t halfH[272];\
1228 uint8_t halfV[256];\
1229 uint8_t halfHV[256];\
1230 copy_block17(full, src, 24, stride, 17);\
1231 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1232 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1233 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1234 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1236 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1237 uint8_t full[24*17];\
1238 uint8_t halfH[272];\
1239 copy_block17(full, src, 24, stride, 17);\
1240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1242 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1244 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 uint8_t halfV[256];\
1248 uint8_t halfHV[256];\
1249 copy_block17(full, src, 24, stride, 17);\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1255 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 copy_block17(full, src, 24, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1261 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1263 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t halfH[272];\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1269 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1270 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1271 #define op_put(a, b) a = cm[((b) + 16)>>5]
1272 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1274 QPEL_MC(0, put_ , _ , op_put)
1275 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1276 QPEL_MC(0, avg_ , _ , op_avg)
1277 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1279 #undef op_avg_no_rnd
1281 #undef op_put_no_rnd
1283 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1284 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1285 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1286 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1287 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1288 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1290 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1291 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1295 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1296 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1297 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1298 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1299 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1300 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1301 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1302 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1308 #if CONFIG_RV40_DECODER
1309 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1310 put_pixels16_xy2_8_c(dst, src, stride, 16);
1312 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1313 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1315 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1316 put_pixels8_xy2_8_c(dst, src, stride, 8);
1318 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1321 #endif /* CONFIG_RV40_DECODER */
1323 #if CONFIG_DIRAC_DECODER
1324 #define DIRAC_MC(OPNAME)\
1325 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1327 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1329 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1331 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1333 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1335 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1336 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1338 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1340 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1342 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1344 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1346 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1348 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1349 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1351 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1353 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1355 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1359 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1362 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1368 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1369 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1373 const int src_1= src[ -srcStride];
1374 const int src0 = src[0 ];
1375 const int src1 = src[ srcStride];
1376 const int src2 = src[2*srcStride];
1377 const int src3 = src[3*srcStride];
1378 const int src4 = src[4*srcStride];
1379 const int src5 = src[5*srcStride];
1380 const int src6 = src[6*srcStride];
1381 const int src7 = src[7*srcStride];
1382 const int src8 = src[8*srcStride];
1383 const int src9 = src[9*srcStride];
1384 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1385 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1386 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1387 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1388 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1389 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1390 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1391 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1397 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1399 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1400 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1403 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1404 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1407 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1409 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1410 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1413 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1414 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1417 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1423 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1424 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1426 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1430 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1431 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1432 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1433 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1435 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1437 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1438 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1441 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1442 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1444 const int strength= ff_h263_loop_filter_strength[qscale];
1448 int p0= src[x-2*stride];
1449 int p1= src[x-1*stride];
1450 int p2= src[x+0*stride];
1451 int p3= src[x+1*stride];
1452 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1454 if (d<-2*strength) d1= 0;
1455 else if(d<- strength) d1=-2*strength - d;
1456 else if(d< strength) d1= d;
1457 else if(d< 2*strength) d1= 2*strength - d;
1462 if(p1&256) p1= ~(p1>>31);
1463 if(p2&256) p2= ~(p2>>31);
1465 src[x-1*stride] = p1;
1466 src[x+0*stride] = p2;
1470 d2= av_clip((p0-p3)/4, -ad1, ad1);
1472 src[x-2*stride] = p0 - d2;
1473 src[x+ stride] = p3 + d2;
1478 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1479 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1481 const int strength= ff_h263_loop_filter_strength[qscale];
1485 int p0= src[y*stride-2];
1486 int p1= src[y*stride-1];
1487 int p2= src[y*stride+0];
1488 int p3= src[y*stride+1];
1489 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1491 if (d<-2*strength) d1= 0;
1492 else if(d<- strength) d1=-2*strength - d;
1493 else if(d< strength) d1= d;
1494 else if(d< 2*strength) d1= 2*strength - d;
1499 if(p1&256) p1= ~(p1>>31);
1500 if(p2&256) p2= ~(p2>>31);
1502 src[y*stride-1] = p1;
1503 src[y*stride+0] = p2;
1507 d2= av_clip((p0-p3)/4, -ad1, ad1);
1509 src[y*stride-2] = p0 - d2;
1510 src[y*stride+1] = p3 + d2;
1515 static void h261_loop_filter_c(uint8_t *src, int stride){
1520 temp[x ] = 4*src[x ];
1521 temp[x + 7*8] = 4*src[x + 7*stride];
1525 xy = y * stride + x;
1527 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1532 src[ y*stride] = (temp[ y*8] + 2)>>2;
1533 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1535 xy = y * stride + x;
1537 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1542 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1548 s += abs(pix1[0] - pix2[0]);
1549 s += abs(pix1[1] - pix2[1]);
1550 s += abs(pix1[2] - pix2[2]);
1551 s += abs(pix1[3] - pix2[3]);
1552 s += abs(pix1[4] - pix2[4]);
1553 s += abs(pix1[5] - pix2[5]);
1554 s += abs(pix1[6] - pix2[6]);
1555 s += abs(pix1[7] - pix2[7]);
1556 s += abs(pix1[8] - pix2[8]);
1557 s += abs(pix1[9] - pix2[9]);
1558 s += abs(pix1[10] - pix2[10]);
1559 s += abs(pix1[11] - pix2[11]);
1560 s += abs(pix1[12] - pix2[12]);
1561 s += abs(pix1[13] - pix2[13]);
1562 s += abs(pix1[14] - pix2[14]);
1563 s += abs(pix1[15] - pix2[15]);
1570 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1576 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1577 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1578 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1579 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1580 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1581 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1582 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1583 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1584 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1585 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1586 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1587 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1588 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1589 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1590 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1591 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1598 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1601 uint8_t *pix3 = pix2 + line_size;
1605 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1606 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1607 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1608 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1609 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1610 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1611 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1612 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1613 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1614 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1615 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1616 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1617 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1618 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1619 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1620 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1628 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1631 uint8_t *pix3 = pix2 + line_size;
1635 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1636 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1637 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1638 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1639 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1640 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1641 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1642 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1643 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1644 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1645 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1646 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1647 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1648 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1649 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1650 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1658 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664 s += abs(pix1[0] - pix2[0]);
1665 s += abs(pix1[1] - pix2[1]);
1666 s += abs(pix1[2] - pix2[2]);
1667 s += abs(pix1[3] - pix2[3]);
1668 s += abs(pix1[4] - pix2[4]);
1669 s += abs(pix1[5] - pix2[5]);
1670 s += abs(pix1[6] - pix2[6]);
1671 s += abs(pix1[7] - pix2[7]);
1678 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1685 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1686 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1687 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1688 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1689 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1690 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1691 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1698 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1701 uint8_t *pix3 = pix2 + line_size;
1705 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1706 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1707 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1708 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1709 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1710 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1711 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1712 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1720 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1723 uint8_t *pix3 = pix2 + line_size;
1727 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1728 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1729 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1730 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1731 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1732 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1733 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1734 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1742 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743 MpegEncContext *c = v;
1749 for(x=0; x<16; x++){
1750 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1753 for(x=0; x<15; x++){
1754 score2+= FFABS( s1[x ] - s1[x +stride]
1755 - s1[x+1] + s1[x+1+stride])
1756 -FFABS( s2[x ] - s2[x +stride]
1757 - s2[x+1] + s2[x+1+stride]);
1764 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765 else return score1 + FFABS(score2)*8;
1768 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1769 MpegEncContext *c = v;
1776 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1780 score2+= FFABS( s1[x ] - s1[x +stride]
1781 - s1[x+1] + s1[x+1+stride])
1782 -FFABS( s2[x ] - s2[x +stride]
1783 - s2[x+1] + s2[x+1+stride]);
1790 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1791 else return score1 + FFABS(score2)*8;
1794 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1798 for(i=0; i<8*8; i++){
1799 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1802 assert(-512<b && b<512);
1804 sum += (w*b)*(w*b)>>4;
1809 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1812 for(i=0; i<8*8; i++){
1813 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1818 * Permute an 8x8 block.
1819 * @param block the block which will be permuted according to the given permutation vector
1820 * @param permutation the permutation vector
1821 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1822 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1823 * (inverse) permutated to scantable order!
1825 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1831 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1833 for(i=0; i<=last; i++){
1834 const int j= scantable[i];
1839 for(i=0; i<=last; i++){
1840 const int j= scantable[i];
1841 const int perm_j= permutation[j];
1842 block[perm_j]= temp[j];
1846 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1850 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1853 memset(cmp, 0, sizeof(void*)*6);
1861 cmp[i]= c->hadamard8_diff[i];
1867 cmp[i]= c->dct_sad[i];
1870 cmp[i]= c->dct264_sad[i];
1873 cmp[i]= c->dct_max[i];
1876 cmp[i]= c->quant_psnr[i];
1905 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1910 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1912 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1913 long a = *(long*)(src+i);
1914 long b = *(long*)(dst+i);
1915 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1918 dst[i+0] += src[i+0];
1921 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1923 #if !HAVE_FAST_UNALIGNED
1924 if((long)src2 & (sizeof(long)-1)){
1925 for(i=0; i+7<w; i+=8){
1926 dst[i+0] = src1[i+0]-src2[i+0];
1927 dst[i+1] = src1[i+1]-src2[i+1];
1928 dst[i+2] = src1[i+2]-src2[i+2];
1929 dst[i+3] = src1[i+3]-src2[i+3];
1930 dst[i+4] = src1[i+4]-src2[i+4];
1931 dst[i+5] = src1[i+5]-src2[i+5];
1932 dst[i+6] = src1[i+6]-src2[i+6];
1933 dst[i+7] = src1[i+7]-src2[i+7];
1937 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1938 long a = *(long*)(src1+i);
1939 long b = *(long*)(src2+i);
1940 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1943 dst[i+0] = src1[i+0]-src2[i+0];
1946 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1954 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1963 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1971 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1981 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1984 for(i=0; i<w-1; i++){
2011 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2041 #define BUTTERFLY2(o1,o2,i1,i2) \
2045 #define BUTTERFLY1(x,y) \
2054 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2056 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2064 //FIXME try pointer walks
2065 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2066 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2067 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2068 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2070 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2071 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2072 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2073 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2075 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2076 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2077 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2078 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2082 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2083 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2084 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2085 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2087 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2088 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2089 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2090 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2093 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2094 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2095 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2096 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2101 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2109 //FIXME try pointer walks
2110 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2111 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2112 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2113 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2115 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2116 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2117 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2118 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2120 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2121 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2122 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2123 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2127 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2128 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2129 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2130 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2132 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2133 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2134 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2135 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2138 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2139 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2140 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2141 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2144 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2149 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2150 MpegEncContext * const s= (MpegEncContext *)c;
2151 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2155 s->dsp.diff_pixels(temp, src1, src2, stride);
2157 return s->dsp.sum_abs_dctelem(temp);
2162 const int s07 = SRC(0) + SRC(7);\
2163 const int s16 = SRC(1) + SRC(6);\
2164 const int s25 = SRC(2) + SRC(5);\
2165 const int s34 = SRC(3) + SRC(4);\
2166 const int a0 = s07 + s34;\
2167 const int a1 = s16 + s25;\
2168 const int a2 = s07 - s34;\
2169 const int a3 = s16 - s25;\
2170 const int d07 = SRC(0) - SRC(7);\
2171 const int d16 = SRC(1) - SRC(6);\
2172 const int d25 = SRC(2) - SRC(5);\
2173 const int d34 = SRC(3) - SRC(4);\
2174 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2175 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2176 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2177 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2179 DST(1, a4 + (a7>>2)) ;\
2180 DST(2, a2 + (a3>>1)) ;\
2181 DST(3, a5 + (a6>>2)) ;\
2183 DST(5, a6 - (a5>>2)) ;\
2184 DST(6, (a2>>1) - a3 ) ;\
2185 DST(7, (a4>>2) - a7 ) ;\
2188 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2189 MpegEncContext * const s= (MpegEncContext *)c;
2194 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2196 #define SRC(x) dct[i][x]
2197 #define DST(x,v) dct[i][x]= v
2198 for( i = 0; i < 8; i++ )
2203 #define SRC(x) dct[x][i]
2204 #define DST(x,v) sum += FFABS(v)
2205 for( i = 0; i < 8; i++ )
2213 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2214 MpegEncContext * const s= (MpegEncContext *)c;
2215 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2220 s->dsp.diff_pixels(temp, src1, src2, stride);
2224 sum= FFMAX(sum, FFABS(temp[i]));
2229 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2230 MpegEncContext * const s= (MpegEncContext *)c;
2231 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2232 DCTELEM * const bak = temp+64;
2238 s->dsp.diff_pixels(temp, src1, src2, stride);
2240 memcpy(bak, temp, 64*sizeof(DCTELEM));
2242 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2243 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2244 ff_simple_idct_8(temp); //FIXME
2247 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2252 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2253 MpegEncContext * const s= (MpegEncContext *)c;
2254 const uint8_t *scantable= s->intra_scantable.permutated;
2255 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2256 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2257 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2258 int i, last, run, bits, level, distortion, start_i;
2259 const int esc_length= s->ac_esc_length;
2261 uint8_t * last_length;
2265 copy_block8(lsrc1, src1, 8, stride, 8);
2266 copy_block8(lsrc2, src2, 8, stride, 8);
2268 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2270 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2276 length = s->intra_ac_vlc_length;
2277 last_length= s->intra_ac_vlc_last_length;
2278 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2281 length = s->inter_ac_vlc_length;
2282 last_length= s->inter_ac_vlc_last_length;
2287 for(i=start_i; i<last; i++){
2288 int j= scantable[i];
2293 if((level&(~127)) == 0){
2294 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2303 level= temp[i] + 64;
2307 if((level&(~127)) == 0){
2308 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2316 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2318 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2321 s->dsp.idct_add(lsrc2, 8, temp);
2323 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2325 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2328 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2329 MpegEncContext * const s= (MpegEncContext *)c;
2330 const uint8_t *scantable= s->intra_scantable.permutated;
2331 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2332 int i, last, run, bits, level, start_i;
2333 const int esc_length= s->ac_esc_length;
2335 uint8_t * last_length;
2339 s->dsp.diff_pixels(temp, src1, src2, stride);
2341 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2347 length = s->intra_ac_vlc_length;
2348 last_length= s->intra_ac_vlc_last_length;
2349 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2352 length = s->inter_ac_vlc_length;
2353 last_length= s->inter_ac_vlc_last_length;
2358 for(i=start_i; i<last; i++){
2359 int j= scantable[i];
2364 if((level&(~127)) == 0){
2365 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2374 level= temp[i] + 64;
2378 if((level&(~127)) == 0){
2379 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2387 #define VSAD_INTRA(size) \
2388 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2392 for(y=1; y<h; y++){ \
2393 for(x=0; x<size; x+=4){ \
2394 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2395 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2405 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2410 for(x=0; x<16; x++){
2411 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2420 #define SQ(a) ((a)*(a))
2421 #define VSSE_INTRA(size) \
2422 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2426 for(y=1; y<h; y++){ \
2427 for(x=0; x<size; x+=4){ \
2428 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2429 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2439 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2444 for(x=0; x<16; x++){
2445 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2454 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2458 for(i=0; i<size; i++)
2459 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2463 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2464 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2465 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2467 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2469 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2470 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2471 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2472 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2474 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2477 for(i=0; i<len; i++)
2478 dst[i] = src0[i] * src1[-i];
2481 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2483 for(i=0; i<len; i++)
2484 dst[i] = src0[i] * src1[i] + src2[i];
2487 static void vector_fmul_window_c(float *dst, const float *src0,
2488 const float *src1, const float *win, int len)
2494 for(i=-len, j=len-1; i<0; i++, j--) {
2499 dst[i] = s0*wj - s1*wi;
2500 dst[j] = s0*wi + s1*wj;
2504 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2508 for (i = 0; i < len; i++)
2509 dst[i] = src[i] * mul;
2512 static void butterflies_float_c(float *av_restrict v1, float *av_restrict v2,
2516 for (i = 0; i < len; i++) {
2517 float t = v1[i] - v2[i];
2523 static void butterflies_float_interleave_c(float *dst, const float *src0,
2524 const float *src1, int len)
2527 for (i = 0; i < len; i++) {
2530 dst[2*i ] = f1 + f2;
2531 dst[2*i + 1] = f1 - f2;
2535 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2540 for (i = 0; i < len; i++)
2546 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2547 uint32_t maxi, uint32_t maxisign)
2550 if(a > mini) return mini;
2551 else if((a^(1U<<31)) > maxisign) return maxi;
2555 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2557 uint32_t mini = *(uint32_t*)min;
2558 uint32_t maxi = *(uint32_t*)max;
2559 uint32_t maxisign = maxi ^ (1U<<31);
2560 uint32_t *dsti = (uint32_t*)dst;
2561 const uint32_t *srci = (const uint32_t*)src;
2562 for(i=0; i<len; i+=8) {
2563 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2564 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2565 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2566 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2567 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2568 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2569 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2570 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2573 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2575 if(min < 0 && max > 0) {
2576 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2578 for(i=0; i < len; i+=8) {
2579 dst[i ] = av_clipf(src[i ], min, max);
2580 dst[i + 1] = av_clipf(src[i + 1], min, max);
2581 dst[i + 2] = av_clipf(src[i + 2], min, max);
2582 dst[i + 3] = av_clipf(src[i + 3], min, max);
2583 dst[i + 4] = av_clipf(src[i + 4], min, max);
2584 dst[i + 5] = av_clipf(src[i + 5], min, max);
2585 dst[i + 6] = av_clipf(src[i + 6], min, max);
2586 dst[i + 7] = av_clipf(src[i + 7], min, max);
2591 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2596 res += *v1++ * *v2++;
2601 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2606 *v1++ += mul * *v3++;
2611 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2612 const int16_t *window, unsigned int len)
2615 int len2 = len >> 1;
2617 for (i = 0; i < len2; i++) {
2618 int16_t w = window[i];
2619 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2620 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2624 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2625 int32_t max, unsigned int len)
2628 *dst++ = av_clip(*src++, min, max);
2629 *dst++ = av_clip(*src++, min, max);
2630 *dst++ = av_clip(*src++, min, max);
2631 *dst++ = av_clip(*src++, min, max);
2632 *dst++ = av_clip(*src++, min, max);
2633 *dst++ = av_clip(*src++, min, max);
2634 *dst++ = av_clip(*src++, min, max);
2635 *dst++ = av_clip(*src++, min, max);
2641 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2642 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2643 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2644 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2645 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2646 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2647 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2649 static void wmv2_idct_row(short * b)
2652 int a0,a1,a2,a3,a4,a5,a6,a7;
2654 a1 = W1*b[1]+W7*b[7];
2655 a7 = W7*b[1]-W1*b[7];
2656 a5 = W5*b[5]+W3*b[3];
2657 a3 = W3*b[5]-W5*b[3];
2658 a2 = W2*b[2]+W6*b[6];
2659 a6 = W6*b[2]-W2*b[6];
2660 a0 = W0*b[0]+W0*b[4];
2661 a4 = W0*b[0]-W0*b[4];
2663 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2664 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2666 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2667 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2668 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2669 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2670 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2671 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2672 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2673 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2675 static void wmv2_idct_col(short * b)
2678 int a0,a1,a2,a3,a4,a5,a6,a7;
2679 /*step 1, with extended precision*/
2680 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2681 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2682 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2683 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2684 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2685 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2686 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2687 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2689 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2690 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2692 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2693 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2694 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2695 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2697 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2698 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2699 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2700 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2702 void ff_wmv2_idct_c(short * block){
2706 wmv2_idct_row(block+i);
2709 wmv2_idct_col(block+i);
2712 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2714 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2716 ff_wmv2_idct_c(block);
2717 ff_put_pixels_clamped_c(block, dest, line_size);
2719 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2721 ff_wmv2_idct_c(block);
2722 ff_add_pixels_clamped_c(block, dest, line_size);
2724 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2726 ff_j_rev_dct (block);
2727 ff_put_pixels_clamped_c(block, dest, line_size);
2729 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2731 ff_j_rev_dct (block);
2732 ff_add_pixels_clamped_c(block, dest, line_size);
2735 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2737 ff_j_rev_dct4 (block);
2738 put_pixels_clamped4_c(block, dest, line_size);
2740 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2742 ff_j_rev_dct4 (block);
2743 add_pixels_clamped4_c(block, dest, line_size);
2746 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2748 ff_j_rev_dct2 (block);
2749 put_pixels_clamped2_c(block, dest, line_size);
2751 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2753 ff_j_rev_dct2 (block);
2754 add_pixels_clamped2_c(block, dest, line_size);
2757 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2759 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2761 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2763 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2766 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2768 /* init static data */
2769 av_cold void ff_dsputil_static_init(void)
2773 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2774 for(i=0;i<MAX_NEG_CROP;i++) {
2776 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2779 for(i=0;i<512;i++) {
2780 ff_squareTbl[i] = (i - 256) * (i - 256);
2783 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2786 int ff_check_alignment(void){
2787 static int did_fail=0;
2788 LOCAL_ALIGNED_16(int, aligned, [4]);
2790 if((intptr_t)aligned & 15){
2792 #if HAVE_MMX || HAVE_ALTIVEC
2793 av_log(NULL, AV_LOG_ERROR,
2794 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2795 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2796 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2797 "Do not report crashes to FFmpeg developers.\n");
2806 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2810 ff_check_alignment();
2813 if (avctx->bits_per_raw_sample == 10) {
2814 c->fdct = ff_jpeg_fdct_islow_10;
2815 c->fdct248 = ff_fdct248_islow_10;
2817 if(avctx->dct_algo==FF_DCT_FASTINT) {
2818 c->fdct = ff_fdct_ifast;
2819 c->fdct248 = ff_fdct_ifast248;
2821 else if(avctx->dct_algo==FF_DCT_FAAN) {
2822 c->fdct = ff_faandct;
2823 c->fdct248 = ff_faandct248;
2826 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2827 c->fdct248 = ff_fdct248_islow_8;
2830 #endif //CONFIG_ENCODERS
2832 if(avctx->lowres==1){
2833 c->idct_put= ff_jref_idct4_put;
2834 c->idct_add= ff_jref_idct4_add;
2835 c->idct = ff_j_rev_dct4;
2836 c->idct_permutation_type= FF_NO_IDCT_PERM;
2837 }else if(avctx->lowres==2){
2838 c->idct_put= ff_jref_idct2_put;
2839 c->idct_add= ff_jref_idct2_add;
2840 c->idct = ff_j_rev_dct2;
2841 c->idct_permutation_type= FF_NO_IDCT_PERM;
2842 }else if(avctx->lowres==3){
2843 c->idct_put= ff_jref_idct1_put;
2844 c->idct_add= ff_jref_idct1_add;
2845 c->idct = ff_j_rev_dct1;
2846 c->idct_permutation_type= FF_NO_IDCT_PERM;
2848 if (avctx->bits_per_raw_sample == 10) {
2849 c->idct_put = ff_simple_idct_put_10;
2850 c->idct_add = ff_simple_idct_add_10;
2851 c->idct = ff_simple_idct_10;
2852 c->idct_permutation_type = FF_NO_IDCT_PERM;
2854 if(avctx->idct_algo==FF_IDCT_INT){
2855 c->idct_put= ff_jref_idct_put;
2856 c->idct_add= ff_jref_idct_add;
2857 c->idct = ff_j_rev_dct;
2858 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2859 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2860 avctx->idct_algo==FF_IDCT_VP3){
2861 c->idct_put= ff_vp3_idct_put_c;
2862 c->idct_add= ff_vp3_idct_add_c;
2863 c->idct = ff_vp3_idct_c;
2864 c->idct_permutation_type= FF_NO_IDCT_PERM;
2865 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2866 c->idct_put= ff_wmv2_idct_put_c;
2867 c->idct_add= ff_wmv2_idct_add_c;
2868 c->idct = ff_wmv2_idct_c;
2869 c->idct_permutation_type= FF_NO_IDCT_PERM;
2870 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2871 c->idct_put= ff_faanidct_put;
2872 c->idct_add= ff_faanidct_add;
2873 c->idct = ff_faanidct;
2874 c->idct_permutation_type= FF_NO_IDCT_PERM;
2875 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2876 c->idct_put= ff_ea_idct_put_c;
2877 c->idct_permutation_type= FF_NO_IDCT_PERM;
2878 }else{ //accurate/default
2879 c->idct_put = ff_simple_idct_put_8;
2880 c->idct_add = ff_simple_idct_add_8;
2881 c->idct = ff_simple_idct_8;
2882 c->idct_permutation_type= FF_NO_IDCT_PERM;
2887 c->diff_pixels = diff_pixels_c;
2888 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2889 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2890 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2891 c->sum_abs_dctelem = sum_abs_dctelem_c;
2894 c->pix_sum = pix_sum_c;
2895 c->pix_norm1 = pix_norm1_c;
2897 c->fill_block_tab[0] = fill_block16_c;
2898 c->fill_block_tab[1] = fill_block8_c;
2900 /* TODO [0] 16 [1] 8 */
2901 c->pix_abs[0][0] = pix_abs16_c;
2902 c->pix_abs[0][1] = pix_abs16_x2_c;
2903 c->pix_abs[0][2] = pix_abs16_y2_c;
2904 c->pix_abs[0][3] = pix_abs16_xy2_c;
2905 c->pix_abs[1][0] = pix_abs8_c;
2906 c->pix_abs[1][1] = pix_abs8_x2_c;
2907 c->pix_abs[1][2] = pix_abs8_y2_c;
2908 c->pix_abs[1][3] = pix_abs8_xy2_c;
2910 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2911 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2912 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2913 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2914 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2915 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2916 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2917 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2918 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2920 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2921 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2922 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2923 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2924 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2925 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2926 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2927 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2928 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2930 #define dspfunc(PFX, IDX, NUM) \
2931 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2932 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2933 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2934 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2935 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2936 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2937 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2938 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2939 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2940 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2941 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2942 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2943 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2944 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2945 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2946 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2948 dspfunc(put_qpel, 0, 16);
2949 dspfunc(put_no_rnd_qpel, 0, 16);
2951 dspfunc(avg_qpel, 0, 16);
2952 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2954 dspfunc(put_qpel, 1, 8);
2955 dspfunc(put_no_rnd_qpel, 1, 8);
2957 dspfunc(avg_qpel, 1, 8);
2958 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2962 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2963 ff_mlp_init(c, avctx);
2965 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2966 ff_intrax8dsp_init(c,avctx);
2969 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2970 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2971 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2972 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2973 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2974 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2975 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2976 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2978 #define SET_CMP_FUNC(name) \
2979 c->name[0]= name ## 16_c;\
2980 c->name[1]= name ## 8x8_c;
2982 SET_CMP_FUNC(hadamard8_diff)
2983 c->hadamard8_diff[4]= hadamard8_intra16_c;
2984 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2985 SET_CMP_FUNC(dct_sad)
2986 SET_CMP_FUNC(dct_max)
2988 SET_CMP_FUNC(dct264_sad)
2990 c->sad[0]= pix_abs16_c;
2991 c->sad[1]= pix_abs8_c;
2995 SET_CMP_FUNC(quant_psnr)
2998 c->vsad[0]= vsad16_c;
2999 c->vsad[4]= vsad_intra16_c;
3000 c->vsad[5]= vsad_intra8_c;
3001 c->vsse[0]= vsse16_c;
3002 c->vsse[4]= vsse_intra16_c;
3003 c->vsse[5]= vsse_intra8_c;
3004 c->nsse[0]= nsse16_c;
3005 c->nsse[1]= nsse8_c;
3007 ff_dsputil_init_dwt(c);
3010 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3012 c->add_bytes= add_bytes_c;
3013 c->diff_bytes= diff_bytes_c;
3014 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3015 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3016 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3017 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3018 c->bswap_buf= bswap_buf;
3019 c->bswap16_buf = bswap16_buf;
3021 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3022 c->h263_h_loop_filter= h263_h_loop_filter_c;
3023 c->h263_v_loop_filter= h263_v_loop_filter_c;
3026 if (CONFIG_VP3_DECODER) {
3027 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3028 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3029 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3032 c->h261_loop_filter= h261_loop_filter_c;
3034 c->try_8x8basis= try_8x8basis_c;
3035 c->add_8x8basis= add_8x8basis_c;
3037 #if CONFIG_VORBIS_DECODER
3038 c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3040 #if CONFIG_AC3_DECODER
3041 c->ac3_downmix = ff_ac3_downmix_c;
3043 c->vector_fmul_reverse = vector_fmul_reverse_c;
3044 c->vector_fmul_add = vector_fmul_add_c;
3045 c->vector_fmul_window = vector_fmul_window_c;
3046 c->vector_clipf = vector_clipf_c;
3047 c->scalarproduct_int16 = scalarproduct_int16_c;
3048 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3049 c->apply_window_int16 = apply_window_int16_c;
3050 c->vector_clip_int32 = vector_clip_int32_c;
3051 c->scalarproduct_float = scalarproduct_float_c;
3052 c->butterflies_float = butterflies_float_c;
3053 c->butterflies_float_interleave = butterflies_float_interleave_c;
3054 c->vector_fmul_scalar = vector_fmul_scalar_c;
3056 c->shrink[0]= av_image_copy_plane;
3057 c->shrink[1]= ff_shrink22;
3058 c->shrink[2]= ff_shrink44;
3059 c->shrink[3]= ff_shrink88;
3061 c->prefetch= just_return;
3063 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3064 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3068 #define FUNC(f, depth) f ## _ ## depth
3069 #define FUNCC(f, depth) f ## _ ## depth ## _c
3071 #define dspfunc1(PFX, IDX, NUM, depth)\
3072 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3073 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3074 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3075 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3077 #define dspfunc2(PFX, IDX, NUM, depth)\
3078 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3079 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3080 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3081 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3082 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3083 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3084 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3085 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3086 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3087 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3088 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3089 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3090 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3091 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3092 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3093 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3096 #define BIT_DEPTH_FUNCS(depth, dct)\
3097 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3098 c->draw_edges = FUNCC(draw_edges , depth);\
3099 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3100 c->clear_block = FUNCC(clear_block ## dct , depth);\
3101 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3102 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3103 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3104 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3105 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3107 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3108 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3109 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3110 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3111 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3112 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3114 dspfunc1(put , 0, 16, depth);\
3115 dspfunc1(put , 1, 8, depth);\
3116 dspfunc1(put , 2, 4, depth);\
3117 dspfunc1(put , 3, 2, depth);\
3118 dspfunc1(put_no_rnd, 0, 16, depth);\
3119 dspfunc1(put_no_rnd, 1, 8, depth);\
3120 dspfunc1(avg , 0, 16, depth);\
3121 dspfunc1(avg , 1, 8, depth);\
3122 dspfunc1(avg , 2, 4, depth);\
3123 dspfunc1(avg , 3, 2, depth);\
3124 dspfunc1(avg_no_rnd, 0, 16, depth);\
3125 dspfunc1(avg_no_rnd, 1, 8, depth);\
3127 dspfunc2(put_h264_qpel, 0, 16, depth);\
3128 dspfunc2(put_h264_qpel, 1, 8, depth);\
3129 dspfunc2(put_h264_qpel, 2, 4, depth);\
3130 dspfunc2(put_h264_qpel, 3, 2, depth);\
3131 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3132 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3133 dspfunc2(avg_h264_qpel, 2, 4, depth);
3135 switch (avctx->bits_per_raw_sample) {
3137 if (c->dct_bits == 32) {
3138 BIT_DEPTH_FUNCS(9, _32);
3140 BIT_DEPTH_FUNCS(9, _16);
3144 if (c->dct_bits == 32) {
3145 BIT_DEPTH_FUNCS(10, _32);
3147 BIT_DEPTH_FUNCS(10, _16);
3151 BIT_DEPTH_FUNCS(8, _16);
3156 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
3157 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
3158 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
3159 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
3160 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
3161 if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
3162 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
3163 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
3165 for (i = 0; i < 4; i++) {
3166 for (j = 0; j < 16; j++) {
3167 if(!c->put_2tap_qpel_pixels_tab[i][j])
3168 c->put_2tap_qpel_pixels_tab[i][j] =
3169 c->put_h264_qpel_pixels_tab[i][j];
3170 if(!c->avg_2tap_qpel_pixels_tab[i][j])
3171 c->avg_2tap_qpel_pixels_tab[i][j] =
3172 c->avg_h264_qpel_pixels_tab[i][j];
3176 ff_init_scantable_permutation(c->idct_permutation,
3177 c->idct_permutation_type);
3180 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3182 ff_dsputil_init(c, avctx);