3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 void ff_init_scantable_permutation(uint8_t *idct_permutation,
149 int idct_permutation_type)
153 switch(idct_permutation_type){
154 case FF_NO_IDCT_PERM:
156 idct_permutation[i]= i;
158 case FF_LIBMPEG2_IDCT_PERM:
160 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
162 case FF_SIMPLE_IDCT_PERM:
164 idct_permutation[i]= simple_mmx_permutation[i];
166 case FF_TRANSPOSE_IDCT_PERM:
168 idct_permutation[i]= ((i&7)<<3) | (i>>3);
170 case FF_PARTTRANS_IDCT_PERM:
172 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
174 case FF_SSE2_IDCT_PERM:
176 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
179 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
183 static int pix_sum_c(uint8_t * pix, int line_size)
188 for (i = 0; i < 16; i++) {
189 for (j = 0; j < 16; j += 8) {
200 pix += line_size - 16;
205 static int pix_norm1_c(uint8_t * pix, int line_size)
208 uint32_t *sq = ff_squareTbl + 256;
211 for (i = 0; i < 16; i++) {
212 for (j = 0; j < 16; j += 8) {
224 register uint64_t x=*(uint64_t*)pix;
226 s += sq[(x>>8)&0xff];
227 s += sq[(x>>16)&0xff];
228 s += sq[(x>>24)&0xff];
229 s += sq[(x>>32)&0xff];
230 s += sq[(x>>40)&0xff];
231 s += sq[(x>>48)&0xff];
232 s += sq[(x>>56)&0xff];
234 register uint32_t x=*(uint32_t*)pix;
236 s += sq[(x>>8)&0xff];
237 s += sq[(x>>16)&0xff];
238 s += sq[(x>>24)&0xff];
239 x=*(uint32_t*)(pix+4);
241 s += sq[(x>>8)&0xff];
242 s += sq[(x>>16)&0xff];
243 s += sq[(x>>24)&0xff];
248 pix += line_size - 16;
253 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
256 for(i=0; i+8<=w; i+=8){
257 dst[i+0]= av_bswap32(src[i+0]);
258 dst[i+1]= av_bswap32(src[i+1]);
259 dst[i+2]= av_bswap32(src[i+2]);
260 dst[i+3]= av_bswap32(src[i+3]);
261 dst[i+4]= av_bswap32(src[i+4]);
262 dst[i+5]= av_bswap32(src[i+5]);
263 dst[i+6]= av_bswap32(src[i+6]);
264 dst[i+7]= av_bswap32(src[i+7]);
267 dst[i+0]= av_bswap32(src[i+0]);
271 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
274 *dst++ = av_bswap16(*src++);
277 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
280 uint32_t *sq = ff_squareTbl + 256;
283 for (i = 0; i < h; i++) {
284 s += sq[pix1[0] - pix2[0]];
285 s += sq[pix1[1] - pix2[1]];
286 s += sq[pix1[2] - pix2[2]];
287 s += sq[pix1[3] - pix2[3]];
294 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
297 uint32_t *sq = ff_squareTbl + 256;
300 for (i = 0; i < h; i++) {
301 s += sq[pix1[0] - pix2[0]];
302 s += sq[pix1[1] - pix2[1]];
303 s += sq[pix1[2] - pix2[2]];
304 s += sq[pix1[3] - pix2[3]];
305 s += sq[pix1[4] - pix2[4]];
306 s += sq[pix1[5] - pix2[5]];
307 s += sq[pix1[6] - pix2[6]];
308 s += sq[pix1[7] - pix2[7]];
315 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
318 uint32_t *sq = ff_squareTbl + 256;
321 for (i = 0; i < h; i++) {
322 s += sq[pix1[ 0] - pix2[ 0]];
323 s += sq[pix1[ 1] - pix2[ 1]];
324 s += sq[pix1[ 2] - pix2[ 2]];
325 s += sq[pix1[ 3] - pix2[ 3]];
326 s += sq[pix1[ 4] - pix2[ 4]];
327 s += sq[pix1[ 5] - pix2[ 5]];
328 s += sq[pix1[ 6] - pix2[ 6]];
329 s += sq[pix1[ 7] - pix2[ 7]];
330 s += sq[pix1[ 8] - pix2[ 8]];
331 s += sq[pix1[ 9] - pix2[ 9]];
332 s += sq[pix1[10] - pix2[10]];
333 s += sq[pix1[11] - pix2[11]];
334 s += sq[pix1[12] - pix2[12]];
335 s += sq[pix1[13] - pix2[13]];
336 s += sq[pix1[14] - pix2[14]];
337 s += sq[pix1[15] - pix2[15]];
345 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
346 const uint8_t *s2, int stride){
349 /* read the pixels */
351 block[0] = s1[0] - s2[0];
352 block[1] = s1[1] - s2[1];
353 block[2] = s1[2] - s2[2];
354 block[3] = s1[3] - s2[3];
355 block[4] = s1[4] - s2[4];
356 block[5] = s1[5] - s2[5];
357 block[6] = s1[6] - s2[6];
358 block[7] = s1[7] - s2[7];
366 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
370 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
372 /* read the pixels */
374 pixels[0] = cm[block[0]];
375 pixels[1] = cm[block[1]];
376 pixels[2] = cm[block[2]];
377 pixels[3] = cm[block[3]];
378 pixels[4] = cm[block[4]];
379 pixels[5] = cm[block[5]];
380 pixels[6] = cm[block[6]];
381 pixels[7] = cm[block[7]];
388 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
392 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
394 /* read the pixels */
396 pixels[0] = cm[block[0]];
397 pixels[1] = cm[block[1]];
398 pixels[2] = cm[block[2]];
399 pixels[3] = cm[block[3]];
406 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
410 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
412 /* read the pixels */
414 pixels[0] = cm[block[0]];
415 pixels[1] = cm[block[1]];
422 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
423 uint8_t *restrict pixels,
428 for (i = 0; i < 8; i++) {
429 for (j = 0; j < 8; j++) {
432 else if (*block > 127)
435 *pixels = (uint8_t)(*block + 128);
439 pixels += (line_size - 8);
443 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
447 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
449 /* read the pixels */
451 pixels[0] = cm[pixels[0] + block[0]];
452 pixels[1] = cm[pixels[1] + block[1]];
453 pixels[2] = cm[pixels[2] + block[2]];
454 pixels[3] = cm[pixels[3] + block[3]];
455 pixels[4] = cm[pixels[4] + block[4]];
456 pixels[5] = cm[pixels[5] + block[5]];
457 pixels[6] = cm[pixels[6] + block[6]];
458 pixels[7] = cm[pixels[7] + block[7]];
464 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
468 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
470 /* read the pixels */
472 pixels[0] = cm[pixels[0] + block[0]];
473 pixels[1] = cm[pixels[1] + block[1]];
474 pixels[2] = cm[pixels[2] + block[2]];
475 pixels[3] = cm[pixels[3] + block[3]];
481 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
485 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
487 /* read the pixels */
489 pixels[0] = cm[pixels[0] + block[0]];
490 pixels[1] = cm[pixels[1] + block[1]];
496 static int sum_abs_dctelem_c(DCTELEM *block)
500 sum+= FFABS(block[i]);
504 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
508 for (i = 0; i < h; i++) {
509 memset(block, value, 16);
514 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
518 for (i = 0; i < h; i++) {
519 memset(block, value, 8);
524 #define avg2(a,b) ((a+b+1)>>1)
525 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
527 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
529 const int A=(16-x16)*(16-y16);
530 const int B=( x16)*(16-y16);
531 const int C=(16-x16)*( y16);
532 const int D=( x16)*( y16);
537 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
538 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
539 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
540 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
541 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
542 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
543 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
544 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
550 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
551 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
554 const int s= 1<<shift;
564 for(x=0; x<8; x++){ //XXX FIXME optimize
565 int src_x, src_y, frac_x, frac_y, index;
574 if((unsigned)src_x < width){
575 if((unsigned)src_y < height){
576 index= src_x + src_y*stride;
577 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
578 + src[index +1]* frac_x )*(s-frac_y)
579 + ( src[index+stride ]*(s-frac_x)
580 + src[index+stride+1]* frac_x )* frac_y
583 index= src_x + av_clip(src_y, 0, height)*stride;
584 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
585 + src[index +1]* frac_x )*s
589 if((unsigned)src_y < height){
590 index= av_clip(src_x, 0, width) + src_y*stride;
591 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
592 + src[index+stride ]* frac_y )*s
595 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
596 dst[y*stride + x]= src[index ];
608 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
610 case 2: put_pixels2_8_c (dst, src, stride, height); break;
611 case 4: put_pixels4_8_c (dst, src, stride, height); break;
612 case 8: put_pixels8_8_c (dst, src, stride, height); break;
613 case 16:put_pixels16_8_c(dst, src, stride, height); break;
617 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
619 for (i=0; i < height; i++) {
620 for (j=0; j < width; j++) {
621 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
628 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 for (i=0; i < height; i++) {
631 for (j=0; j < width; j++) {
632 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
639 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
641 for (i=0; i < height; i++) {
642 for (j=0; j < width; j++) {
643 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
650 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
652 for (i=0; i < height; i++) {
653 for (j=0; j < width; j++) {
654 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
661 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
663 for (i=0; i < height; i++) {
664 for (j=0; j < width; j++) {
665 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
672 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
674 for (i=0; i < height; i++) {
675 for (j=0; j < width; j++) {
676 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
683 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
685 for (i=0; i < height; i++) {
686 for (j=0; j < width; j++) {
687 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
694 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
696 for (i=0; i < height; i++) {
697 for (j=0; j < width; j++) {
698 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
705 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
708 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
709 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
710 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
714 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 for (i=0; i < height; i++) {
717 for (j=0; j < width; j++) {
718 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
725 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 for (i=0; i < height; i++) {
728 for (j=0; j < width; j++) {
729 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
736 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
738 for (i=0; i < height; i++) {
739 for (j=0; j < width; j++) {
740 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
747 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
749 for (i=0; i < height; i++) {
750 for (j=0; j < width; j++) {
751 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
758 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
760 for (i=0; i < height; i++) {
761 for (j=0; j < width; j++) {
762 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
769 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
771 for (i=0; i < height; i++) {
772 for (j=0; j < width; j++) {
773 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
780 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
782 for (i=0; i < height; i++) {
783 for (j=0; j < width; j++) {
784 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
791 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
793 for (i=0; i < height; i++) {
794 for (j=0; j < width; j++) {
795 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
802 #define QPEL_MC(r, OPNAME, RND, OP) \
803 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
804 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
808 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
809 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
810 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
811 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
812 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
813 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
814 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
815 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
821 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
823 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
827 const int src0= src[0*srcStride];\
828 const int src1= src[1*srcStride];\
829 const int src2= src[2*srcStride];\
830 const int src3= src[3*srcStride];\
831 const int src4= src[4*srcStride];\
832 const int src5= src[5*srcStride];\
833 const int src6= src[6*srcStride];\
834 const int src7= src[7*srcStride];\
835 const int src8= src[8*srcStride];\
836 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
837 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
838 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
839 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
840 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
841 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
842 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
843 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
849 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
850 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
855 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
856 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
857 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
858 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
859 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
860 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
861 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
862 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
863 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
864 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
865 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
866 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
867 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
868 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
869 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
870 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
876 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
877 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
882 const int src0= src[0*srcStride];\
883 const int src1= src[1*srcStride];\
884 const int src2= src[2*srcStride];\
885 const int src3= src[3*srcStride];\
886 const int src4= src[4*srcStride];\
887 const int src5= src[5*srcStride];\
888 const int src6= src[6*srcStride];\
889 const int src7= src[7*srcStride];\
890 const int src8= src[8*srcStride];\
891 const int src9= src[9*srcStride];\
892 const int src10= src[10*srcStride];\
893 const int src11= src[11*srcStride];\
894 const int src12= src[12*srcStride];\
895 const int src13= src[13*srcStride];\
896 const int src14= src[14*srcStride];\
897 const int src15= src[15*srcStride];\
898 const int src16= src[16*srcStride];\
899 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
900 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
901 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
902 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
903 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
904 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
905 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
906 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
907 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
908 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
909 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
910 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
911 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
912 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
913 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
914 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
920 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
922 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
923 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
927 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
930 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
932 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
933 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
936 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
939 copy_block9(full, src, 16, stride, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
941 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
946 copy_block9(full, src, 16, stride, 9);\
947 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
950 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
953 copy_block9(full, src, 16, stride, 9);\
954 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
955 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
957 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
962 copy_block9(full, src, 16, stride, 9);\
963 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
964 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
965 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
966 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
968 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
972 copy_block9(full, src, 16, stride, 9);\
973 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
974 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
976 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
978 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
983 copy_block9(full, src, 16, stride, 9);\
984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
987 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
989 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
993 copy_block9(full, src, 16, stride, 9);\
994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
995 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
997 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
999 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1010 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1041 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1043 uint8_t halfHV[64];\
1044 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1045 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1046 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1048 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1050 uint8_t halfHV[64];\
1051 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1053 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1055 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056 uint8_t full[16*9];\
1059 uint8_t halfHV[64];\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1063 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1066 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1067 uint8_t full[16*9];\
1069 copy_block9(full, src, 16, stride, 9);\
1070 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1071 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1072 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1074 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1075 uint8_t full[16*9];\
1078 uint8_t halfHV[64];\
1079 copy_block9(full, src, 16, stride, 9);\
1080 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1082 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1083 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1085 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1086 uint8_t full[16*9];\
1088 copy_block9(full, src, 16, stride, 9);\
1089 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1090 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1091 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1093 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1095 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1096 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1099 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1101 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1102 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1105 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1106 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1109 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1111 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1112 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1115 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1116 uint8_t full[24*17];\
1118 copy_block17(full, src, 24, stride, 17);\
1119 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1120 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1123 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1124 uint8_t full[24*17];\
1125 copy_block17(full, src, 24, stride, 17);\
1126 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1129 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1130 uint8_t full[24*17];\
1132 copy_block17(full, src, 24, stride, 17);\
1133 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1134 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1136 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1138 uint8_t halfH[272];\
1139 uint8_t halfV[256];\
1140 uint8_t halfHV[256];\
1141 copy_block17(full, src, 24, stride, 17);\
1142 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1144 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1145 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1147 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1148 uint8_t full[24*17];\
1149 uint8_t halfH[272];\
1150 uint8_t halfHV[256];\
1151 copy_block17(full, src, 24, stride, 17);\
1152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1153 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1154 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1155 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1157 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1159 uint8_t halfH[272];\
1160 uint8_t halfV[256];\
1161 uint8_t halfHV[256];\
1162 copy_block17(full, src, 24, stride, 17);\
1163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1166 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1168 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1169 uint8_t full[24*17];\
1170 uint8_t halfH[272];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1174 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1178 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t halfH[272];\
1222 uint8_t halfHV[256];\
1223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1224 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1225 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1227 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1228 uint8_t halfH[272];\
1229 uint8_t halfHV[256];\
1230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1231 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1232 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1234 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1235 uint8_t full[24*17];\
1236 uint8_t halfH[272];\
1237 uint8_t halfV[256];\
1238 uint8_t halfHV[256];\
1239 copy_block17(full, src, 24, stride, 17);\
1240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1242 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1243 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1245 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1246 uint8_t full[24*17];\
1247 uint8_t halfH[272];\
1248 copy_block17(full, src, 24, stride, 17);\
1249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1253 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1254 uint8_t full[24*17];\
1255 uint8_t halfH[272];\
1256 uint8_t halfV[256];\
1257 uint8_t halfHV[256];\
1258 copy_block17(full, src, 24, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1261 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1262 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1264 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1265 uint8_t full[24*17];\
1266 uint8_t halfH[272];\
1267 copy_block17(full, src, 24, stride, 17);\
1268 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1269 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1270 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1272 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1273 uint8_t halfH[272];\
1274 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1275 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1278 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1279 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1280 #define op_put(a, b) a = cm[((b) + 16)>>5]
1281 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1283 QPEL_MC(0, put_ , _ , op_put)
1284 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1285 QPEL_MC(0, avg_ , _ , op_avg)
1286 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1288 #undef op_avg_no_rnd
1290 #undef op_put_no_rnd
1292 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1293 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1294 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1295 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1296 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1297 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1299 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1300 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1304 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1305 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1306 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1307 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1308 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1309 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1310 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1311 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1317 #if CONFIG_RV40_DECODER
1318 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319 put_pixels16_xy2_8_c(dst, src, stride, 16);
1321 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1322 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1324 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1325 put_pixels8_xy2_8_c(dst, src, stride, 8);
1327 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1328 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1330 #endif /* CONFIG_RV40_DECODER */
1332 #if CONFIG_DIRAC_DECODER
1333 #define DIRAC_MC(OPNAME)\
1334 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1336 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1338 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1340 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1342 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1344 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1345 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1347 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1349 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1351 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1353 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1355 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1358 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1360 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1362 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1364 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1366 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1368 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1370 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1371 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1377 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1378 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1382 const int src_1= src[ -srcStride];
1383 const int src0 = src[0 ];
1384 const int src1 = src[ srcStride];
1385 const int src2 = src[2*srcStride];
1386 const int src3 = src[3*srcStride];
1387 const int src4 = src[4*srcStride];
1388 const int src5 = src[5*srcStride];
1389 const int src6 = src[6*srcStride];
1390 const int src7 = src[7*srcStride];
1391 const int src8 = src[8*srcStride];
1392 const int src9 = src[9*srcStride];
1393 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1394 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1395 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1396 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1397 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1398 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1399 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1400 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1406 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1408 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1409 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1412 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1413 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1416 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1418 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1419 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1422 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1423 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1426 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1430 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1431 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1432 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1433 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1435 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1439 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1440 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1441 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1442 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1444 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1446 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1447 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1450 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1451 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1453 const int strength= ff_h263_loop_filter_strength[qscale];
1457 int p0= src[x-2*stride];
1458 int p1= src[x-1*stride];
1459 int p2= src[x+0*stride];
1460 int p3= src[x+1*stride];
1461 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1463 if (d<-2*strength) d1= 0;
1464 else if(d<- strength) d1=-2*strength - d;
1465 else if(d< strength) d1= d;
1466 else if(d< 2*strength) d1= 2*strength - d;
1471 if(p1&256) p1= ~(p1>>31);
1472 if(p2&256) p2= ~(p2>>31);
1474 src[x-1*stride] = p1;
1475 src[x+0*stride] = p2;
1479 d2= av_clip((p0-p3)/4, -ad1, ad1);
1481 src[x-2*stride] = p0 - d2;
1482 src[x+ stride] = p3 + d2;
1487 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1488 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1490 const int strength= ff_h263_loop_filter_strength[qscale];
1494 int p0= src[y*stride-2];
1495 int p1= src[y*stride-1];
1496 int p2= src[y*stride+0];
1497 int p3= src[y*stride+1];
1498 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1500 if (d<-2*strength) d1= 0;
1501 else if(d<- strength) d1=-2*strength - d;
1502 else if(d< strength) d1= d;
1503 else if(d< 2*strength) d1= 2*strength - d;
1508 if(p1&256) p1= ~(p1>>31);
1509 if(p2&256) p2= ~(p2>>31);
1511 src[y*stride-1] = p1;
1512 src[y*stride+0] = p2;
1516 d2= av_clip((p0-p3)/4, -ad1, ad1);
1518 src[y*stride-2] = p0 - d2;
1519 src[y*stride+1] = p3 + d2;
1524 static void h261_loop_filter_c(uint8_t *src, int stride){
1529 temp[x ] = 4*src[x ];
1530 temp[x + 7*8] = 4*src[x + 7*stride];
1534 xy = y * stride + x;
1536 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1541 src[ y*stride] = (temp[ y*8] + 2)>>2;
1542 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1544 xy = y * stride + x;
1546 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1551 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1557 s += abs(pix1[0] - pix2[0]);
1558 s += abs(pix1[1] - pix2[1]);
1559 s += abs(pix1[2] - pix2[2]);
1560 s += abs(pix1[3] - pix2[3]);
1561 s += abs(pix1[4] - pix2[4]);
1562 s += abs(pix1[5] - pix2[5]);
1563 s += abs(pix1[6] - pix2[6]);
1564 s += abs(pix1[7] - pix2[7]);
1565 s += abs(pix1[8] - pix2[8]);
1566 s += abs(pix1[9] - pix2[9]);
1567 s += abs(pix1[10] - pix2[10]);
1568 s += abs(pix1[11] - pix2[11]);
1569 s += abs(pix1[12] - pix2[12]);
1570 s += abs(pix1[13] - pix2[13]);
1571 s += abs(pix1[14] - pix2[14]);
1572 s += abs(pix1[15] - pix2[15]);
1579 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1585 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1586 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1587 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1588 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1589 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1590 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1591 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1592 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1593 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1594 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1595 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1596 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1597 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1598 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1599 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1600 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1607 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1610 uint8_t *pix3 = pix2 + line_size;
1614 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1615 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1616 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1617 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1618 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1619 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1620 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1621 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1622 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1623 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1624 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1625 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1626 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1627 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1628 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1629 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1637 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1640 uint8_t *pix3 = pix2 + line_size;
1644 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1645 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1646 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1647 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1648 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1649 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1650 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1651 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1652 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1653 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1654 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1655 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1656 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1657 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1658 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1659 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1667 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1673 s += abs(pix1[0] - pix2[0]);
1674 s += abs(pix1[1] - pix2[1]);
1675 s += abs(pix1[2] - pix2[2]);
1676 s += abs(pix1[3] - pix2[3]);
1677 s += abs(pix1[4] - pix2[4]);
1678 s += abs(pix1[5] - pix2[5]);
1679 s += abs(pix1[6] - pix2[6]);
1680 s += abs(pix1[7] - pix2[7]);
1687 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1693 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1694 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1695 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1696 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1697 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1698 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1699 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1700 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1707 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1710 uint8_t *pix3 = pix2 + line_size;
1714 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1715 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1716 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1717 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1718 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1719 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1720 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1721 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1729 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1732 uint8_t *pix3 = pix2 + line_size;
1736 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1737 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1738 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1739 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1740 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1741 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1742 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1743 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1751 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1752 MpegEncContext *c = v;
1758 for(x=0; x<16; x++){
1759 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1762 for(x=0; x<15; x++){
1763 score2+= FFABS( s1[x ] - s1[x +stride]
1764 - s1[x+1] + s1[x+1+stride])
1765 -FFABS( s2[x ] - s2[x +stride]
1766 - s2[x+1] + s2[x+1+stride]);
1773 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1774 else return score1 + FFABS(score2)*8;
1777 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1778 MpegEncContext *c = v;
1785 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1789 score2+= FFABS( s1[x ] - s1[x +stride]
1790 - s1[x+1] + s1[x+1+stride])
1791 -FFABS( s2[x ] - s2[x +stride]
1792 - s2[x+1] + s2[x+1+stride]);
1799 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1800 else return score1 + FFABS(score2)*8;
1803 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1807 for(i=0; i<8*8; i++){
1808 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1811 assert(-512<b && b<512);
1813 sum += (w*b)*(w*b)>>4;
1818 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1821 for(i=0; i<8*8; i++){
1822 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1827 * Permute an 8x8 block.
1828 * @param block the block which will be permuted according to the given permutation vector
1829 * @param permutation the permutation vector
1830 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1831 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1832 * (inverse) permutated to scantable order!
1834 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1840 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1842 for(i=0; i<=last; i++){
1843 const int j= scantable[i];
1848 for(i=0; i<=last; i++){
1849 const int j= scantable[i];
1850 const int perm_j= permutation[j];
1851 block[perm_j]= temp[j];
1855 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1859 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1862 memset(cmp, 0, sizeof(void*)*6);
1870 cmp[i]= c->hadamard8_diff[i];
1876 cmp[i]= c->dct_sad[i];
1879 cmp[i]= c->dct264_sad[i];
1882 cmp[i]= c->dct_max[i];
1885 cmp[i]= c->quant_psnr[i];
1914 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1919 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1921 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1922 long a = *(long*)(src+i);
1923 long b = *(long*)(dst+i);
1924 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1927 dst[i+0] += src[i+0];
1930 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1932 #if !HAVE_FAST_UNALIGNED
1933 if((long)src2 & (sizeof(long)-1)){
1934 for(i=0; i+7<w; i+=8){
1935 dst[i+0] = src1[i+0]-src2[i+0];
1936 dst[i+1] = src1[i+1]-src2[i+1];
1937 dst[i+2] = src1[i+2]-src2[i+2];
1938 dst[i+3] = src1[i+3]-src2[i+3];
1939 dst[i+4] = src1[i+4]-src2[i+4];
1940 dst[i+5] = src1[i+5]-src2[i+5];
1941 dst[i+6] = src1[i+6]-src2[i+6];
1942 dst[i+7] = src1[i+7]-src2[i+7];
1946 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1947 long a = *(long*)(src1+i);
1948 long b = *(long*)(src2+i);
1949 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1952 dst[i+0] = src1[i+0]-src2[i+0];
1955 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1963 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1972 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1980 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1990 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1993 for(i=0; i<w-1; i++){
2020 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2050 #define BUTTERFLY2(o1,o2,i1,i2) \
2054 #define BUTTERFLY1(x,y) \
2063 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2065 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2073 //FIXME try pointer walks
2074 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2075 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2076 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2077 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2079 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2080 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2081 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2082 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2084 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2085 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2086 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2087 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2091 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2092 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2093 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2094 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2096 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2097 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2098 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2099 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2102 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2103 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2104 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2105 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2110 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2118 //FIXME try pointer walks
2119 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2120 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2121 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2122 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2124 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2125 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2126 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2127 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2129 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2130 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2131 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2132 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2136 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2137 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2138 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2139 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2141 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2142 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2143 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2144 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2147 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2148 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2149 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2150 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2153 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2158 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159 MpegEncContext * const s= (MpegEncContext *)c;
2160 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2164 s->dsp.diff_pixels(temp, src1, src2, stride);
2166 return s->dsp.sum_abs_dctelem(temp);
2171 const int s07 = SRC(0) + SRC(7);\
2172 const int s16 = SRC(1) + SRC(6);\
2173 const int s25 = SRC(2) + SRC(5);\
2174 const int s34 = SRC(3) + SRC(4);\
2175 const int a0 = s07 + s34;\
2176 const int a1 = s16 + s25;\
2177 const int a2 = s07 - s34;\
2178 const int a3 = s16 - s25;\
2179 const int d07 = SRC(0) - SRC(7);\
2180 const int d16 = SRC(1) - SRC(6);\
2181 const int d25 = SRC(2) - SRC(5);\
2182 const int d34 = SRC(3) - SRC(4);\
2183 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2184 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2185 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2186 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2188 DST(1, a4 + (a7>>2)) ;\
2189 DST(2, a2 + (a3>>1)) ;\
2190 DST(3, a5 + (a6>>2)) ;\
2192 DST(5, a6 - (a5>>2)) ;\
2193 DST(6, (a2>>1) - a3 ) ;\
2194 DST(7, (a4>>2) - a7 ) ;\
2197 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198 MpegEncContext * const s= (MpegEncContext *)c;
2203 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2205 #define SRC(x) dct[i][x]
2206 #define DST(x,v) dct[i][x]= v
2207 for( i = 0; i < 8; i++ )
2212 #define SRC(x) dct[x][i]
2213 #define DST(x,v) sum += FFABS(v)
2214 for( i = 0; i < 8; i++ )
2222 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2223 MpegEncContext * const s= (MpegEncContext *)c;
2224 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2229 s->dsp.diff_pixels(temp, src1, src2, stride);
2233 sum= FFMAX(sum, FFABS(temp[i]));
2238 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2239 MpegEncContext * const s= (MpegEncContext *)c;
2240 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2241 DCTELEM * const bak = temp+64;
2247 s->dsp.diff_pixels(temp, src1, src2, stride);
2249 memcpy(bak, temp, 64*sizeof(DCTELEM));
2251 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2252 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2253 ff_simple_idct_8(temp); //FIXME
2256 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2261 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2262 MpegEncContext * const s= (MpegEncContext *)c;
2263 const uint8_t *scantable= s->intra_scantable.permutated;
2264 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2265 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2266 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2267 int i, last, run, bits, level, distortion, start_i;
2268 const int esc_length= s->ac_esc_length;
2270 uint8_t * last_length;
2274 copy_block8(lsrc1, src1, 8, stride, 8);
2275 copy_block8(lsrc2, src2, 8, stride, 8);
2277 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2279 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2285 length = s->intra_ac_vlc_length;
2286 last_length= s->intra_ac_vlc_last_length;
2287 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2290 length = s->inter_ac_vlc_length;
2291 last_length= s->inter_ac_vlc_last_length;
2296 for(i=start_i; i<last; i++){
2297 int j= scantable[i];
2302 if((level&(~127)) == 0){
2303 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2312 level= temp[i] + 64;
2316 if((level&(~127)) == 0){
2317 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2325 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2327 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2330 s->dsp.idct_add(lsrc2, 8, temp);
2332 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2334 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2337 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2338 MpegEncContext * const s= (MpegEncContext *)c;
2339 const uint8_t *scantable= s->intra_scantable.permutated;
2340 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2341 int i, last, run, bits, level, start_i;
2342 const int esc_length= s->ac_esc_length;
2344 uint8_t * last_length;
2348 s->dsp.diff_pixels(temp, src1, src2, stride);
2350 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2356 length = s->intra_ac_vlc_length;
2357 last_length= s->intra_ac_vlc_last_length;
2358 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2361 length = s->inter_ac_vlc_length;
2362 last_length= s->inter_ac_vlc_last_length;
2367 for(i=start_i; i<last; i++){
2368 int j= scantable[i];
2373 if((level&(~127)) == 0){
2374 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2383 level= temp[i] + 64;
2387 if((level&(~127)) == 0){
2388 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2396 #define VSAD_INTRA(size) \
2397 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2401 for(y=1; y<h; y++){ \
2402 for(x=0; x<size; x+=4){ \
2403 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2404 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2414 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2419 for(x=0; x<16; x++){
2420 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2429 #define SQ(a) ((a)*(a))
2430 #define VSSE_INTRA(size) \
2431 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2435 for(y=1; y<h; y++){ \
2436 for(x=0; x<size; x+=4){ \
2437 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2438 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2448 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2453 for(x=0; x<16; x++){
2454 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2463 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2467 for(i=0; i<size; i++)
2468 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2472 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2473 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2474 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2476 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2478 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2479 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2480 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2481 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2483 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2485 for(i=0; i<len; i++)
2486 dst[i] = src0[i] * src1[i];
2489 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2492 for(i=0; i<len; i++)
2493 dst[i] = src0[i] * src1[-i];
2496 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2498 for(i=0; i<len; i++)
2499 dst[i] = src0[i] * src1[i] + src2[i];
2502 static void vector_fmul_window_c(float *dst, const float *src0,
2503 const float *src1, const float *win, int len)
2509 for(i=-len, j=len-1; i<0; i++, j--) {
2514 dst[i] = s0*wj - s1*wi;
2515 dst[j] = s0*wi + s1*wj;
2519 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2523 for (i = 0; i < len; i++)
2524 dst[i] = src[i] * mul;
2527 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2531 for (i = 0; i < len; i++)
2532 dst[i] += src[i] * mul;
2535 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2539 for (i = 0; i < len; i++) {
2540 float t = v1[i] - v2[i];
2546 static void butterflies_float_interleave_c(float *dst, const float *src0,
2547 const float *src1, int len)
2550 for (i = 0; i < len; i++) {
2553 dst[2*i ] = f1 + f2;
2554 dst[2*i + 1] = f1 - f2;
2558 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2563 for (i = 0; i < len; i++)
2569 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2570 uint32_t maxi, uint32_t maxisign)
2573 if(a > mini) return mini;
2574 else if((a^(1U<<31)) > maxisign) return maxi;
2578 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2580 uint32_t mini = *(uint32_t*)min;
2581 uint32_t maxi = *(uint32_t*)max;
2582 uint32_t maxisign = maxi ^ (1U<<31);
2583 uint32_t *dsti = (uint32_t*)dst;
2584 const uint32_t *srci = (const uint32_t*)src;
2585 for(i=0; i<len; i+=8) {
2586 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2587 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2588 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2589 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2590 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2591 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2592 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2593 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2596 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2598 if(min < 0 && max > 0) {
2599 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2601 for(i=0; i < len; i+=8) {
2602 dst[i ] = av_clipf(src[i ], min, max);
2603 dst[i + 1] = av_clipf(src[i + 1], min, max);
2604 dst[i + 2] = av_clipf(src[i + 2], min, max);
2605 dst[i + 3] = av_clipf(src[i + 3], min, max);
2606 dst[i + 4] = av_clipf(src[i + 4], min, max);
2607 dst[i + 5] = av_clipf(src[i + 5], min, max);
2608 dst[i + 6] = av_clipf(src[i + 6], min, max);
2609 dst[i + 7] = av_clipf(src[i + 7], min, max);
2614 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2619 res += (*v1++ * *v2++) >> shift;
2624 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2629 *v1++ += mul * *v3++;
2634 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2635 const int16_t *window, unsigned int len)
2638 int len2 = len >> 1;
2640 for (i = 0; i < len2; i++) {
2641 int16_t w = window[i];
2642 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2643 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2647 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2648 int32_t max, unsigned int len)
2651 *dst++ = av_clip(*src++, min, max);
2652 *dst++ = av_clip(*src++, min, max);
2653 *dst++ = av_clip(*src++, min, max);
2654 *dst++ = av_clip(*src++, min, max);
2655 *dst++ = av_clip(*src++, min, max);
2656 *dst++ = av_clip(*src++, min, max);
2657 *dst++ = av_clip(*src++, min, max);
2658 *dst++ = av_clip(*src++, min, max);
2664 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2665 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2666 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2667 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2668 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2669 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2670 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2672 static void wmv2_idct_row(short * b)
2675 int a0,a1,a2,a3,a4,a5,a6,a7;
2677 a1 = W1*b[1]+W7*b[7];
2678 a7 = W7*b[1]-W1*b[7];
2679 a5 = W5*b[5]+W3*b[3];
2680 a3 = W3*b[5]-W5*b[3];
2681 a2 = W2*b[2]+W6*b[6];
2682 a6 = W6*b[2]-W2*b[6];
2683 a0 = W0*b[0]+W0*b[4];
2684 a4 = W0*b[0]-W0*b[4];
2686 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2687 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2689 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2690 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2691 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2692 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2693 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2694 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2695 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2696 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2698 static void wmv2_idct_col(short * b)
2701 int a0,a1,a2,a3,a4,a5,a6,a7;
2702 /*step 1, with extended precision*/
2703 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2704 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2705 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2706 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2707 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2708 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2709 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2710 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2712 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2713 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2715 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2716 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2717 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2718 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2720 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2721 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2722 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2723 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2725 void ff_wmv2_idct_c(short * block){
2729 wmv2_idct_row(block+i);
2732 wmv2_idct_col(block+i);
2735 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2737 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2739 ff_wmv2_idct_c(block);
2740 ff_put_pixels_clamped_c(block, dest, line_size);
2742 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2744 ff_wmv2_idct_c(block);
2745 ff_add_pixels_clamped_c(block, dest, line_size);
2747 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2750 ff_put_pixels_clamped_c(block, dest, line_size);
2752 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2755 ff_add_pixels_clamped_c(block, dest, line_size);
2758 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2761 put_pixels_clamped4_c(block, dest, line_size);
2763 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2766 add_pixels_clamped4_c(block, dest, line_size);
2769 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2772 put_pixels_clamped2_c(block, dest, line_size);
2774 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2777 add_pixels_clamped2_c(block, dest, line_size);
2780 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2782 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2784 dest[0] = cm[(block[0] + 4)>>3];
2786 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2788 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2790 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2793 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2795 /* init static data */
2796 av_cold void dsputil_static_init(void)
2800 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2801 for(i=0;i<MAX_NEG_CROP;i++) {
2803 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2806 for(i=0;i<512;i++) {
2807 ff_squareTbl[i] = (i - 256) * (i - 256);
2810 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2813 int ff_check_alignment(void){
2814 static int did_fail=0;
2815 LOCAL_ALIGNED_16(int, aligned, [4]);
2817 if((intptr_t)aligned & 15){
2819 #if HAVE_MMX || HAVE_ALTIVEC
2820 av_log(NULL, AV_LOG_ERROR,
2821 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2822 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2823 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2824 "Do not report crashes to FFmpeg developers.\n");
2833 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2837 ff_check_alignment();
2840 if (avctx->bits_per_raw_sample == 10) {
2841 c->fdct = ff_jpeg_fdct_islow_10;
2842 c->fdct248 = ff_fdct248_islow_10;
2844 if(avctx->dct_algo==FF_DCT_FASTINT) {
2845 c->fdct = fdct_ifast;
2846 c->fdct248 = fdct_ifast248;
2848 else if(avctx->dct_algo==FF_DCT_FAAN) {
2849 c->fdct = ff_faandct;
2850 c->fdct248 = ff_faandct248;
2853 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2854 c->fdct248 = ff_fdct248_islow_8;
2857 #endif //CONFIG_ENCODERS
2859 if(avctx->lowres==1){
2860 c->idct_put= ff_jref_idct4_put;
2861 c->idct_add= ff_jref_idct4_add;
2862 c->idct = j_rev_dct4;
2863 c->idct_permutation_type= FF_NO_IDCT_PERM;
2864 }else if(avctx->lowres==2){
2865 c->idct_put= ff_jref_idct2_put;
2866 c->idct_add= ff_jref_idct2_add;
2867 c->idct = j_rev_dct2;
2868 c->idct_permutation_type= FF_NO_IDCT_PERM;
2869 }else if(avctx->lowres==3){
2870 c->idct_put= ff_jref_idct1_put;
2871 c->idct_add= ff_jref_idct1_add;
2872 c->idct = j_rev_dct1;
2873 c->idct_permutation_type= FF_NO_IDCT_PERM;
2875 if (avctx->bits_per_raw_sample == 10) {
2876 c->idct_put = ff_simple_idct_put_10;
2877 c->idct_add = ff_simple_idct_add_10;
2878 c->idct = ff_simple_idct_10;
2879 c->idct_permutation_type = FF_NO_IDCT_PERM;
2881 if(avctx->idct_algo==FF_IDCT_INT){
2882 c->idct_put= ff_jref_idct_put;
2883 c->idct_add= ff_jref_idct_add;
2884 c->idct = j_rev_dct;
2885 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2886 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2887 avctx->idct_algo==FF_IDCT_VP3){
2888 c->idct_put= ff_vp3_idct_put_c;
2889 c->idct_add= ff_vp3_idct_add_c;
2890 c->idct = ff_vp3_idct_c;
2891 c->idct_permutation_type= FF_NO_IDCT_PERM;
2892 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2893 c->idct_put= ff_wmv2_idct_put_c;
2894 c->idct_add= ff_wmv2_idct_add_c;
2895 c->idct = ff_wmv2_idct_c;
2896 c->idct_permutation_type= FF_NO_IDCT_PERM;
2897 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2898 c->idct_put= ff_faanidct_put;
2899 c->idct_add= ff_faanidct_add;
2900 c->idct = ff_faanidct;
2901 c->idct_permutation_type= FF_NO_IDCT_PERM;
2902 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2903 c->idct_put= ff_ea_idct_put_c;
2904 c->idct_permutation_type= FF_NO_IDCT_PERM;
2905 }else{ //accurate/default
2906 c->idct_put = ff_simple_idct_put_8;
2907 c->idct_add = ff_simple_idct_add_8;
2908 c->idct = ff_simple_idct_8;
2909 c->idct_permutation_type= FF_NO_IDCT_PERM;
2914 c->diff_pixels = diff_pixels_c;
2915 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2916 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2917 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2918 c->sum_abs_dctelem = sum_abs_dctelem_c;
2921 c->pix_sum = pix_sum_c;
2922 c->pix_norm1 = pix_norm1_c;
2924 c->fill_block_tab[0] = fill_block16_c;
2925 c->fill_block_tab[1] = fill_block8_c;
2927 /* TODO [0] 16 [1] 8 */
2928 c->pix_abs[0][0] = pix_abs16_c;
2929 c->pix_abs[0][1] = pix_abs16_x2_c;
2930 c->pix_abs[0][2] = pix_abs16_y2_c;
2931 c->pix_abs[0][3] = pix_abs16_xy2_c;
2932 c->pix_abs[1][0] = pix_abs8_c;
2933 c->pix_abs[1][1] = pix_abs8_x2_c;
2934 c->pix_abs[1][2] = pix_abs8_y2_c;
2935 c->pix_abs[1][3] = pix_abs8_xy2_c;
2937 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2938 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2939 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2940 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2941 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2942 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2943 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2944 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2945 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2947 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2948 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2949 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2950 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2951 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2952 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2953 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2954 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2955 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2957 #define dspfunc(PFX, IDX, NUM) \
2958 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2959 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2960 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2961 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2962 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2963 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2964 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2965 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2966 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2967 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2968 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2969 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2970 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2971 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2972 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2973 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2975 dspfunc(put_qpel, 0, 16);
2976 dspfunc(put_no_rnd_qpel, 0, 16);
2978 dspfunc(avg_qpel, 0, 16);
2979 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2981 dspfunc(put_qpel, 1, 8);
2982 dspfunc(put_no_rnd_qpel, 1, 8);
2984 dspfunc(avg_qpel, 1, 8);
2985 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2989 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2990 ff_mlp_init(c, avctx);
2992 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2993 ff_intrax8dsp_init(c,avctx);
2996 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2997 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2998 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2999 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3000 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3001 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3002 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3003 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3005 #define SET_CMP_FUNC(name) \
3006 c->name[0]= name ## 16_c;\
3007 c->name[1]= name ## 8x8_c;
3009 SET_CMP_FUNC(hadamard8_diff)
3010 c->hadamard8_diff[4]= hadamard8_intra16_c;
3011 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3012 SET_CMP_FUNC(dct_sad)
3013 SET_CMP_FUNC(dct_max)
3015 SET_CMP_FUNC(dct264_sad)
3017 c->sad[0]= pix_abs16_c;
3018 c->sad[1]= pix_abs8_c;
3022 SET_CMP_FUNC(quant_psnr)
3025 c->vsad[0]= vsad16_c;
3026 c->vsad[4]= vsad_intra16_c;
3027 c->vsad[5]= vsad_intra8_c;
3028 c->vsse[0]= vsse16_c;
3029 c->vsse[4]= vsse_intra16_c;
3030 c->vsse[5]= vsse_intra8_c;
3031 c->nsse[0]= nsse16_c;
3032 c->nsse[1]= nsse8_c;
3034 ff_dsputil_init_dwt(c);
3037 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3039 c->add_bytes= add_bytes_c;
3040 c->diff_bytes= diff_bytes_c;
3041 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3042 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3043 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3044 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3045 c->bswap_buf= bswap_buf;
3046 c->bswap16_buf = bswap16_buf;
3048 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3049 c->h263_h_loop_filter= h263_h_loop_filter_c;
3050 c->h263_v_loop_filter= h263_v_loop_filter_c;
3053 if (CONFIG_VP3_DECODER) {
3054 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3055 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3056 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3059 c->h261_loop_filter= h261_loop_filter_c;
3061 c->try_8x8basis= try_8x8basis_c;
3062 c->add_8x8basis= add_8x8basis_c;
3064 #if CONFIG_VORBIS_DECODER
3065 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3067 #if CONFIG_AC3_DECODER
3068 c->ac3_downmix = ff_ac3_downmix_c;
3070 c->vector_fmul = vector_fmul_c;
3071 c->vector_fmul_reverse = vector_fmul_reverse_c;
3072 c->vector_fmul_add = vector_fmul_add_c;
3073 c->vector_fmul_window = vector_fmul_window_c;
3074 c->vector_clipf = vector_clipf_c;
3075 c->scalarproduct_int16 = scalarproduct_int16_c;
3076 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3077 c->apply_window_int16 = apply_window_int16_c;
3078 c->vector_clip_int32 = vector_clip_int32_c;
3079 c->scalarproduct_float = scalarproduct_float_c;
3080 c->butterflies_float = butterflies_float_c;
3081 c->butterflies_float_interleave = butterflies_float_interleave_c;
3082 c->vector_fmul_scalar = vector_fmul_scalar_c;
3083 c->vector_fmac_scalar = vector_fmac_scalar_c;
3085 c->shrink[0]= av_image_copy_plane;
3086 c->shrink[1]= ff_shrink22;
3087 c->shrink[2]= ff_shrink44;
3088 c->shrink[3]= ff_shrink88;
3090 c->prefetch= just_return;
3092 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3093 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3097 #define FUNC(f, depth) f ## _ ## depth
3098 #define FUNCC(f, depth) f ## _ ## depth ## _c
3100 #define dspfunc1(PFX, IDX, NUM, depth)\
3101 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3102 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3103 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3104 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3106 #define dspfunc2(PFX, IDX, NUM, depth)\
3107 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3108 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3109 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3110 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3111 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3112 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3113 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3114 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3115 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3116 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3117 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3118 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3119 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3120 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3121 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3122 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3125 #define BIT_DEPTH_FUNCS(depth, dct)\
3126 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3127 c->draw_edges = FUNCC(draw_edges , depth);\
3128 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3129 c->clear_block = FUNCC(clear_block ## dct , depth);\
3130 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3131 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3132 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3133 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3134 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3136 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3137 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3138 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3139 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3140 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3141 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3143 dspfunc1(put , 0, 16, depth);\
3144 dspfunc1(put , 1, 8, depth);\
3145 dspfunc1(put , 2, 4, depth);\
3146 dspfunc1(put , 3, 2, depth);\
3147 dspfunc1(put_no_rnd, 0, 16, depth);\
3148 dspfunc1(put_no_rnd, 1, 8, depth);\
3149 dspfunc1(avg , 0, 16, depth);\
3150 dspfunc1(avg , 1, 8, depth);\
3151 dspfunc1(avg , 2, 4, depth);\
3152 dspfunc1(avg , 3, 2, depth);\
3153 dspfunc1(avg_no_rnd, 0, 16, depth);\
3154 dspfunc1(avg_no_rnd, 1, 8, depth);\
3156 dspfunc2(put_h264_qpel, 0, 16, depth);\
3157 dspfunc2(put_h264_qpel, 1, 8, depth);\
3158 dspfunc2(put_h264_qpel, 2, 4, depth);\
3159 dspfunc2(put_h264_qpel, 3, 2, depth);\
3160 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3161 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3162 dspfunc2(avg_h264_qpel, 2, 4, depth);
3164 switch (avctx->bits_per_raw_sample) {
3166 if (c->dct_bits == 32) {
3167 BIT_DEPTH_FUNCS(9, _32);
3169 BIT_DEPTH_FUNCS(9, _16);
3173 if (c->dct_bits == 32) {
3174 BIT_DEPTH_FUNCS(10, _32);
3176 BIT_DEPTH_FUNCS(10, _16);
3180 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3182 BIT_DEPTH_FUNCS(8, _16);
3187 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3188 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3189 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3190 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3191 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3192 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3193 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3194 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3195 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3197 for(i=0; i<64; i++){
3198 if(!c->put_2tap_qpel_pixels_tab[0][i])
3199 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3200 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3201 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3204 ff_init_scantable_permutation(c->idct_permutation,
3205 c->idct_permutation_type);