3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
139 j = st->permutated[i];
141 st->raster_end[i]= end;
145 void ff_init_scantable_permutation(uint8_t *idct_permutation,
146 int idct_permutation_type)
150 switch(idct_permutation_type){
151 case FF_NO_IDCT_PERM:
153 idct_permutation[i]= i;
155 case FF_LIBMPEG2_IDCT_PERM:
157 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
159 case FF_SIMPLE_IDCT_PERM:
161 idct_permutation[i]= simple_mmx_permutation[i];
163 case FF_TRANSPOSE_IDCT_PERM:
165 idct_permutation[i]= ((i&7)<<3) | (i>>3);
167 case FF_PARTTRANS_IDCT_PERM:
169 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
171 case FF_SSE2_IDCT_PERM:
173 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
176 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
180 static int pix_sum_c(uint8_t * pix, int line_size)
185 for (i = 0; i < 16; i++) {
186 for (j = 0; j < 16; j += 8) {
197 pix += line_size - 16;
202 static int pix_norm1_c(uint8_t * pix, int line_size)
205 uint32_t *sq = ff_squareTbl + 256;
208 for (i = 0; i < 16; i++) {
209 for (j = 0; j < 16; j += 8) {
221 register uint64_t x=*(uint64_t*)pix;
223 s += sq[(x>>8)&0xff];
224 s += sq[(x>>16)&0xff];
225 s += sq[(x>>24)&0xff];
226 s += sq[(x>>32)&0xff];
227 s += sq[(x>>40)&0xff];
228 s += sq[(x>>48)&0xff];
229 s += sq[(x>>56)&0xff];
231 register uint32_t x=*(uint32_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 x=*(uint32_t*)(pix+4);
238 s += sq[(x>>8)&0xff];
239 s += sq[(x>>16)&0xff];
240 s += sq[(x>>24)&0xff];
245 pix += line_size - 16;
250 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
253 for(i=0; i+8<=w; i+=8){
254 dst[i+0]= av_bswap32(src[i+0]);
255 dst[i+1]= av_bswap32(src[i+1]);
256 dst[i+2]= av_bswap32(src[i+2]);
257 dst[i+3]= av_bswap32(src[i+3]);
258 dst[i+4]= av_bswap32(src[i+4]);
259 dst[i+5]= av_bswap32(src[i+5]);
260 dst[i+6]= av_bswap32(src[i+6]);
261 dst[i+7]= av_bswap32(src[i+7]);
264 dst[i+0]= av_bswap32(src[i+0]);
268 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
271 *dst++ = av_bswap16(*src++);
274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 uint32_t *sq = ff_squareTbl + 256;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 uint32_t *sq = ff_squareTbl + 256;
297 for (i = 0; i < h; i++) {
298 s += sq[pix1[0] - pix2[0]];
299 s += sq[pix1[1] - pix2[1]];
300 s += sq[pix1[2] - pix2[2]];
301 s += sq[pix1[3] - pix2[3]];
302 s += sq[pix1[4] - pix2[4]];
303 s += sq[pix1[5] - pix2[5]];
304 s += sq[pix1[6] - pix2[6]];
305 s += sq[pix1[7] - pix2[7]];
312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 uint32_t *sq = ff_squareTbl + 256;
318 for (i = 0; i < h; i++) {
319 s += sq[pix1[ 0] - pix2[ 0]];
320 s += sq[pix1[ 1] - pix2[ 1]];
321 s += sq[pix1[ 2] - pix2[ 2]];
322 s += sq[pix1[ 3] - pix2[ 3]];
323 s += sq[pix1[ 4] - pix2[ 4]];
324 s += sq[pix1[ 5] - pix2[ 5]];
325 s += sq[pix1[ 6] - pix2[ 6]];
326 s += sq[pix1[ 7] - pix2[ 7]];
327 s += sq[pix1[ 8] - pix2[ 8]];
328 s += sq[pix1[ 9] - pix2[ 9]];
329 s += sq[pix1[10] - pix2[10]];
330 s += sq[pix1[11] - pix2[11]];
331 s += sq[pix1[12] - pix2[12]];
332 s += sq[pix1[13] - pix2[13]];
333 s += sq[pix1[14] - pix2[14]];
334 s += sq[pix1[15] - pix2[15]];
342 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
343 const uint8_t *s2, int stride){
346 /* read the pixels */
348 block[0] = s1[0] - s2[0];
349 block[1] = s1[1] - s2[1];
350 block[2] = s1[2] - s2[2];
351 block[3] = s1[3] - s2[3];
352 block[4] = s1[4] - s2[4];
353 block[5] = s1[5] - s2[5];
354 block[6] = s1[6] - s2[6];
355 block[7] = s1[7] - s2[7];
363 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
368 /* read the pixels */
370 pixels[0] = av_clip_uint8(block[0]);
371 pixels[1] = av_clip_uint8(block[1]);
372 pixels[2] = av_clip_uint8(block[2]);
373 pixels[3] = av_clip_uint8(block[3]);
374 pixels[4] = av_clip_uint8(block[4]);
375 pixels[5] = av_clip_uint8(block[5]);
376 pixels[6] = av_clip_uint8(block[6]);
377 pixels[7] = av_clip_uint8(block[7]);
384 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
389 /* read the pixels */
391 pixels[0] = av_clip_uint8(block[0]);
392 pixels[1] = av_clip_uint8(block[1]);
393 pixels[2] = av_clip_uint8(block[2]);
394 pixels[3] = av_clip_uint8(block[3]);
401 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
406 /* read the pixels */
408 pixels[0] = av_clip_uint8(block[0]);
409 pixels[1] = av_clip_uint8(block[1]);
416 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
417 uint8_t *restrict pixels,
422 for (i = 0; i < 8; i++) {
423 for (j = 0; j < 8; j++) {
426 else if (*block > 127)
429 *pixels = (uint8_t)(*block + 128);
433 pixels += (line_size - 8);
437 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
442 /* read the pixels */
444 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
445 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
446 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
447 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
448 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
449 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
450 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
451 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
457 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 /* read the pixels */
464 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
465 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
466 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
467 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
473 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478 /* read the pixels */
480 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
481 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
487 static int sum_abs_dctelem_c(DCTELEM *block)
491 sum+= FFABS(block[i]);
495 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
499 for (i = 0; i < h; i++) {
500 memset(block, value, 16);
505 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
509 for (i = 0; i < h; i++) {
510 memset(block, value, 8);
515 #define avg2(a,b) ((a+b+1)>>1)
516 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
518 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
520 const int A=(16-x16)*(16-y16);
521 const int B=( x16)*(16-y16);
522 const int C=(16-x16)*( y16);
523 const int D=( x16)*( y16);
528 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
529 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
530 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
531 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
532 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
533 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
534 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
535 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
541 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
542 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
545 const int s= 1<<shift;
555 for(x=0; x<8; x++){ //XXX FIXME optimize
556 int src_x, src_y, frac_x, frac_y, index;
565 if((unsigned)src_x < width){
566 if((unsigned)src_y < height){
567 index= src_x + src_y*stride;
568 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
569 + src[index +1]* frac_x )*(s-frac_y)
570 + ( src[index+stride ]*(s-frac_x)
571 + src[index+stride+1]* frac_x )* frac_y
574 index= src_x + av_clip(src_y, 0, height)*stride;
575 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
576 + src[index +1]* frac_x )*s
580 if((unsigned)src_y < height){
581 index= av_clip(src_x, 0, width) + src_y*stride;
582 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
583 + src[index+stride ]* frac_y )*s
586 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
587 dst[y*stride + x]= src[index ];
599 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
601 case 2: put_pixels2_8_c (dst, src, stride, height); break;
602 case 4: put_pixels4_8_c (dst, src, stride, height); break;
603 case 8: put_pixels8_8_c (dst, src, stride, height); break;
604 case 16:put_pixels16_8_c(dst, src, stride, height); break;
608 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
610 for (i=0; i < height; i++) {
611 for (j=0; j < width; j++) {
612 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
619 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
621 for (i=0; i < height; i++) {
622 for (j=0; j < width; j++) {
623 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
630 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
632 for (i=0; i < height; i++) {
633 for (j=0; j < width; j++) {
634 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
641 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
643 for (i=0; i < height; i++) {
644 for (j=0; j < width; j++) {
645 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
652 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
654 for (i=0; i < height; i++) {
655 for (j=0; j < width; j++) {
656 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
663 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
665 for (i=0; i < height; i++) {
666 for (j=0; j < width; j++) {
667 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
674 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
676 for (i=0; i < height; i++) {
677 for (j=0; j < width; j++) {
678 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
685 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
687 for (i=0; i < height; i++) {
688 for (j=0; j < width; j++) {
689 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
696 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
698 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
699 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
700 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
701 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
705 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
707 for (i=0; i < height; i++) {
708 for (j=0; j < width; j++) {
709 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
716 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
718 for (i=0; i < height; i++) {
719 for (j=0; j < width; j++) {
720 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
727 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
729 for (i=0; i < height; i++) {
730 for (j=0; j < width; j++) {
731 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
738 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
740 for (i=0; i < height; i++) {
741 for (j=0; j < width; j++) {
742 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
749 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
751 for (i=0; i < height; i++) {
752 for (j=0; j < width; j++) {
753 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
760 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
762 for (i=0; i < height; i++) {
763 for (j=0; j < width; j++) {
764 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
771 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
773 for (i=0; i < height; i++) {
774 for (j=0; j < width; j++) {
775 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
782 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
784 for (i=0; i < height; i++) {
785 for (j=0; j < width; j++) {
786 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
793 #define QPEL_MC(r, OPNAME, RND, OP) \
794 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
795 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
799 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
800 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
801 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
802 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
803 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
804 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
805 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
806 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
812 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
814 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
818 const int src0= src[0*srcStride];\
819 const int src1= src[1*srcStride];\
820 const int src2= src[2*srcStride];\
821 const int src3= src[3*srcStride];\
822 const int src4= src[4*srcStride];\
823 const int src5= src[5*srcStride];\
824 const int src6= src[6*srcStride];\
825 const int src7= src[7*srcStride];\
826 const int src8= src[8*srcStride];\
827 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
828 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
829 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
830 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
831 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
832 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
833 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
834 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
840 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
841 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
846 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
847 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
848 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
849 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
850 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
851 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
852 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
853 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
854 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
855 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
856 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
857 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
858 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
859 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
860 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
861 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
867 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
868 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
873 const int src0= src[0*srcStride];\
874 const int src1= src[1*srcStride];\
875 const int src2= src[2*srcStride];\
876 const int src3= src[3*srcStride];\
877 const int src4= src[4*srcStride];\
878 const int src5= src[5*srcStride];\
879 const int src6= src[6*srcStride];\
880 const int src7= src[7*srcStride];\
881 const int src8= src[8*srcStride];\
882 const int src9= src[9*srcStride];\
883 const int src10= src[10*srcStride];\
884 const int src11= src[11*srcStride];\
885 const int src12= src[12*srcStride];\
886 const int src13= src[13*srcStride];\
887 const int src14= src[14*srcStride];\
888 const int src15= src[15*srcStride];\
889 const int src16= src[16*srcStride];\
890 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
891 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
892 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
893 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
894 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
895 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
896 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
897 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
898 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
899 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
900 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
901 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
902 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
903 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
904 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
905 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
911 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
913 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
914 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
917 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
918 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
921 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
923 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
924 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
927 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
930 copy_block9(full, src, 16, stride, 9);\
931 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
932 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
935 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
937 copy_block9(full, src, 16, stride, 9);\
938 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
941 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
944 copy_block9(full, src, 16, stride, 9);\
945 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
946 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
948 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
953 copy_block9(full, src, 16, stride, 9);\
954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
955 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
957 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
959 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
963 copy_block9(full, src, 16, stride, 9);\
964 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
965 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
967 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
969 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
976 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
977 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
978 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
980 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
984 copy_block9(full, src, 16, stride, 9);\
985 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
986 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
988 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
990 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1001 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t full[16*9];\
1004 uint8_t halfHV[64];\
1005 copy_block9(full, src, 16, stride, 9);\
1006 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1011 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1012 uint8_t full[16*9];\
1015 uint8_t halfHV[64];\
1016 copy_block9(full, src, 16, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1018 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1019 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1022 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1023 uint8_t full[16*9];\
1025 uint8_t halfHV[64];\
1026 copy_block9(full, src, 16, stride, 9);\
1027 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1028 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1029 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1030 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1032 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1034 uint8_t halfHV[64];\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1037 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1039 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t halfHV[64];\
1042 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1043 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1044 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1046 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1047 uint8_t full[16*9];\
1050 uint8_t halfHV[64];\
1051 copy_block9(full, src, 16, stride, 9);\
1052 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1053 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1054 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1055 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1057 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1058 uint8_t full[16*9];\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1063 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1065 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1066 uint8_t full[16*9];\
1069 uint8_t halfHV[64];\
1070 copy_block9(full, src, 16, stride, 9);\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1076 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1077 uint8_t full[16*9];\
1079 copy_block9(full, src, 16, stride, 9);\
1080 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1081 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1082 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1084 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1090 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1092 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1093 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1096 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1097 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1100 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1102 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1103 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1106 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1107 uint8_t full[24*17];\
1109 copy_block17(full, src, 24, stride, 17);\
1110 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1111 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1114 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1115 uint8_t full[24*17];\
1116 copy_block17(full, src, 24, stride, 17);\
1117 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1120 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t full[24*17];\
1123 copy_block17(full, src, 24, stride, 17);\
1124 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1125 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1127 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1128 uint8_t full[24*17];\
1129 uint8_t halfH[272];\
1130 uint8_t halfV[256];\
1131 uint8_t halfHV[256];\
1132 copy_block17(full, src, 24, stride, 17);\
1133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1134 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1136 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1138 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1139 uint8_t full[24*17];\
1140 uint8_t halfH[272];\
1141 uint8_t halfHV[256];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1144 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1146 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1148 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1149 uint8_t full[24*17];\
1150 uint8_t halfH[272];\
1151 uint8_t halfV[256];\
1152 uint8_t halfHV[256];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1159 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1160 uint8_t full[24*17];\
1161 uint8_t halfH[272];\
1162 uint8_t halfHV[256];\
1163 copy_block17(full, src, 24, stride, 17);\
1164 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1165 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1167 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1169 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1170 uint8_t full[24*17];\
1171 uint8_t halfH[272];\
1172 uint8_t halfV[256];\
1173 uint8_t halfHV[256];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1181 uint8_t full[24*17];\
1182 uint8_t halfH[272];\
1183 uint8_t halfHV[256];\
1184 copy_block17(full, src, 24, stride, 17);\
1185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1190 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1191 uint8_t full[24*17];\
1192 uint8_t halfH[272];\
1193 uint8_t halfV[256];\
1194 uint8_t halfHV[256];\
1195 copy_block17(full, src, 24, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1197 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1201 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1202 uint8_t full[24*17];\
1203 uint8_t halfH[272];\
1204 uint8_t halfHV[256];\
1205 copy_block17(full, src, 24, stride, 17);\
1206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1207 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1208 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1209 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1211 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1216 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1218 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1219 uint8_t halfH[272];\
1220 uint8_t halfHV[256];\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1223 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1225 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1226 uint8_t full[24*17];\
1227 uint8_t halfH[272];\
1228 uint8_t halfV[256];\
1229 uint8_t halfHV[256];\
1230 copy_block17(full, src, 24, stride, 17);\
1231 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1232 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1233 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1234 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1236 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1237 uint8_t full[24*17];\
1238 uint8_t halfH[272];\
1239 copy_block17(full, src, 24, stride, 17);\
1240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1242 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1244 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 uint8_t halfV[256];\
1248 uint8_t halfHV[256];\
1249 copy_block17(full, src, 24, stride, 17);\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1252 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1253 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1255 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 copy_block17(full, src, 24, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1261 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1263 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1264 uint8_t halfH[272];\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1269 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1270 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1271 #define op_put(a, b) a = cm[((b) + 16)>>5]
1272 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1274 QPEL_MC(0, put_ , _ , op_put)
1275 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1276 QPEL_MC(0, avg_ , _ , op_avg)
1277 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1279 #undef op_avg_no_rnd
1281 #undef op_put_no_rnd
1283 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1284 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1285 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1286 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1287 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1288 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1290 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1291 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1295 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1296 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1297 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1298 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1299 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1300 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1301 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1302 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1308 #if CONFIG_RV40_DECODER
1309 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1310 put_pixels16_xy2_8_c(dst, src, stride, 16);
1312 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1313 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1315 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1316 put_pixels8_xy2_8_c(dst, src, stride, 8);
1318 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1319 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1321 #endif /* CONFIG_RV40_DECODER */
1323 #if CONFIG_DIRAC_DECODER
1324 #define DIRAC_MC(OPNAME)\
1325 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1327 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1329 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1331 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1333 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1335 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1336 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1338 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1340 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1342 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1344 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1346 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1348 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1349 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1351 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1353 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1355 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1357 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1359 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1361 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1362 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1368 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1369 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1373 const int src_1= src[ -srcStride];
1374 const int src0 = src[0 ];
1375 const int src1 = src[ srcStride];
1376 const int src2 = src[2*srcStride];
1377 const int src3 = src[3*srcStride];
1378 const int src4 = src[4*srcStride];
1379 const int src5 = src[5*srcStride];
1380 const int src6 = src[6*srcStride];
1381 const int src7 = src[7*srcStride];
1382 const int src8 = src[8*srcStride];
1383 const int src9 = src[9*srcStride];
1384 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1385 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1386 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1387 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1388 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1389 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1390 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1391 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1397 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1399 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1400 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1403 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1404 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1407 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1409 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1410 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1413 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1414 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1417 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1423 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1424 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1426 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1430 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1431 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1432 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1433 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1435 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1437 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1438 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1441 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1442 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1444 const int strength= ff_h263_loop_filter_strength[qscale];
1448 int p0= src[x-2*stride];
1449 int p1= src[x-1*stride];
1450 int p2= src[x+0*stride];
1451 int p3= src[x+1*stride];
1452 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1454 if (d<-2*strength) d1= 0;
1455 else if(d<- strength) d1=-2*strength - d;
1456 else if(d< strength) d1= d;
1457 else if(d< 2*strength) d1= 2*strength - d;
1462 if(p1&256) p1= ~(p1>>31);
1463 if(p2&256) p2= ~(p2>>31);
1465 src[x-1*stride] = p1;
1466 src[x+0*stride] = p2;
1470 d2= av_clip((p0-p3)/4, -ad1, ad1);
1472 src[x-2*stride] = p0 - d2;
1473 src[x+ stride] = p3 + d2;
1478 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1479 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1481 const int strength= ff_h263_loop_filter_strength[qscale];
1485 int p0= src[y*stride-2];
1486 int p1= src[y*stride-1];
1487 int p2= src[y*stride+0];
1488 int p3= src[y*stride+1];
1489 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1491 if (d<-2*strength) d1= 0;
1492 else if(d<- strength) d1=-2*strength - d;
1493 else if(d< strength) d1= d;
1494 else if(d< 2*strength) d1= 2*strength - d;
1499 if(p1&256) p1= ~(p1>>31);
1500 if(p2&256) p2= ~(p2>>31);
1502 src[y*stride-1] = p1;
1503 src[y*stride+0] = p2;
1507 d2= av_clip((p0-p3)/4, -ad1, ad1);
1509 src[y*stride-2] = p0 - d2;
1510 src[y*stride+1] = p3 + d2;
1515 static void h261_loop_filter_c(uint8_t *src, int stride){
1520 temp[x ] = 4*src[x ];
1521 temp[x + 7*8] = 4*src[x + 7*stride];
1525 xy = y * stride + x;
1527 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1532 src[ y*stride] = (temp[ y*8] + 2)>>2;
1533 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1535 xy = y * stride + x;
1537 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1542 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1548 s += abs(pix1[0] - pix2[0]);
1549 s += abs(pix1[1] - pix2[1]);
1550 s += abs(pix1[2] - pix2[2]);
1551 s += abs(pix1[3] - pix2[3]);
1552 s += abs(pix1[4] - pix2[4]);
1553 s += abs(pix1[5] - pix2[5]);
1554 s += abs(pix1[6] - pix2[6]);
1555 s += abs(pix1[7] - pix2[7]);
1556 s += abs(pix1[8] - pix2[8]);
1557 s += abs(pix1[9] - pix2[9]);
1558 s += abs(pix1[10] - pix2[10]);
1559 s += abs(pix1[11] - pix2[11]);
1560 s += abs(pix1[12] - pix2[12]);
1561 s += abs(pix1[13] - pix2[13]);
1562 s += abs(pix1[14] - pix2[14]);
1563 s += abs(pix1[15] - pix2[15]);
1570 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1576 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1577 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1578 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1579 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1580 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1581 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1582 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1583 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1584 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1585 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1586 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1587 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1588 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1589 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1590 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1591 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1598 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1601 uint8_t *pix3 = pix2 + line_size;
1605 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1606 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1607 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1608 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1609 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1610 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1611 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1612 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1613 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1614 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1615 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1616 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1617 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1618 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1619 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1620 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1628 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1631 uint8_t *pix3 = pix2 + line_size;
1635 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1636 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1637 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1638 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1639 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1640 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1641 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1642 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1643 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1644 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1645 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1646 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1647 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1648 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1649 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1650 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1658 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664 s += abs(pix1[0] - pix2[0]);
1665 s += abs(pix1[1] - pix2[1]);
1666 s += abs(pix1[2] - pix2[2]);
1667 s += abs(pix1[3] - pix2[3]);
1668 s += abs(pix1[4] - pix2[4]);
1669 s += abs(pix1[5] - pix2[5]);
1670 s += abs(pix1[6] - pix2[6]);
1671 s += abs(pix1[7] - pix2[7]);
1678 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1684 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1685 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1686 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1687 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1688 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1689 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1690 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1691 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1698 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1701 uint8_t *pix3 = pix2 + line_size;
1705 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1706 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1707 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1708 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1709 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1710 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1711 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1712 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1720 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1723 uint8_t *pix3 = pix2 + line_size;
1727 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1728 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1729 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1730 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1731 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1732 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1733 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1734 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1742 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743 MpegEncContext *c = v;
1749 for(x=0; x<16; x++){
1750 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1753 for(x=0; x<15; x++){
1754 score2+= FFABS( s1[x ] - s1[x +stride]
1755 - s1[x+1] + s1[x+1+stride])
1756 -FFABS( s2[x ] - s2[x +stride]
1757 - s2[x+1] + s2[x+1+stride]);
1764 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765 else return score1 + FFABS(score2)*8;
1768 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1769 MpegEncContext *c = v;
1776 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1780 score2+= FFABS( s1[x ] - s1[x +stride]
1781 - s1[x+1] + s1[x+1+stride])
1782 -FFABS( s2[x ] - s2[x +stride]
1783 - s2[x+1] + s2[x+1+stride]);
1790 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1791 else return score1 + FFABS(score2)*8;
1794 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1798 for(i=0; i<8*8; i++){
1799 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1802 assert(-512<b && b<512);
1804 sum += (w*b)*(w*b)>>4;
1809 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1812 for(i=0; i<8*8; i++){
1813 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1818 * Permute an 8x8 block.
1819 * @param block the block which will be permuted according to the given permutation vector
1820 * @param permutation the permutation vector
1821 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1822 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1823 * (inverse) permutated to scantable order!
1825 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1831 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1833 for(i=0; i<=last; i++){
1834 const int j= scantable[i];
1839 for(i=0; i<=last; i++){
1840 const int j= scantable[i];
1841 const int perm_j= permutation[j];
1842 block[perm_j]= temp[j];
1846 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1850 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1853 memset(cmp, 0, sizeof(void*)*6);
1861 cmp[i]= c->hadamard8_diff[i];
1867 cmp[i]= c->dct_sad[i];
1870 cmp[i]= c->dct264_sad[i];
1873 cmp[i]= c->dct_max[i];
1876 cmp[i]= c->quant_psnr[i];
1905 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1910 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1912 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1913 long a = *(long*)(src+i);
1914 long b = *(long*)(dst+i);
1915 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1918 dst[i+0] += src[i+0];
1921 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1923 #if !HAVE_FAST_UNALIGNED
1924 if((long)src2 & (sizeof(long)-1)){
1925 for(i=0; i+7<w; i+=8){
1926 dst[i+0] = src1[i+0]-src2[i+0];
1927 dst[i+1] = src1[i+1]-src2[i+1];
1928 dst[i+2] = src1[i+2]-src2[i+2];
1929 dst[i+3] = src1[i+3]-src2[i+3];
1930 dst[i+4] = src1[i+4]-src2[i+4];
1931 dst[i+5] = src1[i+5]-src2[i+5];
1932 dst[i+6] = src1[i+6]-src2[i+6];
1933 dst[i+7] = src1[i+7]-src2[i+7];
1937 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1938 long a = *(long*)(src1+i);
1939 long b = *(long*)(src2+i);
1940 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1943 dst[i+0] = src1[i+0]-src2[i+0];
1946 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1954 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1963 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1971 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1981 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1984 for(i=0; i<w-1; i++){
2011 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2041 #define BUTTERFLY2(o1,o2,i1,i2) \
2045 #define BUTTERFLY1(x,y) \
2054 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2056 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2064 //FIXME try pointer walks
2065 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2066 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2067 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2068 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2070 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2071 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2072 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2073 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2075 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2076 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2077 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2078 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2082 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2083 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2084 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2085 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2087 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2088 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2089 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2090 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2093 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2094 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2095 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2096 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2101 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2109 //FIXME try pointer walks
2110 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2111 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2112 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2113 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2115 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2116 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2117 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2118 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2120 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2121 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2122 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2123 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2127 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2128 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2129 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2130 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2132 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2133 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2134 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2135 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2138 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2139 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2140 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2141 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2144 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2149 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2150 MpegEncContext * const s= (MpegEncContext *)c;
2151 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2155 s->dsp.diff_pixels(temp, src1, src2, stride);
2157 return s->dsp.sum_abs_dctelem(temp);
2162 const int s07 = SRC(0) + SRC(7);\
2163 const int s16 = SRC(1) + SRC(6);\
2164 const int s25 = SRC(2) + SRC(5);\
2165 const int s34 = SRC(3) + SRC(4);\
2166 const int a0 = s07 + s34;\
2167 const int a1 = s16 + s25;\
2168 const int a2 = s07 - s34;\
2169 const int a3 = s16 - s25;\
2170 const int d07 = SRC(0) - SRC(7);\
2171 const int d16 = SRC(1) - SRC(6);\
2172 const int d25 = SRC(2) - SRC(5);\
2173 const int d34 = SRC(3) - SRC(4);\
2174 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2175 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2176 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2177 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2179 DST(1, a4 + (a7>>2)) ;\
2180 DST(2, a2 + (a3>>1)) ;\
2181 DST(3, a5 + (a6>>2)) ;\
2183 DST(5, a6 - (a5>>2)) ;\
2184 DST(6, (a2>>1) - a3 ) ;\
2185 DST(7, (a4>>2) - a7 ) ;\
2188 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2189 MpegEncContext * const s= (MpegEncContext *)c;
2194 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2196 #define SRC(x) dct[i][x]
2197 #define DST(x,v) dct[i][x]= v
2198 for( i = 0; i < 8; i++ )
2203 #define SRC(x) dct[x][i]
2204 #define DST(x,v) sum += FFABS(v)
2205 for( i = 0; i < 8; i++ )
2213 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2214 MpegEncContext * const s= (MpegEncContext *)c;
2215 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2220 s->dsp.diff_pixels(temp, src1, src2, stride);
2224 sum= FFMAX(sum, FFABS(temp[i]));
2229 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2230 MpegEncContext * const s= (MpegEncContext *)c;
2231 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2232 DCTELEM * const bak = temp+64;
2238 s->dsp.diff_pixels(temp, src1, src2, stride);
2240 memcpy(bak, temp, 64*sizeof(DCTELEM));
2242 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2243 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2244 ff_simple_idct_8(temp); //FIXME
2247 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2252 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2253 MpegEncContext * const s= (MpegEncContext *)c;
2254 const uint8_t *scantable= s->intra_scantable.permutated;
2255 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2256 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2257 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2258 int i, last, run, bits, level, distortion, start_i;
2259 const int esc_length= s->ac_esc_length;
2261 uint8_t * last_length;
2265 copy_block8(lsrc1, src1, 8, stride, 8);
2266 copy_block8(lsrc2, src2, 8, stride, 8);
2268 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2270 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2276 length = s->intra_ac_vlc_length;
2277 last_length= s->intra_ac_vlc_last_length;
2278 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2281 length = s->inter_ac_vlc_length;
2282 last_length= s->inter_ac_vlc_last_length;
2287 for(i=start_i; i<last; i++){
2288 int j= scantable[i];
2293 if((level&(~127)) == 0){
2294 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2303 level= temp[i] + 64;
2307 if((level&(~127)) == 0){
2308 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2316 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2318 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2321 s->dsp.idct_add(lsrc2, 8, temp);
2323 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2325 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2328 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2329 MpegEncContext * const s= (MpegEncContext *)c;
2330 const uint8_t *scantable= s->intra_scantable.permutated;
2331 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2332 int i, last, run, bits, level, start_i;
2333 const int esc_length= s->ac_esc_length;
2335 uint8_t * last_length;
2339 s->dsp.diff_pixels(temp, src1, src2, stride);
2341 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2347 length = s->intra_ac_vlc_length;
2348 last_length= s->intra_ac_vlc_last_length;
2349 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2352 length = s->inter_ac_vlc_length;
2353 last_length= s->inter_ac_vlc_last_length;
2358 for(i=start_i; i<last; i++){
2359 int j= scantable[i];
2364 if((level&(~127)) == 0){
2365 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2374 level= temp[i] + 64;
2378 if((level&(~127)) == 0){
2379 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2387 #define VSAD_INTRA(size) \
2388 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2392 for(y=1; y<h; y++){ \
2393 for(x=0; x<size; x+=4){ \
2394 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2395 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2405 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2410 for(x=0; x<16; x++){
2411 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2420 #define SQ(a) ((a)*(a))
2421 #define VSSE_INTRA(size) \
2422 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2426 for(y=1; y<h; y++){ \
2427 for(x=0; x<size; x+=4){ \
2428 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2429 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2439 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2444 for(x=0; x<16; x++){
2445 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2454 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2458 for(i=0; i<size; i++)
2459 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2463 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2464 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2465 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2467 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2469 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2470 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2471 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2472 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2474 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2477 for(i=0; i<len; i++)
2478 dst[i] = src0[i] * src1[-i];
2481 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2483 for(i=0; i<len; i++)
2484 dst[i] = src0[i] * src1[i] + src2[i];
2487 static void vector_fmul_window_c(float *dst, const float *src0,
2488 const float *src1, const float *win, int len)
2494 for(i=-len, j=len-1; i<0; i++, j--) {
2499 dst[i] = s0*wj - s1*wi;
2500 dst[j] = s0*wi + s1*wj;
2504 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2508 for (i = 0; i < len; i++)
2509 dst[i] = src[i] * mul;
2512 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2516 for (i = 0; i < len; i++)
2517 dst[i] += src[i] * mul;
2520 static void butterflies_float_c(float *av_restrict v1, float *av_restrict v2,
2524 for (i = 0; i < len; i++) {
2525 float t = v1[i] - v2[i];
2531 static void butterflies_float_interleave_c(float *dst, const float *src0,
2532 const float *src1, int len)
2535 for (i = 0; i < len; i++) {
2538 dst[2*i ] = f1 + f2;
2539 dst[2*i + 1] = f1 - f2;
2543 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2548 for (i = 0; i < len; i++)
2554 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2555 uint32_t maxi, uint32_t maxisign)
2558 if(a > mini) return mini;
2559 else if((a^(1U<<31)) > maxisign) return maxi;
2563 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2565 uint32_t mini = *(uint32_t*)min;
2566 uint32_t maxi = *(uint32_t*)max;
2567 uint32_t maxisign = maxi ^ (1U<<31);
2568 uint32_t *dsti = (uint32_t*)dst;
2569 const uint32_t *srci = (const uint32_t*)src;
2570 for(i=0; i<len; i+=8) {
2571 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2572 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2573 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2574 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2575 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2576 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2577 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2578 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2581 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2583 if(min < 0 && max > 0) {
2584 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2586 for(i=0; i < len; i+=8) {
2587 dst[i ] = av_clipf(src[i ], min, max);
2588 dst[i + 1] = av_clipf(src[i + 1], min, max);
2589 dst[i + 2] = av_clipf(src[i + 2], min, max);
2590 dst[i + 3] = av_clipf(src[i + 3], min, max);
2591 dst[i + 4] = av_clipf(src[i + 4], min, max);
2592 dst[i + 5] = av_clipf(src[i + 5], min, max);
2593 dst[i + 6] = av_clipf(src[i + 6], min, max);
2594 dst[i + 7] = av_clipf(src[i + 7], min, max);
2599 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2604 res += *v1++ * *v2++;
2609 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2614 *v1++ += mul * *v3++;
2619 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2620 const int16_t *window, unsigned int len)
2623 int len2 = len >> 1;
2625 for (i = 0; i < len2; i++) {
2626 int16_t w = window[i];
2627 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2628 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2632 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2633 int32_t max, unsigned int len)
2636 *dst++ = av_clip(*src++, min, max);
2637 *dst++ = av_clip(*src++, min, max);
2638 *dst++ = av_clip(*src++, min, max);
2639 *dst++ = av_clip(*src++, min, max);
2640 *dst++ = av_clip(*src++, min, max);
2641 *dst++ = av_clip(*src++, min, max);
2642 *dst++ = av_clip(*src++, min, max);
2643 *dst++ = av_clip(*src++, min, max);
2649 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2650 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2651 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2652 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2653 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2654 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2655 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2657 static void wmv2_idct_row(short * b)
2660 int a0,a1,a2,a3,a4,a5,a6,a7;
2662 a1 = W1*b[1]+W7*b[7];
2663 a7 = W7*b[1]-W1*b[7];
2664 a5 = W5*b[5]+W3*b[3];
2665 a3 = W3*b[5]-W5*b[3];
2666 a2 = W2*b[2]+W6*b[6];
2667 a6 = W6*b[2]-W2*b[6];
2668 a0 = W0*b[0]+W0*b[4];
2669 a4 = W0*b[0]-W0*b[4];
2671 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2672 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2674 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2675 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2676 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2677 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2678 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2679 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2680 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2681 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2683 static void wmv2_idct_col(short * b)
2686 int a0,a1,a2,a3,a4,a5,a6,a7;
2687 /*step 1, with extended precision*/
2688 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2689 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2690 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2691 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2692 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2693 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2694 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2695 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2697 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2698 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2700 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2701 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2702 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2703 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2705 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2706 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2707 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2708 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2710 void ff_wmv2_idct_c(short * block){
2714 wmv2_idct_row(block+i);
2717 wmv2_idct_col(block+i);
2720 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2722 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2724 ff_wmv2_idct_c(block);
2725 ff_put_pixels_clamped_c(block, dest, line_size);
2727 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2729 ff_wmv2_idct_c(block);
2730 ff_add_pixels_clamped_c(block, dest, line_size);
2732 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2734 ff_j_rev_dct (block);
2735 ff_put_pixels_clamped_c(block, dest, line_size);
2737 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2739 ff_j_rev_dct (block);
2740 ff_add_pixels_clamped_c(block, dest, line_size);
2743 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2745 ff_j_rev_dct4 (block);
2746 put_pixels_clamped4_c(block, dest, line_size);
2748 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2750 ff_j_rev_dct4 (block);
2751 add_pixels_clamped4_c(block, dest, line_size);
2754 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2756 ff_j_rev_dct2 (block);
2757 put_pixels_clamped2_c(block, dest, line_size);
2759 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2761 ff_j_rev_dct2 (block);
2762 add_pixels_clamped2_c(block, dest, line_size);
2765 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2767 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2769 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2771 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2774 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2776 /* init static data */
2777 av_cold void ff_dsputil_static_init(void)
2781 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2782 for(i=0;i<MAX_NEG_CROP;i++) {
2784 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2787 for(i=0;i<512;i++) {
2788 ff_squareTbl[i] = (i - 256) * (i - 256);
2791 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2794 int ff_check_alignment(void){
2795 static int did_fail=0;
2796 LOCAL_ALIGNED_16(int, aligned, [4]);
2798 if((intptr_t)aligned & 15){
2800 #if HAVE_MMX || HAVE_ALTIVEC
2801 av_log(NULL, AV_LOG_ERROR,
2802 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2803 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2804 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2805 "Do not report crashes to FFmpeg developers.\n");
2814 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2818 ff_check_alignment();
2821 if (avctx->bits_per_raw_sample == 10) {
2822 c->fdct = ff_jpeg_fdct_islow_10;
2823 c->fdct248 = ff_fdct248_islow_10;
2825 if(avctx->dct_algo==FF_DCT_FASTINT) {
2826 c->fdct = ff_fdct_ifast;
2827 c->fdct248 = ff_fdct_ifast248;
2829 else if(avctx->dct_algo==FF_DCT_FAAN) {
2830 c->fdct = ff_faandct;
2831 c->fdct248 = ff_faandct248;
2834 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2835 c->fdct248 = ff_fdct248_islow_8;
2838 #endif //CONFIG_ENCODERS
2840 if(avctx->lowres==1){
2841 c->idct_put= ff_jref_idct4_put;
2842 c->idct_add= ff_jref_idct4_add;
2843 c->idct = ff_j_rev_dct4;
2844 c->idct_permutation_type= FF_NO_IDCT_PERM;
2845 }else if(avctx->lowres==2){
2846 c->idct_put= ff_jref_idct2_put;
2847 c->idct_add= ff_jref_idct2_add;
2848 c->idct = ff_j_rev_dct2;
2849 c->idct_permutation_type= FF_NO_IDCT_PERM;
2850 }else if(avctx->lowres==3){
2851 c->idct_put= ff_jref_idct1_put;
2852 c->idct_add= ff_jref_idct1_add;
2853 c->idct = ff_j_rev_dct1;
2854 c->idct_permutation_type= FF_NO_IDCT_PERM;
2856 if (avctx->bits_per_raw_sample == 10) {
2857 c->idct_put = ff_simple_idct_put_10;
2858 c->idct_add = ff_simple_idct_add_10;
2859 c->idct = ff_simple_idct_10;
2860 c->idct_permutation_type = FF_NO_IDCT_PERM;
2862 if(avctx->idct_algo==FF_IDCT_INT){
2863 c->idct_put= ff_jref_idct_put;
2864 c->idct_add= ff_jref_idct_add;
2865 c->idct = ff_j_rev_dct;
2866 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2867 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2868 avctx->idct_algo==FF_IDCT_VP3){
2869 c->idct_put= ff_vp3_idct_put_c;
2870 c->idct_add= ff_vp3_idct_add_c;
2871 c->idct = ff_vp3_idct_c;
2872 c->idct_permutation_type= FF_NO_IDCT_PERM;
2873 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2874 c->idct_put= ff_wmv2_idct_put_c;
2875 c->idct_add= ff_wmv2_idct_add_c;
2876 c->idct = ff_wmv2_idct_c;
2877 c->idct_permutation_type= FF_NO_IDCT_PERM;
2878 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2879 c->idct_put= ff_faanidct_put;
2880 c->idct_add= ff_faanidct_add;
2881 c->idct = ff_faanidct;
2882 c->idct_permutation_type= FF_NO_IDCT_PERM;
2883 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2884 c->idct_put= ff_ea_idct_put_c;
2885 c->idct_permutation_type= FF_NO_IDCT_PERM;
2886 }else{ //accurate/default
2887 c->idct_put = ff_simple_idct_put_8;
2888 c->idct_add = ff_simple_idct_add_8;
2889 c->idct = ff_simple_idct_8;
2890 c->idct_permutation_type= FF_NO_IDCT_PERM;
2895 c->diff_pixels = diff_pixels_c;
2896 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2897 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2898 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2899 c->sum_abs_dctelem = sum_abs_dctelem_c;
2902 c->pix_sum = pix_sum_c;
2903 c->pix_norm1 = pix_norm1_c;
2905 c->fill_block_tab[0] = fill_block16_c;
2906 c->fill_block_tab[1] = fill_block8_c;
2908 /* TODO [0] 16 [1] 8 */
2909 c->pix_abs[0][0] = pix_abs16_c;
2910 c->pix_abs[0][1] = pix_abs16_x2_c;
2911 c->pix_abs[0][2] = pix_abs16_y2_c;
2912 c->pix_abs[0][3] = pix_abs16_xy2_c;
2913 c->pix_abs[1][0] = pix_abs8_c;
2914 c->pix_abs[1][1] = pix_abs8_x2_c;
2915 c->pix_abs[1][2] = pix_abs8_y2_c;
2916 c->pix_abs[1][3] = pix_abs8_xy2_c;
2918 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2919 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2920 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2921 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2922 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2923 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2924 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2925 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2926 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2928 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2929 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2930 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2931 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2932 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2933 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2934 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2935 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2936 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2938 #define dspfunc(PFX, IDX, NUM) \
2939 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2940 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2941 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2942 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2943 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2944 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2945 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2946 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2947 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2948 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2949 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2950 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2951 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2952 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2953 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2954 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2956 dspfunc(put_qpel, 0, 16);
2957 dspfunc(put_no_rnd_qpel, 0, 16);
2959 dspfunc(avg_qpel, 0, 16);
2960 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2962 dspfunc(put_qpel, 1, 8);
2963 dspfunc(put_no_rnd_qpel, 1, 8);
2965 dspfunc(avg_qpel, 1, 8);
2966 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2970 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2971 ff_mlp_init(c, avctx);
2973 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2974 ff_intrax8dsp_init(c,avctx);
2977 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2978 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2979 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2980 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2981 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2982 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2983 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2984 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2986 #define SET_CMP_FUNC(name) \
2987 c->name[0]= name ## 16_c;\
2988 c->name[1]= name ## 8x8_c;
2990 SET_CMP_FUNC(hadamard8_diff)
2991 c->hadamard8_diff[4]= hadamard8_intra16_c;
2992 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2993 SET_CMP_FUNC(dct_sad)
2994 SET_CMP_FUNC(dct_max)
2996 SET_CMP_FUNC(dct264_sad)
2998 c->sad[0]= pix_abs16_c;
2999 c->sad[1]= pix_abs8_c;
3003 SET_CMP_FUNC(quant_psnr)
3006 c->vsad[0]= vsad16_c;
3007 c->vsad[4]= vsad_intra16_c;
3008 c->vsad[5]= vsad_intra8_c;
3009 c->vsse[0]= vsse16_c;
3010 c->vsse[4]= vsse_intra16_c;
3011 c->vsse[5]= vsse_intra8_c;
3012 c->nsse[0]= nsse16_c;
3013 c->nsse[1]= nsse8_c;
3015 ff_dsputil_init_dwt(c);
3018 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3020 c->add_bytes= add_bytes_c;
3021 c->diff_bytes= diff_bytes_c;
3022 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3023 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3024 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3025 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3026 c->bswap_buf= bswap_buf;
3027 c->bswap16_buf = bswap16_buf;
3029 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3030 c->h263_h_loop_filter= h263_h_loop_filter_c;
3031 c->h263_v_loop_filter= h263_v_loop_filter_c;
3034 if (CONFIG_VP3_DECODER) {
3035 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3036 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3037 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3040 c->h261_loop_filter= h261_loop_filter_c;
3042 c->try_8x8basis= try_8x8basis_c;
3043 c->add_8x8basis= add_8x8basis_c;
3045 #if CONFIG_VORBIS_DECODER
3046 c->vorbis_inverse_coupling = ff_vorbis_inverse_coupling;
3048 #if CONFIG_AC3_DECODER
3049 c->ac3_downmix = ff_ac3_downmix_c;
3051 c->vector_fmul_reverse = vector_fmul_reverse_c;
3052 c->vector_fmul_add = vector_fmul_add_c;
3053 c->vector_fmul_window = vector_fmul_window_c;
3054 c->vector_clipf = vector_clipf_c;
3055 c->scalarproduct_int16 = scalarproduct_int16_c;
3056 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3057 c->apply_window_int16 = apply_window_int16_c;
3058 c->vector_clip_int32 = vector_clip_int32_c;
3059 c->scalarproduct_float = scalarproduct_float_c;
3060 c->butterflies_float = butterflies_float_c;
3061 c->butterflies_float_interleave = butterflies_float_interleave_c;
3062 c->vector_fmul_scalar = vector_fmul_scalar_c;
3063 c->vector_fmac_scalar = vector_fmac_scalar_c;
3065 c->shrink[0]= av_image_copy_plane;
3066 c->shrink[1]= ff_shrink22;
3067 c->shrink[2]= ff_shrink44;
3068 c->shrink[3]= ff_shrink88;
3070 c->prefetch= just_return;
3072 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3073 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3077 #define FUNC(f, depth) f ## _ ## depth
3078 #define FUNCC(f, depth) f ## _ ## depth ## _c
3080 #define dspfunc1(PFX, IDX, NUM, depth)\
3081 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3082 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3083 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3084 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3086 #define dspfunc2(PFX, IDX, NUM, depth)\
3087 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3088 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3089 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3090 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3091 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3092 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3093 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3094 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3095 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3096 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3097 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3098 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3099 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3100 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3101 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3102 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3105 #define BIT_DEPTH_FUNCS(depth, dct)\
3106 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3107 c->draw_edges = FUNCC(draw_edges , depth);\
3108 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3109 c->clear_block = FUNCC(clear_block ## dct , depth);\
3110 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3111 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3112 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3113 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3114 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3116 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3117 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3118 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3119 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3120 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3121 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3123 dspfunc1(put , 0, 16, depth);\
3124 dspfunc1(put , 1, 8, depth);\
3125 dspfunc1(put , 2, 4, depth);\
3126 dspfunc1(put , 3, 2, depth);\
3127 dspfunc1(put_no_rnd, 0, 16, depth);\
3128 dspfunc1(put_no_rnd, 1, 8, depth);\
3129 dspfunc1(avg , 0, 16, depth);\
3130 dspfunc1(avg , 1, 8, depth);\
3131 dspfunc1(avg , 2, 4, depth);\
3132 dspfunc1(avg , 3, 2, depth);\
3133 dspfunc1(avg_no_rnd, 0, 16, depth);\
3134 dspfunc1(avg_no_rnd, 1, 8, depth);\
3136 dspfunc2(put_h264_qpel, 0, 16, depth);\
3137 dspfunc2(put_h264_qpel, 1, 8, depth);\
3138 dspfunc2(put_h264_qpel, 2, 4, depth);\
3139 dspfunc2(put_h264_qpel, 3, 2, depth);\
3140 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3141 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3142 dspfunc2(avg_h264_qpel, 2, 4, depth);
3144 switch (avctx->bits_per_raw_sample) {
3146 if (c->dct_bits == 32) {
3147 BIT_DEPTH_FUNCS(9, _32);
3149 BIT_DEPTH_FUNCS(9, _16);
3153 if (c->dct_bits == 32) {
3154 BIT_DEPTH_FUNCS(10, _32);
3156 BIT_DEPTH_FUNCS(10, _16);
3160 BIT_DEPTH_FUNCS(8, _16);
3165 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
3166 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
3167 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
3168 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
3169 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
3170 if (HAVE_MMI) ff_dsputil_init_mmi (c, avctx);
3171 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
3172 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
3174 for (i = 0; i < 4; i++) {
3175 for (j = 0; j < 16; j++) {
3176 if(!c->put_2tap_qpel_pixels_tab[i][j])
3177 c->put_2tap_qpel_pixels_tab[i][j] =
3178 c->put_h264_qpel_pixels_tab[i][j];
3179 if(!c->avg_2tap_qpel_pixels_tab[i][j])
3180 c->avg_2tap_qpel_pixels_tab[i][j] =
3181 c->avg_h264_qpel_pixels_tab[i][j];
3185 ff_init_scantable_permutation(c->idct_permutation,
3186 c->idct_permutation_type);
3189 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3191 ff_dsputil_init(c, avctx);