3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
311 const uint8_t *s2, int stride){
314 /* read the pixels */
316 block[0] = s1[0] - s2[0];
317 block[1] = s1[1] - s2[1];
318 block[2] = s1[2] - s2[2];
319 block[3] = s1[3] - s2[3];
320 block[4] = s1[4] - s2[4];
321 block[5] = s1[5] - s2[5];
322 block[6] = s1[6] - s2[6];
323 block[7] = s1[7] - s2[7];
331 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
337 /* read the pixels */
339 pixels[0] = cm[block[0]];
340 pixels[1] = cm[block[1]];
341 pixels[2] = cm[block[2]];
342 pixels[3] = cm[block[3]];
343 pixels[4] = cm[block[4]];
344 pixels[5] = cm[block[5]];
345 pixels[6] = cm[block[6]];
346 pixels[7] = cm[block[7]];
353 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
357 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
359 /* read the pixels */
361 pixels[0] = cm[block[0]];
362 pixels[1] = cm[block[1]];
363 pixels[2] = cm[block[2]];
364 pixels[3] = cm[block[3]];
371 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377 /* read the pixels */
379 pixels[0] = cm[block[0]];
380 pixels[1] = cm[block[1]];
387 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
388 uint8_t *restrict pixels,
393 for (i = 0; i < 8; i++) {
394 for (j = 0; j < 8; j++) {
397 else if (*block > 127)
400 *pixels = (uint8_t)(*block + 128);
404 pixels += (line_size - 8);
408 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
412 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
414 /* read the pixels */
416 pixels[0] = cm[pixels[0] + block[0]];
417 pixels[1] = cm[pixels[1] + block[1]];
418 pixels[2] = cm[pixels[2] + block[2]];
419 pixels[3] = cm[pixels[3] + block[3]];
420 pixels[4] = cm[pixels[4] + block[4]];
421 pixels[5] = cm[pixels[5] + block[5]];
422 pixels[6] = cm[pixels[6] + block[6]];
423 pixels[7] = cm[pixels[7] + block[7]];
429 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
435 /* read the pixels */
437 pixels[0] = cm[pixels[0] + block[0]];
438 pixels[1] = cm[pixels[1] + block[1]];
439 pixels[2] = cm[pixels[2] + block[2]];
440 pixels[3] = cm[pixels[3] + block[3]];
446 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452 /* read the pixels */
454 pixels[0] = cm[pixels[0] + block[0]];
455 pixels[1] = cm[pixels[1] + block[1]];
461 static int sum_abs_dctelem_c(DCTELEM *block)
465 sum+= FFABS(block[i]);
469 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
473 for (i = 0; i < h; i++) {
474 memset(block, value, 16);
479 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
483 for (i = 0; i < h; i++) {
484 memset(block, value, 8);
489 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
492 uint16_t *dst1 = (uint16_t *) dst;
493 uint16_t *dst2 = (uint16_t *)(dst + linesize);
495 for (j = 0; j < 8; j++) {
496 for (i = 0; i < 8; i++) {
497 dst1[i] = dst2[i] = src[i] * 0x0101;
505 #define avg2(a,b) ((a+b+1)>>1)
506 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
508 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
510 const int A=(16-x16)*(16-y16);
511 const int B=( x16)*(16-y16);
512 const int C=(16-x16)*( y16);
513 const int D=( x16)*( y16);
518 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
519 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
520 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
521 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
522 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
523 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
524 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
525 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
531 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
532 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
535 const int s= 1<<shift;
545 for(x=0; x<8; x++){ //XXX FIXME optimize
546 int src_x, src_y, frac_x, frac_y, index;
555 if((unsigned)src_x < width){
556 if((unsigned)src_y < height){
557 index= src_x + src_y*stride;
558 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
559 + src[index +1]* frac_x )*(s-frac_y)
560 + ( src[index+stride ]*(s-frac_x)
561 + src[index+stride+1]* frac_x )* frac_y
564 index= src_x + av_clip(src_y, 0, height)*stride;
565 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
566 + src[index +1]* frac_x )*s
570 if((unsigned)src_y < height){
571 index= av_clip(src_x, 0, width) + src_y*stride;
572 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
573 + src[index+stride ]* frac_y )*s
576 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
577 dst[y*stride + x]= src[index ];
589 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
591 case 2: put_pixels2_8_c (dst, src, stride, height); break;
592 case 4: put_pixels4_8_c (dst, src, stride, height); break;
593 case 8: put_pixels8_8_c (dst, src, stride, height); break;
594 case 16:put_pixels16_8_c(dst, src, stride, height); break;
598 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
600 for (i=0; i < height; i++) {
601 for (j=0; j < width; j++) {
602 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
609 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
611 for (i=0; i < height; i++) {
612 for (j=0; j < width; j++) {
613 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
620 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
622 for (i=0; i < height; i++) {
623 for (j=0; j < width; j++) {
624 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
631 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
633 for (i=0; i < height; i++) {
634 for (j=0; j < width; j++) {
635 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
642 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
644 for (i=0; i < height; i++) {
645 for (j=0; j < width; j++) {
646 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
653 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
655 for (i=0; i < height; i++) {
656 for (j=0; j < width; j++) {
657 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
664 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
666 for (i=0; i < height; i++) {
667 for (j=0; j < width; j++) {
668 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
675 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
677 for (i=0; i < height; i++) {
678 for (j=0; j < width; j++) {
679 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
686 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
688 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
689 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
690 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
691 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
695 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
706 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
717 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
719 for (i=0; i < height; i++) {
720 for (j=0; j < width; j++) {
721 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
728 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
730 for (i=0; i < height; i++) {
731 for (j=0; j < width; j++) {
732 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
739 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
741 for (i=0; i < height; i++) {
742 for (j=0; j < width; j++) {
743 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
750 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
752 for (i=0; i < height; i++) {
753 for (j=0; j < width; j++) {
754 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
761 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
763 for (i=0; i < height; i++) {
764 for (j=0; j < width; j++) {
765 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
772 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
774 for (i=0; i < height; i++) {
775 for (j=0; j < width; j++) {
776 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
783 #define QPEL_MC(r, OPNAME, RND, OP) \
784 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
785 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
789 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
790 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
791 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
792 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
793 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
794 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
795 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
796 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
802 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
804 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
808 const int src0= src[0*srcStride];\
809 const int src1= src[1*srcStride];\
810 const int src2= src[2*srcStride];\
811 const int src3= src[3*srcStride];\
812 const int src4= src[4*srcStride];\
813 const int src5= src[5*srcStride];\
814 const int src6= src[6*srcStride];\
815 const int src7= src[7*srcStride];\
816 const int src8= src[8*srcStride];\
817 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
818 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
819 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
820 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
821 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
822 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
823 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
824 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
830 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
831 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
836 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
837 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
838 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
839 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
840 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
841 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
842 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
843 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
844 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
845 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
846 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
847 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
848 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
849 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
850 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
851 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
857 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
858 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
863 const int src0= src[0*srcStride];\
864 const int src1= src[1*srcStride];\
865 const int src2= src[2*srcStride];\
866 const int src3= src[3*srcStride];\
867 const int src4= src[4*srcStride];\
868 const int src5= src[5*srcStride];\
869 const int src6= src[6*srcStride];\
870 const int src7= src[7*srcStride];\
871 const int src8= src[8*srcStride];\
872 const int src9= src[9*srcStride];\
873 const int src10= src[10*srcStride];\
874 const int src11= src[11*srcStride];\
875 const int src12= src[12*srcStride];\
876 const int src13= src[13*srcStride];\
877 const int src14= src[14*srcStride];\
878 const int src15= src[15*srcStride];\
879 const int src16= src[16*srcStride];\
880 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
881 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
882 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
883 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
884 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
885 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
886 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
887 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
888 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
889 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
890 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
891 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
892 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
893 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
894 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
895 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
901 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
903 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
904 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
907 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
908 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
911 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
913 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
914 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
917 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
920 copy_block9(full, src, 16, stride, 9);\
921 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
922 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
925 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
927 copy_block9(full, src, 16, stride, 9);\
928 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
931 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
934 copy_block9(full, src, 16, stride, 9);\
935 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
936 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
938 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
943 copy_block9(full, src, 16, stride, 9);\
944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
945 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
947 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
949 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
953 copy_block9(full, src, 16, stride, 9);\
954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
955 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
957 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
959 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
964 copy_block9(full, src, 16, stride, 9);\
965 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
966 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
967 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
968 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
970 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
976 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
977 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
978 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
980 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
985 copy_block9(full, src, 16, stride, 9);\
986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
987 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
988 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
989 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
991 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1001 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t full[16*9];\
1005 uint8_t halfHV[64];\
1006 copy_block9(full, src, 16, stride, 9);\
1007 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1009 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1010 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1012 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1013 uint8_t full[16*9];\
1015 uint8_t halfHV[64];\
1016 copy_block9(full, src, 16, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1018 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1019 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1022 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1024 uint8_t halfHV[64];\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1027 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1029 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t halfHV[64];\
1032 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1033 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1034 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1036 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1037 uint8_t full[16*9];\
1040 uint8_t halfHV[64];\
1041 copy_block9(full, src, 16, stride, 9);\
1042 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1043 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1044 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1045 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1047 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t full[16*9];\
1050 copy_block9(full, src, 16, stride, 9);\
1051 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1052 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1053 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1055 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056 uint8_t full[16*9];\
1059 uint8_t halfHV[64];\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1063 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1066 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1067 uint8_t full[16*9];\
1069 copy_block9(full, src, 16, stride, 9);\
1070 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1071 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1072 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1074 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1076 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1077 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1080 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1082 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1083 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1086 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1087 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1090 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1092 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1093 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1096 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1097 uint8_t full[24*17];\
1099 copy_block17(full, src, 24, stride, 17);\
1100 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1101 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1104 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1105 uint8_t full[24*17];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1110 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1111 uint8_t full[24*17];\
1113 copy_block17(full, src, 24, stride, 17);\
1114 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1115 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1117 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1118 uint8_t full[24*17];\
1119 uint8_t halfH[272];\
1120 uint8_t halfV[256];\
1121 uint8_t halfHV[256];\
1122 copy_block17(full, src, 24, stride, 17);\
1123 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1124 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1125 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1126 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1128 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1129 uint8_t full[24*17];\
1130 uint8_t halfH[272];\
1131 uint8_t halfHV[256];\
1132 copy_block17(full, src, 24, stride, 17);\
1133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1134 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1136 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1138 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1139 uint8_t full[24*17];\
1140 uint8_t halfH[272];\
1141 uint8_t halfV[256];\
1142 uint8_t halfHV[256];\
1143 copy_block17(full, src, 24, stride, 17);\
1144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1145 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1147 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1149 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1150 uint8_t full[24*17];\
1151 uint8_t halfH[272];\
1152 uint8_t halfHV[256];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1160 uint8_t full[24*17];\
1161 uint8_t halfH[272];\
1162 uint8_t halfV[256];\
1163 uint8_t halfHV[256];\
1164 copy_block17(full, src, 24, stride, 17);\
1165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1168 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1170 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1171 uint8_t full[24*17];\
1172 uint8_t halfH[272];\
1173 uint8_t halfHV[256];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1180 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1181 uint8_t full[24*17];\
1182 uint8_t halfH[272];\
1183 uint8_t halfV[256];\
1184 uint8_t halfHV[256];\
1185 copy_block17(full, src, 24, stride, 17);\
1186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1189 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1191 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1192 uint8_t full[24*17];\
1193 uint8_t halfH[272];\
1194 uint8_t halfHV[256];\
1195 copy_block17(full, src, 24, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1197 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1201 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1202 uint8_t halfH[272];\
1203 uint8_t halfHV[256];\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1208 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1209 uint8_t halfH[272];\
1210 uint8_t halfHV[256];\
1211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1212 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1213 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1215 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1216 uint8_t full[24*17];\
1217 uint8_t halfH[272];\
1218 uint8_t halfV[256];\
1219 uint8_t halfHV[256];\
1220 copy_block17(full, src, 24, stride, 17);\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1224 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1226 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1227 uint8_t full[24*17];\
1228 uint8_t halfH[272];\
1229 copy_block17(full, src, 24, stride, 17);\
1230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1231 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1232 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1234 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1235 uint8_t full[24*17];\
1236 uint8_t halfH[272];\
1237 uint8_t halfV[256];\
1238 uint8_t halfHV[256];\
1239 copy_block17(full, src, 24, stride, 17);\
1240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1241 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1242 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1243 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1245 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1246 uint8_t full[24*17];\
1247 uint8_t halfH[272];\
1248 copy_block17(full, src, 24, stride, 17);\
1249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1253 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1254 uint8_t halfH[272];\
1255 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1259 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1260 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1261 #define op_put(a, b) a = cm[((b) + 16)>>5]
1262 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1264 QPEL_MC(0, put_ , _ , op_put)
1265 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1266 QPEL_MC(0, avg_ , _ , op_avg)
1267 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1269 #undef op_avg_no_rnd
1271 #undef op_put_no_rnd
1273 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1274 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1275 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1276 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1277 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1278 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1280 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1281 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1285 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1286 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1287 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1288 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1289 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1290 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1291 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1292 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1298 #if CONFIG_RV40_DECODER
1299 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1300 put_pixels16_xy2_8_c(dst, src, stride, 16);
1302 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1303 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1305 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1306 put_pixels8_xy2_8_c(dst, src, stride, 8);
1308 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1309 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1311 #endif /* CONFIG_RV40_DECODER */
1313 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1314 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1318 const int src_1= src[ -srcStride];
1319 const int src0 = src[0 ];
1320 const int src1 = src[ srcStride];
1321 const int src2 = src[2*srcStride];
1322 const int src3 = src[3*srcStride];
1323 const int src4 = src[4*srcStride];
1324 const int src5 = src[5*srcStride];
1325 const int src6 = src[6*srcStride];
1326 const int src7 = src[7*srcStride];
1327 const int src8 = src[8*srcStride];
1328 const int src9 = src[9*srcStride];
1329 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1330 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1331 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1332 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1333 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1334 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1335 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1336 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1342 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1344 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1345 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1348 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1349 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1352 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1354 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1355 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1358 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1359 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1362 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1366 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1367 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1368 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1369 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1371 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1375 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1376 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1377 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1378 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1380 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1382 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1383 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1386 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1387 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1389 const int strength= ff_h263_loop_filter_strength[qscale];
1393 int p0= src[x-2*stride];
1394 int p1= src[x-1*stride];
1395 int p2= src[x+0*stride];
1396 int p3= src[x+1*stride];
1397 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1399 if (d<-2*strength) d1= 0;
1400 else if(d<- strength) d1=-2*strength - d;
1401 else if(d< strength) d1= d;
1402 else if(d< 2*strength) d1= 2*strength - d;
1407 if(p1&256) p1= ~(p1>>31);
1408 if(p2&256) p2= ~(p2>>31);
1410 src[x-1*stride] = p1;
1411 src[x+0*stride] = p2;
1415 d2= av_clip((p0-p3)/4, -ad1, ad1);
1417 src[x-2*stride] = p0 - d2;
1418 src[x+ stride] = p3 + d2;
1423 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1424 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1426 const int strength= ff_h263_loop_filter_strength[qscale];
1430 int p0= src[y*stride-2];
1431 int p1= src[y*stride-1];
1432 int p2= src[y*stride+0];
1433 int p3= src[y*stride+1];
1434 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1436 if (d<-2*strength) d1= 0;
1437 else if(d<- strength) d1=-2*strength - d;
1438 else if(d< strength) d1= d;
1439 else if(d< 2*strength) d1= 2*strength - d;
1444 if(p1&256) p1= ~(p1>>31);
1445 if(p2&256) p2= ~(p2>>31);
1447 src[y*stride-1] = p1;
1448 src[y*stride+0] = p2;
1452 d2= av_clip((p0-p3)/4, -ad1, ad1);
1454 src[y*stride-2] = p0 - d2;
1455 src[y*stride+1] = p3 + d2;
1460 static void h261_loop_filter_c(uint8_t *src, int stride){
1465 temp[x ] = 4*src[x ];
1466 temp[x + 7*8] = 4*src[x + 7*stride];
1470 xy = y * stride + x;
1472 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1477 src[ y*stride] = (temp[ y*8] + 2)>>2;
1478 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1480 xy = y * stride + x;
1482 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1487 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1493 s += abs(pix1[0] - pix2[0]);
1494 s += abs(pix1[1] - pix2[1]);
1495 s += abs(pix1[2] - pix2[2]);
1496 s += abs(pix1[3] - pix2[3]);
1497 s += abs(pix1[4] - pix2[4]);
1498 s += abs(pix1[5] - pix2[5]);
1499 s += abs(pix1[6] - pix2[6]);
1500 s += abs(pix1[7] - pix2[7]);
1501 s += abs(pix1[8] - pix2[8]);
1502 s += abs(pix1[9] - pix2[9]);
1503 s += abs(pix1[10] - pix2[10]);
1504 s += abs(pix1[11] - pix2[11]);
1505 s += abs(pix1[12] - pix2[12]);
1506 s += abs(pix1[13] - pix2[13]);
1507 s += abs(pix1[14] - pix2[14]);
1508 s += abs(pix1[15] - pix2[15]);
1515 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1521 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1522 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1523 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1524 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1525 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1526 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1527 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1528 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1529 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1530 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1531 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1532 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1533 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1534 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1535 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1536 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1543 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1546 uint8_t *pix3 = pix2 + line_size;
1550 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1551 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1552 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1553 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1554 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1555 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1556 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1557 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1558 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1559 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1560 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1561 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1562 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1563 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1564 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1565 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1573 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1576 uint8_t *pix3 = pix2 + line_size;
1580 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1581 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1582 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1583 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1584 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1585 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1586 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1587 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1588 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1589 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1590 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1591 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1592 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1593 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1594 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1595 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1603 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1609 s += abs(pix1[0] - pix2[0]);
1610 s += abs(pix1[1] - pix2[1]);
1611 s += abs(pix1[2] - pix2[2]);
1612 s += abs(pix1[3] - pix2[3]);
1613 s += abs(pix1[4] - pix2[4]);
1614 s += abs(pix1[5] - pix2[5]);
1615 s += abs(pix1[6] - pix2[6]);
1616 s += abs(pix1[7] - pix2[7]);
1623 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1629 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1630 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1631 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1632 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1633 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1634 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1635 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1636 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1643 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1646 uint8_t *pix3 = pix2 + line_size;
1650 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1651 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1652 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1653 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1654 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1655 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1656 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1657 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1665 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1668 uint8_t *pix3 = pix2 + line_size;
1672 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1673 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1674 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1675 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1676 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1677 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1678 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1679 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1687 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1688 MpegEncContext *c = v;
1694 for(x=0; x<16; x++){
1695 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1698 for(x=0; x<15; x++){
1699 score2+= FFABS( s1[x ] - s1[x +stride]
1700 - s1[x+1] + s1[x+1+stride])
1701 -FFABS( s2[x ] - s2[x +stride]
1702 - s2[x+1] + s2[x+1+stride]);
1709 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1710 else return score1 + FFABS(score2)*8;
1713 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1714 MpegEncContext *c = v;
1721 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1725 score2+= FFABS( s1[x ] - s1[x +stride]
1726 - s1[x+1] + s1[x+1+stride])
1727 -FFABS( s2[x ] - s2[x +stride]
1728 - s2[x+1] + s2[x+1+stride]);
1735 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1736 else return score1 + FFABS(score2)*8;
1739 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1743 for(i=0; i<8*8; i++){
1744 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1747 assert(-512<b && b<512);
1749 sum += (w*b)*(w*b)>>4;
1754 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1757 for(i=0; i<8*8; i++){
1758 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1763 * permutes an 8x8 block.
1764 * @param block the block which will be permuted according to the given permutation vector
1765 * @param permutation the permutation vector
1766 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1767 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1768 * (inverse) permutated to scantable order!
1770 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1776 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1778 for(i=0; i<=last; i++){
1779 const int j= scantable[i];
1784 for(i=0; i<=last; i++){
1785 const int j= scantable[i];
1786 const int perm_j= permutation[j];
1787 block[perm_j]= temp[j];
1791 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1795 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1798 memset(cmp, 0, sizeof(void*)*6);
1806 cmp[i]= c->hadamard8_diff[i];
1812 cmp[i]= c->dct_sad[i];
1815 cmp[i]= c->dct264_sad[i];
1818 cmp[i]= c->dct_max[i];
1821 cmp[i]= c->quant_psnr[i];
1850 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1855 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1857 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1858 long a = *(long*)(src+i);
1859 long b = *(long*)(dst+i);
1860 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1863 dst[i+0] += src[i+0];
1866 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1868 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1869 long a = *(long*)(src1+i);
1870 long b = *(long*)(src2+i);
1871 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1874 dst[i] = src1[i]+src2[i];
1877 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1879 #if !HAVE_FAST_UNALIGNED
1880 if((long)src2 & (sizeof(long)-1)){
1881 for(i=0; i+7<w; i+=8){
1882 dst[i+0] = src1[i+0]-src2[i+0];
1883 dst[i+1] = src1[i+1]-src2[i+1];
1884 dst[i+2] = src1[i+2]-src2[i+2];
1885 dst[i+3] = src1[i+3]-src2[i+3];
1886 dst[i+4] = src1[i+4]-src2[i+4];
1887 dst[i+5] = src1[i+5]-src2[i+5];
1888 dst[i+6] = src1[i+6]-src2[i+6];
1889 dst[i+7] = src1[i+7]-src2[i+7];
1893 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1894 long a = *(long*)(src1+i);
1895 long b = *(long*)(src2+i);
1896 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1899 dst[i+0] = src1[i+0]-src2[i+0];
1902 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1910 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1919 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1927 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1937 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1940 for(i=0; i<w-1; i++){
1967 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1997 #define BUTTERFLY2(o1,o2,i1,i2) \
2001 #define BUTTERFLY1(x,y) \
2010 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2012 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2020 //FIXME try pointer walks
2021 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2022 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2023 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2024 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2026 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2027 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2028 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2029 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2031 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2032 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2033 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2034 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2038 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2039 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2040 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2041 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2043 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2044 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2045 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2046 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2049 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2050 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2051 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2052 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2057 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2065 //FIXME try pointer walks
2066 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2067 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2068 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2069 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2071 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2072 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2073 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2074 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2076 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2077 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2078 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2079 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2083 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2084 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2085 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2086 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2088 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2089 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2090 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2091 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2094 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2095 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2096 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2097 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2100 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2105 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2106 MpegEncContext * const s= (MpegEncContext *)c;
2107 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2111 s->dsp.diff_pixels(temp, src1, src2, stride);
2113 return s->dsp.sum_abs_dctelem(temp);
2118 const int s07 = SRC(0) + SRC(7);\
2119 const int s16 = SRC(1) + SRC(6);\
2120 const int s25 = SRC(2) + SRC(5);\
2121 const int s34 = SRC(3) + SRC(4);\
2122 const int a0 = s07 + s34;\
2123 const int a1 = s16 + s25;\
2124 const int a2 = s07 - s34;\
2125 const int a3 = s16 - s25;\
2126 const int d07 = SRC(0) - SRC(7);\
2127 const int d16 = SRC(1) - SRC(6);\
2128 const int d25 = SRC(2) - SRC(5);\
2129 const int d34 = SRC(3) - SRC(4);\
2130 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2131 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2132 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2133 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2135 DST(1, a4 + (a7>>2)) ;\
2136 DST(2, a2 + (a3>>1)) ;\
2137 DST(3, a5 + (a6>>2)) ;\
2139 DST(5, a6 - (a5>>2)) ;\
2140 DST(6, (a2>>1) - a3 ) ;\
2141 DST(7, (a4>>2) - a7 ) ;\
2144 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2145 MpegEncContext * const s= (MpegEncContext *)c;
2150 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2152 #define SRC(x) dct[i][x]
2153 #define DST(x,v) dct[i][x]= v
2154 for( i = 0; i < 8; i++ )
2159 #define SRC(x) dct[x][i]
2160 #define DST(x,v) sum += FFABS(v)
2161 for( i = 0; i < 8; i++ )
2169 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2170 MpegEncContext * const s= (MpegEncContext *)c;
2171 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2176 s->dsp.diff_pixels(temp, src1, src2, stride);
2180 sum= FFMAX(sum, FFABS(temp[i]));
2185 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2186 MpegEncContext * const s= (MpegEncContext *)c;
2187 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2188 DCTELEM * const bak = temp+64;
2194 s->dsp.diff_pixels(temp, src1, src2, stride);
2196 memcpy(bak, temp, 64*sizeof(DCTELEM));
2198 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2199 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2200 ff_simple_idct_8(temp); //FIXME
2203 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2208 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2209 MpegEncContext * const s= (MpegEncContext *)c;
2210 const uint8_t *scantable= s->intra_scantable.permutated;
2211 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2212 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2213 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2214 int i, last, run, bits, level, distortion, start_i;
2215 const int esc_length= s->ac_esc_length;
2217 uint8_t * last_length;
2221 copy_block8(lsrc1, src1, 8, stride, 8);
2222 copy_block8(lsrc2, src2, 8, stride, 8);
2224 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2226 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2232 length = s->intra_ac_vlc_length;
2233 last_length= s->intra_ac_vlc_last_length;
2234 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2237 length = s->inter_ac_vlc_length;
2238 last_length= s->inter_ac_vlc_last_length;
2243 for(i=start_i; i<last; i++){
2244 int j= scantable[i];
2249 if((level&(~127)) == 0){
2250 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2259 level= temp[i] + 64;
2263 if((level&(~127)) == 0){
2264 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2272 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2274 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2277 s->dsp.idct_add(lsrc2, 8, temp);
2279 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2281 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2284 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2285 MpegEncContext * const s= (MpegEncContext *)c;
2286 const uint8_t *scantable= s->intra_scantable.permutated;
2287 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2288 int i, last, run, bits, level, start_i;
2289 const int esc_length= s->ac_esc_length;
2291 uint8_t * last_length;
2295 s->dsp.diff_pixels(temp, src1, src2, stride);
2297 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2303 length = s->intra_ac_vlc_length;
2304 last_length= s->intra_ac_vlc_last_length;
2305 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2308 length = s->inter_ac_vlc_length;
2309 last_length= s->inter_ac_vlc_last_length;
2314 for(i=start_i; i<last; i++){
2315 int j= scantable[i];
2320 if((level&(~127)) == 0){
2321 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2330 level= temp[i] + 64;
2334 if((level&(~127)) == 0){
2335 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2343 #define VSAD_INTRA(size) \
2344 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2348 for(y=1; y<h; y++){ \
2349 for(x=0; x<size; x+=4){ \
2350 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2351 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2361 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2366 for(x=0; x<16; x++){
2367 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2376 #define SQ(a) ((a)*(a))
2377 #define VSSE_INTRA(size) \
2378 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2382 for(y=1; y<h; y++){ \
2383 for(x=0; x<size; x+=4){ \
2384 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2385 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2395 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2400 for(x=0; x<16; x++){
2401 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2410 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2414 for(i=0; i<size; i++)
2415 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2419 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2420 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2421 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2423 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2425 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2426 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2427 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2428 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2430 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2432 for(i=0; i<len; i++)
2433 dst[i] = src0[i] * src1[i];
2436 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2439 for(i=0; i<len; i++)
2440 dst[i] = src0[i] * src1[-i];
2443 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2445 for(i=0; i<len; i++)
2446 dst[i] = src0[i] * src1[i] + src2[i];
2449 static void vector_fmul_window_c(float *dst, const float *src0,
2450 const float *src1, const float *win, int len)
2456 for(i=-len, j=len-1; i<0; i++, j--) {
2461 dst[i] = s0*wj - s1*wi;
2462 dst[j] = s0*wi + s1*wj;
2466 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2470 for (i = 0; i < len; i++)
2471 dst[i] = src[i] * mul;
2474 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2478 for (i = 0; i < len; i++) {
2479 float t = v1[i] - v2[i];
2485 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2490 for (i = 0; i < len; i++)
2496 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2497 uint32_t maxi, uint32_t maxisign)
2500 if(a > mini) return mini;
2501 else if((a^(1U<<31)) > maxisign) return maxi;
2505 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2507 uint32_t mini = *(uint32_t*)min;
2508 uint32_t maxi = *(uint32_t*)max;
2509 uint32_t maxisign = maxi ^ (1U<<31);
2510 uint32_t *dsti = (uint32_t*)dst;
2511 const uint32_t *srci = (const uint32_t*)src;
2512 for(i=0; i<len; i+=8) {
2513 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2514 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2515 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2516 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2517 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2518 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2519 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2520 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2523 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2525 if(min < 0 && max > 0) {
2526 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2528 for(i=0; i < len; i+=8) {
2529 dst[i ] = av_clipf(src[i ], min, max);
2530 dst[i + 1] = av_clipf(src[i + 1], min, max);
2531 dst[i + 2] = av_clipf(src[i + 2], min, max);
2532 dst[i + 3] = av_clipf(src[i + 3], min, max);
2533 dst[i + 4] = av_clipf(src[i + 4], min, max);
2534 dst[i + 5] = av_clipf(src[i + 5], min, max);
2535 dst[i + 6] = av_clipf(src[i + 6], min, max);
2536 dst[i + 7] = av_clipf(src[i + 7], min, max);
2541 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2546 res += (*v1++ * *v2++) >> shift;
2551 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2556 *v1++ += mul * *v3++;
2561 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2562 const int16_t *window, unsigned int len)
2565 int len2 = len >> 1;
2567 for (i = 0; i < len2; i++) {
2568 int16_t w = window[i];
2569 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2570 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2574 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2575 int32_t max, unsigned int len)
2578 *dst++ = av_clip(*src++, min, max);
2579 *dst++ = av_clip(*src++, min, max);
2580 *dst++ = av_clip(*src++, min, max);
2581 *dst++ = av_clip(*src++, min, max);
2582 *dst++ = av_clip(*src++, min, max);
2583 *dst++ = av_clip(*src++, min, max);
2584 *dst++ = av_clip(*src++, min, max);
2585 *dst++ = av_clip(*src++, min, max);
2591 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2592 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2593 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2594 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2595 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2596 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2597 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2599 static void wmv2_idct_row(short * b)
2602 int a0,a1,a2,a3,a4,a5,a6,a7;
2604 a1 = W1*b[1]+W7*b[7];
2605 a7 = W7*b[1]-W1*b[7];
2606 a5 = W5*b[5]+W3*b[3];
2607 a3 = W3*b[5]-W5*b[3];
2608 a2 = W2*b[2]+W6*b[6];
2609 a6 = W6*b[2]-W2*b[6];
2610 a0 = W0*b[0]+W0*b[4];
2611 a4 = W0*b[0]-W0*b[4];
2613 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2614 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2616 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2617 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2618 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2619 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2620 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2621 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2622 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2623 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2625 static void wmv2_idct_col(short * b)
2628 int a0,a1,a2,a3,a4,a5,a6,a7;
2629 /*step 1, with extended precision*/
2630 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2631 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2632 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2633 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2634 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2635 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2636 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2637 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2639 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2640 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2642 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2643 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2644 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2645 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2647 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2648 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2649 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2650 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2652 void ff_wmv2_idct_c(short * block){
2656 wmv2_idct_row(block+i);
2659 wmv2_idct_col(block+i);
2662 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2664 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2666 ff_wmv2_idct_c(block);
2667 ff_put_pixels_clamped_c(block, dest, line_size);
2669 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2671 ff_wmv2_idct_c(block);
2672 ff_add_pixels_clamped_c(block, dest, line_size);
2674 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2677 ff_put_pixels_clamped_c(block, dest, line_size);
2679 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2682 ff_add_pixels_clamped_c(block, dest, line_size);
2685 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2688 put_pixels_clamped4_c(block, dest, line_size);
2690 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2693 add_pixels_clamped4_c(block, dest, line_size);
2696 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2699 put_pixels_clamped2_c(block, dest, line_size);
2701 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2704 add_pixels_clamped2_c(block, dest, line_size);
2707 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2709 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2711 dest[0] = cm[(block[0] + 4)>>3];
2713 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2715 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2717 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2720 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2722 /* init static data */
2723 av_cold void dsputil_static_init(void)
2727 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2728 for(i=0;i<MAX_NEG_CROP;i++) {
2730 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2733 for(i=0;i<512;i++) {
2734 ff_squareTbl[i] = (i - 256) * (i - 256);
2737 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2740 int ff_check_alignment(void){
2741 static int did_fail=0;
2742 LOCAL_ALIGNED_16(int, aligned, [4]);
2744 if((intptr_t)aligned & 15){
2746 #if HAVE_MMX || HAVE_ALTIVEC
2747 av_log(NULL, AV_LOG_ERROR,
2748 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2749 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2750 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2751 "Do not report crashes to Libav developers.\n");
2760 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2764 ff_check_alignment();
2767 if (avctx->bits_per_raw_sample == 10) {
2768 c->fdct = ff_jpeg_fdct_islow_10;
2769 c->fdct248 = ff_fdct248_islow_10;
2771 if(avctx->dct_algo==FF_DCT_FASTINT) {
2772 c->fdct = fdct_ifast;
2773 c->fdct248 = fdct_ifast248;
2775 else if(avctx->dct_algo==FF_DCT_FAAN) {
2776 c->fdct = ff_faandct;
2777 c->fdct248 = ff_faandct248;
2780 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2781 c->fdct248 = ff_fdct248_islow_8;
2784 #endif //CONFIG_ENCODERS
2786 if(avctx->lowres==1){
2787 c->idct_put= ff_jref_idct4_put;
2788 c->idct_add= ff_jref_idct4_add;
2789 c->idct = j_rev_dct4;
2790 c->idct_permutation_type= FF_NO_IDCT_PERM;
2791 }else if(avctx->lowres==2){
2792 c->idct_put= ff_jref_idct2_put;
2793 c->idct_add= ff_jref_idct2_add;
2794 c->idct = j_rev_dct2;
2795 c->idct_permutation_type= FF_NO_IDCT_PERM;
2796 }else if(avctx->lowres==3){
2797 c->idct_put= ff_jref_idct1_put;
2798 c->idct_add= ff_jref_idct1_add;
2799 c->idct = j_rev_dct1;
2800 c->idct_permutation_type= FF_NO_IDCT_PERM;
2802 if (avctx->bits_per_raw_sample == 10) {
2803 c->idct_put = ff_simple_idct_put_10;
2804 c->idct_add = ff_simple_idct_add_10;
2805 c->idct = ff_simple_idct_10;
2806 c->idct_permutation_type = FF_NO_IDCT_PERM;
2808 if(avctx->idct_algo==FF_IDCT_INT){
2809 c->idct_put= ff_jref_idct_put;
2810 c->idct_add= ff_jref_idct_add;
2811 c->idct = j_rev_dct;
2812 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2813 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2814 avctx->idct_algo==FF_IDCT_VP3){
2815 c->idct_put= ff_vp3_idct_put_c;
2816 c->idct_add= ff_vp3_idct_add_c;
2817 c->idct = ff_vp3_idct_c;
2818 c->idct_permutation_type= FF_NO_IDCT_PERM;
2819 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2820 c->idct_put= ff_wmv2_idct_put_c;
2821 c->idct_add= ff_wmv2_idct_add_c;
2822 c->idct = ff_wmv2_idct_c;
2823 c->idct_permutation_type= FF_NO_IDCT_PERM;
2824 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2825 c->idct_put= ff_faanidct_put;
2826 c->idct_add= ff_faanidct_add;
2827 c->idct = ff_faanidct;
2828 c->idct_permutation_type= FF_NO_IDCT_PERM;
2829 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2830 c->idct_put= ff_ea_idct_put_c;
2831 c->idct_permutation_type= FF_NO_IDCT_PERM;
2832 }else{ //accurate/default
2833 c->idct_put = ff_simple_idct_put_8;
2834 c->idct_add = ff_simple_idct_add_8;
2835 c->idct = ff_simple_idct_8;
2836 c->idct_permutation_type= FF_NO_IDCT_PERM;
2841 c->diff_pixels = diff_pixels_c;
2842 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2843 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2844 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2845 c->sum_abs_dctelem = sum_abs_dctelem_c;
2848 c->pix_sum = pix_sum_c;
2849 c->pix_norm1 = pix_norm1_c;
2851 c->fill_block_tab[0] = fill_block16_c;
2852 c->fill_block_tab[1] = fill_block8_c;
2853 c->scale_block = scale_block_c;
2855 /* TODO [0] 16 [1] 8 */
2856 c->pix_abs[0][0] = pix_abs16_c;
2857 c->pix_abs[0][1] = pix_abs16_x2_c;
2858 c->pix_abs[0][2] = pix_abs16_y2_c;
2859 c->pix_abs[0][3] = pix_abs16_xy2_c;
2860 c->pix_abs[1][0] = pix_abs8_c;
2861 c->pix_abs[1][1] = pix_abs8_x2_c;
2862 c->pix_abs[1][2] = pix_abs8_y2_c;
2863 c->pix_abs[1][3] = pix_abs8_xy2_c;
2865 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2866 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2867 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2868 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2869 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2870 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2871 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2872 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2873 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2875 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2876 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2877 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2878 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2879 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2880 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2881 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2882 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2883 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2885 #define dspfunc(PFX, IDX, NUM) \
2886 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2887 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2888 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2889 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2890 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2891 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2892 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2893 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2894 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2895 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2896 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2897 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2898 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2899 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2900 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2901 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2903 dspfunc(put_qpel, 0, 16);
2904 dspfunc(put_no_rnd_qpel, 0, 16);
2906 dspfunc(avg_qpel, 0, 16);
2907 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2909 dspfunc(put_qpel, 1, 8);
2910 dspfunc(put_no_rnd_qpel, 1, 8);
2912 dspfunc(avg_qpel, 1, 8);
2913 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2917 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2918 ff_mlp_init(c, avctx);
2920 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2921 ff_intrax8dsp_init(c,avctx);
2923 #if CONFIG_RV30_DECODER
2924 ff_rv30dsp_init(c,avctx);
2926 #if CONFIG_RV40_DECODER
2927 ff_rv40dsp_init(c,avctx);
2928 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
2929 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
2930 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
2931 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
2934 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2935 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2936 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2937 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2938 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2939 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2940 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2941 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2943 #define SET_CMP_FUNC(name) \
2944 c->name[0]= name ## 16_c;\
2945 c->name[1]= name ## 8x8_c;
2947 SET_CMP_FUNC(hadamard8_diff)
2948 c->hadamard8_diff[4]= hadamard8_intra16_c;
2949 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2950 SET_CMP_FUNC(dct_sad)
2951 SET_CMP_FUNC(dct_max)
2953 SET_CMP_FUNC(dct264_sad)
2955 c->sad[0]= pix_abs16_c;
2956 c->sad[1]= pix_abs8_c;
2960 SET_CMP_FUNC(quant_psnr)
2963 c->vsad[0]= vsad16_c;
2964 c->vsad[4]= vsad_intra16_c;
2965 c->vsad[5]= vsad_intra8_c;
2966 c->vsse[0]= vsse16_c;
2967 c->vsse[4]= vsse_intra16_c;
2968 c->vsse[5]= vsse_intra8_c;
2969 c->nsse[0]= nsse16_c;
2970 c->nsse[1]= nsse8_c;
2972 ff_dsputil_init_dwt(c);
2975 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2977 c->add_bytes= add_bytes_c;
2978 c->add_bytes_l2= add_bytes_l2_c;
2979 c->diff_bytes= diff_bytes_c;
2980 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2981 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2982 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2983 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2984 c->bswap_buf= bswap_buf;
2985 c->bswap16_buf = bswap16_buf;
2986 #if CONFIG_PNG_DECODER
2987 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
2990 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2991 c->h263_h_loop_filter= h263_h_loop_filter_c;
2992 c->h263_v_loop_filter= h263_v_loop_filter_c;
2995 if (CONFIG_VP3_DECODER) {
2996 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2997 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2998 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3001 c->h261_loop_filter= h261_loop_filter_c;
3003 c->try_8x8basis= try_8x8basis_c;
3004 c->add_8x8basis= add_8x8basis_c;
3006 #if CONFIG_VORBIS_DECODER
3007 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3009 #if CONFIG_AC3_DECODER
3010 c->ac3_downmix = ff_ac3_downmix_c;
3012 c->vector_fmul = vector_fmul_c;
3013 c->vector_fmul_reverse = vector_fmul_reverse_c;
3014 c->vector_fmul_add = vector_fmul_add_c;
3015 c->vector_fmul_window = vector_fmul_window_c;
3016 c->vector_clipf = vector_clipf_c;
3017 c->scalarproduct_int16 = scalarproduct_int16_c;
3018 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3019 c->apply_window_int16 = apply_window_int16_c;
3020 c->vector_clip_int32 = vector_clip_int32_c;
3021 c->scalarproduct_float = scalarproduct_float_c;
3022 c->butterflies_float = butterflies_float_c;
3023 c->vector_fmul_scalar = vector_fmul_scalar_c;
3025 c->shrink[0]= av_image_copy_plane;
3026 c->shrink[1]= ff_shrink22;
3027 c->shrink[2]= ff_shrink44;
3028 c->shrink[3]= ff_shrink88;
3030 c->prefetch= just_return;
3032 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3033 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3037 #define FUNC(f, depth) f ## _ ## depth
3038 #define FUNCC(f, depth) f ## _ ## depth ## _c
3040 #define dspfunc1(PFX, IDX, NUM, depth)\
3041 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3042 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3043 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3044 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3046 #define dspfunc2(PFX, IDX, NUM, depth)\
3047 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3048 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3049 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3050 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3051 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3052 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3053 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3054 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3055 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3056 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3057 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3058 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3059 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3060 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3061 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3062 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3065 #define BIT_DEPTH_FUNCS(depth, dct)\
3066 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3067 c->draw_edges = FUNCC(draw_edges , depth);\
3068 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3069 c->clear_block = FUNCC(clear_block ## dct , depth);\
3070 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3071 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3072 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3073 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3074 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3076 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3077 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3078 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3079 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3080 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3081 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3083 dspfunc1(put , 0, 16, depth);\
3084 dspfunc1(put , 1, 8, depth);\
3085 dspfunc1(put , 2, 4, depth);\
3086 dspfunc1(put , 3, 2, depth);\
3087 dspfunc1(put_no_rnd, 0, 16, depth);\
3088 dspfunc1(put_no_rnd, 1, 8, depth);\
3089 dspfunc1(avg , 0, 16, depth);\
3090 dspfunc1(avg , 1, 8, depth);\
3091 dspfunc1(avg , 2, 4, depth);\
3092 dspfunc1(avg , 3, 2, depth);\
3093 dspfunc1(avg_no_rnd, 0, 16, depth);\
3094 dspfunc1(avg_no_rnd, 1, 8, depth);\
3096 dspfunc2(put_h264_qpel, 0, 16, depth);\
3097 dspfunc2(put_h264_qpel, 1, 8, depth);\
3098 dspfunc2(put_h264_qpel, 2, 4, depth);\
3099 dspfunc2(put_h264_qpel, 3, 2, depth);\
3100 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3101 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3102 dspfunc2(avg_h264_qpel, 2, 4, depth);
3104 switch (avctx->bits_per_raw_sample) {
3106 if (c->dct_bits == 32) {
3107 BIT_DEPTH_FUNCS(9, _32);
3109 BIT_DEPTH_FUNCS(9, _16);
3113 if (c->dct_bits == 32) {
3114 BIT_DEPTH_FUNCS(10, _32);
3116 BIT_DEPTH_FUNCS(10, _16);
3120 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3122 BIT_DEPTH_FUNCS(8, _16);
3127 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3128 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3129 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3130 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3131 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3132 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3133 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3134 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3135 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3137 for(i=0; i<64; i++){
3138 if(!c->put_2tap_qpel_pixels_tab[0][i])
3139 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3140 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3141 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3144 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3145 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3146 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3147 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3149 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3150 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3151 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3152 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3154 switch(c->idct_permutation_type){
3155 case FF_NO_IDCT_PERM:
3157 c->idct_permutation[i]= i;
3159 case FF_LIBMPEG2_IDCT_PERM:
3161 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3163 case FF_SIMPLE_IDCT_PERM:
3165 c->idct_permutation[i]= simple_mmx_permutation[i];
3167 case FF_TRANSPOSE_IDCT_PERM:
3169 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3171 case FF_PARTTRANS_IDCT_PERM:
3173 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3175 case FF_SSE2_IDCT_PERM:
3177 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3180 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");