3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
188 #if LONG_MAX > 2147483647
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
314 /* read the pixels */
316 block[0] = pixels[0];
317 block[1] = pixels[1];
318 block[2] = pixels[2];
319 block[3] = pixels[3];
320 block[4] = pixels[4];
321 block[5] = pixels[5];
322 block[6] = pixels[6];
323 block[7] = pixels[7];
329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
330 const uint8_t *s2, int stride){
333 /* read the pixels */
335 block[0] = s1[0] - s2[0];
336 block[1] = s1[1] - s2[1];
337 block[2] = s1[2] - s2[2];
338 block[3] = s1[3] - s2[3];
339 block[4] = s1[4] - s2[4];
340 block[5] = s1[5] - s2[5];
341 block[6] = s1[6] - s2[6];
342 block[7] = s1[7] - s2[7];
350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
356 /* read the pixels */
358 pixels[0] = cm[block[0]];
359 pixels[1] = cm[block[1]];
360 pixels[2] = cm[block[2]];
361 pixels[3] = cm[block[3]];
362 pixels[4] = cm[block[4]];
363 pixels[5] = cm[block[5]];
364 pixels[6] = cm[block[6]];
365 pixels[7] = cm[block[7]];
372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
376 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
378 /* read the pixels */
380 pixels[0] = cm[block[0]];
381 pixels[1] = cm[block[1]];
382 pixels[2] = cm[block[2]];
383 pixels[3] = cm[block[3]];
390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
394 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
396 /* read the pixels */
398 pixels[0] = cm[block[0]];
399 pixels[1] = cm[block[1]];
406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
407 uint8_t *restrict pixels,
412 for (i = 0; i < 8; i++) {
413 for (j = 0; j < 8; j++) {
416 else if (*block > 127)
419 *pixels = (uint8_t)(*block + 128);
423 pixels += (line_size - 8);
427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
432 /* read the pixels */
434 pixels[0] = block[0];
435 pixels[1] = block[1];
436 pixels[2] = block[2];
437 pixels[3] = block[3];
438 pixels[4] = block[4];
439 pixels[5] = block[5];
440 pixels[6] = block[6];
441 pixels[7] = block[7];
448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
452 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
454 /* read the pixels */
456 pixels[0] = cm[pixels[0] + block[0]];
457 pixels[1] = cm[pixels[1] + block[1]];
458 pixels[2] = cm[pixels[2] + block[2]];
459 pixels[3] = cm[pixels[3] + block[3]];
460 pixels[4] = cm[pixels[4] + block[4]];
461 pixels[5] = cm[pixels[5] + block[5]];
462 pixels[6] = cm[pixels[6] + block[6]];
463 pixels[7] = cm[pixels[7] + block[7]];
469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
473 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
475 /* read the pixels */
477 pixels[0] = cm[pixels[0] + block[0]];
478 pixels[1] = cm[pixels[1] + block[1]];
479 pixels[2] = cm[pixels[2] + block[2]];
480 pixels[3] = cm[pixels[3] + block[3]];
486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
490 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
492 /* read the pixels */
494 pixels[0] = cm[pixels[0] + block[0]];
495 pixels[1] = cm[pixels[1] + block[1]];
501 static int sum_abs_dctelem_c(DCTELEM *block)
505 sum+= FFABS(block[i]);
509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
513 for (i = 0; i < h; i++) {
514 memset(block, value, 16);
519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
523 for (i = 0; i < h; i++) {
524 memset(block, value, 8);
529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
532 uint16_t *dst1 = (uint16_t *) dst;
533 uint16_t *dst2 = (uint16_t *)(dst + linesize);
535 for (j = 0; j < 8; j++) {
536 for (i = 0; i < 8; i++) {
537 dst1[i] = dst2[i] = src[i] * 0x0101;
545 #define avg2(a,b) ((a+b+1)>>1)
546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
550 const int A=(16-x16)*(16-y16);
551 const int B=( x16)*(16-y16);
552 const int C=(16-x16)*( y16);
553 const int D=( x16)*( y16);
558 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
575 const int s= 1<<shift;
585 for(x=0; x<8; x++){ //XXX FIXME optimize
586 int src_x, src_y, frac_x, frac_y, index;
595 if((unsigned)src_x < width){
596 if((unsigned)src_y < height){
597 index= src_x + src_y*stride;
598 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
599 + src[index +1]* frac_x )*(s-frac_y)
600 + ( src[index+stride ]*(s-frac_x)
601 + src[index+stride+1]* frac_x )* frac_y
604 index= src_x + av_clip(src_y, 0, height)*stride;
605 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
606 + src[index +1]* frac_x )*s
610 if((unsigned)src_y < height){
611 index= av_clip(src_x, 0, width) + src_y*stride;
612 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
613 + src[index+stride ]* frac_y )*s
616 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617 dst[y*stride + x]= src[index ];
629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
631 case 2: put_pixels2_8_c (dst, src, stride, height); break;
632 case 4: put_pixels4_8_c (dst, src, stride, height); break;
633 case 8: put_pixels8_8_c (dst, src, stride, height); break;
634 case 16:put_pixels16_8_c(dst, src, stride, height); break;
638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
717 for (i=0; i < height; i++) {
718 for (j=0; j < width; j++) {
719 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
728 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
814 for (i=0; i < height; i++) {
815 for (j=0; j < width; j++) {
816 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
823 #define TPEL_WIDTH(width)\
824 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
825 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
826 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
827 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
828 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
829 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
830 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
831 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
832 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
833 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
834 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
835 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
836 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
837 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
838 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
839 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
840 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
841 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
844 #define QPEL_MC(r, OPNAME, RND, OP) \
845 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
850 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
851 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
852 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
853 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
854 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
855 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
856 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
857 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
863 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
865 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
869 const int src0= src[0*srcStride];\
870 const int src1= src[1*srcStride];\
871 const int src2= src[2*srcStride];\
872 const int src3= src[3*srcStride];\
873 const int src4= src[4*srcStride];\
874 const int src5= src[5*srcStride];\
875 const int src6= src[6*srcStride];\
876 const int src7= src[7*srcStride];\
877 const int src8= src[8*srcStride];\
878 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
879 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
880 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
881 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
882 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
883 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
884 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
885 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
891 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
892 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
897 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
898 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
899 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
900 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
901 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
902 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
903 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
904 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
905 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
906 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
907 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
908 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
909 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
910 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
911 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
912 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
918 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
919 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
924 const int src0= src[0*srcStride];\
925 const int src1= src[1*srcStride];\
926 const int src2= src[2*srcStride];\
927 const int src3= src[3*srcStride];\
928 const int src4= src[4*srcStride];\
929 const int src5= src[5*srcStride];\
930 const int src6= src[6*srcStride];\
931 const int src7= src[7*srcStride];\
932 const int src8= src[8*srcStride];\
933 const int src9= src[9*srcStride];\
934 const int src10= src[10*srcStride];\
935 const int src11= src[11*srcStride];\
936 const int src12= src[12*srcStride];\
937 const int src13= src[13*srcStride];\
938 const int src14= src[14*srcStride];\
939 const int src15= src[15*srcStride];\
940 const int src16= src[16*srcStride];\
941 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
942 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
943 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
944 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
945 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
946 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
947 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
948 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
949 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
950 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
951 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
952 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
953 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
954 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
955 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
956 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
962 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
964 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
965 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
968 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
969 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
972 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
974 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
975 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
978 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
981 copy_block9(full, src, 16, stride, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
983 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
986 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
988 copy_block9(full, src, 16, stride, 9);\
989 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
992 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
997 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
999 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1010 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1041 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1045 uint8_t halfHV[64];\
1046 copy_block9(full, src, 16, stride, 9);\
1047 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1052 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t full[16*9];\
1055 uint8_t halfHV[64];\
1056 copy_block9(full, src, 16, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1062 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t full[16*9];\
1066 uint8_t halfHV[64];\
1067 copy_block9(full, src, 16, stride, 9);\
1068 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1069 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1070 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1071 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1073 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1074 uint8_t full[16*9];\
1076 uint8_t halfHV[64];\
1077 copy_block9(full, src, 16, stride, 9);\
1078 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1080 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1083 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1085 uint8_t halfHV[64];\
1086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1090 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1092 uint8_t halfHV[64];\
1093 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1094 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1095 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1097 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1098 uint8_t full[16*9];\
1101 uint8_t halfHV[64];\
1102 copy_block9(full, src, 16, stride, 9);\
1103 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1105 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1106 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1108 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1109 uint8_t full[16*9];\
1111 copy_block9(full, src, 16, stride, 9);\
1112 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1113 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1114 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1116 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1117 uint8_t full[16*9];\
1120 uint8_t halfHV[64];\
1121 copy_block9(full, src, 16, stride, 9);\
1122 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1124 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1125 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1127 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1128 uint8_t full[16*9];\
1130 copy_block9(full, src, 16, stride, 9);\
1131 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1132 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1133 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1135 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1137 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1138 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1141 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1143 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1144 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1147 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1148 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1151 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1153 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1154 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1157 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1160 copy_block17(full, src, 24, stride, 17);\
1161 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1162 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1165 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1166 uint8_t full[24*17];\
1167 copy_block17(full, src, 24, stride, 17);\
1168 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1171 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[24*17];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1176 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1178 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1220 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfV[256];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1231 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1232 uint8_t full[24*17];\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1241 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1242 uint8_t full[24*17];\
1243 uint8_t halfH[272];\
1244 uint8_t halfV[256];\
1245 uint8_t halfHV[256];\
1246 copy_block17(full, src, 24, stride, 17);\
1247 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1249 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1250 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1252 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1253 uint8_t full[24*17];\
1254 uint8_t halfH[272];\
1255 uint8_t halfHV[256];\
1256 copy_block17(full, src, 24, stride, 17);\
1257 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1263 uint8_t halfH[272];\
1264 uint8_t halfHV[256];\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1269 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1270 uint8_t halfH[272];\
1271 uint8_t halfHV[256];\
1272 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1273 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1274 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1276 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1277 uint8_t full[24*17];\
1278 uint8_t halfH[272];\
1279 uint8_t halfV[256];\
1280 uint8_t halfHV[256];\
1281 copy_block17(full, src, 24, stride, 17);\
1282 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1284 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1285 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1287 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t full[24*17];\
1289 uint8_t halfH[272];\
1290 copy_block17(full, src, 24, stride, 17);\
1291 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1292 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1293 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1295 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1296 uint8_t full[24*17];\
1297 uint8_t halfH[272];\
1298 uint8_t halfV[256];\
1299 uint8_t halfHV[256];\
1300 copy_block17(full, src, 24, stride, 17);\
1301 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1303 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1304 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1306 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1307 uint8_t full[24*17];\
1308 uint8_t halfH[272];\
1309 copy_block17(full, src, 24, stride, 17);\
1310 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1311 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1312 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1314 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1315 uint8_t halfH[272];\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1317 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1320 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1321 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1322 #define op_put(a, b) a = cm[((b) + 16)>>5]
1323 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1325 QPEL_MC(0, put_ , _ , op_put)
1326 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1327 QPEL_MC(0, avg_ , _ , op_avg)
1328 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1330 #undef op_avg_no_rnd
1332 #undef op_put_no_rnd
1334 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1335 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1336 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1337 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1338 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1339 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1341 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1342 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1346 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1347 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1348 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1349 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1350 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1351 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1352 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1353 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1359 #if CONFIG_RV40_DECODER
1360 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361 put_pixels16_xy2_8_c(dst, src, stride, 16);
1363 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1364 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1366 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1367 put_pixels8_xy2_8_c(dst, src, stride, 8);
1369 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1370 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1372 #endif /* CONFIG_RV40_DECODER */
1374 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1379 const int src_1= src[ -srcStride];
1380 const int src0 = src[0 ];
1381 const int src1 = src[ srcStride];
1382 const int src2 = src[2*srcStride];
1383 const int src3 = src[3*srcStride];
1384 const int src4 = src[4*srcStride];
1385 const int src5 = src[5*srcStride];
1386 const int src6 = src[6*srcStride];
1387 const int src7 = src[7*srcStride];
1388 const int src8 = src[8*srcStride];
1389 const int src9 = src[9*srcStride];
1390 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1391 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1392 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1393 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1394 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1395 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1396 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1397 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1403 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1405 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1406 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1409 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1410 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1413 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1415 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1416 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1419 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1420 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1423 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1427 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1429 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1432 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1436 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1437 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1438 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1439 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1441 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1443 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1444 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1447 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1448 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1450 const int strength= ff_h263_loop_filter_strength[qscale];
1454 int p0= src[x-2*stride];
1455 int p1= src[x-1*stride];
1456 int p2= src[x+0*stride];
1457 int p3= src[x+1*stride];
1458 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1460 if (d<-2*strength) d1= 0;
1461 else if(d<- strength) d1=-2*strength - d;
1462 else if(d< strength) d1= d;
1463 else if(d< 2*strength) d1= 2*strength - d;
1468 if(p1&256) p1= ~(p1>>31);
1469 if(p2&256) p2= ~(p2>>31);
1471 src[x-1*stride] = p1;
1472 src[x+0*stride] = p2;
1476 d2= av_clip((p0-p3)/4, -ad1, ad1);
1478 src[x-2*stride] = p0 - d2;
1479 src[x+ stride] = p3 + d2;
1484 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1485 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1487 const int strength= ff_h263_loop_filter_strength[qscale];
1491 int p0= src[y*stride-2];
1492 int p1= src[y*stride-1];
1493 int p2= src[y*stride+0];
1494 int p3= src[y*stride+1];
1495 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1497 if (d<-2*strength) d1= 0;
1498 else if(d<- strength) d1=-2*strength - d;
1499 else if(d< strength) d1= d;
1500 else if(d< 2*strength) d1= 2*strength - d;
1505 if(p1&256) p1= ~(p1>>31);
1506 if(p2&256) p2= ~(p2>>31);
1508 src[y*stride-1] = p1;
1509 src[y*stride+0] = p2;
1513 d2= av_clip((p0-p3)/4, -ad1, ad1);
1515 src[y*stride-2] = p0 - d2;
1516 src[y*stride+1] = p3 + d2;
1521 static void h261_loop_filter_c(uint8_t *src, int stride){
1526 temp[x ] = 4*src[x ];
1527 temp[x + 7*8] = 4*src[x + 7*stride];
1531 xy = y * stride + x;
1533 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1538 src[ y*stride] = (temp[ y*8] + 2)>>2;
1539 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1541 xy = y * stride + x;
1543 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1548 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1554 s += abs(pix1[0] - pix2[0]);
1555 s += abs(pix1[1] - pix2[1]);
1556 s += abs(pix1[2] - pix2[2]);
1557 s += abs(pix1[3] - pix2[3]);
1558 s += abs(pix1[4] - pix2[4]);
1559 s += abs(pix1[5] - pix2[5]);
1560 s += abs(pix1[6] - pix2[6]);
1561 s += abs(pix1[7] - pix2[7]);
1562 s += abs(pix1[8] - pix2[8]);
1563 s += abs(pix1[9] - pix2[9]);
1564 s += abs(pix1[10] - pix2[10]);
1565 s += abs(pix1[11] - pix2[11]);
1566 s += abs(pix1[12] - pix2[12]);
1567 s += abs(pix1[13] - pix2[13]);
1568 s += abs(pix1[14] - pix2[14]);
1569 s += abs(pix1[15] - pix2[15]);
1576 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1582 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1583 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1584 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1585 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1586 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1587 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1588 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1589 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1590 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1591 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1592 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1593 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1594 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1595 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1596 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1597 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1604 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1607 uint8_t *pix3 = pix2 + line_size;
1611 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1612 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1613 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1614 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1615 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1616 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1617 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1618 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1619 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1620 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1621 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1622 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1623 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1624 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1625 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1626 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1634 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1637 uint8_t *pix3 = pix2 + line_size;
1641 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1642 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1643 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1644 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1645 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1646 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1647 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1648 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1649 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1650 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1651 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1652 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1653 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1654 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1655 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1656 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1664 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1670 s += abs(pix1[0] - pix2[0]);
1671 s += abs(pix1[1] - pix2[1]);
1672 s += abs(pix1[2] - pix2[2]);
1673 s += abs(pix1[3] - pix2[3]);
1674 s += abs(pix1[4] - pix2[4]);
1675 s += abs(pix1[5] - pix2[5]);
1676 s += abs(pix1[6] - pix2[6]);
1677 s += abs(pix1[7] - pix2[7]);
1684 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1690 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1691 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1692 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1693 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1694 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1695 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1696 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1697 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1704 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1707 uint8_t *pix3 = pix2 + line_size;
1711 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1712 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1713 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1714 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1715 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1716 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1717 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1718 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1726 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1729 uint8_t *pix3 = pix2 + line_size;
1733 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1734 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1735 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1736 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1737 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1738 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1739 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1740 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1748 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1749 MpegEncContext *c = v;
1755 for(x=0; x<16; x++){
1756 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1759 for(x=0; x<15; x++){
1760 score2+= FFABS( s1[x ] - s1[x +stride]
1761 - s1[x+1] + s1[x+1+stride])
1762 -FFABS( s2[x ] - s2[x +stride]
1763 - s2[x+1] + s2[x+1+stride]);
1770 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1771 else return score1 + FFABS(score2)*8;
1774 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1775 MpegEncContext *c = v;
1782 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1786 score2+= FFABS( s1[x ] - s1[x +stride]
1787 - s1[x+1] + s1[x+1+stride])
1788 -FFABS( s2[x ] - s2[x +stride]
1789 - s2[x+1] + s2[x+1+stride]);
1796 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1797 else return score1 + FFABS(score2)*8;
1800 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1804 for(i=0; i<8*8; i++){
1805 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1808 assert(-512<b && b<512);
1810 sum += (w*b)*(w*b)>>4;
1815 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1818 for(i=0; i<8*8; i++){
1819 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1824 * permutes an 8x8 block.
1825 * @param block the block which will be permuted according to the given permutation vector
1826 * @param permutation the permutation vector
1827 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1828 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1829 * (inverse) permutated to scantable order!
1831 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1837 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1839 for(i=0; i<=last; i++){
1840 const int j= scantable[i];
1845 for(i=0; i<=last; i++){
1846 const int j= scantable[i];
1847 const int perm_j= permutation[j];
1848 block[perm_j]= temp[j];
1852 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1856 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1859 memset(cmp, 0, sizeof(void*)*6);
1867 cmp[i]= c->hadamard8_diff[i];
1873 cmp[i]= c->dct_sad[i];
1876 cmp[i]= c->dct264_sad[i];
1879 cmp[i]= c->dct_max[i];
1882 cmp[i]= c->quant_psnr[i];
1911 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1916 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1918 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1919 long a = *(long*)(src+i);
1920 long b = *(long*)(dst+i);
1921 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1924 dst[i+0] += src[i+0];
1927 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1929 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1930 long a = *(long*)(src1+i);
1931 long b = *(long*)(src2+i);
1932 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1935 dst[i] = src1[i]+src2[i];
1938 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1940 #if !HAVE_FAST_UNALIGNED
1941 if((long)src2 & (sizeof(long)-1)){
1942 for(i=0; i+7<w; i+=8){
1943 dst[i+0] = src1[i+0]-src2[i+0];
1944 dst[i+1] = src1[i+1]-src2[i+1];
1945 dst[i+2] = src1[i+2]-src2[i+2];
1946 dst[i+3] = src1[i+3]-src2[i+3];
1947 dst[i+4] = src1[i+4]-src2[i+4];
1948 dst[i+5] = src1[i+5]-src2[i+5];
1949 dst[i+6] = src1[i+6]-src2[i+6];
1950 dst[i+7] = src1[i+7]-src2[i+7];
1954 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1955 long a = *(long*)(src1+i);
1956 long b = *(long*)(src2+i);
1957 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1960 dst[i+0] = src1[i+0]-src2[i+0];
1963 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1971 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1980 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1988 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1998 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
2001 for(i=0; i<w-1; i++){
2028 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2058 #define BUTTERFLY2(o1,o2,i1,i2) \
2062 #define BUTTERFLY1(x,y) \
2071 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2073 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2081 //FIXME try pointer walks
2082 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2083 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2084 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2085 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2087 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2088 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2089 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2090 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2092 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2093 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2094 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2095 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2099 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2100 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2101 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2102 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2104 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2105 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2106 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2107 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2110 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2111 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2112 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2113 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2118 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2126 //FIXME try pointer walks
2127 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2128 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2129 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2130 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2132 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2133 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2134 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2135 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2137 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2138 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2139 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2140 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2144 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2145 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2146 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2147 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2149 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2150 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2151 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2152 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2155 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2156 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2157 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2158 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2161 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2166 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2167 MpegEncContext * const s= (MpegEncContext *)c;
2168 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2172 s->dsp.diff_pixels(temp, src1, src2, stride);
2174 return s->dsp.sum_abs_dctelem(temp);
2179 const int s07 = SRC(0) + SRC(7);\
2180 const int s16 = SRC(1) + SRC(6);\
2181 const int s25 = SRC(2) + SRC(5);\
2182 const int s34 = SRC(3) + SRC(4);\
2183 const int a0 = s07 + s34;\
2184 const int a1 = s16 + s25;\
2185 const int a2 = s07 - s34;\
2186 const int a3 = s16 - s25;\
2187 const int d07 = SRC(0) - SRC(7);\
2188 const int d16 = SRC(1) - SRC(6);\
2189 const int d25 = SRC(2) - SRC(5);\
2190 const int d34 = SRC(3) - SRC(4);\
2191 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2192 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2193 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2194 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2196 DST(1, a4 + (a7>>2)) ;\
2197 DST(2, a2 + (a3>>1)) ;\
2198 DST(3, a5 + (a6>>2)) ;\
2200 DST(5, a6 - (a5>>2)) ;\
2201 DST(6, (a2>>1) - a3 ) ;\
2202 DST(7, (a4>>2) - a7 ) ;\
2205 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2206 MpegEncContext * const s= (MpegEncContext *)c;
2211 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2213 #define SRC(x) dct[i][x]
2214 #define DST(x,v) dct[i][x]= v
2215 for( i = 0; i < 8; i++ )
2220 #define SRC(x) dct[x][i]
2221 #define DST(x,v) sum += FFABS(v)
2222 for( i = 0; i < 8; i++ )
2230 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2231 MpegEncContext * const s= (MpegEncContext *)c;
2232 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2237 s->dsp.diff_pixels(temp, src1, src2, stride);
2241 sum= FFMAX(sum, FFABS(temp[i]));
2246 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2247 MpegEncContext * const s= (MpegEncContext *)c;
2248 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2249 DCTELEM * const bak = temp+64;
2255 s->dsp.diff_pixels(temp, src1, src2, stride);
2257 memcpy(bak, temp, 64*sizeof(DCTELEM));
2259 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2260 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2261 ff_simple_idct(temp); //FIXME
2264 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2269 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2270 MpegEncContext * const s= (MpegEncContext *)c;
2271 const uint8_t *scantable= s->intra_scantable.permutated;
2272 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2273 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2274 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2275 int i, last, run, bits, level, distortion, start_i;
2276 const int esc_length= s->ac_esc_length;
2278 uint8_t * last_length;
2282 copy_block8(lsrc1, src1, 8, stride, 8);
2283 copy_block8(lsrc2, src2, 8, stride, 8);
2285 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2287 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2293 length = s->intra_ac_vlc_length;
2294 last_length= s->intra_ac_vlc_last_length;
2295 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2298 length = s->inter_ac_vlc_length;
2299 last_length= s->inter_ac_vlc_last_length;
2304 for(i=start_i; i<last; i++){
2305 int j= scantable[i];
2310 if((level&(~127)) == 0){
2311 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2320 level= temp[i] + 64;
2324 if((level&(~127)) == 0){
2325 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2333 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2335 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2338 s->dsp.idct_add(lsrc2, 8, temp);
2340 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2342 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2345 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2346 MpegEncContext * const s= (MpegEncContext *)c;
2347 const uint8_t *scantable= s->intra_scantable.permutated;
2348 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2349 int i, last, run, bits, level, start_i;
2350 const int esc_length= s->ac_esc_length;
2352 uint8_t * last_length;
2356 s->dsp.diff_pixels(temp, src1, src2, stride);
2358 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2364 length = s->intra_ac_vlc_length;
2365 last_length= s->intra_ac_vlc_last_length;
2366 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2369 length = s->inter_ac_vlc_length;
2370 last_length= s->inter_ac_vlc_last_length;
2375 for(i=start_i; i<last; i++){
2376 int j= scantable[i];
2381 if((level&(~127)) == 0){
2382 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2391 level= temp[i] + 64;
2395 if((level&(~127)) == 0){
2396 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2404 #define VSAD_INTRA(size) \
2405 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2409 for(y=1; y<h; y++){ \
2410 for(x=0; x<size; x+=4){ \
2411 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2412 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2422 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2427 for(x=0; x<16; x++){
2428 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2437 #define SQ(a) ((a)*(a))
2438 #define VSSE_INTRA(size) \
2439 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2443 for(y=1; y<h; y++){ \
2444 for(x=0; x<size; x+=4){ \
2445 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2446 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2456 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2461 for(x=0; x<16; x++){
2462 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2471 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2475 for(i=0; i<size; i++)
2476 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2480 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2481 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2482 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2484 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2486 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2487 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2488 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2489 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2491 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2493 for(i=0; i<len; i++)
2494 dst[i] = src0[i] * src1[i];
2497 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2500 for(i=0; i<len; i++)
2501 dst[i] = src0[i] * src1[-i];
2504 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2506 for(i=0; i<len; i++)
2507 dst[i] = src0[i] * src1[i] + src2[i];
2510 static void vector_fmul_window_c(float *dst, const float *src0,
2511 const float *src1, const float *win, int len)
2517 for(i=-len, j=len-1; i<0; i++, j--) {
2522 dst[i] = s0*wj - s1*wi;
2523 dst[j] = s0*wi + s1*wj;
2527 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2531 for (i = 0; i < len; i++)
2532 dst[i] = src[i] * mul;
2535 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2536 const float **sv, float mul, int len)
2539 for (i = 0; i < len; i += 2, sv++) {
2540 dst[i ] = src[i ] * sv[0][0] * mul;
2541 dst[i+1] = src[i+1] * sv[0][1] * mul;
2545 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2546 const float **sv, float mul, int len)
2549 for (i = 0; i < len; i += 4, sv++) {
2550 dst[i ] = src[i ] * sv[0][0] * mul;
2551 dst[i+1] = src[i+1] * sv[0][1] * mul;
2552 dst[i+2] = src[i+2] * sv[0][2] * mul;
2553 dst[i+3] = src[i+3] * sv[0][3] * mul;
2557 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2561 for (i = 0; i < len; i += 2, sv++) {
2562 dst[i ] = sv[0][0] * mul;
2563 dst[i+1] = sv[0][1] * mul;
2567 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2571 for (i = 0; i < len; i += 4, sv++) {
2572 dst[i ] = sv[0][0] * mul;
2573 dst[i+1] = sv[0][1] * mul;
2574 dst[i+2] = sv[0][2] * mul;
2575 dst[i+3] = sv[0][3] * mul;
2579 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2583 for (i = 0; i < len; i++) {
2584 float t = v1[i] - v2[i];
2590 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2595 for (i = 0; i < len; i++)
2601 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2602 uint32_t maxi, uint32_t maxisign)
2605 if(a > mini) return mini;
2606 else if((a^(1U<<31)) > maxisign) return maxi;
2610 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2612 uint32_t mini = *(uint32_t*)min;
2613 uint32_t maxi = *(uint32_t*)max;
2614 uint32_t maxisign = maxi ^ (1U<<31);
2615 uint32_t *dsti = (uint32_t*)dst;
2616 const uint32_t *srci = (const uint32_t*)src;
2617 for(i=0; i<len; i+=8) {
2618 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2619 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2620 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2621 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2622 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2623 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2624 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2625 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2628 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2630 if(min < 0 && max > 0) {
2631 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2633 for(i=0; i < len; i+=8) {
2634 dst[i ] = av_clipf(src[i ], min, max);
2635 dst[i + 1] = av_clipf(src[i + 1], min, max);
2636 dst[i + 2] = av_clipf(src[i + 2], min, max);
2637 dst[i + 3] = av_clipf(src[i + 3], min, max);
2638 dst[i + 4] = av_clipf(src[i + 4], min, max);
2639 dst[i + 5] = av_clipf(src[i + 5], min, max);
2640 dst[i + 6] = av_clipf(src[i + 6], min, max);
2641 dst[i + 7] = av_clipf(src[i + 7], min, max);
2646 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2651 res += (*v1++ * *v2++) >> shift;
2656 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2661 *v1++ += mul * *v3++;
2666 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2667 const int16_t *window, unsigned int len)
2670 int len2 = len >> 1;
2672 for (i = 0; i < len2; i++) {
2673 int16_t w = window[i];
2674 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2675 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2679 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2680 int32_t max, unsigned int len)
2683 *dst++ = av_clip(*src++, min, max);
2684 *dst++ = av_clip(*src++, min, max);
2685 *dst++ = av_clip(*src++, min, max);
2686 *dst++ = av_clip(*src++, min, max);
2687 *dst++ = av_clip(*src++, min, max);
2688 *dst++ = av_clip(*src++, min, max);
2689 *dst++ = av_clip(*src++, min, max);
2690 *dst++ = av_clip(*src++, min, max);
2696 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2697 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2698 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2699 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2700 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2701 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2702 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2704 static void wmv2_idct_row(short * b)
2707 int a0,a1,a2,a3,a4,a5,a6,a7;
2709 a1 = W1*b[1]+W7*b[7];
2710 a7 = W7*b[1]-W1*b[7];
2711 a5 = W5*b[5]+W3*b[3];
2712 a3 = W3*b[5]-W5*b[3];
2713 a2 = W2*b[2]+W6*b[6];
2714 a6 = W6*b[2]-W2*b[6];
2715 a0 = W0*b[0]+W0*b[4];
2716 a4 = W0*b[0]-W0*b[4];
2718 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2719 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2721 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2722 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2723 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2724 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2725 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2726 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2727 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2728 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2730 static void wmv2_idct_col(short * b)
2733 int a0,a1,a2,a3,a4,a5,a6,a7;
2734 /*step 1, with extended precision*/
2735 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2736 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2737 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2738 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2739 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2740 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2741 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2742 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2744 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2745 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2747 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2748 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2749 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2750 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2752 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2753 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2754 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2755 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2757 void ff_wmv2_idct_c(short * block){
2761 wmv2_idct_row(block+i);
2764 wmv2_idct_col(block+i);
2767 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2769 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2771 ff_wmv2_idct_c(block);
2772 ff_put_pixels_clamped_c(block, dest, line_size);
2774 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2776 ff_wmv2_idct_c(block);
2777 ff_add_pixels_clamped_c(block, dest, line_size);
2779 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2782 ff_put_pixels_clamped_c(block, dest, line_size);
2784 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2787 ff_add_pixels_clamped_c(block, dest, line_size);
2790 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2793 put_pixels_clamped4_c(block, dest, line_size);
2795 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2798 add_pixels_clamped4_c(block, dest, line_size);
2801 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2804 put_pixels_clamped2_c(block, dest, line_size);
2806 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2809 add_pixels_clamped2_c(block, dest, line_size);
2812 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2814 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2816 dest[0] = cm[(block[0] + 4)>>3];
2818 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2820 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2822 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2825 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2827 /* init static data */
2828 av_cold void dsputil_static_init(void)
2832 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2833 for(i=0;i<MAX_NEG_CROP;i++) {
2835 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2838 for(i=0;i<512;i++) {
2839 ff_squareTbl[i] = (i - 256) * (i - 256);
2842 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2845 int ff_check_alignment(void){
2846 static int did_fail=0;
2847 DECLARE_ALIGNED(16, int, aligned);
2849 if((intptr_t)&aligned & 15){
2851 #if HAVE_MMX || HAVE_ALTIVEC
2852 av_log(NULL, AV_LOG_ERROR,
2853 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2854 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2855 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2856 "Do not report crashes to Libav developers.\n");
2865 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2869 ff_check_alignment();
2872 if(avctx->dct_algo==FF_DCT_FASTINT) {
2873 c->fdct = fdct_ifast;
2874 c->fdct248 = fdct_ifast248;
2876 else if(avctx->dct_algo==FF_DCT_FAAN) {
2877 c->fdct = ff_faandct;
2878 c->fdct248 = ff_faandct248;
2881 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2882 c->fdct248 = ff_fdct248_islow;
2884 #endif //CONFIG_ENCODERS
2886 if(avctx->lowres==1){
2887 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2888 c->idct_put= ff_jref_idct4_put;
2889 c->idct_add= ff_jref_idct4_add;
2891 if (avctx->codec_id != CODEC_ID_H264) {
2892 c->idct_put= ff_h264_lowres_idct_put_8_c;
2893 c->idct_add= ff_h264_lowres_idct_add_8_c;
2895 switch (avctx->bits_per_raw_sample) {
2897 c->idct_put= ff_h264_lowres_idct_put_9_c;
2898 c->idct_add= ff_h264_lowres_idct_add_9_c;
2901 c->idct_put= ff_h264_lowres_idct_put_10_c;
2902 c->idct_add= ff_h264_lowres_idct_add_10_c;
2905 c->idct_put= ff_h264_lowres_idct_put_8_c;
2906 c->idct_add= ff_h264_lowres_idct_add_8_c;
2910 c->idct = j_rev_dct4;
2911 c->idct_permutation_type= FF_NO_IDCT_PERM;
2912 }else if(avctx->lowres==2){
2913 c->idct_put= ff_jref_idct2_put;
2914 c->idct_add= ff_jref_idct2_add;
2915 c->idct = j_rev_dct2;
2916 c->idct_permutation_type= FF_NO_IDCT_PERM;
2917 }else if(avctx->lowres==3){
2918 c->idct_put= ff_jref_idct1_put;
2919 c->idct_add= ff_jref_idct1_add;
2920 c->idct = j_rev_dct1;
2921 c->idct_permutation_type= FF_NO_IDCT_PERM;
2923 if(avctx->idct_algo==FF_IDCT_INT){
2924 c->idct_put= ff_jref_idct_put;
2925 c->idct_add= ff_jref_idct_add;
2926 c->idct = j_rev_dct;
2927 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2928 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2929 avctx->idct_algo==FF_IDCT_VP3){
2930 c->idct_put= ff_vp3_idct_put_c;
2931 c->idct_add= ff_vp3_idct_add_c;
2932 c->idct = ff_vp3_idct_c;
2933 c->idct_permutation_type= FF_NO_IDCT_PERM;
2934 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2935 c->idct_put= ff_wmv2_idct_put_c;
2936 c->idct_add= ff_wmv2_idct_add_c;
2937 c->idct = ff_wmv2_idct_c;
2938 c->idct_permutation_type= FF_NO_IDCT_PERM;
2939 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2940 c->idct_put= ff_faanidct_put;
2941 c->idct_add= ff_faanidct_add;
2942 c->idct = ff_faanidct;
2943 c->idct_permutation_type= FF_NO_IDCT_PERM;
2944 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2945 c->idct_put= ff_ea_idct_put_c;
2946 c->idct_permutation_type= FF_NO_IDCT_PERM;
2947 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2948 c->idct = ff_bink_idct_c;
2949 c->idct_add = ff_bink_idct_add_c;
2950 c->idct_put = ff_bink_idct_put_c;
2951 c->idct_permutation_type = FF_NO_IDCT_PERM;
2952 }else{ //accurate/default
2953 c->idct_put= ff_simple_idct_put;
2954 c->idct_add= ff_simple_idct_add;
2955 c->idct = ff_simple_idct;
2956 c->idct_permutation_type= FF_NO_IDCT_PERM;
2960 c->get_pixels = get_pixels_c;
2961 c->diff_pixels = diff_pixels_c;
2962 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2963 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2964 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2965 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2966 c->sum_abs_dctelem = sum_abs_dctelem_c;
2969 c->pix_sum = pix_sum_c;
2970 c->pix_norm1 = pix_norm1_c;
2972 c->fill_block_tab[0] = fill_block16_c;
2973 c->fill_block_tab[1] = fill_block8_c;
2974 c->scale_block = scale_block_c;
2976 /* TODO [0] 16 [1] 8 */
2977 c->pix_abs[0][0] = pix_abs16_c;
2978 c->pix_abs[0][1] = pix_abs16_x2_c;
2979 c->pix_abs[0][2] = pix_abs16_y2_c;
2980 c->pix_abs[0][3] = pix_abs16_xy2_c;
2981 c->pix_abs[1][0] = pix_abs8_c;
2982 c->pix_abs[1][1] = pix_abs8_x2_c;
2983 c->pix_abs[1][2] = pix_abs8_y2_c;
2984 c->pix_abs[1][3] = pix_abs8_xy2_c;
2986 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2987 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2988 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2989 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2990 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2991 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2992 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2993 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2994 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2996 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2997 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2998 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2999 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3000 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3001 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3002 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3003 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3004 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3006 #define dspfunc(PFX, IDX, NUM) \
3007 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3008 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3009 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3010 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3011 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3012 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3013 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3014 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3015 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3016 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3017 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3018 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3019 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3020 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3021 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3022 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3024 dspfunc(put_qpel, 0, 16);
3025 dspfunc(put_no_rnd_qpel, 0, 16);
3027 dspfunc(avg_qpel, 0, 16);
3028 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3030 dspfunc(put_qpel, 1, 8);
3031 dspfunc(put_no_rnd_qpel, 1, 8);
3033 dspfunc(avg_qpel, 1, 8);
3034 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3038 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3039 ff_mlp_init(c, avctx);
3041 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3042 ff_intrax8dsp_init(c,avctx);
3044 #if CONFIG_RV30_DECODER
3045 ff_rv30dsp_init(c,avctx);
3047 #if CONFIG_RV40_DECODER
3048 ff_rv40dsp_init(c,avctx);
3049 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3050 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3051 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3052 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3055 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3056 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3057 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3058 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3059 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3060 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3061 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3062 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3064 #define SET_CMP_FUNC(name) \
3065 c->name[0]= name ## 16_c;\
3066 c->name[1]= name ## 8x8_c;
3068 SET_CMP_FUNC(hadamard8_diff)
3069 c->hadamard8_diff[4]= hadamard8_intra16_c;
3070 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3071 SET_CMP_FUNC(dct_sad)
3072 SET_CMP_FUNC(dct_max)
3074 SET_CMP_FUNC(dct264_sad)
3076 c->sad[0]= pix_abs16_c;
3077 c->sad[1]= pix_abs8_c;
3081 SET_CMP_FUNC(quant_psnr)
3084 c->vsad[0]= vsad16_c;
3085 c->vsad[4]= vsad_intra16_c;
3086 c->vsad[5]= vsad_intra8_c;
3087 c->vsse[0]= vsse16_c;
3088 c->vsse[4]= vsse_intra16_c;
3089 c->vsse[5]= vsse_intra8_c;
3090 c->nsse[0]= nsse16_c;
3091 c->nsse[1]= nsse8_c;
3093 ff_dsputil_init_dwt(c);
3096 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3098 c->add_bytes= add_bytes_c;
3099 c->add_bytes_l2= add_bytes_l2_c;
3100 c->diff_bytes= diff_bytes_c;
3101 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3102 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3103 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3104 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3105 c->bswap_buf= bswap_buf;
3106 c->bswap16_buf = bswap16_buf;
3107 #if CONFIG_PNG_DECODER
3108 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3111 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3112 c->h263_h_loop_filter= h263_h_loop_filter_c;
3113 c->h263_v_loop_filter= h263_v_loop_filter_c;
3116 if (CONFIG_VP3_DECODER) {
3117 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3118 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3119 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3122 c->h261_loop_filter= h261_loop_filter_c;
3124 c->try_8x8basis= try_8x8basis_c;
3125 c->add_8x8basis= add_8x8basis_c;
3127 #if CONFIG_VORBIS_DECODER
3128 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3130 #if CONFIG_AC3_DECODER
3131 c->ac3_downmix = ff_ac3_downmix_c;
3133 c->vector_fmul = vector_fmul_c;
3134 c->vector_fmul_reverse = vector_fmul_reverse_c;
3135 c->vector_fmul_add = vector_fmul_add_c;
3136 c->vector_fmul_window = vector_fmul_window_c;
3137 c->vector_clipf = vector_clipf_c;
3138 c->scalarproduct_int16 = scalarproduct_int16_c;
3139 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3140 c->apply_window_int16 = apply_window_int16_c;
3141 c->vector_clip_int32 = vector_clip_int32_c;
3142 c->scalarproduct_float = scalarproduct_float_c;
3143 c->butterflies_float = butterflies_float_c;
3144 c->vector_fmul_scalar = vector_fmul_scalar_c;
3146 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3147 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3149 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3150 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3152 c->shrink[0]= av_image_copy_plane;
3153 c->shrink[1]= ff_shrink22;
3154 c->shrink[2]= ff_shrink44;
3155 c->shrink[3]= ff_shrink88;
3157 c->prefetch= just_return;
3159 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3160 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3164 #define FUNC(f, depth) f ## _ ## depth
3165 #define FUNCC(f, depth) f ## _ ## depth ## _c
3167 #define dspfunc1(PFX, IDX, NUM, depth)\
3168 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3169 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3170 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3171 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3173 #define dspfunc2(PFX, IDX, NUM, depth)\
3174 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3175 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3176 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3177 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3178 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3179 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3180 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3181 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3182 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3183 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3184 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3185 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3186 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3187 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3188 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3189 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3192 #define BIT_DEPTH_FUNCS(depth)\
3193 c->draw_edges = FUNCC(draw_edges , depth);\
3194 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3195 c->clear_block = FUNCC(clear_block , depth);\
3196 c->clear_blocks = FUNCC(clear_blocks , depth);\
3197 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3198 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3199 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3200 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3202 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3203 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3204 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3205 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3206 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3207 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3209 dspfunc1(put , 0, 16, depth);\
3210 dspfunc1(put , 1, 8, depth);\
3211 dspfunc1(put , 2, 4, depth);\
3212 dspfunc1(put , 3, 2, depth);\
3213 dspfunc1(put_no_rnd, 0, 16, depth);\
3214 dspfunc1(put_no_rnd, 1, 8, depth);\
3215 dspfunc1(avg , 0, 16, depth);\
3216 dspfunc1(avg , 1, 8, depth);\
3217 dspfunc1(avg , 2, 4, depth);\
3218 dspfunc1(avg , 3, 2, depth);\
3219 dspfunc1(avg_no_rnd, 0, 16, depth);\
3220 dspfunc1(avg_no_rnd, 1, 8, depth);\
3222 dspfunc2(put_h264_qpel, 0, 16, depth);\
3223 dspfunc2(put_h264_qpel, 1, 8, depth);\
3224 dspfunc2(put_h264_qpel, 2, 4, depth);\
3225 dspfunc2(put_h264_qpel, 3, 2, depth);\
3226 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3227 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3228 dspfunc2(avg_h264_qpel, 2, 4, depth);
3230 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3233 switch (avctx->bits_per_raw_sample) {
3241 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3248 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3249 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3250 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3251 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3252 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3253 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3254 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3255 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3256 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3258 for(i=0; i<64; i++){
3259 if(!c->put_2tap_qpel_pixels_tab[0][i])
3260 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3261 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3262 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3265 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3266 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3267 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3268 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3270 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3271 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3272 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3273 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3275 switch(c->idct_permutation_type){
3276 case FF_NO_IDCT_PERM:
3278 c->idct_permutation[i]= i;
3280 case FF_LIBMPEG2_IDCT_PERM:
3282 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3284 case FF_SIMPLE_IDCT_PERM:
3286 c->idct_permutation[i]= simple_mmx_permutation[i];
3288 case FF_TRANSPOSE_IDCT_PERM:
3290 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3292 case FF_PARTTRANS_IDCT_PERM:
3294 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3296 case FF_SSE2_IDCT_PERM:
3298 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3301 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");