3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_internal.h"
51 #include "dsputil_internal.h"
55 #include "dsputil_internal.h"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
188 #if LONG_MAX > 2147483647
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
314 /* read the pixels */
316 block[0] = pixels[0];
317 block[1] = pixels[1];
318 block[2] = pixels[2];
319 block[3] = pixels[3];
320 block[4] = pixels[4];
321 block[5] = pixels[5];
322 block[6] = pixels[6];
323 block[7] = pixels[7];
329 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
330 const uint8_t *s2, int stride){
333 /* read the pixels */
335 block[0] = s1[0] - s2[0];
336 block[1] = s1[1] - s2[1];
337 block[2] = s1[2] - s2[2];
338 block[3] = s1[3] - s2[3];
339 block[4] = s1[4] - s2[4];
340 block[5] = s1[5] - s2[5];
341 block[6] = s1[6] - s2[6];
342 block[7] = s1[7] - s2[7];
350 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
356 /* read the pixels */
358 pixels[0] = cm[block[0]];
359 pixels[1] = cm[block[1]];
360 pixels[2] = cm[block[2]];
361 pixels[3] = cm[block[3]];
362 pixels[4] = cm[block[4]];
363 pixels[5] = cm[block[5]];
364 pixels[6] = cm[block[6]];
365 pixels[7] = cm[block[7]];
372 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
376 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
378 /* read the pixels */
380 pixels[0] = cm[block[0]];
381 pixels[1] = cm[block[1]];
382 pixels[2] = cm[block[2]];
383 pixels[3] = cm[block[3]];
390 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
394 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
396 /* read the pixels */
398 pixels[0] = cm[block[0]];
399 pixels[1] = cm[block[1]];
406 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
407 uint8_t *restrict pixels,
412 for (i = 0; i < 8; i++) {
413 for (j = 0; j < 8; j++) {
416 else if (*block > 127)
419 *pixels = (uint8_t)(*block + 128);
423 pixels += (line_size - 8);
427 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
432 /* read the pixels */
434 pixels[0] = block[0];
435 pixels[1] = block[1];
436 pixels[2] = block[2];
437 pixels[3] = block[3];
438 pixels[4] = block[4];
439 pixels[5] = block[5];
440 pixels[6] = block[6];
441 pixels[7] = block[7];
448 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
452 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
454 /* read the pixels */
456 pixels[0] = cm[pixels[0] + block[0]];
457 pixels[1] = cm[pixels[1] + block[1]];
458 pixels[2] = cm[pixels[2] + block[2]];
459 pixels[3] = cm[pixels[3] + block[3]];
460 pixels[4] = cm[pixels[4] + block[4]];
461 pixels[5] = cm[pixels[5] + block[5]];
462 pixels[6] = cm[pixels[6] + block[6]];
463 pixels[7] = cm[pixels[7] + block[7]];
469 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
473 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
475 /* read the pixels */
477 pixels[0] = cm[pixels[0] + block[0]];
478 pixels[1] = cm[pixels[1] + block[1]];
479 pixels[2] = cm[pixels[2] + block[2]];
480 pixels[3] = cm[pixels[3] + block[3]];
486 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
490 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
492 /* read the pixels */
494 pixels[0] = cm[pixels[0] + block[0]];
495 pixels[1] = cm[pixels[1] + block[1]];
501 static int sum_abs_dctelem_c(DCTELEM *block)
505 sum+= FFABS(block[i]);
509 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
513 for (i = 0; i < h; i++) {
514 memset(block, value, 16);
519 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
523 for (i = 0; i < h; i++) {
524 memset(block, value, 8);
529 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
532 uint16_t *dst1 = (uint16_t *) dst;
533 uint16_t *dst2 = (uint16_t *)(dst + linesize);
535 for (j = 0; j < 8; j++) {
536 for (i = 0; i < 8; i++) {
537 dst1[i] = dst2[i] = src[i] * 0x0101;
545 #define avg2(a,b) ((a+b+1)>>1)
546 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
548 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
550 const int A=(16-x16)*(16-y16);
551 const int B=( x16)*(16-y16);
552 const int C=(16-x16)*( y16);
553 const int D=( x16)*( y16);
558 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
559 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
560 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
561 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
562 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
563 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
564 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
565 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
571 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
572 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
575 const int s= 1<<shift;
585 for(x=0; x<8; x++){ //XXX FIXME optimize
586 int src_x, src_y, frac_x, frac_y, index;
595 if((unsigned)src_x < width){
596 if((unsigned)src_y < height){
597 index= src_x + src_y*stride;
598 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
599 + src[index +1]* frac_x )*(s-frac_y)
600 + ( src[index+stride ]*(s-frac_x)
601 + src[index+stride+1]* frac_x )* frac_y
604 index= src_x + av_clip(src_y, 0, height)*stride;
605 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
606 + src[index +1]* frac_x )*s
610 if((unsigned)src_y < height){
611 index= av_clip(src_x, 0, width) + src_y*stride;
612 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
613 + src[index+stride ]* frac_y )*s
616 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
617 dst[y*stride + x]= src[index ];
629 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
631 case 2: put_pixels2_8_c (dst, src, stride, height); break;
632 case 4: put_pixels4_8_c (dst, src, stride, height); break;
633 case 8: put_pixels8_8_c (dst, src, stride, height); break;
634 case 16:put_pixels16_8_c(dst, src, stride, height); break;
638 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
660 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
671 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
682 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
704 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 for (i=0; i < height; i++) {
707 for (j=0; j < width; j++) {
708 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
715 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
717 for (i=0; i < height; i++) {
718 for (j=0; j < width; j++) {
719 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
726 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
728 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
729 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
730 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
731 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
735 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
801 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
803 for (i=0; i < height; i++) {
804 for (j=0; j < width; j++) {
805 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
812 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
814 for (i=0; i < height; i++) {
815 for (j=0; j < width; j++) {
816 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
823 #define TPEL_WIDTH(width)\
824 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
825 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
826 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
827 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
828 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
829 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
830 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
831 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
832 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
833 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
834 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
835 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
836 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
837 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
838 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
839 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
840 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
841 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
844 #define QPEL_MC(r, OPNAME, RND, OP) \
845 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
850 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
851 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
852 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
853 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
854 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
855 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
856 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
857 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
863 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
865 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
869 const int src0= src[0*srcStride];\
870 const int src1= src[1*srcStride];\
871 const int src2= src[2*srcStride];\
872 const int src3= src[3*srcStride];\
873 const int src4= src[4*srcStride];\
874 const int src5= src[5*srcStride];\
875 const int src6= src[6*srcStride];\
876 const int src7= src[7*srcStride];\
877 const int src8= src[8*srcStride];\
878 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
879 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
880 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
881 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
882 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
883 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
884 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
885 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
891 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
892 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
897 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
898 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
899 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
900 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
901 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
902 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
903 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
904 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
905 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
906 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
907 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
908 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
909 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
910 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
911 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
912 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
918 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
919 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
924 const int src0= src[0*srcStride];\
925 const int src1= src[1*srcStride];\
926 const int src2= src[2*srcStride];\
927 const int src3= src[3*srcStride];\
928 const int src4= src[4*srcStride];\
929 const int src5= src[5*srcStride];\
930 const int src6= src[6*srcStride];\
931 const int src7= src[7*srcStride];\
932 const int src8= src[8*srcStride];\
933 const int src9= src[9*srcStride];\
934 const int src10= src[10*srcStride];\
935 const int src11= src[11*srcStride];\
936 const int src12= src[12*srcStride];\
937 const int src13= src[13*srcStride];\
938 const int src14= src[14*srcStride];\
939 const int src15= src[15*srcStride];\
940 const int src16= src[16*srcStride];\
941 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
942 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
943 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
944 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
945 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
946 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
947 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
948 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
949 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
950 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
951 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
952 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
953 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
954 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
955 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
956 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
962 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
964 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
965 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
968 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
969 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
972 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
974 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
975 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
978 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
981 copy_block9(full, src, 16, stride, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
983 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
986 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
988 copy_block9(full, src, 16, stride, 9);\
989 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
992 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
997 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
999 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1000 uint8_t full[16*9];\
1003 uint8_t halfHV[64];\
1004 copy_block9(full, src, 16, stride, 9);\
1005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1008 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1010 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1011 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 uint8_t halfHV[64];\
1035 copy_block9(full, src, 16, stride, 9);\
1036 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1037 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1038 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1039 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1041 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[16*9];\
1045 uint8_t halfHV[64];\
1046 copy_block9(full, src, 16, stride, 9);\
1047 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1050 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1052 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1053 uint8_t full[16*9];\
1055 uint8_t halfHV[64];\
1056 copy_block9(full, src, 16, stride, 9);\
1057 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1058 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1062 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t full[16*9];\
1066 uint8_t halfHV[64];\
1067 copy_block9(full, src, 16, stride, 9);\
1068 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1069 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1070 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1071 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1073 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1074 uint8_t full[16*9];\
1076 uint8_t halfHV[64];\
1077 copy_block9(full, src, 16, stride, 9);\
1078 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1080 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1083 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1085 uint8_t halfHV[64];\
1086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1090 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1092 uint8_t halfHV[64];\
1093 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1094 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1095 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1097 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1098 uint8_t full[16*9];\
1101 uint8_t halfHV[64];\
1102 copy_block9(full, src, 16, stride, 9);\
1103 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1105 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1106 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1108 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1109 uint8_t full[16*9];\
1111 copy_block9(full, src, 16, stride, 9);\
1112 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1113 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1114 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1116 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1117 uint8_t full[16*9];\
1120 uint8_t halfHV[64];\
1121 copy_block9(full, src, 16, stride, 9);\
1122 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1124 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1125 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1127 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1128 uint8_t full[16*9];\
1130 copy_block9(full, src, 16, stride, 9);\
1131 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1132 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1133 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1135 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1137 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1138 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1141 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1143 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1144 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1147 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1148 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1151 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1153 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1154 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1157 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1158 uint8_t full[24*17];\
1160 copy_block17(full, src, 24, stride, 17);\
1161 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1162 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1165 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1166 uint8_t full[24*17];\
1167 copy_block17(full, src, 24, stride, 17);\
1168 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1171 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[24*17];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1176 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1178 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1179 uint8_t full[24*17];\
1180 uint8_t halfH[272];\
1181 uint8_t halfV[256];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1190 uint8_t full[24*17];\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1220 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1221 uint8_t full[24*17];\
1222 uint8_t halfH[272];\
1223 uint8_t halfV[256];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1231 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1232 uint8_t full[24*17];\
1233 uint8_t halfH[272];\
1234 uint8_t halfHV[256];\
1235 copy_block17(full, src, 24, stride, 17);\
1236 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1237 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1238 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1239 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1241 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1242 uint8_t full[24*17];\
1243 uint8_t halfH[272];\
1244 uint8_t halfV[256];\
1245 uint8_t halfHV[256];\
1246 copy_block17(full, src, 24, stride, 17);\
1247 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1249 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1250 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1252 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1253 uint8_t full[24*17];\
1254 uint8_t halfH[272];\
1255 uint8_t halfHV[256];\
1256 copy_block17(full, src, 24, stride, 17);\
1257 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1263 uint8_t halfH[272];\
1264 uint8_t halfHV[256];\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1266 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1267 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1269 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1270 uint8_t halfH[272];\
1271 uint8_t halfHV[256];\
1272 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1273 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1274 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1276 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1277 uint8_t full[24*17];\
1278 uint8_t halfH[272];\
1279 uint8_t halfV[256];\
1280 uint8_t halfHV[256];\
1281 copy_block17(full, src, 24, stride, 17);\
1282 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1284 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1285 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1287 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t full[24*17];\
1289 uint8_t halfH[272];\
1290 copy_block17(full, src, 24, stride, 17);\
1291 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1292 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1293 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1295 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1296 uint8_t full[24*17];\
1297 uint8_t halfH[272];\
1298 uint8_t halfV[256];\
1299 uint8_t halfHV[256];\
1300 copy_block17(full, src, 24, stride, 17);\
1301 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1303 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1304 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1306 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1307 uint8_t full[24*17];\
1308 uint8_t halfH[272];\
1309 copy_block17(full, src, 24, stride, 17);\
1310 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1311 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1312 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1314 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1315 uint8_t halfH[272];\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1317 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1320 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1321 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1322 #define op_put(a, b) a = cm[((b) + 16)>>5]
1323 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1325 QPEL_MC(0, put_ , _ , op_put)
1326 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1327 QPEL_MC(0, avg_ , _ , op_avg)
1328 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1330 #undef op_avg_no_rnd
1332 #undef op_put_no_rnd
1334 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1335 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1336 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1337 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1338 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1339 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1341 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1342 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1346 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1347 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1348 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1349 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1350 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1351 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1352 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1353 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1359 #if CONFIG_RV40_DECODER
1360 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361 put_pixels16_xy2_8_c(dst, src, stride, 16);
1363 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1364 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1366 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1367 put_pixels8_xy2_8_c(dst, src, stride, 8);
1369 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1370 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1372 #endif /* CONFIG_RV40_DECODER */
1374 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1379 const int src_1= src[ -srcStride];
1380 const int src0 = src[0 ];
1381 const int src1 = src[ srcStride];
1382 const int src2 = src[2*srcStride];
1383 const int src3 = src[3*srcStride];
1384 const int src4 = src[4*srcStride];
1385 const int src5 = src[5*srcStride];
1386 const int src6 = src[6*srcStride];
1387 const int src7 = src[7*srcStride];
1388 const int src8 = src[8*srcStride];
1389 const int src9 = src[9*srcStride];
1390 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1391 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1392 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1393 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1394 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1395 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1396 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1397 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1403 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1405 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1406 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1409 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1410 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1413 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1415 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1416 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1419 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1420 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1423 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1427 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1429 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1432 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1436 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1437 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1438 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1439 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1441 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1443 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1444 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1447 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1448 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1450 const int strength= ff_h263_loop_filter_strength[qscale];
1454 int p0= src[x-2*stride];
1455 int p1= src[x-1*stride];
1456 int p2= src[x+0*stride];
1457 int p3= src[x+1*stride];
1458 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1460 if (d<-2*strength) d1= 0;
1461 else if(d<- strength) d1=-2*strength - d;
1462 else if(d< strength) d1= d;
1463 else if(d< 2*strength) d1= 2*strength - d;
1468 if(p1&256) p1= ~(p1>>31);
1469 if(p2&256) p2= ~(p2>>31);
1471 src[x-1*stride] = p1;
1472 src[x+0*stride] = p2;
1476 d2= av_clip((p0-p3)/4, -ad1, ad1);
1478 src[x-2*stride] = p0 - d2;
1479 src[x+ stride] = p3 + d2;
1484 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1485 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1487 const int strength= ff_h263_loop_filter_strength[qscale];
1491 int p0= src[y*stride-2];
1492 int p1= src[y*stride-1];
1493 int p2= src[y*stride+0];
1494 int p3= src[y*stride+1];
1495 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1497 if (d<-2*strength) d1= 0;
1498 else if(d<- strength) d1=-2*strength - d;
1499 else if(d< strength) d1= d;
1500 else if(d< 2*strength) d1= 2*strength - d;
1505 if(p1&256) p1= ~(p1>>31);
1506 if(p2&256) p2= ~(p2>>31);
1508 src[y*stride-1] = p1;
1509 src[y*stride+0] = p2;
1513 d2= av_clip((p0-p3)/4, -ad1, ad1);
1515 src[y*stride-2] = p0 - d2;
1516 src[y*stride+1] = p3 + d2;
1521 static void h261_loop_filter_c(uint8_t *src, int stride){
1526 temp[x ] = 4*src[x ];
1527 temp[x + 7*8] = 4*src[x + 7*stride];
1531 xy = y * stride + x;
1533 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1538 src[ y*stride] = (temp[ y*8] + 2)>>2;
1539 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1541 xy = y * stride + x;
1543 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1548 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1554 s += abs(pix1[0] - pix2[0]);
1555 s += abs(pix1[1] - pix2[1]);
1556 s += abs(pix1[2] - pix2[2]);
1557 s += abs(pix1[3] - pix2[3]);
1558 s += abs(pix1[4] - pix2[4]);
1559 s += abs(pix1[5] - pix2[5]);
1560 s += abs(pix1[6] - pix2[6]);
1561 s += abs(pix1[7] - pix2[7]);
1562 s += abs(pix1[8] - pix2[8]);
1563 s += abs(pix1[9] - pix2[9]);
1564 s += abs(pix1[10] - pix2[10]);
1565 s += abs(pix1[11] - pix2[11]);
1566 s += abs(pix1[12] - pix2[12]);
1567 s += abs(pix1[13] - pix2[13]);
1568 s += abs(pix1[14] - pix2[14]);
1569 s += abs(pix1[15] - pix2[15]);
1576 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1582 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1583 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1584 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1585 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1586 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1587 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1588 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1589 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1590 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1591 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1592 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1593 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1594 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1595 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1596 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1597 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1604 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1607 uint8_t *pix3 = pix2 + line_size;
1611 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1612 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1613 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1614 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1615 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1616 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1617 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1618 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1619 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1620 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1621 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1622 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1623 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1624 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1625 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1626 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1634 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1637 uint8_t *pix3 = pix2 + line_size;
1641 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1642 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1643 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1644 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1645 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1646 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1647 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1648 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1649 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1650 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1651 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1652 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1653 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1654 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1655 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1656 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1664 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1670 s += abs(pix1[0] - pix2[0]);
1671 s += abs(pix1[1] - pix2[1]);
1672 s += abs(pix1[2] - pix2[2]);
1673 s += abs(pix1[3] - pix2[3]);
1674 s += abs(pix1[4] - pix2[4]);
1675 s += abs(pix1[5] - pix2[5]);
1676 s += abs(pix1[6] - pix2[6]);
1677 s += abs(pix1[7] - pix2[7]);
1684 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1690 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1691 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1692 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1693 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1694 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1695 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1696 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1697 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1704 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1707 uint8_t *pix3 = pix2 + line_size;
1711 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1712 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1713 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1714 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1715 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1716 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1717 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1718 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1726 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1729 uint8_t *pix3 = pix2 + line_size;
1733 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1734 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1735 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1736 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1737 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1738 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1739 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1740 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1748 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1749 MpegEncContext *c = v;
1755 for(x=0; x<16; x++){
1756 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1759 for(x=0; x<15; x++){
1760 score2+= FFABS( s1[x ] - s1[x +stride]
1761 - s1[x+1] + s1[x+1+stride])
1762 -FFABS( s2[x ] - s2[x +stride]
1763 - s2[x+1] + s2[x+1+stride]);
1770 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1771 else return score1 + FFABS(score2)*8;
1774 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1775 MpegEncContext *c = v;
1782 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1786 score2+= FFABS( s1[x ] - s1[x +stride]
1787 - s1[x+1] + s1[x+1+stride])
1788 -FFABS( s2[x ] - s2[x +stride]
1789 - s2[x+1] + s2[x+1+stride]);
1796 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1797 else return score1 + FFABS(score2)*8;
1800 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1804 for(i=0; i<8*8; i++){
1805 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1808 assert(-512<b && b<512);
1810 sum += (w*b)*(w*b)>>4;
1815 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1818 for(i=0; i<8*8; i++){
1819 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1824 * permutes an 8x8 block.
1825 * @param block the block which will be permuted according to the given permutation vector
1826 * @param permutation the permutation vector
1827 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1828 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1829 * (inverse) permutated to scantable order!
1831 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1837 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1839 for(i=0; i<=last; i++){
1840 const int j= scantable[i];
1845 for(i=0; i<=last; i++){
1846 const int j= scantable[i];
1847 const int perm_j= permutation[j];
1848 block[perm_j]= temp[j];
1852 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1856 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1859 memset(cmp, 0, sizeof(void*)*6);
1867 cmp[i]= c->hadamard8_diff[i];
1873 cmp[i]= c->dct_sad[i];
1876 cmp[i]= c->dct264_sad[i];
1879 cmp[i]= c->dct_max[i];
1882 cmp[i]= c->quant_psnr[i];
1911 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1916 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1918 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1919 long a = *(long*)(src+i);
1920 long b = *(long*)(dst+i);
1921 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1924 dst[i+0] += src[i+0];
1927 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1929 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1930 long a = *(long*)(src1+i);
1931 long b = *(long*)(src2+i);
1932 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1935 dst[i] = src1[i]+src2[i];
1938 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1940 #if !HAVE_FAST_UNALIGNED
1941 if((long)src2 & (sizeof(long)-1)){
1942 for(i=0; i+7<w; i+=8){
1943 dst[i+0] = src1[i+0]-src2[i+0];
1944 dst[i+1] = src1[i+1]-src2[i+1];
1945 dst[i+2] = src1[i+2]-src2[i+2];
1946 dst[i+3] = src1[i+3]-src2[i+3];
1947 dst[i+4] = src1[i+4]-src2[i+4];
1948 dst[i+5] = src1[i+5]-src2[i+5];
1949 dst[i+6] = src1[i+6]-src2[i+6];
1950 dst[i+7] = src1[i+7]-src2[i+7];
1954 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1955 long a = *(long*)(src1+i);
1956 long b = *(long*)(src2+i);
1957 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1960 dst[i+0] = src1[i+0]-src2[i+0];
1963 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1971 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1980 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1988 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1998 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
2001 for(i=0; i<w-1; i++){
2028 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2058 #define BUTTERFLY2(o1,o2,i1,i2) \
2062 #define BUTTERFLY1(x,y) \
2071 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2073 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2081 //FIXME try pointer walks
2082 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2083 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2084 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2085 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2087 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2088 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2089 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2090 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2092 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2093 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2094 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2095 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2099 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2100 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2101 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2102 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2104 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2105 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2106 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2107 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2110 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2111 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2112 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2113 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2119 printf("MAX:%d\n", maxi);
2125 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2133 //FIXME try pointer walks
2134 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2135 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2136 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2137 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2139 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2140 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2141 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2142 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2144 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2145 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2146 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2147 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2151 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2152 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2153 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2154 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2156 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2157 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2158 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2159 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2162 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2163 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2164 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2165 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2168 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2173 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2174 MpegEncContext * const s= (MpegEncContext *)c;
2175 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2179 s->dsp.diff_pixels(temp, src1, src2, stride);
2181 return s->dsp.sum_abs_dctelem(temp);
2186 const int s07 = SRC(0) + SRC(7);\
2187 const int s16 = SRC(1) + SRC(6);\
2188 const int s25 = SRC(2) + SRC(5);\
2189 const int s34 = SRC(3) + SRC(4);\
2190 const int a0 = s07 + s34;\
2191 const int a1 = s16 + s25;\
2192 const int a2 = s07 - s34;\
2193 const int a3 = s16 - s25;\
2194 const int d07 = SRC(0) - SRC(7);\
2195 const int d16 = SRC(1) - SRC(6);\
2196 const int d25 = SRC(2) - SRC(5);\
2197 const int d34 = SRC(3) - SRC(4);\
2198 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2199 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2200 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2201 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2203 DST(1, a4 + (a7>>2)) ;\
2204 DST(2, a2 + (a3>>1)) ;\
2205 DST(3, a5 + (a6>>2)) ;\
2207 DST(5, a6 - (a5>>2)) ;\
2208 DST(6, (a2>>1) - a3 ) ;\
2209 DST(7, (a4>>2) - a7 ) ;\
2212 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2213 MpegEncContext * const s= (MpegEncContext *)c;
2218 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2220 #define SRC(x) dct[i][x]
2221 #define DST(x,v) dct[i][x]= v
2222 for( i = 0; i < 8; i++ )
2227 #define SRC(x) dct[x][i]
2228 #define DST(x,v) sum += FFABS(v)
2229 for( i = 0; i < 8; i++ )
2237 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2238 MpegEncContext * const s= (MpegEncContext *)c;
2239 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2244 s->dsp.diff_pixels(temp, src1, src2, stride);
2248 sum= FFMAX(sum, FFABS(temp[i]));
2253 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2254 MpegEncContext * const s= (MpegEncContext *)c;
2255 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2256 DCTELEM * const bak = temp+64;
2262 s->dsp.diff_pixels(temp, src1, src2, stride);
2264 memcpy(bak, temp, 64*sizeof(DCTELEM));
2266 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2267 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2268 ff_simple_idct(temp); //FIXME
2271 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2276 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2277 MpegEncContext * const s= (MpegEncContext *)c;
2278 const uint8_t *scantable= s->intra_scantable.permutated;
2279 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2280 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2281 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2282 int i, last, run, bits, level, distortion, start_i;
2283 const int esc_length= s->ac_esc_length;
2285 uint8_t * last_length;
2289 copy_block8(lsrc1, src1, 8, stride, 8);
2290 copy_block8(lsrc2, src2, 8, stride, 8);
2292 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2294 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2300 length = s->intra_ac_vlc_length;
2301 last_length= s->intra_ac_vlc_last_length;
2302 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2305 length = s->inter_ac_vlc_length;
2306 last_length= s->inter_ac_vlc_last_length;
2311 for(i=start_i; i<last; i++){
2312 int j= scantable[i];
2317 if((level&(~127)) == 0){
2318 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2327 level= temp[i] + 64;
2331 if((level&(~127)) == 0){
2332 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2340 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2342 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2345 s->dsp.idct_add(lsrc2, 8, temp);
2347 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2349 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2352 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2353 MpegEncContext * const s= (MpegEncContext *)c;
2354 const uint8_t *scantable= s->intra_scantable.permutated;
2355 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2356 int i, last, run, bits, level, start_i;
2357 const int esc_length= s->ac_esc_length;
2359 uint8_t * last_length;
2363 s->dsp.diff_pixels(temp, src1, src2, stride);
2365 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2371 length = s->intra_ac_vlc_length;
2372 last_length= s->intra_ac_vlc_last_length;
2373 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2376 length = s->inter_ac_vlc_length;
2377 last_length= s->inter_ac_vlc_last_length;
2382 for(i=start_i; i<last; i++){
2383 int j= scantable[i];
2388 if((level&(~127)) == 0){
2389 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2398 level= temp[i] + 64;
2402 if((level&(~127)) == 0){
2403 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2411 #define VSAD_INTRA(size) \
2412 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2416 for(y=1; y<h; y++){ \
2417 for(x=0; x<size; x+=4){ \
2418 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2419 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2429 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2434 for(x=0; x<16; x++){
2435 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2444 #define SQ(a) ((a)*(a))
2445 #define VSSE_INTRA(size) \
2446 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2450 for(y=1; y<h; y++){ \
2451 for(x=0; x<size; x+=4){ \
2452 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2453 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2463 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2468 for(x=0; x<16; x++){
2469 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2478 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2482 for(i=0; i<size; i++)
2483 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2487 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2488 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2489 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2491 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2493 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2494 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2495 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2496 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2498 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2500 for(i=0; i<len; i++)
2501 dst[i] = src0[i] * src1[i];
2504 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2507 for(i=0; i<len; i++)
2508 dst[i] = src0[i] * src1[-i];
2511 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2513 for(i=0; i<len; i++)
2514 dst[i] = src0[i] * src1[i] + src2[i];
2517 static void vector_fmul_window_c(float *dst, const float *src0,
2518 const float *src1, const float *win, int len)
2524 for(i=-len, j=len-1; i<0; i++, j--) {
2529 dst[i] = s0*wj - s1*wi;
2530 dst[j] = s0*wi + s1*wj;
2534 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2538 for (i = 0; i < len; i++)
2539 dst[i] = src[i] * mul;
2542 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2543 const float **sv, float mul, int len)
2546 for (i = 0; i < len; i += 2, sv++) {
2547 dst[i ] = src[i ] * sv[0][0] * mul;
2548 dst[i+1] = src[i+1] * sv[0][1] * mul;
2552 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2553 const float **sv, float mul, int len)
2556 for (i = 0; i < len; i += 4, sv++) {
2557 dst[i ] = src[i ] * sv[0][0] * mul;
2558 dst[i+1] = src[i+1] * sv[0][1] * mul;
2559 dst[i+2] = src[i+2] * sv[0][2] * mul;
2560 dst[i+3] = src[i+3] * sv[0][3] * mul;
2564 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2568 for (i = 0; i < len; i += 2, sv++) {
2569 dst[i ] = sv[0][0] * mul;
2570 dst[i+1] = sv[0][1] * mul;
2574 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2578 for (i = 0; i < len; i += 4, sv++) {
2579 dst[i ] = sv[0][0] * mul;
2580 dst[i+1] = sv[0][1] * mul;
2581 dst[i+2] = sv[0][2] * mul;
2582 dst[i+3] = sv[0][3] * mul;
2586 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2590 for (i = 0; i < len; i++) {
2591 float t = v1[i] - v2[i];
2597 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2602 for (i = 0; i < len; i++)
2608 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2609 uint32_t maxi, uint32_t maxisign)
2612 if(a > mini) return mini;
2613 else if((a^(1U<<31)) > maxisign) return maxi;
2617 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2619 uint32_t mini = *(uint32_t*)min;
2620 uint32_t maxi = *(uint32_t*)max;
2621 uint32_t maxisign = maxi ^ (1U<<31);
2622 uint32_t *dsti = (uint32_t*)dst;
2623 const uint32_t *srci = (const uint32_t*)src;
2624 for(i=0; i<len; i+=8) {
2625 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2626 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2627 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2628 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2629 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2630 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2631 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2632 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2635 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2637 if(min < 0 && max > 0) {
2638 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2640 for(i=0; i < len; i+=8) {
2641 dst[i ] = av_clipf(src[i ], min, max);
2642 dst[i + 1] = av_clipf(src[i + 1], min, max);
2643 dst[i + 2] = av_clipf(src[i + 2], min, max);
2644 dst[i + 3] = av_clipf(src[i + 3], min, max);
2645 dst[i + 4] = av_clipf(src[i + 4], min, max);
2646 dst[i + 5] = av_clipf(src[i + 5], min, max);
2647 dst[i + 6] = av_clipf(src[i + 6], min, max);
2648 dst[i + 7] = av_clipf(src[i + 7], min, max);
2653 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2658 res += (*v1++ * *v2++) >> shift;
2663 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2668 *v1++ += mul * *v3++;
2673 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2674 const int16_t *window, unsigned int len)
2677 int len2 = len >> 1;
2679 for (i = 0; i < len2; i++) {
2680 int16_t w = window[i];
2681 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2682 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2687 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2688 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2689 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2690 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2691 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2692 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2693 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2695 static void wmv2_idct_row(short * b)
2698 int a0,a1,a2,a3,a4,a5,a6,a7;
2700 a1 = W1*b[1]+W7*b[7];
2701 a7 = W7*b[1]-W1*b[7];
2702 a5 = W5*b[5]+W3*b[3];
2703 a3 = W3*b[5]-W5*b[3];
2704 a2 = W2*b[2]+W6*b[6];
2705 a6 = W6*b[2]-W2*b[6];
2706 a0 = W0*b[0]+W0*b[4];
2707 a4 = W0*b[0]-W0*b[4];
2709 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2710 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2712 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2713 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2714 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2715 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2716 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2717 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2718 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2719 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2721 static void wmv2_idct_col(short * b)
2724 int a0,a1,a2,a3,a4,a5,a6,a7;
2725 /*step 1, with extended precision*/
2726 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2727 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2728 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2729 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2730 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2731 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2732 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2733 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2735 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2736 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2738 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2739 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2740 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2741 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2743 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2744 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2745 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2746 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2748 void ff_wmv2_idct_c(short * block){
2752 wmv2_idct_row(block+i);
2755 wmv2_idct_col(block+i);
2758 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2760 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2762 ff_wmv2_idct_c(block);
2763 ff_put_pixels_clamped_c(block, dest, line_size);
2765 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2767 ff_wmv2_idct_c(block);
2768 ff_add_pixels_clamped_c(block, dest, line_size);
2770 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2773 ff_put_pixels_clamped_c(block, dest, line_size);
2775 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2778 ff_add_pixels_clamped_c(block, dest, line_size);
2781 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2784 put_pixels_clamped4_c(block, dest, line_size);
2786 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2789 add_pixels_clamped4_c(block, dest, line_size);
2792 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2795 put_pixels_clamped2_c(block, dest, line_size);
2797 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2800 add_pixels_clamped2_c(block, dest, line_size);
2803 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2805 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2807 dest[0] = cm[(block[0] + 4)>>3];
2809 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2811 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2813 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2816 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2818 /* init static data */
2819 av_cold void dsputil_static_init(void)
2823 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2824 for(i=0;i<MAX_NEG_CROP;i++) {
2826 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2829 for(i=0;i<512;i++) {
2830 ff_squareTbl[i] = (i - 256) * (i - 256);
2833 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2836 int ff_check_alignment(void){
2837 static int did_fail=0;
2838 DECLARE_ALIGNED(16, int, aligned);
2840 if((intptr_t)&aligned & 15){
2842 #if HAVE_MMX || HAVE_ALTIVEC
2843 av_log(NULL, AV_LOG_ERROR,
2844 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2845 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2846 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2847 "Do not report crashes to FFmpeg developers.\n");
2856 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2860 ff_check_alignment();
2863 if(avctx->dct_algo==FF_DCT_FASTINT) {
2864 c->fdct = fdct_ifast;
2865 c->fdct248 = fdct_ifast248;
2867 else if(avctx->dct_algo==FF_DCT_FAAN) {
2868 c->fdct = ff_faandct;
2869 c->fdct248 = ff_faandct248;
2872 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2873 c->fdct248 = ff_fdct248_islow;
2875 #endif //CONFIG_ENCODERS
2877 if(avctx->lowres==1){
2878 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2879 c->idct_put= ff_jref_idct4_put;
2880 c->idct_add= ff_jref_idct4_add;
2882 if (avctx->codec_id != CODEC_ID_H264) {
2883 c->idct_put= ff_h264_lowres_idct_put_8_c;
2884 c->idct_add= ff_h264_lowres_idct_add_8_c;
2886 switch (avctx->bits_per_raw_sample) {
2888 c->idct_put= ff_h264_lowres_idct_put_9_c;
2889 c->idct_add= ff_h264_lowres_idct_add_9_c;
2892 c->idct_put= ff_h264_lowres_idct_put_10_c;
2893 c->idct_add= ff_h264_lowres_idct_add_10_c;
2896 c->idct_put= ff_h264_lowres_idct_put_8_c;
2897 c->idct_add= ff_h264_lowres_idct_add_8_c;
2901 c->idct = j_rev_dct4;
2902 c->idct_permutation_type= FF_NO_IDCT_PERM;
2903 }else if(avctx->lowres==2){
2904 c->idct_put= ff_jref_idct2_put;
2905 c->idct_add= ff_jref_idct2_add;
2906 c->idct = j_rev_dct2;
2907 c->idct_permutation_type= FF_NO_IDCT_PERM;
2908 }else if(avctx->lowres==3){
2909 c->idct_put= ff_jref_idct1_put;
2910 c->idct_add= ff_jref_idct1_add;
2911 c->idct = j_rev_dct1;
2912 c->idct_permutation_type= FF_NO_IDCT_PERM;
2914 if(avctx->idct_algo==FF_IDCT_INT){
2915 c->idct_put= ff_jref_idct_put;
2916 c->idct_add= ff_jref_idct_add;
2917 c->idct = j_rev_dct;
2918 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2919 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2920 avctx->idct_algo==FF_IDCT_VP3){
2921 c->idct_put= ff_vp3_idct_put_c;
2922 c->idct_add= ff_vp3_idct_add_c;
2923 c->idct = ff_vp3_idct_c;
2924 c->idct_permutation_type= FF_NO_IDCT_PERM;
2925 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2926 c->idct_put= ff_wmv2_idct_put_c;
2927 c->idct_add= ff_wmv2_idct_add_c;
2928 c->idct = ff_wmv2_idct_c;
2929 c->idct_permutation_type= FF_NO_IDCT_PERM;
2930 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2931 c->idct_put= ff_faanidct_put;
2932 c->idct_add= ff_faanidct_add;
2933 c->idct = ff_faanidct;
2934 c->idct_permutation_type= FF_NO_IDCT_PERM;
2935 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2936 c->idct_put= ff_ea_idct_put_c;
2937 c->idct_permutation_type= FF_NO_IDCT_PERM;
2938 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2939 c->idct = ff_bink_idct_c;
2940 c->idct_add = ff_bink_idct_add_c;
2941 c->idct_put = ff_bink_idct_put_c;
2942 c->idct_permutation_type = FF_NO_IDCT_PERM;
2943 }else{ //accurate/default
2944 c->idct_put= ff_simple_idct_put;
2945 c->idct_add= ff_simple_idct_add;
2946 c->idct = ff_simple_idct;
2947 c->idct_permutation_type= FF_NO_IDCT_PERM;
2951 c->get_pixels = get_pixels_c;
2952 c->diff_pixels = diff_pixels_c;
2953 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2954 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2955 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2956 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2957 c->sum_abs_dctelem = sum_abs_dctelem_c;
2960 c->pix_sum = pix_sum_c;
2961 c->pix_norm1 = pix_norm1_c;
2963 c->fill_block_tab[0] = fill_block16_c;
2964 c->fill_block_tab[1] = fill_block8_c;
2965 c->scale_block = scale_block_c;
2967 /* TODO [0] 16 [1] 8 */
2968 c->pix_abs[0][0] = pix_abs16_c;
2969 c->pix_abs[0][1] = pix_abs16_x2_c;
2970 c->pix_abs[0][2] = pix_abs16_y2_c;
2971 c->pix_abs[0][3] = pix_abs16_xy2_c;
2972 c->pix_abs[1][0] = pix_abs8_c;
2973 c->pix_abs[1][1] = pix_abs8_x2_c;
2974 c->pix_abs[1][2] = pix_abs8_y2_c;
2975 c->pix_abs[1][3] = pix_abs8_xy2_c;
2977 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2978 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2979 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2980 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2981 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2982 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2983 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2984 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2985 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2987 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2988 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2989 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2990 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2991 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2992 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2993 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2994 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2995 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2997 #define dspfunc(PFX, IDX, NUM) \
2998 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2999 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3000 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3001 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3002 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3003 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3004 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3005 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3006 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3007 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3008 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3009 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3010 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3011 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3012 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3013 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3015 dspfunc(put_qpel, 0, 16);
3016 dspfunc(put_no_rnd_qpel, 0, 16);
3018 dspfunc(avg_qpel, 0, 16);
3019 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3021 dspfunc(put_qpel, 1, 8);
3022 dspfunc(put_no_rnd_qpel, 1, 8);
3024 dspfunc(avg_qpel, 1, 8);
3025 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3029 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3030 ff_mlp_init(c, avctx);
3032 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3033 ff_intrax8dsp_init(c,avctx);
3035 #if CONFIG_RV30_DECODER
3036 ff_rv30dsp_init(c,avctx);
3038 #if CONFIG_RV40_DECODER
3039 ff_rv40dsp_init(c,avctx);
3040 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3041 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3042 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3043 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3046 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3047 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3048 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3049 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3050 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3051 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3052 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3053 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3055 #define SET_CMP_FUNC(name) \
3056 c->name[0]= name ## 16_c;\
3057 c->name[1]= name ## 8x8_c;
3059 SET_CMP_FUNC(hadamard8_diff)
3060 c->hadamard8_diff[4]= hadamard8_intra16_c;
3061 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3062 SET_CMP_FUNC(dct_sad)
3063 SET_CMP_FUNC(dct_max)
3065 SET_CMP_FUNC(dct264_sad)
3067 c->sad[0]= pix_abs16_c;
3068 c->sad[1]= pix_abs8_c;
3072 SET_CMP_FUNC(quant_psnr)
3075 c->vsad[0]= vsad16_c;
3076 c->vsad[4]= vsad_intra16_c;
3077 c->vsad[5]= vsad_intra8_c;
3078 c->vsse[0]= vsse16_c;
3079 c->vsse[4]= vsse_intra16_c;
3080 c->vsse[5]= vsse_intra8_c;
3081 c->nsse[0]= nsse16_c;
3082 c->nsse[1]= nsse8_c;
3084 ff_dsputil_init_dwt(c);
3087 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3089 c->add_bytes= add_bytes_c;
3090 c->add_bytes_l2= add_bytes_l2_c;
3091 c->diff_bytes= diff_bytes_c;
3092 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3093 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3094 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3095 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3096 c->bswap_buf= bswap_buf;
3097 c->bswap16_buf = bswap16_buf;
3098 #if CONFIG_PNG_DECODER
3099 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3102 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3103 c->h263_h_loop_filter= h263_h_loop_filter_c;
3104 c->h263_v_loop_filter= h263_v_loop_filter_c;
3107 if (CONFIG_VP3_DECODER) {
3108 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3109 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3110 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3113 c->h261_loop_filter= h261_loop_filter_c;
3115 c->try_8x8basis= try_8x8basis_c;
3116 c->add_8x8basis= add_8x8basis_c;
3118 #if CONFIG_VORBIS_DECODER
3119 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3121 #if CONFIG_AC3_DECODER
3122 c->ac3_downmix = ff_ac3_downmix_c;
3124 c->vector_fmul = vector_fmul_c;
3125 c->vector_fmul_reverse = vector_fmul_reverse_c;
3126 c->vector_fmul_add = vector_fmul_add_c;
3127 c->vector_fmul_window = vector_fmul_window_c;
3128 c->vector_clipf = vector_clipf_c;
3129 c->scalarproduct_int16 = scalarproduct_int16_c;
3130 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3131 c->apply_window_int16 = apply_window_int16_c;
3132 c->scalarproduct_float = scalarproduct_float_c;
3133 c->butterflies_float = butterflies_float_c;
3134 c->vector_fmul_scalar = vector_fmul_scalar_c;
3136 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3137 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3139 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3140 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3142 c->shrink[0]= av_image_copy_plane;
3143 c->shrink[1]= ff_shrink22;
3144 c->shrink[2]= ff_shrink44;
3145 c->shrink[3]= ff_shrink88;
3147 c->prefetch= just_return;
3149 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3150 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3154 #define FUNC(f, depth) f ## _ ## depth
3155 #define FUNCC(f, depth) f ## _ ## depth ## _c
3157 #define dspfunc1(PFX, IDX, NUM, depth)\
3158 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3159 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3160 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3161 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3163 #define dspfunc2(PFX, IDX, NUM, depth)\
3164 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3165 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3166 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3167 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3168 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3169 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3170 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3171 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3172 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3173 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3174 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3175 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3176 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3177 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3178 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3179 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3182 #define BIT_DEPTH_FUNCS(depth)\
3183 c->draw_edges = FUNCC(draw_edges , depth);\
3184 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3185 c->clear_block = FUNCC(clear_block , depth);\
3186 c->clear_blocks = FUNCC(clear_blocks , depth);\
3187 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3188 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3189 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3190 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3192 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3193 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3194 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3195 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3196 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3197 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3199 dspfunc1(put , 0, 16, depth);\
3200 dspfunc1(put , 1, 8, depth);\
3201 dspfunc1(put , 2, 4, depth);\
3202 dspfunc1(put , 3, 2, depth);\
3203 dspfunc1(put_no_rnd, 0, 16, depth);\
3204 dspfunc1(put_no_rnd, 1, 8, depth);\
3205 dspfunc1(avg , 0, 16, depth);\
3206 dspfunc1(avg , 1, 8, depth);\
3207 dspfunc1(avg , 2, 4, depth);\
3208 dspfunc1(avg , 3, 2, depth);\
3209 dspfunc1(avg_no_rnd, 0, 16, depth);\
3210 dspfunc1(avg_no_rnd, 1, 8, depth);\
3212 dspfunc2(put_h264_qpel, 0, 16, depth);\
3213 dspfunc2(put_h264_qpel, 1, 8, depth);\
3214 dspfunc2(put_h264_qpel, 2, 4, depth);\
3215 dspfunc2(put_h264_qpel, 3, 2, depth);\
3216 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3217 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3218 dspfunc2(avg_h264_qpel, 2, 4, depth);
3220 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3223 switch (avctx->bits_per_raw_sample) {
3231 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3238 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3239 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3240 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3241 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3242 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3243 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3244 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3245 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3246 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3248 for(i=0; i<64; i++){
3249 if(!c->put_2tap_qpel_pixels_tab[0][i])
3250 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3251 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3252 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3255 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3256 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3257 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3258 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3260 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3261 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3262 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3263 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3265 switch(c->idct_permutation_type){
3266 case FF_NO_IDCT_PERM:
3268 c->idct_permutation[i]= i;
3270 case FF_LIBMPEG2_IDCT_PERM:
3272 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3274 case FF_SIMPLE_IDCT_PERM:
3276 c->idct_permutation[i]= simple_mmx_permutation[i];
3278 case FF_TRANSPOSE_IDCT_PERM:
3280 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3282 case FF_PARTTRANS_IDCT_PERM:
3284 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3286 case FF_SSE2_IDCT_PERM:
3288 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3291 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");