3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
40 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
41 uint32_t ff_squareTbl[512] = {0, };
44 #include "dsputil_template.c"
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
58 const uint8_t ff_zigzag_direct[64] = {
59 0, 1, 8, 16, 9, 2, 3, 10,
60 17, 24, 32, 25, 18, 11, 4, 5,
61 12, 19, 26, 33, 40, 48, 41, 34,
62 27, 20, 13, 6, 7, 14, 21, 28,
63 35, 42, 49, 56, 57, 50, 43, 36,
64 29, 22, 15, 23, 30, 37, 44, 51,
65 58, 59, 52, 45, 38, 31, 39, 46,
66 53, 60, 61, 54, 47, 55, 62, 63
69 /* Specific zigzag scan for 248 idct. NOTE that unlike the
70 specification, we interleave the fields */
71 const uint8_t ff_zigzag248_direct[64] = {
72 0, 8, 1, 9, 16, 24, 2, 10,
73 17, 25, 32, 40, 48, 56, 33, 41,
74 18, 26, 3, 11, 4, 12, 19, 27,
75 34, 42, 49, 57, 50, 58, 35, 43,
76 20, 28, 5, 13, 6, 14, 21, 29,
77 36, 44, 51, 59, 52, 60, 37, 45,
78 22, 30, 7, 15, 23, 31, 38, 46,
79 53, 61, 54, 62, 39, 47, 55, 63,
82 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
83 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
85 const uint8_t ff_alternate_horizontal_scan[64] = {
86 0, 1, 2, 3, 8, 9, 16, 17,
87 10, 11, 4, 5, 6, 7, 15, 14,
88 13, 12, 19, 18, 24, 25, 32, 33,
89 26, 27, 20, 21, 22, 23, 28, 29,
90 30, 31, 34, 35, 40, 41, 48, 49,
91 42, 43, 36, 37, 38, 39, 44, 45,
92 46, 47, 50, 51, 56, 57, 58, 59,
93 52, 53, 54, 55, 60, 61, 62, 63,
96 const uint8_t ff_alternate_vertical_scan[64] = {
97 0, 8, 16, 24, 1, 9, 2, 10,
98 17, 25, 32, 40, 48, 56, 57, 49,
99 41, 33, 26, 18, 3, 11, 4, 12,
100 19, 27, 34, 42, 50, 58, 35, 43,
101 51, 59, 20, 28, 5, 13, 6, 14,
102 21, 29, 36, 44, 52, 60, 37, 45,
103 53, 61, 22, 30, 7, 15, 23, 31,
104 38, 46, 54, 62, 39, 47, 55, 63,
107 /* Input permutation for the simple_idct_mmx */
108 static const uint8_t simple_mmx_permutation[64]={
109 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
110 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
111 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
112 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
113 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
114 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
115 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
116 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
119 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
121 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
125 st->scantable= src_scantable;
129 j = src_scantable[i];
130 st->permutated[i] = permutation[j];
136 j = st->permutated[i];
138 st->raster_end[i]= end;
142 void ff_init_scantable_permutation(uint8_t *idct_permutation,
143 int idct_permutation_type)
147 switch(idct_permutation_type){
148 case FF_NO_IDCT_PERM:
150 idct_permutation[i]= i;
152 case FF_LIBMPEG2_IDCT_PERM:
154 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
156 case FF_SIMPLE_IDCT_PERM:
158 idct_permutation[i]= simple_mmx_permutation[i];
160 case FF_TRANSPOSE_IDCT_PERM:
162 idct_permutation[i]= ((i&7)<<3) | (i>>3);
164 case FF_PARTTRANS_IDCT_PERM:
166 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
168 case FF_SSE2_IDCT_PERM:
170 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
173 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
177 static int pix_sum_c(uint8_t * pix, int line_size)
182 for (i = 0; i < 16; i++) {
183 for (j = 0; j < 16; j += 8) {
194 pix += line_size - 16;
199 static int pix_norm1_c(uint8_t * pix, int line_size)
202 uint32_t *sq = ff_squareTbl + 256;
205 for (i = 0; i < 16; i++) {
206 for (j = 0; j < 16; j += 8) {
218 register uint64_t x=*(uint64_t*)pix;
220 s += sq[(x>>8)&0xff];
221 s += sq[(x>>16)&0xff];
222 s += sq[(x>>24)&0xff];
223 s += sq[(x>>32)&0xff];
224 s += sq[(x>>40)&0xff];
225 s += sq[(x>>48)&0xff];
226 s += sq[(x>>56)&0xff];
228 register uint32_t x=*(uint32_t*)pix;
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 x=*(uint32_t*)(pix+4);
235 s += sq[(x>>8)&0xff];
236 s += sq[(x>>16)&0xff];
237 s += sq[(x>>24)&0xff];
242 pix += line_size - 16;
247 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
250 for(i=0; i+8<=w; i+=8){
251 dst[i+0]= av_bswap32(src[i+0]);
252 dst[i+1]= av_bswap32(src[i+1]);
253 dst[i+2]= av_bswap32(src[i+2]);
254 dst[i+3]= av_bswap32(src[i+3]);
255 dst[i+4]= av_bswap32(src[i+4]);
256 dst[i+5]= av_bswap32(src[i+5]);
257 dst[i+6]= av_bswap32(src[i+6]);
258 dst[i+7]= av_bswap32(src[i+7]);
261 dst[i+0]= av_bswap32(src[i+0]);
265 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
268 *dst++ = av_bswap16(*src++);
271 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
274 uint32_t *sq = ff_squareTbl + 256;
277 for (i = 0; i < h; i++) {
278 s += sq[pix1[0] - pix2[0]];
279 s += sq[pix1[1] - pix2[1]];
280 s += sq[pix1[2] - pix2[2]];
281 s += sq[pix1[3] - pix2[3]];
288 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
291 uint32_t *sq = ff_squareTbl + 256;
294 for (i = 0; i < h; i++) {
295 s += sq[pix1[0] - pix2[0]];
296 s += sq[pix1[1] - pix2[1]];
297 s += sq[pix1[2] - pix2[2]];
298 s += sq[pix1[3] - pix2[3]];
299 s += sq[pix1[4] - pix2[4]];
300 s += sq[pix1[5] - pix2[5]];
301 s += sq[pix1[6] - pix2[6]];
302 s += sq[pix1[7] - pix2[7]];
309 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
312 uint32_t *sq = ff_squareTbl + 256;
315 for (i = 0; i < h; i++) {
316 s += sq[pix1[ 0] - pix2[ 0]];
317 s += sq[pix1[ 1] - pix2[ 1]];
318 s += sq[pix1[ 2] - pix2[ 2]];
319 s += sq[pix1[ 3] - pix2[ 3]];
320 s += sq[pix1[ 4] - pix2[ 4]];
321 s += sq[pix1[ 5] - pix2[ 5]];
322 s += sq[pix1[ 6] - pix2[ 6]];
323 s += sq[pix1[ 7] - pix2[ 7]];
324 s += sq[pix1[ 8] - pix2[ 8]];
325 s += sq[pix1[ 9] - pix2[ 9]];
326 s += sq[pix1[10] - pix2[10]];
327 s += sq[pix1[11] - pix2[11]];
328 s += sq[pix1[12] - pix2[12]];
329 s += sq[pix1[13] - pix2[13]];
330 s += sq[pix1[14] - pix2[14]];
331 s += sq[pix1[15] - pix2[15]];
339 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
340 const uint8_t *s2, int stride){
343 /* read the pixels */
345 block[0] = s1[0] - s2[0];
346 block[1] = s1[1] - s2[1];
347 block[2] = s1[2] - s2[2];
348 block[3] = s1[3] - s2[3];
349 block[4] = s1[4] - s2[4];
350 block[5] = s1[5] - s2[5];
351 block[6] = s1[6] - s2[6];
352 block[7] = s1[7] - s2[7];
360 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
365 /* read the pixels */
367 pixels[0] = av_clip_uint8(block[0]);
368 pixels[1] = av_clip_uint8(block[1]);
369 pixels[2] = av_clip_uint8(block[2]);
370 pixels[3] = av_clip_uint8(block[3]);
371 pixels[4] = av_clip_uint8(block[4]);
372 pixels[5] = av_clip_uint8(block[5]);
373 pixels[6] = av_clip_uint8(block[6]);
374 pixels[7] = av_clip_uint8(block[7]);
381 static void put_signed_pixels_clamped_c(const int16_t *block,
382 uint8_t *restrict pixels,
387 for (i = 0; i < 8; i++) {
388 for (j = 0; j < 8; j++) {
391 else if (*block > 127)
394 *pixels = (uint8_t)(*block + 128);
398 pixels += (line_size - 8);
402 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
407 /* read the pixels */
409 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
410 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
411 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
412 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
413 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
414 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
415 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
416 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
422 static int sum_abs_dctelem_c(int16_t *block)
426 sum+= FFABS(block[i]);
430 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
434 for (i = 0; i < h; i++) {
435 memset(block, value, 16);
440 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
444 for (i = 0; i < h; i++) {
445 memset(block, value, 8);
450 #define avg2(a,b) ((a+b+1)>>1)
451 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
453 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
455 const int A=(16-x16)*(16-y16);
456 const int B=( x16)*(16-y16);
457 const int C=(16-x16)*( y16);
458 const int D=( x16)*( y16);
463 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
464 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
465 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
466 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
467 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
468 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
469 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
470 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
476 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
477 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
480 const int s= 1<<shift;
490 for(x=0; x<8; x++){ //XXX FIXME optimize
491 int src_x, src_y, frac_x, frac_y, index;
500 if((unsigned)src_x < width){
501 if((unsigned)src_y < height){
502 index= src_x + src_y*stride;
503 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
504 + src[index +1]* frac_x )*(s-frac_y)
505 + ( src[index+stride ]*(s-frac_x)
506 + src[index+stride+1]* frac_x )* frac_y
509 index= src_x + av_clip(src_y, 0, height)*stride;
510 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
511 + src[index +1]* frac_x )*s
515 if((unsigned)src_y < height){
516 index= av_clip(src_x, 0, width) + src_y*stride;
517 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
518 + src[index+stride ]* frac_y )*s
521 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
522 dst[y*stride + x]= src[index ];
534 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
536 case 2: put_pixels2_8_c (dst, src, stride, height); break;
537 case 4: put_pixels4_8_c (dst, src, stride, height); break;
538 case 8: put_pixels8_8_c (dst, src, stride, height); break;
539 case 16:put_pixels16_8_c(dst, src, stride, height); break;
543 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
545 for (i=0; i < height; i++) {
546 for (j=0; j < width; j++) {
547 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
554 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
556 for (i=0; i < height; i++) {
557 for (j=0; j < width; j++) {
558 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
565 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
567 for (i=0; i < height; i++) {
568 for (j=0; j < width; j++) {
569 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
576 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
578 for (i=0; i < height; i++) {
579 for (j=0; j < width; j++) {
580 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
587 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
589 for (i=0; i < height; i++) {
590 for (j=0; j < width; j++) {
591 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
598 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
600 for (i=0; i < height; i++) {
601 for (j=0; j < width; j++) {
602 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
609 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
611 for (i=0; i < height; i++) {
612 for (j=0; j < width; j++) {
613 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
620 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
622 for (i=0; i < height; i++) {
623 for (j=0; j < width; j++) {
624 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
631 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
633 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
634 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
635 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
636 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
640 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
642 for (i=0; i < height; i++) {
643 for (j=0; j < width; j++) {
644 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
651 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
653 for (i=0; i < height; i++) {
654 for (j=0; j < width; j++) {
655 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
662 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
664 for (i=0; i < height; i++) {
665 for (j=0; j < width; j++) {
666 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
673 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
675 for (i=0; i < height; i++) {
676 for (j=0; j < width; j++) {
677 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
684 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
686 for (i=0; i < height; i++) {
687 for (j=0; j < width; j++) {
688 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
695 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
706 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
717 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
719 for (i=0; i < height; i++) {
720 for (j=0; j < width; j++) {
721 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
728 #define QPEL_MC(r, OPNAME, RND, OP) \
729 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
730 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
734 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
735 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
736 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
737 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
738 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
739 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
740 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
741 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
747 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
749 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
753 const int src0= src[0*srcStride];\
754 const int src1= src[1*srcStride];\
755 const int src2= src[2*srcStride];\
756 const int src3= src[3*srcStride];\
757 const int src4= src[4*srcStride];\
758 const int src5= src[5*srcStride];\
759 const int src6= src[6*srcStride];\
760 const int src7= src[7*srcStride];\
761 const int src8= src[8*srcStride];\
762 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
763 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
764 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
765 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
766 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
767 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
768 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
769 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
775 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
776 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
781 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
782 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
783 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
784 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
785 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
786 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
787 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
788 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
789 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
790 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
791 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
792 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
793 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
794 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
795 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
796 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
802 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
803 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
808 const int src0= src[0*srcStride];\
809 const int src1= src[1*srcStride];\
810 const int src2= src[2*srcStride];\
811 const int src3= src[3*srcStride];\
812 const int src4= src[4*srcStride];\
813 const int src5= src[5*srcStride];\
814 const int src6= src[6*srcStride];\
815 const int src7= src[7*srcStride];\
816 const int src8= src[8*srcStride];\
817 const int src9= src[9*srcStride];\
818 const int src10= src[10*srcStride];\
819 const int src11= src[11*srcStride];\
820 const int src12= src[12*srcStride];\
821 const int src13= src[13*srcStride];\
822 const int src14= src[14*srcStride];\
823 const int src15= src[15*srcStride];\
824 const int src16= src[16*srcStride];\
825 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
826 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
827 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
828 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
829 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
830 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
831 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
832 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
833 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
834 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
835 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
836 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
837 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
838 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
839 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
840 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
846 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
848 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
849 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
852 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
853 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
856 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
858 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
859 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
862 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
865 copy_block9(full, src, 16, stride, 9);\
866 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
867 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
870 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
872 copy_block9(full, src, 16, stride, 9);\
873 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
876 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
879 copy_block9(full, src, 16, stride, 9);\
880 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
881 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
883 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
888 copy_block9(full, src, 16, stride, 9);\
889 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
890 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
891 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
892 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
894 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
898 copy_block9(full, src, 16, stride, 9);\
899 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
900 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
901 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
902 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
904 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
909 copy_block9(full, src, 16, stride, 9);\
910 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
911 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
912 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
913 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
915 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
919 copy_block9(full, src, 16, stride, 9);\
920 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
921 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
922 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
923 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
925 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
930 copy_block9(full, src, 16, stride, 9);\
931 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
932 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
933 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
934 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
936 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
940 copy_block9(full, src, 16, stride, 9);\
941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
942 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
944 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
946 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
951 copy_block9(full, src, 16, stride, 9);\
952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
953 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
954 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
955 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
957 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
961 copy_block9(full, src, 16, stride, 9);\
962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
963 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
965 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
967 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
974 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
978 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
979 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
981 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
986 copy_block9(full, src, 16, stride, 9);\
987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
988 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
990 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
992 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
998 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1000 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1001 uint8_t full[16*9];\
1004 uint8_t halfHV[64];\
1005 copy_block9(full, src, 16, stride, 9);\
1006 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1011 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1012 uint8_t full[16*9];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1017 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1019 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1021 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1022 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1025 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1027 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1028 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1031 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1032 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1035 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1037 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1038 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1041 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1042 uint8_t full[24*17];\
1044 copy_block17(full, src, 24, stride, 17);\
1045 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1046 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1049 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1050 uint8_t full[24*17];\
1051 copy_block17(full, src, 24, stride, 17);\
1052 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1055 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1056 uint8_t full[24*17];\
1058 copy_block17(full, src, 24, stride, 17);\
1059 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1060 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1062 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t full[24*17];\
1064 uint8_t halfH[272];\
1065 uint8_t halfV[256];\
1066 uint8_t halfHV[256];\
1067 copy_block17(full, src, 24, stride, 17);\
1068 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1069 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1070 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1071 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1073 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1074 uint8_t full[24*17];\
1075 uint8_t halfH[272];\
1076 uint8_t halfHV[256];\
1077 copy_block17(full, src, 24, stride, 17);\
1078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1079 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1080 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1081 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1083 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1084 uint8_t full[24*17];\
1085 uint8_t halfH[272];\
1086 uint8_t halfV[256];\
1087 uint8_t halfHV[256];\
1088 copy_block17(full, src, 24, stride, 17);\
1089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1090 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1092 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1094 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[24*17];\
1096 uint8_t halfH[272];\
1097 uint8_t halfHV[256];\
1098 copy_block17(full, src, 24, stride, 17);\
1099 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1100 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1101 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1102 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1104 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1105 uint8_t full[24*17];\
1106 uint8_t halfH[272];\
1107 uint8_t halfV[256];\
1108 uint8_t halfHV[256];\
1109 copy_block17(full, src, 24, stride, 17);\
1110 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1111 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1112 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1113 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1115 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1116 uint8_t full[24*17];\
1117 uint8_t halfH[272];\
1118 uint8_t halfHV[256];\
1119 copy_block17(full, src, 24, stride, 17);\
1120 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1121 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1122 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1123 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1125 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1126 uint8_t full[24*17];\
1127 uint8_t halfH[272];\
1128 uint8_t halfV[256];\
1129 uint8_t halfHV[256];\
1130 copy_block17(full, src, 24, stride, 17);\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1132 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1133 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1134 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1136 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1137 uint8_t full[24*17];\
1138 uint8_t halfH[272];\
1139 uint8_t halfHV[256];\
1140 copy_block17(full, src, 24, stride, 17);\
1141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1142 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1144 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1146 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1147 uint8_t halfH[272];\
1148 uint8_t halfHV[256];\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1151 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1153 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1154 uint8_t halfH[272];\
1155 uint8_t halfHV[256];\
1156 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1157 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1158 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1160 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161 uint8_t full[24*17];\
1162 uint8_t halfH[272];\
1163 uint8_t halfV[256];\
1164 uint8_t halfHV[256];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1171 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[24*17];\
1173 uint8_t halfH[272];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1177 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1179 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1180 uint8_t full[24*17];\
1181 uint8_t halfH[272];\
1182 uint8_t halfV[256];\
1183 uint8_t halfHV[256];\
1184 copy_block17(full, src, 24, stride, 17);\
1185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1190 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1191 uint8_t full[24*17];\
1192 uint8_t halfH[272];\
1193 copy_block17(full, src, 24, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1195 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1196 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1198 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t halfH[272];\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1201 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1204 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1205 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1206 #define op_put(a, b) a = cm[((b) + 16)>>5]
1207 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1209 QPEL_MC(0, put_ , _ , op_put)
1210 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1211 QPEL_MC(0, avg_ , _ , op_avg)
1212 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1214 #undef op_avg_no_rnd
1216 #undef op_put_no_rnd
1218 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1219 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1220 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1221 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1222 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1223 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1225 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1226 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1230 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1231 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1232 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1233 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1234 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1235 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1236 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1237 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1243 #if CONFIG_RV40_DECODER
1244 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1245 put_pixels16_xy2_8_c(dst, src, stride, 16);
1247 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1248 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1250 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1251 put_pixels8_xy2_8_c(dst, src, stride, 8);
1253 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1254 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1256 #endif /* CONFIG_RV40_DECODER */
1258 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1259 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1263 const int src_1= src[ -srcStride];
1264 const int src0 = src[0 ];
1265 const int src1 = src[ srcStride];
1266 const int src2 = src[2*srcStride];
1267 const int src3 = src[3*srcStride];
1268 const int src4 = src[4*srcStride];
1269 const int src5 = src[5*srcStride];
1270 const int src6 = src[6*srcStride];
1271 const int src7 = src[7*srcStride];
1272 const int src8 = src[8*srcStride];
1273 const int src9 = src[9*srcStride];
1274 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1275 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1276 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1277 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1278 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1279 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1280 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1281 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1287 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1289 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1290 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1293 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1294 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1297 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1299 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1300 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1303 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1304 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1307 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1311 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1312 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1313 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1314 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1316 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1320 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1321 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1322 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1323 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1325 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1327 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1328 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1331 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1332 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1334 const int strength= ff_h263_loop_filter_strength[qscale];
1338 int p0= src[x-2*stride];
1339 int p1= src[x-1*stride];
1340 int p2= src[x+0*stride];
1341 int p3= src[x+1*stride];
1342 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1344 if (d<-2*strength) d1= 0;
1345 else if(d<- strength) d1=-2*strength - d;
1346 else if(d< strength) d1= d;
1347 else if(d< 2*strength) d1= 2*strength - d;
1352 if(p1&256) p1= ~(p1>>31);
1353 if(p2&256) p2= ~(p2>>31);
1355 src[x-1*stride] = p1;
1356 src[x+0*stride] = p2;
1360 d2= av_clip((p0-p3)/4, -ad1, ad1);
1362 src[x-2*stride] = p0 - d2;
1363 src[x+ stride] = p3 + d2;
1368 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1369 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1371 const int strength= ff_h263_loop_filter_strength[qscale];
1375 int p0= src[y*stride-2];
1376 int p1= src[y*stride-1];
1377 int p2= src[y*stride+0];
1378 int p3= src[y*stride+1];
1379 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1381 if (d<-2*strength) d1= 0;
1382 else if(d<- strength) d1=-2*strength - d;
1383 else if(d< strength) d1= d;
1384 else if(d< 2*strength) d1= 2*strength - d;
1389 if(p1&256) p1= ~(p1>>31);
1390 if(p2&256) p2= ~(p2>>31);
1392 src[y*stride-1] = p1;
1393 src[y*stride+0] = p2;
1397 d2= av_clip((p0-p3)/4, -ad1, ad1);
1399 src[y*stride-2] = p0 - d2;
1400 src[y*stride+1] = p3 + d2;
1405 static void h261_loop_filter_c(uint8_t *src, int stride){
1410 temp[x ] = 4*src[x ];
1411 temp[x + 7*8] = 4*src[x + 7*stride];
1415 xy = y * stride + x;
1417 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1422 src[ y*stride] = (temp[ y*8] + 2)>>2;
1423 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1425 xy = y * stride + x;
1427 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1432 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1438 s += abs(pix1[0] - pix2[0]);
1439 s += abs(pix1[1] - pix2[1]);
1440 s += abs(pix1[2] - pix2[2]);
1441 s += abs(pix1[3] - pix2[3]);
1442 s += abs(pix1[4] - pix2[4]);
1443 s += abs(pix1[5] - pix2[5]);
1444 s += abs(pix1[6] - pix2[6]);
1445 s += abs(pix1[7] - pix2[7]);
1446 s += abs(pix1[8] - pix2[8]);
1447 s += abs(pix1[9] - pix2[9]);
1448 s += abs(pix1[10] - pix2[10]);
1449 s += abs(pix1[11] - pix2[11]);
1450 s += abs(pix1[12] - pix2[12]);
1451 s += abs(pix1[13] - pix2[13]);
1452 s += abs(pix1[14] - pix2[14]);
1453 s += abs(pix1[15] - pix2[15]);
1460 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1466 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1467 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1468 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1469 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1470 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1471 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1472 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1473 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1474 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1475 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1476 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1477 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1478 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1479 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1480 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1481 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1488 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1491 uint8_t *pix3 = pix2 + line_size;
1495 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1496 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1497 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1498 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1499 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1500 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1501 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1502 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1503 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1504 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1505 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1506 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1507 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1508 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1509 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1510 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1518 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1521 uint8_t *pix3 = pix2 + line_size;
1525 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1526 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1527 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1528 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1529 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1530 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1531 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1532 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1533 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1534 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1535 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1536 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1537 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1538 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1539 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1540 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1548 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1554 s += abs(pix1[0] - pix2[0]);
1555 s += abs(pix1[1] - pix2[1]);
1556 s += abs(pix1[2] - pix2[2]);
1557 s += abs(pix1[3] - pix2[3]);
1558 s += abs(pix1[4] - pix2[4]);
1559 s += abs(pix1[5] - pix2[5]);
1560 s += abs(pix1[6] - pix2[6]);
1561 s += abs(pix1[7] - pix2[7]);
1568 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1574 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1575 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1576 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1577 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1578 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1579 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1580 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1581 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1588 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1591 uint8_t *pix3 = pix2 + line_size;
1595 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1596 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1597 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1598 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1599 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1600 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1601 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1602 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1610 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 uint8_t *pix3 = pix2 + line_size;
1617 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1618 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1619 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1620 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1621 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1622 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1623 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1624 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1632 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1633 MpegEncContext *c = v;
1639 for(x=0; x<16; x++){
1640 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1643 for(x=0; x<15; x++){
1644 score2+= FFABS( s1[x ] - s1[x +stride]
1645 - s1[x+1] + s1[x+1+stride])
1646 -FFABS( s2[x ] - s2[x +stride]
1647 - s2[x+1] + s2[x+1+stride]);
1654 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1655 else return score1 + FFABS(score2)*8;
1658 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1659 MpegEncContext *c = v;
1666 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1670 score2+= FFABS( s1[x ] - s1[x +stride]
1671 - s1[x+1] + s1[x+1+stride])
1672 -FFABS( s2[x ] - s2[x +stride]
1673 - s2[x+1] + s2[x+1+stride]);
1680 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1681 else return score1 + FFABS(score2)*8;
1684 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1688 for(i=0; i<8*8; i++){
1689 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1692 assert(-512<b && b<512);
1694 sum += (w*b)*(w*b)>>4;
1699 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1702 for(i=0; i<8*8; i++){
1703 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1708 * Permute an 8x8 block.
1709 * @param block the block which will be permuted according to the given permutation vector
1710 * @param permutation the permutation vector
1711 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1712 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1713 * (inverse) permutated to scantable order!
1715 void ff_block_permute(int16_t *block, uint8_t *permutation, const uint8_t *scantable, int last)
1721 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1723 for(i=0; i<=last; i++){
1724 const int j= scantable[i];
1729 for(i=0; i<=last; i++){
1730 const int j= scantable[i];
1731 const int perm_j= permutation[j];
1732 block[perm_j]= temp[j];
1736 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1740 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1743 memset(cmp, 0, sizeof(void*)*6);
1751 cmp[i]= c->hadamard8_diff[i];
1757 cmp[i]= c->dct_sad[i];
1760 cmp[i]= c->dct264_sad[i];
1763 cmp[i]= c->dct_max[i];
1766 cmp[i]= c->quant_psnr[i];
1787 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1792 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1794 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1795 long a = *(long*)(src+i);
1796 long b = *(long*)(dst+i);
1797 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1800 dst[i+0] += src[i+0];
1803 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1805 #if !HAVE_FAST_UNALIGNED
1806 if((long)src2 & (sizeof(long)-1)){
1807 for(i=0; i+7<w; i+=8){
1808 dst[i+0] = src1[i+0]-src2[i+0];
1809 dst[i+1] = src1[i+1]-src2[i+1];
1810 dst[i+2] = src1[i+2]-src2[i+2];
1811 dst[i+3] = src1[i+3]-src2[i+3];
1812 dst[i+4] = src1[i+4]-src2[i+4];
1813 dst[i+5] = src1[i+5]-src2[i+5];
1814 dst[i+6] = src1[i+6]-src2[i+6];
1815 dst[i+7] = src1[i+7]-src2[i+7];
1819 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1820 long a = *(long*)(src1+i);
1821 long b = *(long*)(src2+i);
1822 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1825 dst[i+0] = src1[i+0]-src2[i+0];
1828 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1836 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1845 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1853 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1863 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1866 for(i=0; i<w-1; i++){
1893 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1923 #define BUTTERFLY2(o1,o2,i1,i2) \
1927 #define BUTTERFLY1(x,y) \
1936 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1938 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1946 //FIXME try pointer walks
1947 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1948 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1949 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1950 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1952 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1953 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1954 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1955 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1957 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1958 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1959 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1960 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1964 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1965 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1966 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1967 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1969 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1970 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1971 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1972 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1975 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1976 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1977 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1978 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1983 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
1991 //FIXME try pointer walks
1992 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
1993 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
1994 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
1995 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
1997 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1998 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1999 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2000 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2002 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2003 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2004 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2005 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2009 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2010 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2011 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2012 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2014 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2015 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2016 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2017 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2020 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2021 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2022 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2023 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2026 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2031 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2032 MpegEncContext * const s= (MpegEncContext *)c;
2033 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2037 s->dsp.diff_pixels(temp, src1, src2, stride);
2039 return s->dsp.sum_abs_dctelem(temp);
2044 const int s07 = SRC(0) + SRC(7);\
2045 const int s16 = SRC(1) + SRC(6);\
2046 const int s25 = SRC(2) + SRC(5);\
2047 const int s34 = SRC(3) + SRC(4);\
2048 const int a0 = s07 + s34;\
2049 const int a1 = s16 + s25;\
2050 const int a2 = s07 - s34;\
2051 const int a3 = s16 - s25;\
2052 const int d07 = SRC(0) - SRC(7);\
2053 const int d16 = SRC(1) - SRC(6);\
2054 const int d25 = SRC(2) - SRC(5);\
2055 const int d34 = SRC(3) - SRC(4);\
2056 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2057 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2058 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2059 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2061 DST(1, a4 + (a7>>2)) ;\
2062 DST(2, a2 + (a3>>1)) ;\
2063 DST(3, a5 + (a6>>2)) ;\
2065 DST(5, a6 - (a5>>2)) ;\
2066 DST(6, (a2>>1) - a3 ) ;\
2067 DST(7, (a4>>2) - a7 ) ;\
2070 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2071 MpegEncContext * const s= (MpegEncContext *)c;
2076 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2078 #define SRC(x) dct[i][x]
2079 #define DST(x,v) dct[i][x]= v
2080 for( i = 0; i < 8; i++ )
2085 #define SRC(x) dct[x][i]
2086 #define DST(x,v) sum += FFABS(v)
2087 for( i = 0; i < 8; i++ )
2095 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2096 MpegEncContext * const s= (MpegEncContext *)c;
2097 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2102 s->dsp.diff_pixels(temp, src1, src2, stride);
2106 sum= FFMAX(sum, FFABS(temp[i]));
2111 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2112 MpegEncContext * const s= (MpegEncContext *)c;
2113 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2114 int16_t * const bak = temp+64;
2120 s->dsp.diff_pixels(temp, src1, src2, stride);
2122 memcpy(bak, temp, 64*sizeof(int16_t));
2124 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2125 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2126 ff_simple_idct_8(temp); //FIXME
2129 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2134 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2135 MpegEncContext * const s= (MpegEncContext *)c;
2136 const uint8_t *scantable= s->intra_scantable.permutated;
2137 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2138 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2139 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2140 int i, last, run, bits, level, distortion, start_i;
2141 const int esc_length= s->ac_esc_length;
2143 uint8_t * last_length;
2147 copy_block8(lsrc1, src1, 8, stride, 8);
2148 copy_block8(lsrc2, src2, 8, stride, 8);
2150 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2152 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2158 length = s->intra_ac_vlc_length;
2159 last_length= s->intra_ac_vlc_last_length;
2160 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2163 length = s->inter_ac_vlc_length;
2164 last_length= s->inter_ac_vlc_last_length;
2169 for(i=start_i; i<last; i++){
2170 int j= scantable[i];
2175 if((level&(~127)) == 0){
2176 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2185 level= temp[i] + 64;
2189 if((level&(~127)) == 0){
2190 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2198 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2200 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2203 s->dsp.idct_add(lsrc2, 8, temp);
2205 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2207 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2210 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2211 MpegEncContext * const s= (MpegEncContext *)c;
2212 const uint8_t *scantable= s->intra_scantable.permutated;
2213 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2214 int i, last, run, bits, level, start_i;
2215 const int esc_length= s->ac_esc_length;
2217 uint8_t * last_length;
2221 s->dsp.diff_pixels(temp, src1, src2, stride);
2223 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2229 length = s->intra_ac_vlc_length;
2230 last_length= s->intra_ac_vlc_last_length;
2231 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2234 length = s->inter_ac_vlc_length;
2235 last_length= s->inter_ac_vlc_last_length;
2240 for(i=start_i; i<last; i++){
2241 int j= scantable[i];
2246 if((level&(~127)) == 0){
2247 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2256 level= temp[i] + 64;
2260 if((level&(~127)) == 0){
2261 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2269 #define VSAD_INTRA(size) \
2270 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2274 for(y=1; y<h; y++){ \
2275 for(x=0; x<size; x+=4){ \
2276 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2277 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2287 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2292 for(x=0; x<16; x++){
2293 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2302 #define SQ(a) ((a)*(a))
2303 #define VSSE_INTRA(size) \
2304 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2308 for(y=1; y<h; y++){ \
2309 for(x=0; x<size; x+=4){ \
2310 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2311 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2321 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2326 for(x=0; x<16; x++){
2327 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2336 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2340 for(i=0; i<size; i++)
2341 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2345 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2346 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2347 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2349 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2351 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2352 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2353 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2354 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2356 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2357 uint32_t maxi, uint32_t maxisign)
2360 if(a > mini) return mini;
2361 else if((a^(1U<<31)) > maxisign) return maxi;
2365 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2367 uint32_t mini = *(uint32_t*)min;
2368 uint32_t maxi = *(uint32_t*)max;
2369 uint32_t maxisign = maxi ^ (1U<<31);
2370 uint32_t *dsti = (uint32_t*)dst;
2371 const uint32_t *srci = (const uint32_t*)src;
2372 for(i=0; i<len; i+=8) {
2373 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2374 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2375 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2376 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2377 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2378 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2379 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2380 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2383 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2385 if(min < 0 && max > 0) {
2386 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2388 for(i=0; i < len; i+=8) {
2389 dst[i ] = av_clipf(src[i ], min, max);
2390 dst[i + 1] = av_clipf(src[i + 1], min, max);
2391 dst[i + 2] = av_clipf(src[i + 2], min, max);
2392 dst[i + 3] = av_clipf(src[i + 3], min, max);
2393 dst[i + 4] = av_clipf(src[i + 4], min, max);
2394 dst[i + 5] = av_clipf(src[i + 5], min, max);
2395 dst[i + 6] = av_clipf(src[i + 6], min, max);
2396 dst[i + 7] = av_clipf(src[i + 7], min, max);
2401 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2406 res += *v1++ * *v2++;
2411 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2416 *v1++ += mul * *v3++;
2421 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2422 const int16_t *window, unsigned int len)
2425 int len2 = len >> 1;
2427 for (i = 0; i < len2; i++) {
2428 int16_t w = window[i];
2429 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2430 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2434 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2435 int32_t max, unsigned int len)
2438 *dst++ = av_clip(*src++, min, max);
2439 *dst++ = av_clip(*src++, min, max);
2440 *dst++ = av_clip(*src++, min, max);
2441 *dst++ = av_clip(*src++, min, max);
2442 *dst++ = av_clip(*src++, min, max);
2443 *dst++ = av_clip(*src++, min, max);
2444 *dst++ = av_clip(*src++, min, max);
2445 *dst++ = av_clip(*src++, min, max);
2450 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2452 ff_j_rev_dct (block);
2453 put_pixels_clamped_c(block, dest, line_size);
2455 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2457 ff_j_rev_dct (block);
2458 add_pixels_clamped_c(block, dest, line_size);
2461 /* init static data */
2462 av_cold void ff_dsputil_static_init(void)
2466 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2467 for(i=0;i<MAX_NEG_CROP;i++) {
2469 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2472 for(i=0;i<512;i++) {
2473 ff_squareTbl[i] = (i - 256) * (i - 256);
2476 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2479 int ff_check_alignment(void){
2480 static int did_fail=0;
2481 LOCAL_ALIGNED_16(int, aligned, [4]);
2483 if((intptr_t)aligned & 15){
2485 #if HAVE_MMX || HAVE_ALTIVEC
2486 av_log(NULL, AV_LOG_ERROR,
2487 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2488 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2489 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2490 "Do not report crashes to Libav developers.\n");
2499 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2501 ff_check_alignment();
2504 if (avctx->bits_per_raw_sample == 10) {
2505 c->fdct = ff_jpeg_fdct_islow_10;
2506 c->fdct248 = ff_fdct248_islow_10;
2508 if(avctx->dct_algo==FF_DCT_FASTINT) {
2509 c->fdct = ff_fdct_ifast;
2510 c->fdct248 = ff_fdct_ifast248;
2512 else if(avctx->dct_algo==FF_DCT_FAAN) {
2513 c->fdct = ff_faandct;
2514 c->fdct248 = ff_faandct248;
2517 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2518 c->fdct248 = ff_fdct248_islow_8;
2521 #endif //CONFIG_ENCODERS
2523 if (avctx->bits_per_raw_sample == 10) {
2524 c->idct_put = ff_simple_idct_put_10;
2525 c->idct_add = ff_simple_idct_add_10;
2526 c->idct = ff_simple_idct_10;
2527 c->idct_permutation_type = FF_NO_IDCT_PERM;
2529 if(avctx->idct_algo==FF_IDCT_INT){
2530 c->idct_put= ff_jref_idct_put;
2531 c->idct_add= ff_jref_idct_add;
2532 c->idct = ff_j_rev_dct;
2533 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2534 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2535 c->idct_put= ff_faanidct_put;
2536 c->idct_add= ff_faanidct_add;
2537 c->idct = ff_faanidct;
2538 c->idct_permutation_type= FF_NO_IDCT_PERM;
2539 }else{ //accurate/default
2540 c->idct_put = ff_simple_idct_put_8;
2541 c->idct_add = ff_simple_idct_add_8;
2542 c->idct = ff_simple_idct_8;
2543 c->idct_permutation_type= FF_NO_IDCT_PERM;
2547 c->diff_pixels = diff_pixels_c;
2548 c->put_pixels_clamped = put_pixels_clamped_c;
2549 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2550 c->add_pixels_clamped = add_pixels_clamped_c;
2551 c->sum_abs_dctelem = sum_abs_dctelem_c;
2554 c->pix_sum = pix_sum_c;
2555 c->pix_norm1 = pix_norm1_c;
2557 c->fill_block_tab[0] = fill_block16_c;
2558 c->fill_block_tab[1] = fill_block8_c;
2560 /* TODO [0] 16 [1] 8 */
2561 c->pix_abs[0][0] = pix_abs16_c;
2562 c->pix_abs[0][1] = pix_abs16_x2_c;
2563 c->pix_abs[0][2] = pix_abs16_y2_c;
2564 c->pix_abs[0][3] = pix_abs16_xy2_c;
2565 c->pix_abs[1][0] = pix_abs8_c;
2566 c->pix_abs[1][1] = pix_abs8_x2_c;
2567 c->pix_abs[1][2] = pix_abs8_y2_c;
2568 c->pix_abs[1][3] = pix_abs8_xy2_c;
2570 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2571 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2572 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2573 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2574 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2575 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2576 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2577 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2578 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2580 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2581 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2582 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2583 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2584 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2585 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2586 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2587 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2588 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2590 #define dspfunc(PFX, IDX, NUM) \
2591 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2592 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2593 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2594 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2595 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2596 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2597 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2598 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2599 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2600 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2601 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2602 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2603 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2604 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2605 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2606 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2608 dspfunc(put_qpel, 0, 16);
2609 dspfunc(put_no_rnd_qpel, 0, 16);
2611 dspfunc(avg_qpel, 0, 16);
2612 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2614 dspfunc(put_qpel, 1, 8);
2615 dspfunc(put_no_rnd_qpel, 1, 8);
2617 dspfunc(avg_qpel, 1, 8);
2618 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2622 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2623 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2624 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2625 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2626 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2627 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2628 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2629 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2631 #define SET_CMP_FUNC(name) \
2632 c->name[0]= name ## 16_c;\
2633 c->name[1]= name ## 8x8_c;
2635 SET_CMP_FUNC(hadamard8_diff)
2636 c->hadamard8_diff[4]= hadamard8_intra16_c;
2637 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2638 SET_CMP_FUNC(dct_sad)
2639 SET_CMP_FUNC(dct_max)
2641 SET_CMP_FUNC(dct264_sad)
2643 c->sad[0]= pix_abs16_c;
2644 c->sad[1]= pix_abs8_c;
2648 SET_CMP_FUNC(quant_psnr)
2651 c->vsad[0]= vsad16_c;
2652 c->vsad[4]= vsad_intra16_c;
2653 c->vsad[5]= vsad_intra8_c;
2654 c->vsse[0]= vsse16_c;
2655 c->vsse[4]= vsse_intra16_c;
2656 c->vsse[5]= vsse_intra8_c;
2657 c->nsse[0]= nsse16_c;
2658 c->nsse[1]= nsse8_c;
2660 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2662 c->add_bytes= add_bytes_c;
2663 c->diff_bytes= diff_bytes_c;
2664 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2665 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2666 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2667 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2668 c->bswap_buf= bswap_buf;
2669 c->bswap16_buf = bswap16_buf;
2671 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2672 c->h263_h_loop_filter= h263_h_loop_filter_c;
2673 c->h263_v_loop_filter= h263_v_loop_filter_c;
2676 c->h261_loop_filter= h261_loop_filter_c;
2678 c->try_8x8basis= try_8x8basis_c;
2679 c->add_8x8basis= add_8x8basis_c;
2681 c->vector_clipf = vector_clipf_c;
2682 c->scalarproduct_int16 = scalarproduct_int16_c;
2683 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2684 c->apply_window_int16 = apply_window_int16_c;
2685 c->vector_clip_int32 = vector_clip_int32_c;
2687 c->shrink[0]= av_image_copy_plane;
2688 c->shrink[1]= ff_shrink22;
2689 c->shrink[2]= ff_shrink44;
2690 c->shrink[3]= ff_shrink88;
2692 #define hpel_funcs(prefix, idx, num) \
2693 c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2694 c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2695 c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2696 c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2698 hpel_funcs(put, [0], 16);
2699 hpel_funcs(put, [1], 8);
2700 hpel_funcs(put, [2], 4);
2701 hpel_funcs(put, [3], 2);
2702 hpel_funcs(put_no_rnd, [0], 16);
2703 hpel_funcs(put_no_rnd, [1], 8);
2704 hpel_funcs(avg, [0], 16);
2705 hpel_funcs(avg, [1], 8);
2706 hpel_funcs(avg, [2], 4);
2707 hpel_funcs(avg, [3], 2);
2708 hpel_funcs(avg_no_rnd,[0], 16);
2712 #define FUNC(f, depth) f ## _ ## depth
2713 #define FUNCC(f, depth) f ## _ ## depth ## _c
2715 #define dspfunc2(PFX, IDX, NUM, depth)\
2716 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
2717 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
2718 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
2719 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
2720 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
2721 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
2722 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
2723 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
2724 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
2725 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
2726 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
2727 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
2728 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
2729 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
2730 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
2731 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
2733 #define BIT_DEPTH_FUNCS(depth, dct)\
2734 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2735 c->draw_edges = FUNCC(draw_edges , depth);\
2736 c->clear_block = FUNCC(clear_block ## dct , depth);\
2737 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2738 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
2739 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
2741 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
2742 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
2743 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
2744 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
2745 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
2746 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
2748 dspfunc2(put_h264_qpel, 0, 16, depth);\
2749 dspfunc2(put_h264_qpel, 1, 8, depth);\
2750 dspfunc2(put_h264_qpel, 2, 4, depth);\
2751 dspfunc2(put_h264_qpel, 3, 2, depth);\
2752 dspfunc2(avg_h264_qpel, 0, 16, depth);\
2753 dspfunc2(avg_h264_qpel, 1, 8, depth);\
2754 dspfunc2(avg_h264_qpel, 2, 4, depth);
2756 switch (avctx->bits_per_raw_sample) {
2758 if (c->dct_bits == 32) {
2759 BIT_DEPTH_FUNCS(9, _32);
2761 BIT_DEPTH_FUNCS(9, _16);
2765 if (c->dct_bits == 32) {
2766 BIT_DEPTH_FUNCS(10, _32);
2768 BIT_DEPTH_FUNCS(10, _16);
2772 BIT_DEPTH_FUNCS(8, _16);
2777 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2778 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2779 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2780 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2781 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2782 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2783 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2785 ff_init_scantable_permutation(c->idct_permutation,
2786 c->idct_permutation_type);