3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
56 #include "dsputil_template.c"
58 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
59 #define pb_7f (~0UL/255 * 0x7f)
60 #define pb_80 (~0UL/255 * 0x80)
62 const uint8_t ff_zigzag_direct[64] = {
63 0, 1, 8, 16, 9, 2, 3, 10,
64 17, 24, 32, 25, 18, 11, 4, 5,
65 12, 19, 26, 33, 40, 48, 41, 34,
66 27, 20, 13, 6, 7, 14, 21, 28,
67 35, 42, 49, 56, 57, 50, 43, 36,
68 29, 22, 15, 23, 30, 37, 44, 51,
69 58, 59, 52, 45, 38, 31, 39, 46,
70 53, 60, 61, 54, 47, 55, 62, 63
73 /* Specific zigzag scan for 248 idct. NOTE that unlike the
74 specification, we interleave the fields */
75 const uint8_t ff_zigzag248_direct[64] = {
76 0, 8, 1, 9, 16, 24, 2, 10,
77 17, 25, 32, 40, 48, 56, 33, 41,
78 18, 26, 3, 11, 4, 12, 19, 27,
79 34, 42, 49, 57, 50, 58, 35, 43,
80 20, 28, 5, 13, 6, 14, 21, 29,
81 36, 44, 51, 59, 52, 60, 37, 45,
82 22, 30, 7, 15, 23, 31, 38, 46,
83 53, 61, 54, 62, 39, 47, 55, 63,
86 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
87 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
89 const uint8_t ff_alternate_horizontal_scan[64] = {
90 0, 1, 2, 3, 8, 9, 16, 17,
91 10, 11, 4, 5, 6, 7, 15, 14,
92 13, 12, 19, 18, 24, 25, 32, 33,
93 26, 27, 20, 21, 22, 23, 28, 29,
94 30, 31, 34, 35, 40, 41, 48, 49,
95 42, 43, 36, 37, 38, 39, 44, 45,
96 46, 47, 50, 51, 56, 57, 58, 59,
97 52, 53, 54, 55, 60, 61, 62, 63,
100 const uint8_t ff_alternate_vertical_scan[64] = {
101 0, 8, 16, 24, 1, 9, 2, 10,
102 17, 25, 32, 40, 48, 56, 57, 49,
103 41, 33, 26, 18, 3, 11, 4, 12,
104 19, 27, 34, 42, 50, 58, 35, 43,
105 51, 59, 20, 28, 5, 13, 6, 14,
106 21, 29, 36, 44, 52, 60, 37, 45,
107 53, 61, 22, 30, 7, 15, 23, 31,
108 38, 46, 54, 62, 39, 47, 55, 63,
111 /* Input permutation for the simple_idct_mmx */
112 static const uint8_t simple_mmx_permutation[64]={
113 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
114 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
115 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
116 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
117 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
118 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
119 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
120 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
123 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
125 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
129 st->scantable= src_scantable;
133 j = src_scantable[i];
134 st->permutated[i] = permutation[j];
140 j = st->permutated[i];
142 st->raster_end[i]= end;
146 void ff_init_scantable_permutation(uint8_t *idct_permutation,
147 int idct_permutation_type)
151 switch(idct_permutation_type){
152 case FF_NO_IDCT_PERM:
154 idct_permutation[i]= i;
156 case FF_LIBMPEG2_IDCT_PERM:
158 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
160 case FF_SIMPLE_IDCT_PERM:
162 idct_permutation[i]= simple_mmx_permutation[i];
164 case FF_TRANSPOSE_IDCT_PERM:
166 idct_permutation[i]= ((i&7)<<3) | (i>>3);
168 case FF_PARTTRANS_IDCT_PERM:
170 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
172 case FF_SSE2_IDCT_PERM:
174 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
177 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
181 static int pix_sum_c(uint8_t * pix, int line_size)
186 for (i = 0; i < 16; i++) {
187 for (j = 0; j < 16; j += 8) {
198 pix += line_size - 16;
203 static int pix_norm1_c(uint8_t * pix, int line_size)
206 uint32_t *sq = ff_squareTbl + 256;
209 for (i = 0; i < 16; i++) {
210 for (j = 0; j < 16; j += 8) {
222 register uint64_t x=*(uint64_t*)pix;
224 s += sq[(x>>8)&0xff];
225 s += sq[(x>>16)&0xff];
226 s += sq[(x>>24)&0xff];
227 s += sq[(x>>32)&0xff];
228 s += sq[(x>>40)&0xff];
229 s += sq[(x>>48)&0xff];
230 s += sq[(x>>56)&0xff];
232 register uint32_t x=*(uint32_t*)pix;
234 s += sq[(x>>8)&0xff];
235 s += sq[(x>>16)&0xff];
236 s += sq[(x>>24)&0xff];
237 x=*(uint32_t*)(pix+4);
239 s += sq[(x>>8)&0xff];
240 s += sq[(x>>16)&0xff];
241 s += sq[(x>>24)&0xff];
246 pix += line_size - 16;
251 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
254 for(i=0; i+8<=w; i+=8){
255 dst[i+0]= av_bswap32(src[i+0]);
256 dst[i+1]= av_bswap32(src[i+1]);
257 dst[i+2]= av_bswap32(src[i+2]);
258 dst[i+3]= av_bswap32(src[i+3]);
259 dst[i+4]= av_bswap32(src[i+4]);
260 dst[i+5]= av_bswap32(src[i+5]);
261 dst[i+6]= av_bswap32(src[i+6]);
262 dst[i+7]= av_bswap32(src[i+7]);
265 dst[i+0]= av_bswap32(src[i+0]);
269 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
272 *dst++ = av_bswap16(*src++);
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278 uint32_t *sq = ff_squareTbl + 256;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
295 uint32_t *sq = ff_squareTbl + 256;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
316 uint32_t *sq = ff_squareTbl + 256;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
343 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
344 const uint8_t *s2, int stride){
347 /* read the pixels */
349 block[0] = s1[0] - s2[0];
350 block[1] = s1[1] - s2[1];
351 block[2] = s1[2] - s2[2];
352 block[3] = s1[3] - s2[3];
353 block[4] = s1[4] - s2[4];
354 block[5] = s1[5] - s2[5];
355 block[6] = s1[6] - s2[6];
356 block[7] = s1[7] - s2[7];
364 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
369 /* read the pixels */
371 pixels[0] = av_clip_uint8(block[0]);
372 pixels[1] = av_clip_uint8(block[1]);
373 pixels[2] = av_clip_uint8(block[2]);
374 pixels[3] = av_clip_uint8(block[3]);
375 pixels[4] = av_clip_uint8(block[4]);
376 pixels[5] = av_clip_uint8(block[5]);
377 pixels[6] = av_clip_uint8(block[6]);
378 pixels[7] = av_clip_uint8(block[7]);
385 static void put_signed_pixels_clamped_c(const int16_t *block,
386 uint8_t *restrict pixels,
391 for (i = 0; i < 8; i++) {
392 for (j = 0; j < 8; j++) {
395 else if (*block > 127)
398 *pixels = (uint8_t)(*block + 128);
402 pixels += (line_size - 8);
406 static void add_pixels8_c(uint8_t *restrict pixels,
413 pixels[0] += block[0];
414 pixels[1] += block[1];
415 pixels[2] += block[2];
416 pixels[3] += block[3];
417 pixels[4] += block[4];
418 pixels[5] += block[5];
419 pixels[6] += block[6];
420 pixels[7] += block[7];
426 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
431 /* read the pixels */
433 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
434 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
435 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
436 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
437 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
438 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
439 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
440 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
446 static int sum_abs_dctelem_c(int16_t *block)
450 sum+= FFABS(block[i]);
454 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
458 for (i = 0; i < h; i++) {
459 memset(block, value, 16);
464 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
468 for (i = 0; i < h; i++) {
469 memset(block, value, 8);
474 #define avg2(a,b) ((a+b+1)>>1)
475 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
477 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
479 const int A=(16-x16)*(16-y16);
480 const int B=( x16)*(16-y16);
481 const int C=(16-x16)*( y16);
482 const int D=( x16)*( y16);
487 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
488 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
489 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
490 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
491 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
492 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
493 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
494 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
500 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
501 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
504 const int s= 1<<shift;
514 for(x=0; x<8; x++){ //XXX FIXME optimize
515 int src_x, src_y, frac_x, frac_y, index;
524 if((unsigned)src_x < width){
525 if((unsigned)src_y < height){
526 index= src_x + src_y*stride;
527 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
528 + src[index +1]* frac_x )*(s-frac_y)
529 + ( src[index+stride ]*(s-frac_x)
530 + src[index+stride+1]* frac_x )* frac_y
533 index= src_x + av_clip(src_y, 0, height)*stride;
534 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
535 + src[index +1]* frac_x )*s
539 if((unsigned)src_y < height){
540 index= av_clip(src_x, 0, width) + src_y*stride;
541 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
542 + src[index+stride ]* frac_y )*s
545 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
546 dst[y*stride + x]= src[index ];
558 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
560 case 2: put_pixels2_8_c (dst, src, stride, height); break;
561 case 4: put_pixels4_8_c (dst, src, stride, height); break;
562 case 8: put_pixels8_8_c (dst, src, stride, height); break;
563 case 16:put_pixels16_8_c(dst, src, stride, height); break;
567 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
569 for (i=0; i < height; i++) {
570 for (j=0; j < width; j++) {
571 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
578 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
580 for (i=0; i < height; i++) {
581 for (j=0; j < width; j++) {
582 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
589 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
591 for (i=0; i < height; i++) {
592 for (j=0; j < width; j++) {
593 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
600 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
602 for (i=0; i < height; i++) {
603 for (j=0; j < width; j++) {
604 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
611 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
613 for (i=0; i < height; i++) {
614 for (j=0; j < width; j++) {
615 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
622 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
624 for (i=0; i < height; i++) {
625 for (j=0; j < width; j++) {
626 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
633 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
635 for (i=0; i < height; i++) {
636 for (j=0; j < width; j++) {
637 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
644 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
646 for (i=0; i < height; i++) {
647 for (j=0; j < width; j++) {
648 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
655 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
657 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
658 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
659 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
660 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
664 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
666 for (i=0; i < height; i++) {
667 for (j=0; j < width; j++) {
668 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
675 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
677 for (i=0; i < height; i++) {
678 for (j=0; j < width; j++) {
679 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
686 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
688 for (i=0; i < height; i++) {
689 for (j=0; j < width; j++) {
690 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
697 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
699 for (i=0; i < height; i++) {
700 for (j=0; j < width; j++) {
701 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
708 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
710 for (i=0; i < height; i++) {
711 for (j=0; j < width; j++) {
712 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
719 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
721 for (i=0; i < height; i++) {
722 for (j=0; j < width; j++) {
723 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
730 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
732 for (i=0; i < height; i++) {
733 for (j=0; j < width; j++) {
734 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
741 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
743 for (i=0; i < height; i++) {
744 for (j=0; j < width; j++) {
745 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
752 #define QPEL_MC(r, OPNAME, RND, OP) \
753 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
758 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
759 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
760 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
761 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
762 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
763 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
764 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
765 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
771 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
773 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
777 const int src0= src[0*srcStride];\
778 const int src1= src[1*srcStride];\
779 const int src2= src[2*srcStride];\
780 const int src3= src[3*srcStride];\
781 const int src4= src[4*srcStride];\
782 const int src5= src[5*srcStride];\
783 const int src6= src[6*srcStride];\
784 const int src7= src[7*srcStride];\
785 const int src8= src[8*srcStride];\
786 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
787 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
788 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
789 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
790 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
791 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
792 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
793 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
799 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
805 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
806 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
807 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
808 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
809 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
810 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
811 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
812 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
813 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
814 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
815 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
816 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
817 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
818 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
819 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
820 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
826 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
827 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
832 const int src0= src[0*srcStride];\
833 const int src1= src[1*srcStride];\
834 const int src2= src[2*srcStride];\
835 const int src3= src[3*srcStride];\
836 const int src4= src[4*srcStride];\
837 const int src5= src[5*srcStride];\
838 const int src6= src[6*srcStride];\
839 const int src7= src[7*srcStride];\
840 const int src8= src[8*srcStride];\
841 const int src9= src[9*srcStride];\
842 const int src10= src[10*srcStride];\
843 const int src11= src[11*srcStride];\
844 const int src12= src[12*srcStride];\
845 const int src13= src[13*srcStride];\
846 const int src14= src[14*srcStride];\
847 const int src15= src[15*srcStride];\
848 const int src16= src[16*srcStride];\
849 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
850 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
851 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
852 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
853 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
854 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
855 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
856 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
857 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
858 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
859 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
860 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
861 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
862 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
863 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
864 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
870 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
873 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
874 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
877 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
879 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
882 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
885 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
886 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
889 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
893 copy_block9(full, src, 16, stride, 9);\
894 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
895 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
898 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
901 copy_block9(full, src, 16, stride, 9);\
902 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
905 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
909 copy_block9(full, src, 16, stride, 9);\
910 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
911 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
913 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
919 copy_block9(full, src, 16, stride, 9);\
920 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
921 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
922 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
923 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
925 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
930 copy_block9(full, src, 16, stride, 9);\
931 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
932 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
933 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
934 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
936 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
942 copy_block9(full, src, 16, stride, 9);\
943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
944 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
945 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
946 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
948 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
953 copy_block9(full, src, 16, stride, 9);\
954 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
955 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
957 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
959 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
965 copy_block9(full, src, 16, stride, 9);\
966 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
967 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
968 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
969 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
971 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
976 copy_block9(full, src, 16, stride, 9);\
977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
978 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
979 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
980 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
982 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
988 copy_block9(full, src, 16, stride, 9);\
989 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
990 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
991 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
992 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
994 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
999 copy_block9(full, src, 16, stride, 9);\
1000 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1005 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1008 uint8_t halfHV[64];\
1009 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1010 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1013 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1016 uint8_t halfHV[64];\
1017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1018 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1019 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1021 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1023 uint8_t full[16*9];\
1026 uint8_t halfHV[64];\
1027 copy_block9(full, src, 16, stride, 9);\
1028 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1029 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1030 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1031 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1033 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1035 uint8_t full[16*9];\
1037 copy_block9(full, src, 16, stride, 9);\
1038 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1039 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1040 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1042 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1044 uint8_t full[16*9];\
1047 uint8_t halfHV[64];\
1048 copy_block9(full, src, 16, stride, 9);\
1049 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1050 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1054 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1056 uint8_t full[16*9];\
1058 copy_block9(full, src, 16, stride, 9);\
1059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1060 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1061 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1063 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1066 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1067 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1070 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1073 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1074 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1077 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1079 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1082 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1085 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1086 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1089 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1091 uint8_t full[24*17];\
1093 copy_block17(full, src, 24, stride, 17);\
1094 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1095 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1098 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1100 uint8_t full[24*17];\
1101 copy_block17(full, src, 24, stride, 17);\
1102 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1105 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1107 uint8_t full[24*17];\
1109 copy_block17(full, src, 24, stride, 17);\
1110 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1111 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1113 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1115 uint8_t full[24*17];\
1116 uint8_t halfH[272];\
1117 uint8_t halfV[256];\
1118 uint8_t halfHV[256];\
1119 copy_block17(full, src, 24, stride, 17);\
1120 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1121 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1122 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1123 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1125 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1127 uint8_t full[24*17];\
1128 uint8_t halfH[272];\
1129 uint8_t halfHV[256];\
1130 copy_block17(full, src, 24, stride, 17);\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1132 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1133 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1134 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1136 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1138 uint8_t full[24*17];\
1139 uint8_t halfH[272];\
1140 uint8_t halfV[256];\
1141 uint8_t halfHV[256];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1146 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1148 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1150 uint8_t full[24*17];\
1151 uint8_t halfH[272];\
1152 uint8_t halfHV[256];\
1153 copy_block17(full, src, 24, stride, 17);\
1154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1155 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1157 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1161 uint8_t full[24*17];\
1162 uint8_t halfH[272];\
1163 uint8_t halfV[256];\
1164 uint8_t halfHV[256];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1171 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1173 uint8_t full[24*17];\
1174 uint8_t halfH[272];\
1175 uint8_t halfHV[256];\
1176 copy_block17(full, src, 24, stride, 17);\
1177 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1182 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1184 uint8_t full[24*17];\
1185 uint8_t halfH[272];\
1186 uint8_t halfV[256];\
1187 uint8_t halfHV[256];\
1188 copy_block17(full, src, 24, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1194 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1196 uint8_t full[24*17];\
1197 uint8_t halfH[272];\
1198 uint8_t halfHV[256];\
1199 copy_block17(full, src, 24, stride, 17);\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1205 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1207 uint8_t halfH[272];\
1208 uint8_t halfHV[256];\
1209 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1210 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1211 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1213 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1215 uint8_t halfH[272];\
1216 uint8_t halfHV[256];\
1217 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1218 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1219 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1221 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1223 uint8_t full[24*17];\
1224 uint8_t halfH[272];\
1225 uint8_t halfV[256];\
1226 uint8_t halfHV[256];\
1227 copy_block17(full, src, 24, stride, 17);\
1228 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1229 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1230 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1231 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1233 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1235 uint8_t full[24*17];\
1236 uint8_t halfH[272];\
1237 copy_block17(full, src, 24, stride, 17);\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1239 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1240 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1242 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1244 uint8_t full[24*17];\
1245 uint8_t halfH[272];\
1246 uint8_t halfV[256];\
1247 uint8_t halfHV[256];\
1248 copy_block17(full, src, 24, stride, 17);\
1249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1254 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1256 uint8_t full[24*17];\
1257 uint8_t halfH[272];\
1258 copy_block17(full, src, 24, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1260 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1261 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1263 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1265 uint8_t halfH[272];\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1271 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1272 #define op_put(a, b) a = cm[((b) + 16)>>5]
1273 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1275 QPEL_MC(0, put_ , _ , op_put)
1276 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1277 QPEL_MC(0, avg_ , _ , op_avg)
1278 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1280 #undef op_avg_no_rnd
1282 #undef op_put_no_rnd
1284 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1285 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1286 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1287 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1288 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1289 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1291 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1292 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1296 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1297 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1298 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1299 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1300 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1301 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1302 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1303 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1309 #if CONFIG_RV40_DECODER
1310 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1312 put_pixels16_xy2_8_c(dst, src, stride, 16);
1314 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1316 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1318 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1320 put_pixels8_xy2_8_c(dst, src, stride, 8);
1322 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1324 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1326 #endif /* CONFIG_RV40_DECODER */
1328 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1329 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1333 const int src_1= src[ -srcStride];
1334 const int src0 = src[0 ];
1335 const int src1 = src[ srcStride];
1336 const int src2 = src[2*srcStride];
1337 const int src3 = src[3*srcStride];
1338 const int src4 = src[4*srcStride];
1339 const int src5 = src[5*srcStride];
1340 const int src6 = src[6*srcStride];
1341 const int src7 = src[7*srcStride];
1342 const int src8 = src[8*srcStride];
1343 const int src9 = src[9*srcStride];
1344 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1345 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1346 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1347 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1348 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1349 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1350 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1351 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1357 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1360 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1361 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1364 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1366 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1369 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1372 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1373 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1376 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1381 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1387 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1388 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1389 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1391 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1396 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1397 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1398 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1399 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1401 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1404 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1405 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1408 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1409 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1411 const int strength= ff_h263_loop_filter_strength[qscale];
1415 int p0= src[x-2*stride];
1416 int p1= src[x-1*stride];
1417 int p2= src[x+0*stride];
1418 int p3= src[x+1*stride];
1419 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1421 if (d<-2*strength) d1= 0;
1422 else if(d<- strength) d1=-2*strength - d;
1423 else if(d< strength) d1= d;
1424 else if(d< 2*strength) d1= 2*strength - d;
1429 if(p1&256) p1= ~(p1>>31);
1430 if(p2&256) p2= ~(p2>>31);
1432 src[x-1*stride] = p1;
1433 src[x+0*stride] = p2;
1437 d2= av_clip((p0-p3)/4, -ad1, ad1);
1439 src[x-2*stride] = p0 - d2;
1440 src[x+ stride] = p3 + d2;
1445 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1446 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1448 const int strength= ff_h263_loop_filter_strength[qscale];
1452 int p0= src[y*stride-2];
1453 int p1= src[y*stride-1];
1454 int p2= src[y*stride+0];
1455 int p3= src[y*stride+1];
1456 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1458 if (d<-2*strength) d1= 0;
1459 else if(d<- strength) d1=-2*strength - d;
1460 else if(d< strength) d1= d;
1461 else if(d< 2*strength) d1= 2*strength - d;
1466 if(p1&256) p1= ~(p1>>31);
1467 if(p2&256) p2= ~(p2>>31);
1469 src[y*stride-1] = p1;
1470 src[y*stride+0] = p2;
1474 d2= av_clip((p0-p3)/4, -ad1, ad1);
1476 src[y*stride-2] = p0 - d2;
1477 src[y*stride+1] = p3 + d2;
1482 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1488 s += abs(pix1[0] - pix2[0]);
1489 s += abs(pix1[1] - pix2[1]);
1490 s += abs(pix1[2] - pix2[2]);
1491 s += abs(pix1[3] - pix2[3]);
1492 s += abs(pix1[4] - pix2[4]);
1493 s += abs(pix1[5] - pix2[5]);
1494 s += abs(pix1[6] - pix2[6]);
1495 s += abs(pix1[7] - pix2[7]);
1496 s += abs(pix1[8] - pix2[8]);
1497 s += abs(pix1[9] - pix2[9]);
1498 s += abs(pix1[10] - pix2[10]);
1499 s += abs(pix1[11] - pix2[11]);
1500 s += abs(pix1[12] - pix2[12]);
1501 s += abs(pix1[13] - pix2[13]);
1502 s += abs(pix1[14] - pix2[14]);
1503 s += abs(pix1[15] - pix2[15]);
1510 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1516 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1517 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1518 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1519 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1520 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1521 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1522 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1523 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1524 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1525 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1526 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1527 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1528 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1529 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1530 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1531 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1538 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1541 uint8_t *pix3 = pix2 + line_size;
1545 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1546 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1547 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1548 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1549 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1550 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1551 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1552 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1553 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1554 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1555 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1556 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1557 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1558 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1559 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1560 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1568 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1571 uint8_t *pix3 = pix2 + line_size;
1575 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1576 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1577 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1578 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1579 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1580 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1581 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1582 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1583 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1584 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1585 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1586 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1587 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1588 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1589 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1590 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1598 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1604 s += abs(pix1[0] - pix2[0]);
1605 s += abs(pix1[1] - pix2[1]);
1606 s += abs(pix1[2] - pix2[2]);
1607 s += abs(pix1[3] - pix2[3]);
1608 s += abs(pix1[4] - pix2[4]);
1609 s += abs(pix1[5] - pix2[5]);
1610 s += abs(pix1[6] - pix2[6]);
1611 s += abs(pix1[7] - pix2[7]);
1618 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1624 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1625 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1626 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1627 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1628 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1629 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1630 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1631 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1638 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1641 uint8_t *pix3 = pix2 + line_size;
1645 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1646 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1647 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1648 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1649 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1650 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1651 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1652 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1660 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1663 uint8_t *pix3 = pix2 + line_size;
1667 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1668 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1669 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1670 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1671 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1672 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1673 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1674 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1682 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1683 MpegEncContext *c = v;
1689 for(x=0; x<16; x++){
1690 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1693 for(x=0; x<15; x++){
1694 score2+= FFABS( s1[x ] - s1[x +stride]
1695 - s1[x+1] + s1[x+1+stride])
1696 -FFABS( s2[x ] - s2[x +stride]
1697 - s2[x+1] + s2[x+1+stride]);
1704 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1705 else return score1 + FFABS(score2)*8;
1708 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1709 MpegEncContext *c = v;
1716 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1720 score2+= FFABS( s1[x ] - s1[x +stride]
1721 - s1[x+1] + s1[x+1+stride])
1722 -FFABS( s2[x ] - s2[x +stride]
1723 - s2[x+1] + s2[x+1+stride]);
1730 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1731 else return score1 + FFABS(score2)*8;
1734 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1738 for(i=0; i<8*8; i++){
1739 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1742 assert(-512<b && b<512);
1744 sum += (w*b)*(w*b)>>4;
1749 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1752 for(i=0; i<8*8; i++){
1753 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1757 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1761 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1764 memset(cmp, 0, sizeof(void*)*6);
1772 cmp[i]= c->hadamard8_diff[i];
1778 cmp[i]= c->dct_sad[i];
1781 cmp[i]= c->dct264_sad[i];
1784 cmp[i]= c->dct_max[i];
1787 cmp[i]= c->quant_psnr[i];
1808 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1813 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1815 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1816 long a = *(long*)(src+i);
1817 long b = *(long*)(dst+i);
1818 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1821 dst[i+0] += src[i+0];
1824 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1826 #if !HAVE_FAST_UNALIGNED
1827 if((long)src2 & (sizeof(long)-1)){
1828 for(i=0; i+7<w; i+=8){
1829 dst[i+0] = src1[i+0]-src2[i+0];
1830 dst[i+1] = src1[i+1]-src2[i+1];
1831 dst[i+2] = src1[i+2]-src2[i+2];
1832 dst[i+3] = src1[i+3]-src2[i+3];
1833 dst[i+4] = src1[i+4]-src2[i+4];
1834 dst[i+5] = src1[i+5]-src2[i+5];
1835 dst[i+6] = src1[i+6]-src2[i+6];
1836 dst[i+7] = src1[i+7]-src2[i+7];
1840 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1841 long a = *(long*)(src1+i);
1842 long b = *(long*)(src2+i);
1843 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1846 dst[i+0] = src1[i+0]-src2[i+0];
1849 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1857 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1866 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1874 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1884 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1887 for(i=0; i<w-1; i++){
1914 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1944 #define BUTTERFLY2(o1,o2,i1,i2) \
1948 #define BUTTERFLY1(x,y) \
1957 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1959 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1967 //FIXME try pointer walks
1968 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1969 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1970 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1971 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1973 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1974 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1975 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1976 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1978 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1979 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1980 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1981 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1985 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1986 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1987 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1988 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1990 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1991 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1992 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1993 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1996 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1997 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1998 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1999 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2004 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2012 //FIXME try pointer walks
2013 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2014 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2015 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2016 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2018 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2019 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2020 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2021 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2023 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2024 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2025 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2026 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2030 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2031 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2032 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2033 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2035 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2036 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2037 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2038 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2041 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2042 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2043 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2044 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2047 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2052 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2053 MpegEncContext * const s= (MpegEncContext *)c;
2054 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2058 s->dsp.diff_pixels(temp, src1, src2, stride);
2060 return s->dsp.sum_abs_dctelem(temp);
2065 const int s07 = SRC(0) + SRC(7);\
2066 const int s16 = SRC(1) + SRC(6);\
2067 const int s25 = SRC(2) + SRC(5);\
2068 const int s34 = SRC(3) + SRC(4);\
2069 const int a0 = s07 + s34;\
2070 const int a1 = s16 + s25;\
2071 const int a2 = s07 - s34;\
2072 const int a3 = s16 - s25;\
2073 const int d07 = SRC(0) - SRC(7);\
2074 const int d16 = SRC(1) - SRC(6);\
2075 const int d25 = SRC(2) - SRC(5);\
2076 const int d34 = SRC(3) - SRC(4);\
2077 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2078 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2079 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2080 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2082 DST(1, a4 + (a7>>2)) ;\
2083 DST(2, a2 + (a3>>1)) ;\
2084 DST(3, a5 + (a6>>2)) ;\
2086 DST(5, a6 - (a5>>2)) ;\
2087 DST(6, (a2>>1) - a3 ) ;\
2088 DST(7, (a4>>2) - a7 ) ;\
2091 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2092 MpegEncContext * const s= (MpegEncContext *)c;
2097 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2099 #define SRC(x) dct[i][x]
2100 #define DST(x,v) dct[i][x]= v
2101 for( i = 0; i < 8; i++ )
2106 #define SRC(x) dct[x][i]
2107 #define DST(x,v) sum += FFABS(v)
2108 for( i = 0; i < 8; i++ )
2116 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2117 MpegEncContext * const s= (MpegEncContext *)c;
2118 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2123 s->dsp.diff_pixels(temp, src1, src2, stride);
2127 sum= FFMAX(sum, FFABS(temp[i]));
2132 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2133 MpegEncContext * const s= (MpegEncContext *)c;
2134 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2135 int16_t * const bak = temp+64;
2141 s->dsp.diff_pixels(temp, src1, src2, stride);
2143 memcpy(bak, temp, 64*sizeof(int16_t));
2145 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2146 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2147 ff_simple_idct_8(temp); //FIXME
2150 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2155 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2156 MpegEncContext * const s= (MpegEncContext *)c;
2157 const uint8_t *scantable= s->intra_scantable.permutated;
2158 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2159 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2160 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2161 int i, last, run, bits, level, distortion, start_i;
2162 const int esc_length= s->ac_esc_length;
2164 uint8_t * last_length;
2168 copy_block8(lsrc1, src1, 8, stride, 8);
2169 copy_block8(lsrc2, src2, 8, stride, 8);
2171 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2173 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2179 length = s->intra_ac_vlc_length;
2180 last_length= s->intra_ac_vlc_last_length;
2181 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2184 length = s->inter_ac_vlc_length;
2185 last_length= s->inter_ac_vlc_last_length;
2190 for(i=start_i; i<last; i++){
2191 int j= scantable[i];
2196 if((level&(~127)) == 0){
2197 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2206 level= temp[i] + 64;
2210 if((level&(~127)) == 0){
2211 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2219 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2221 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2224 s->dsp.idct_add(lsrc2, 8, temp);
2226 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2228 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2231 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2232 MpegEncContext * const s= (MpegEncContext *)c;
2233 const uint8_t *scantable= s->intra_scantable.permutated;
2234 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2235 int i, last, run, bits, level, start_i;
2236 const int esc_length= s->ac_esc_length;
2238 uint8_t * last_length;
2242 s->dsp.diff_pixels(temp, src1, src2, stride);
2244 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2250 length = s->intra_ac_vlc_length;
2251 last_length= s->intra_ac_vlc_last_length;
2252 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2255 length = s->inter_ac_vlc_length;
2256 last_length= s->inter_ac_vlc_last_length;
2261 for(i=start_i; i<last; i++){
2262 int j= scantable[i];
2267 if((level&(~127)) == 0){
2268 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2277 level= temp[i] + 64;
2281 if((level&(~127)) == 0){
2282 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2290 #define VSAD_INTRA(size) \
2291 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2295 for(y=1; y<h; y++){ \
2296 for(x=0; x<size; x+=4){ \
2297 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2298 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2308 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2313 for(x=0; x<16; x++){
2314 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2323 #define SQ(a) ((a)*(a))
2324 #define VSSE_INTRA(size) \
2325 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2329 for(y=1; y<h; y++){ \
2330 for(x=0; x<size; x+=4){ \
2331 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2332 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2342 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2347 for(x=0; x<16; x++){
2348 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2357 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2361 for(i=0; i<size; i++)
2362 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2366 #define WRAPPER8_16_SQ(name8, name16)\
2367 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2369 score +=name8(s, dst , src , stride, 8);\
2370 score +=name8(s, dst+8 , src+8 , stride, 8);\
2374 score +=name8(s, dst , src , stride, 8);\
2375 score +=name8(s, dst+8 , src+8 , stride, 8);\
2380 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2381 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2382 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2384 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2386 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2387 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2388 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2389 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2391 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2392 uint32_t maxi, uint32_t maxisign)
2395 if(a > mini) return mini;
2396 else if((a^(1U<<31)) > maxisign) return maxi;
2400 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2402 uint32_t mini = *(uint32_t*)min;
2403 uint32_t maxi = *(uint32_t*)max;
2404 uint32_t maxisign = maxi ^ (1U<<31);
2405 uint32_t *dsti = (uint32_t*)dst;
2406 const uint32_t *srci = (const uint32_t*)src;
2407 for(i=0; i<len; i+=8) {
2408 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2409 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2410 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2411 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2412 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2413 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2414 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2415 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2418 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2420 if(min < 0 && max > 0) {
2421 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2423 for(i=0; i < len; i+=8) {
2424 dst[i ] = av_clipf(src[i ], min, max);
2425 dst[i + 1] = av_clipf(src[i + 1], min, max);
2426 dst[i + 2] = av_clipf(src[i + 2], min, max);
2427 dst[i + 3] = av_clipf(src[i + 3], min, max);
2428 dst[i + 4] = av_clipf(src[i + 4], min, max);
2429 dst[i + 5] = av_clipf(src[i + 5], min, max);
2430 dst[i + 6] = av_clipf(src[i + 6], min, max);
2431 dst[i + 7] = av_clipf(src[i + 7], min, max);
2436 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2441 res += *v1++ * *v2++;
2446 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2451 *v1++ += mul * *v3++;
2456 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2457 const int16_t *window, unsigned int len)
2460 int len2 = len >> 1;
2462 for (i = 0; i < len2; i++) {
2463 int16_t w = window[i];
2464 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2465 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2469 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2470 int32_t max, unsigned int len)
2473 *dst++ = av_clip(*src++, min, max);
2474 *dst++ = av_clip(*src++, min, max);
2475 *dst++ = av_clip(*src++, min, max);
2476 *dst++ = av_clip(*src++, min, max);
2477 *dst++ = av_clip(*src++, min, max);
2478 *dst++ = av_clip(*src++, min, max);
2479 *dst++ = av_clip(*src++, min, max);
2480 *dst++ = av_clip(*src++, min, max);
2485 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2487 ff_j_rev_dct (block);
2488 put_pixels_clamped_c(block, dest, line_size);
2490 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2492 ff_j_rev_dct (block);
2493 add_pixels_clamped_c(block, dest, line_size);
2496 /* init static data */
2497 av_cold void ff_dsputil_static_init(void)
2501 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2502 for(i=0;i<MAX_NEG_CROP;i++) {
2504 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2507 for(i=0;i<512;i++) {
2508 ff_squareTbl[i] = (i - 256) * (i - 256);
2511 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2514 int ff_check_alignment(void){
2515 static int did_fail=0;
2516 LOCAL_ALIGNED_16(int, aligned, [4]);
2518 if((intptr_t)aligned & 15){
2520 #if HAVE_MMX || HAVE_ALTIVEC
2521 av_log(NULL, AV_LOG_ERROR,
2522 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2523 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2524 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2525 "Do not report crashes to Libav developers.\n");
2534 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2536 ff_check_alignment();
2539 if (avctx->bits_per_raw_sample == 10) {
2540 c->fdct = ff_jpeg_fdct_islow_10;
2541 c->fdct248 = ff_fdct248_islow_10;
2543 if(avctx->dct_algo==FF_DCT_FASTINT) {
2544 c->fdct = ff_fdct_ifast;
2545 c->fdct248 = ff_fdct_ifast248;
2547 else if(avctx->dct_algo==FF_DCT_FAAN) {
2548 c->fdct = ff_faandct;
2549 c->fdct248 = ff_faandct248;
2552 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2553 c->fdct248 = ff_fdct248_islow_8;
2556 #endif //CONFIG_ENCODERS
2558 if (avctx->bits_per_raw_sample == 10) {
2559 c->idct_put = ff_simple_idct_put_10;
2560 c->idct_add = ff_simple_idct_add_10;
2561 c->idct = ff_simple_idct_10;
2562 c->idct_permutation_type = FF_NO_IDCT_PERM;
2564 if(avctx->idct_algo==FF_IDCT_INT){
2565 c->idct_put= ff_jref_idct_put;
2566 c->idct_add= ff_jref_idct_add;
2567 c->idct = ff_j_rev_dct;
2568 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2569 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2570 c->idct_put= ff_faanidct_put;
2571 c->idct_add= ff_faanidct_add;
2572 c->idct = ff_faanidct;
2573 c->idct_permutation_type= FF_NO_IDCT_PERM;
2574 }else{ //accurate/default
2575 c->idct_put = ff_simple_idct_put_8;
2576 c->idct_add = ff_simple_idct_add_8;
2577 c->idct = ff_simple_idct_8;
2578 c->idct_permutation_type= FF_NO_IDCT_PERM;
2582 c->diff_pixels = diff_pixels_c;
2583 c->put_pixels_clamped = put_pixels_clamped_c;
2584 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2585 c->add_pixels_clamped = add_pixels_clamped_c;
2586 c->sum_abs_dctelem = sum_abs_dctelem_c;
2589 c->pix_sum = pix_sum_c;
2590 c->pix_norm1 = pix_norm1_c;
2592 c->fill_block_tab[0] = fill_block16_c;
2593 c->fill_block_tab[1] = fill_block8_c;
2595 /* TODO [0] 16 [1] 8 */
2596 c->pix_abs[0][0] = pix_abs16_c;
2597 c->pix_abs[0][1] = pix_abs16_x2_c;
2598 c->pix_abs[0][2] = pix_abs16_y2_c;
2599 c->pix_abs[0][3] = pix_abs16_xy2_c;
2600 c->pix_abs[1][0] = pix_abs8_c;
2601 c->pix_abs[1][1] = pix_abs8_x2_c;
2602 c->pix_abs[1][2] = pix_abs8_y2_c;
2603 c->pix_abs[1][3] = pix_abs8_xy2_c;
2605 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2606 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2607 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2608 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2609 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2610 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2611 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2612 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2613 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2615 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2616 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2617 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2618 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2619 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2620 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2621 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2622 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2623 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2625 #define dspfunc(PFX, IDX, NUM) \
2626 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2627 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2628 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2629 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2630 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2631 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2632 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2633 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2634 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2635 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2636 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2637 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2638 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2639 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2640 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2641 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2643 dspfunc(put_qpel, 0, 16);
2644 dspfunc(put_no_rnd_qpel, 0, 16);
2646 dspfunc(avg_qpel, 0, 16);
2647 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2649 dspfunc(put_qpel, 1, 8);
2650 dspfunc(put_no_rnd_qpel, 1, 8);
2652 dspfunc(avg_qpel, 1, 8);
2653 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2657 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2658 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2659 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2660 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2661 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2662 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2663 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2664 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2666 #define SET_CMP_FUNC(name) \
2667 c->name[0]= name ## 16_c;\
2668 c->name[1]= name ## 8x8_c;
2670 SET_CMP_FUNC(hadamard8_diff)
2671 c->hadamard8_diff[4]= hadamard8_intra16_c;
2672 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2673 SET_CMP_FUNC(dct_sad)
2674 SET_CMP_FUNC(dct_max)
2676 SET_CMP_FUNC(dct264_sad)
2678 c->sad[0]= pix_abs16_c;
2679 c->sad[1]= pix_abs8_c;
2683 SET_CMP_FUNC(quant_psnr)
2686 c->vsad[0]= vsad16_c;
2687 c->vsad[4]= vsad_intra16_c;
2688 c->vsad[5]= vsad_intra8_c;
2689 c->vsse[0]= vsse16_c;
2690 c->vsse[4]= vsse_intra16_c;
2691 c->vsse[5]= vsse_intra8_c;
2692 c->nsse[0]= nsse16_c;
2693 c->nsse[1]= nsse8_c;
2695 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2697 c->add_bytes= add_bytes_c;
2698 c->diff_bytes= diff_bytes_c;
2699 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2700 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2701 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2702 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2703 c->bswap_buf= bswap_buf;
2704 c->bswap16_buf = bswap16_buf;
2706 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2707 c->h263_h_loop_filter= h263_h_loop_filter_c;
2708 c->h263_v_loop_filter= h263_v_loop_filter_c;
2711 c->try_8x8basis= try_8x8basis_c;
2712 c->add_8x8basis= add_8x8basis_c;
2714 c->vector_clipf = vector_clipf_c;
2715 c->scalarproduct_int16 = scalarproduct_int16_c;
2716 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2717 c->apply_window_int16 = apply_window_int16_c;
2718 c->vector_clip_int32 = vector_clip_int32_c;
2720 c->shrink[0]= av_image_copy_plane;
2721 c->shrink[1]= ff_shrink22;
2722 c->shrink[2]= ff_shrink44;
2723 c->shrink[3]= ff_shrink88;
2725 c->add_pixels8 = add_pixels8_c;
2727 #define hpel_funcs(prefix, idx, num) \
2728 c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2729 c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2730 c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2731 c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2733 hpel_funcs(put, [0], 16);
2734 hpel_funcs(put, [1], 8);
2735 hpel_funcs(put, [2], 4);
2736 hpel_funcs(put, [3], 2);
2737 hpel_funcs(put_no_rnd, [0], 16);
2738 hpel_funcs(put_no_rnd, [1], 8);
2739 hpel_funcs(avg, [0], 16);
2740 hpel_funcs(avg, [1], 8);
2741 hpel_funcs(avg, [2], 4);
2742 hpel_funcs(avg, [3], 2);
2743 hpel_funcs(avg_no_rnd,, 16);
2747 #define FUNC(f, depth) f ## _ ## depth
2748 #define FUNCC(f, depth) f ## _ ## depth ## _c
2750 #define BIT_DEPTH_FUNCS(depth, dct)\
2751 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2752 c->draw_edges = FUNCC(draw_edges , depth);\
2753 c->clear_block = FUNCC(clear_block ## dct , depth);\
2754 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2756 switch (avctx->bits_per_raw_sample) {
2758 if (c->dct_bits == 32) {
2759 BIT_DEPTH_FUNCS(9, _32);
2761 BIT_DEPTH_FUNCS(9, _16);
2765 if (c->dct_bits == 32) {
2766 BIT_DEPTH_FUNCS(10, _32);
2768 BIT_DEPTH_FUNCS(10, _16);
2772 BIT_DEPTH_FUNCS(8, _16);
2777 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2778 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2779 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2780 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2781 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2782 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2783 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2785 ff_init_scantable_permutation(c->idct_permutation,
2786 c->idct_permutation_type);