3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
32 #include "copy_block.h"
34 #include "simple_idct.h"
37 #include "imgconvert.h"
39 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
59 #include "dsputil_template.c"
63 #include "dsputil_template.c"
65 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
66 #define pb_7f (~0UL/255 * 0x7f)
67 #define pb_80 (~0UL/255 * 0x80)
69 const uint8_t ff_zigzag_direct[64] = {
70 0, 1, 8, 16, 9, 2, 3, 10,
71 17, 24, 32, 25, 18, 11, 4, 5,
72 12, 19, 26, 33, 40, 48, 41, 34,
73 27, 20, 13, 6, 7, 14, 21, 28,
74 35, 42, 49, 56, 57, 50, 43, 36,
75 29, 22, 15, 23, 30, 37, 44, 51,
76 58, 59, 52, 45, 38, 31, 39, 46,
77 53, 60, 61, 54, 47, 55, 62, 63
80 /* Specific zigzag scan for 248 idct. NOTE that unlike the
81 specification, we interleave the fields */
82 const uint8_t ff_zigzag248_direct[64] = {
83 0, 8, 1, 9, 16, 24, 2, 10,
84 17, 25, 32, 40, 48, 56, 33, 41,
85 18, 26, 3, 11, 4, 12, 19, 27,
86 34, 42, 49, 57, 50, 58, 35, 43,
87 20, 28, 5, 13, 6, 14, 21, 29,
88 36, 44, 51, 59, 52, 60, 37, 45,
89 22, 30, 7, 15, 23, 31, 38, 46,
90 53, 61, 54, 62, 39, 47, 55, 63,
93 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
94 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
96 const uint8_t ff_alternate_horizontal_scan[64] = {
97 0, 1, 2, 3, 8, 9, 16, 17,
98 10, 11, 4, 5, 6, 7, 15, 14,
99 13, 12, 19, 18, 24, 25, 32, 33,
100 26, 27, 20, 21, 22, 23, 28, 29,
101 30, 31, 34, 35, 40, 41, 48, 49,
102 42, 43, 36, 37, 38, 39, 44, 45,
103 46, 47, 50, 51, 56, 57, 58, 59,
104 52, 53, 54, 55, 60, 61, 62, 63,
107 const uint8_t ff_alternate_vertical_scan[64] = {
108 0, 8, 16, 24, 1, 9, 2, 10,
109 17, 25, 32, 40, 48, 56, 57, 49,
110 41, 33, 26, 18, 3, 11, 4, 12,
111 19, 27, 34, 42, 50, 58, 35, 43,
112 51, 59, 20, 28, 5, 13, 6, 14,
113 21, 29, 36, 44, 52, 60, 37, 45,
114 53, 61, 22, 30, 7, 15, 23, 31,
115 38, 46, 54, 62, 39, 47, 55, 63,
118 /* Input permutation for the simple_idct_mmx */
119 static const uint8_t simple_mmx_permutation[64]={
120 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
121 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
122 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
123 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
124 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
125 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
126 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
127 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
130 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
132 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
136 st->scantable= src_scantable;
140 j = src_scantable[i];
141 st->permutated[i] = permutation[j];
147 j = st->permutated[i];
149 st->raster_end[i]= end;
153 void ff_init_scantable_permutation(uint8_t *idct_permutation,
154 int idct_permutation_type)
158 switch(idct_permutation_type){
159 case FF_NO_IDCT_PERM:
161 idct_permutation[i]= i;
163 case FF_LIBMPEG2_IDCT_PERM:
165 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
167 case FF_SIMPLE_IDCT_PERM:
169 idct_permutation[i]= simple_mmx_permutation[i];
171 case FF_TRANSPOSE_IDCT_PERM:
173 idct_permutation[i]= ((i&7)<<3) | (i>>3);
175 case FF_PARTTRANS_IDCT_PERM:
177 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
179 case FF_SSE2_IDCT_PERM:
181 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
184 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
188 static int pix_sum_c(uint8_t * pix, int line_size)
193 for (i = 0; i < 16; i++) {
194 for (j = 0; j < 16; j += 8) {
205 pix += line_size - 16;
210 static int pix_norm1_c(uint8_t * pix, int line_size)
213 uint32_t *sq = ff_squareTbl + 256;
216 for (i = 0; i < 16; i++) {
217 for (j = 0; j < 16; j += 8) {
229 register uint64_t x=*(uint64_t*)pix;
231 s += sq[(x>>8)&0xff];
232 s += sq[(x>>16)&0xff];
233 s += sq[(x>>24)&0xff];
234 s += sq[(x>>32)&0xff];
235 s += sq[(x>>40)&0xff];
236 s += sq[(x>>48)&0xff];
237 s += sq[(x>>56)&0xff];
239 register uint32_t x=*(uint32_t*)pix;
241 s += sq[(x>>8)&0xff];
242 s += sq[(x>>16)&0xff];
243 s += sq[(x>>24)&0xff];
244 x=*(uint32_t*)(pix+4);
246 s += sq[(x>>8)&0xff];
247 s += sq[(x>>16)&0xff];
248 s += sq[(x>>24)&0xff];
253 pix += line_size - 16;
258 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
261 for(i=0; i+8<=w; i+=8){
262 dst[i+0]= av_bswap32(src[i+0]);
263 dst[i+1]= av_bswap32(src[i+1]);
264 dst[i+2]= av_bswap32(src[i+2]);
265 dst[i+3]= av_bswap32(src[i+3]);
266 dst[i+4]= av_bswap32(src[i+4]);
267 dst[i+5]= av_bswap32(src[i+5]);
268 dst[i+6]= av_bswap32(src[i+6]);
269 dst[i+7]= av_bswap32(src[i+7]);
272 dst[i+0]= av_bswap32(src[i+0]);
276 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
279 *dst++ = av_bswap16(*src++);
282 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
285 uint32_t *sq = ff_squareTbl + 256;
288 for (i = 0; i < h; i++) {
289 s += sq[pix1[0] - pix2[0]];
290 s += sq[pix1[1] - pix2[1]];
291 s += sq[pix1[2] - pix2[2]];
292 s += sq[pix1[3] - pix2[3]];
299 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
302 uint32_t *sq = ff_squareTbl + 256;
305 for (i = 0; i < h; i++) {
306 s += sq[pix1[0] - pix2[0]];
307 s += sq[pix1[1] - pix2[1]];
308 s += sq[pix1[2] - pix2[2]];
309 s += sq[pix1[3] - pix2[3]];
310 s += sq[pix1[4] - pix2[4]];
311 s += sq[pix1[5] - pix2[5]];
312 s += sq[pix1[6] - pix2[6]];
313 s += sq[pix1[7] - pix2[7]];
320 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
323 uint32_t *sq = ff_squareTbl + 256;
326 for (i = 0; i < h; i++) {
327 s += sq[pix1[ 0] - pix2[ 0]];
328 s += sq[pix1[ 1] - pix2[ 1]];
329 s += sq[pix1[ 2] - pix2[ 2]];
330 s += sq[pix1[ 3] - pix2[ 3]];
331 s += sq[pix1[ 4] - pix2[ 4]];
332 s += sq[pix1[ 5] - pix2[ 5]];
333 s += sq[pix1[ 6] - pix2[ 6]];
334 s += sq[pix1[ 7] - pix2[ 7]];
335 s += sq[pix1[ 8] - pix2[ 8]];
336 s += sq[pix1[ 9] - pix2[ 9]];
337 s += sq[pix1[10] - pix2[10]];
338 s += sq[pix1[11] - pix2[11]];
339 s += sq[pix1[12] - pix2[12]];
340 s += sq[pix1[13] - pix2[13]];
341 s += sq[pix1[14] - pix2[14]];
342 s += sq[pix1[15] - pix2[15]];
350 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
351 const uint8_t *s2, int stride){
354 /* read the pixels */
356 block[0] = s1[0] - s2[0];
357 block[1] = s1[1] - s2[1];
358 block[2] = s1[2] - s2[2];
359 block[3] = s1[3] - s2[3];
360 block[4] = s1[4] - s2[4];
361 block[5] = s1[5] - s2[5];
362 block[6] = s1[6] - s2[6];
363 block[7] = s1[7] - s2[7];
370 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
375 /* read the pixels */
377 pixels[0] = av_clip_uint8(block[0]);
378 pixels[1] = av_clip_uint8(block[1]);
379 pixels[2] = av_clip_uint8(block[2]);
380 pixels[3] = av_clip_uint8(block[3]);
381 pixels[4] = av_clip_uint8(block[4]);
382 pixels[5] = av_clip_uint8(block[5]);
383 pixels[6] = av_clip_uint8(block[6]);
384 pixels[7] = av_clip_uint8(block[7]);
391 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
396 /* read the pixels */
398 pixels[0] = av_clip_uint8(block[0]);
399 pixels[1] = av_clip_uint8(block[1]);
400 pixels[2] = av_clip_uint8(block[2]);
401 pixels[3] = av_clip_uint8(block[3]);
408 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
413 /* read the pixels */
415 pixels[0] = av_clip_uint8(block[0]);
416 pixels[1] = av_clip_uint8(block[1]);
423 static void put_signed_pixels_clamped_c(const int16_t *block,
424 uint8_t *av_restrict pixels,
429 for (i = 0; i < 8; i++) {
430 for (j = 0; j < 8; j++) {
433 else if (*block > 127)
436 *pixels = (uint8_t)(*block + 128);
440 pixels += (line_size - 8);
444 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
449 /* read the pixels */
451 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
452 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
453 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
454 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
455 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
456 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
457 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
458 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
464 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
469 /* read the pixels */
471 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
472 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
473 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
474 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
480 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
485 /* read the pixels */
487 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
488 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
494 static int sum_abs_dctelem_c(int16_t *block)
498 sum+= FFABS(block[i]);
502 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
506 for (i = 0; i < h; i++) {
507 memset(block, value, 16);
512 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
516 for (i = 0; i < h; i++) {
517 memset(block, value, 8);
522 #define avg2(a,b) ((a+b+1)>>1)
523 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
525 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
527 const int A=(16-x16)*(16-y16);
528 const int B=( x16)*(16-y16);
529 const int C=(16-x16)*( y16);
530 const int D=( x16)*( y16);
535 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
536 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
537 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
538 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
539 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
540 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
541 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
542 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
548 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
549 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
552 const int s= 1<<shift;
562 for(x=0; x<8; x++){ //XXX FIXME optimize
563 int src_x, src_y, frac_x, frac_y, index;
572 if((unsigned)src_x < width){
573 if((unsigned)src_y < height){
574 index= src_x + src_y*stride;
575 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
576 + src[index +1]* frac_x )*(s-frac_y)
577 + ( src[index+stride ]*(s-frac_x)
578 + src[index+stride+1]* frac_x )* frac_y
581 index= src_x + av_clip(src_y, 0, height)*stride;
582 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
583 + src[index +1]* frac_x )*s
587 if((unsigned)src_y < height){
588 index= av_clip(src_x, 0, width) + src_y*stride;
589 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
590 + src[index+stride ]* frac_y )*s
593 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
594 dst[y*stride + x]= src[index ];
606 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
608 case 2: put_pixels2_8_c (dst, src, stride, height); break;
609 case 4: put_pixels4_8_c (dst, src, stride, height); break;
610 case 8: put_pixels8_8_c (dst, src, stride, height); break;
611 case 16:put_pixels16_8_c(dst, src, stride, height); break;
615 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617 for (i=0; i < height; i++) {
618 for (j=0; j < width; j++) {
619 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
626 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
628 for (i=0; i < height; i++) {
629 for (j=0; j < width; j++) {
630 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
637 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
659 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
670 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 for (i=0; i < height; i++) {
673 for (j=0; j < width; j++) {
674 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
681 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 for (i=0; i < height; i++) {
684 for (j=0; j < width; j++) {
685 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
692 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 for (i=0; i < height; i++) {
695 for (j=0; j < width; j++) {
696 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
703 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
706 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
707 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
708 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
712 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714 for (i=0; i < height; i++) {
715 for (j=0; j < width; j++) {
716 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
723 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
725 for (i=0; i < height; i++) {
726 for (j=0; j < width; j++) {
727 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
734 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
767 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 for (i=0; i < height; i++) {
770 for (j=0; j < width; j++) {
771 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
778 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 for (i=0; i < height; i++) {
781 for (j=0; j < width; j++) {
782 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
789 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 for (i=0; i < height; i++) {
792 for (j=0; j < width; j++) {
793 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
800 #define QPEL_MC(r, OPNAME, RND, OP) \
801 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
802 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
806 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
807 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
808 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
809 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
810 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
811 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
812 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
813 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
819 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
821 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
825 const int src0= src[0*srcStride];\
826 const int src1= src[1*srcStride];\
827 const int src2= src[2*srcStride];\
828 const int src3= src[3*srcStride];\
829 const int src4= src[4*srcStride];\
830 const int src5= src[5*srcStride];\
831 const int src6= src[6*srcStride];\
832 const int src7= src[7*srcStride];\
833 const int src8= src[8*srcStride];\
834 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
835 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
836 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
837 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
838 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
839 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
840 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
841 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
847 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
848 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
853 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
854 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
855 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
856 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
857 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
858 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
859 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
860 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
861 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
862 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
863 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
864 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
865 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
866 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
867 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
868 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
874 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
875 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
880 const int src0= src[0*srcStride];\
881 const int src1= src[1*srcStride];\
882 const int src2= src[2*srcStride];\
883 const int src3= src[3*srcStride];\
884 const int src4= src[4*srcStride];\
885 const int src5= src[5*srcStride];\
886 const int src6= src[6*srcStride];\
887 const int src7= src[7*srcStride];\
888 const int src8= src[8*srcStride];\
889 const int src9= src[9*srcStride];\
890 const int src10= src[10*srcStride];\
891 const int src11= src[11*srcStride];\
892 const int src12= src[12*srcStride];\
893 const int src13= src[13*srcStride];\
894 const int src14= src[14*srcStride];\
895 const int src15= src[15*srcStride];\
896 const int src16= src[16*srcStride];\
897 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
898 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
899 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
900 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
901 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
902 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
903 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
904 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
905 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
906 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
907 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
908 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
909 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
910 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
911 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
912 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
918 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
920 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
921 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
924 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
925 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
928 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
930 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
931 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
934 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
937 copy_block9(full, src, 16, stride, 9);\
938 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
939 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
942 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
944 copy_block9(full, src, 16, stride, 9);\
945 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
948 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
951 copy_block9(full, src, 16, stride, 9);\
952 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
953 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
955 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
960 copy_block9(full, src, 16, stride, 9);\
961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
964 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
966 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
970 copy_block9(full, src, 16, stride, 9);\
971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
972 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
974 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
976 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
981 copy_block9(full, src, 16, stride, 9);\
982 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
983 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
984 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
985 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
987 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
991 copy_block9(full, src, 16, stride, 9);\
992 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
993 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
994 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
997 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1001 uint8_t halfHV[64];\
1002 copy_block9(full, src, 16, stride, 9);\
1003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1004 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1008 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1009 uint8_t full[16*9];\
1011 uint8_t halfHV[64];\
1012 copy_block9(full, src, 16, stride, 9);\
1013 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1014 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1015 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1016 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1018 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1019 uint8_t full[16*9];\
1022 uint8_t halfHV[64];\
1023 copy_block9(full, src, 16, stride, 9);\
1024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1025 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1027 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1029 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1030 uint8_t full[16*9];\
1032 uint8_t halfHV[64];\
1033 copy_block9(full, src, 16, stride, 9);\
1034 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1035 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1036 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1037 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1039 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t halfHV[64];\
1042 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1043 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1044 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1046 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1048 uint8_t halfHV[64];\
1049 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1050 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1053 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1054 uint8_t full[16*9];\
1057 uint8_t halfHV[64];\
1058 copy_block9(full, src, 16, stride, 9);\
1059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1060 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1061 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1062 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1064 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1065 uint8_t full[16*9];\
1067 copy_block9(full, src, 16, stride, 9);\
1068 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1069 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1070 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1072 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1073 uint8_t full[16*9];\
1076 uint8_t halfHV[64];\
1077 copy_block9(full, src, 16, stride, 9);\
1078 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1079 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1080 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1083 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1084 uint8_t full[16*9];\
1086 copy_block9(full, src, 16, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1088 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1089 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1093 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1094 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1097 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1099 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1100 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1103 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1104 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1107 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1109 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1110 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1113 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1114 uint8_t full[24*17];\
1116 copy_block17(full, src, 24, stride, 17);\
1117 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1118 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1121 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1122 uint8_t full[24*17];\
1123 copy_block17(full, src, 24, stride, 17);\
1124 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1127 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1128 uint8_t full[24*17];\
1130 copy_block17(full, src, 24, stride, 17);\
1131 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1132 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1134 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1135 uint8_t full[24*17];\
1136 uint8_t halfH[272];\
1137 uint8_t halfV[256];\
1138 uint8_t halfHV[256];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1141 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1142 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1143 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1145 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1146 uint8_t full[24*17];\
1147 uint8_t halfH[272];\
1148 uint8_t halfHV[256];\
1149 copy_block17(full, src, 24, stride, 17);\
1150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1151 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1153 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1155 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1156 uint8_t full[24*17];\
1157 uint8_t halfH[272];\
1158 uint8_t halfV[256];\
1159 uint8_t halfHV[256];\
1160 copy_block17(full, src, 24, stride, 17);\
1161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1162 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1163 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1164 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1166 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1167 uint8_t full[24*17];\
1168 uint8_t halfH[272];\
1169 uint8_t halfHV[256];\
1170 copy_block17(full, src, 24, stride, 17);\
1171 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1172 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1173 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1174 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1176 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1177 uint8_t full[24*17];\
1178 uint8_t halfH[272];\
1179 uint8_t halfV[256];\
1180 uint8_t halfHV[256];\
1181 copy_block17(full, src, 24, stride, 17);\
1182 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1183 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1184 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1185 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1187 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1188 uint8_t full[24*17];\
1189 uint8_t halfH[272];\
1190 uint8_t halfHV[256];\
1191 copy_block17(full, src, 24, stride, 17);\
1192 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1193 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1194 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1197 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1198 uint8_t full[24*17];\
1199 uint8_t halfH[272];\
1200 uint8_t halfV[256];\
1201 uint8_t halfHV[256];\
1202 copy_block17(full, src, 24, stride, 17);\
1203 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1204 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1208 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1209 uint8_t full[24*17];\
1210 uint8_t halfH[272];\
1211 uint8_t halfHV[256];\
1212 copy_block17(full, src, 24, stride, 17);\
1213 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1214 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1215 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1216 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1218 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1219 uint8_t halfH[272];\
1220 uint8_t halfHV[256];\
1221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1222 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1223 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1225 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1226 uint8_t halfH[272];\
1227 uint8_t halfHV[256];\
1228 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1229 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1232 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1233 uint8_t full[24*17];\
1234 uint8_t halfH[272];\
1235 uint8_t halfV[256];\
1236 uint8_t halfHV[256];\
1237 copy_block17(full, src, 24, stride, 17);\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1239 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1240 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1243 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1244 uint8_t full[24*17];\
1245 uint8_t halfH[272];\
1246 copy_block17(full, src, 24, stride, 17);\
1247 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1248 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1249 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1251 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1252 uint8_t full[24*17];\
1253 uint8_t halfH[272];\
1254 uint8_t halfV[256];\
1255 uint8_t halfHV[256];\
1256 copy_block17(full, src, 24, stride, 17);\
1257 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1258 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1263 uint8_t full[24*17];\
1264 uint8_t halfH[272];\
1265 copy_block17(full, src, 24, stride, 17);\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1267 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1268 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1270 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1271 uint8_t halfH[272];\
1272 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1273 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1276 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1277 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1278 #define op_put(a, b) a = cm[((b) + 16)>>5]
1279 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1281 QPEL_MC(0, put_ , _ , op_put)
1282 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1283 QPEL_MC(0, avg_ , _ , op_avg)
1284 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1286 #undef op_avg_no_rnd
1288 #undef op_put_no_rnd
1290 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1291 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1292 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1293 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1294 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1295 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1297 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1298 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1302 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1303 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1304 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1305 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1306 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1307 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1308 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1309 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1315 #if CONFIG_RV40_DECODER
1316 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1317 put_pixels16_xy2_8_c(dst, src, stride, 16);
1319 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1320 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1322 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1323 put_pixels8_xy2_8_c(dst, src, stride, 8);
1325 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1326 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1328 #endif /* CONFIG_RV40_DECODER */
1330 #if CONFIG_DIRAC_DECODER
1331 #define DIRAC_MC(OPNAME)\
1332 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1334 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1336 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1338 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1340 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1342 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1343 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1345 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1347 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1349 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1351 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1353 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1355 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1356 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1358 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1360 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1362 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1364 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1366 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1368 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1369 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1375 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1376 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1380 const int src_1= src[ -srcStride];
1381 const int src0 = src[0 ];
1382 const int src1 = src[ srcStride];
1383 const int src2 = src[2*srcStride];
1384 const int src3 = src[3*srcStride];
1385 const int src4 = src[4*srcStride];
1386 const int src5 = src[5*srcStride];
1387 const int src6 = src[6*srcStride];
1388 const int src7 = src[7*srcStride];
1389 const int src8 = src[8*srcStride];
1390 const int src9 = src[9*srcStride];
1391 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1392 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1393 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1394 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1395 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1396 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1397 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1398 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1404 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1406 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1407 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1410 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1411 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1414 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1416 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1417 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1420 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1421 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1424 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1428 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1429 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1430 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1431 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1433 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1437 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1438 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1439 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1440 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1442 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1444 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1445 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1448 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1449 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1451 const int strength= ff_h263_loop_filter_strength[qscale];
1455 int p0= src[x-2*stride];
1456 int p1= src[x-1*stride];
1457 int p2= src[x+0*stride];
1458 int p3= src[x+1*stride];
1459 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1461 if (d<-2*strength) d1= 0;
1462 else if(d<- strength) d1=-2*strength - d;
1463 else if(d< strength) d1= d;
1464 else if(d< 2*strength) d1= 2*strength - d;
1469 if(p1&256) p1= ~(p1>>31);
1470 if(p2&256) p2= ~(p2>>31);
1472 src[x-1*stride] = p1;
1473 src[x+0*stride] = p2;
1477 d2= av_clip((p0-p3)/4, -ad1, ad1);
1479 src[x-2*stride] = p0 - d2;
1480 src[x+ stride] = p3 + d2;
1485 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1486 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1488 const int strength= ff_h263_loop_filter_strength[qscale];
1492 int p0= src[y*stride-2];
1493 int p1= src[y*stride-1];
1494 int p2= src[y*stride+0];
1495 int p3= src[y*stride+1];
1496 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1498 if (d<-2*strength) d1= 0;
1499 else if(d<- strength) d1=-2*strength - d;
1500 else if(d< strength) d1= d;
1501 else if(d< 2*strength) d1= 2*strength - d;
1506 if(p1&256) p1= ~(p1>>31);
1507 if(p2&256) p2= ~(p2>>31);
1509 src[y*stride-1] = p1;
1510 src[y*stride+0] = p2;
1514 d2= av_clip((p0-p3)/4, -ad1, ad1);
1516 src[y*stride-2] = p0 - d2;
1517 src[y*stride+1] = p3 + d2;
1522 static void h261_loop_filter_c(uint8_t *src, int stride){
1527 temp[x ] = 4*src[x ];
1528 temp[x + 7*8] = 4*src[x + 7*stride];
1532 xy = y * stride + x;
1534 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1539 src[ y*stride] = (temp[ y*8] + 2)>>2;
1540 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1542 xy = y * stride + x;
1544 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1549 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1555 s += abs(pix1[0] - pix2[0]);
1556 s += abs(pix1[1] - pix2[1]);
1557 s += abs(pix1[2] - pix2[2]);
1558 s += abs(pix1[3] - pix2[3]);
1559 s += abs(pix1[4] - pix2[4]);
1560 s += abs(pix1[5] - pix2[5]);
1561 s += abs(pix1[6] - pix2[6]);
1562 s += abs(pix1[7] - pix2[7]);
1563 s += abs(pix1[8] - pix2[8]);
1564 s += abs(pix1[9] - pix2[9]);
1565 s += abs(pix1[10] - pix2[10]);
1566 s += abs(pix1[11] - pix2[11]);
1567 s += abs(pix1[12] - pix2[12]);
1568 s += abs(pix1[13] - pix2[13]);
1569 s += abs(pix1[14] - pix2[14]);
1570 s += abs(pix1[15] - pix2[15]);
1577 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1583 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1584 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1585 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1586 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1587 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1588 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1589 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1590 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1591 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1592 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1593 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1594 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1595 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1596 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1597 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1598 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1605 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1608 uint8_t *pix3 = pix2 + line_size;
1612 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1613 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1614 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1615 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1616 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1617 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1618 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1619 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1620 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1621 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1622 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1623 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1624 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1625 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1626 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1627 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1635 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1638 uint8_t *pix3 = pix2 + line_size;
1642 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1643 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1644 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1645 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1646 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1647 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1648 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1649 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1650 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1651 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1652 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1653 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1654 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1655 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1656 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1657 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1665 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1671 s += abs(pix1[0] - pix2[0]);
1672 s += abs(pix1[1] - pix2[1]);
1673 s += abs(pix1[2] - pix2[2]);
1674 s += abs(pix1[3] - pix2[3]);
1675 s += abs(pix1[4] - pix2[4]);
1676 s += abs(pix1[5] - pix2[5]);
1677 s += abs(pix1[6] - pix2[6]);
1678 s += abs(pix1[7] - pix2[7]);
1685 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1691 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1692 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1693 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1694 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1695 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1696 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1697 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1698 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1705 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1708 uint8_t *pix3 = pix2 + line_size;
1712 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1713 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1714 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1715 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1716 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1717 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1718 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1719 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1727 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1730 uint8_t *pix3 = pix2 + line_size;
1734 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1735 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1736 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1737 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1738 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1739 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1740 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1741 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1749 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1750 MpegEncContext *c = v;
1756 for(x=0; x<16; x++){
1757 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1760 for(x=0; x<15; x++){
1761 score2+= FFABS( s1[x ] - s1[x +stride]
1762 - s1[x+1] + s1[x+1+stride])
1763 -FFABS( s2[x ] - s2[x +stride]
1764 - s2[x+1] + s2[x+1+stride]);
1771 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1772 else return score1 + FFABS(score2)*8;
1775 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1776 MpegEncContext *c = v;
1783 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1787 score2+= FFABS( s1[x ] - s1[x +stride]
1788 - s1[x+1] + s1[x+1+stride])
1789 -FFABS( s2[x ] - s2[x +stride]
1790 - s2[x+1] + s2[x+1+stride]);
1797 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1798 else return score1 + FFABS(score2)*8;
1801 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1805 for(i=0; i<8*8; i++){
1806 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1809 av_assert2(-512<b && b<512);
1811 sum += (w*b)*(w*b)>>4;
1816 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1819 for(i=0; i<8*8; i++){
1820 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1824 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1828 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1831 memset(cmp, 0, sizeof(void*)*6);
1839 cmp[i]= c->hadamard8_diff[i];
1845 cmp[i]= c->dct_sad[i];
1848 cmp[i]= c->dct264_sad[i];
1851 cmp[i]= c->dct_max[i];
1854 cmp[i]= c->quant_psnr[i];
1883 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1888 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1890 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1891 long a = *(long*)(src+i);
1892 long b = *(long*)(dst+i);
1893 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1896 dst[i+0] += src[i+0];
1899 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1901 #if !HAVE_FAST_UNALIGNED
1902 if((long)src2 & (sizeof(long)-1)){
1903 for(i=0; i+7<w; i+=8){
1904 dst[i+0] = src1[i+0]-src2[i+0];
1905 dst[i+1] = src1[i+1]-src2[i+1];
1906 dst[i+2] = src1[i+2]-src2[i+2];
1907 dst[i+3] = src1[i+3]-src2[i+3];
1908 dst[i+4] = src1[i+4]-src2[i+4];
1909 dst[i+5] = src1[i+5]-src2[i+5];
1910 dst[i+6] = src1[i+6]-src2[i+6];
1911 dst[i+7] = src1[i+7]-src2[i+7];
1915 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1916 long a = *(long*)(src1+i);
1917 long b = *(long*)(src2+i);
1918 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1921 dst[i+0] = src1[i+0]-src2[i+0];
1924 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1932 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1941 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1949 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1959 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1962 for(i=0; i<w-1; i++){
1989 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2019 #define BUTTERFLY2(o1,o2,i1,i2) \
2023 #define BUTTERFLY1(x,y) \
2032 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2034 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2042 //FIXME try pointer walks
2043 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2044 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2045 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2046 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2048 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2049 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2050 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2051 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2053 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2054 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2055 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2056 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2060 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2061 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2062 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2063 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2065 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2066 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2067 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2068 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2071 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2072 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2073 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2074 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2079 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2087 //FIXME try pointer walks
2088 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2089 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2090 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2091 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2093 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2094 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2095 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2096 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2098 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2099 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2100 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2101 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2105 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2106 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2107 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2108 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2110 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2111 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2112 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2113 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2116 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2117 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2118 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2119 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2122 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2127 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2128 MpegEncContext * const s= (MpegEncContext *)c;
2129 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2133 s->dsp.diff_pixels(temp, src1, src2, stride);
2135 return s->dsp.sum_abs_dctelem(temp);
2140 const int s07 = SRC(0) + SRC(7);\
2141 const int s16 = SRC(1) + SRC(6);\
2142 const int s25 = SRC(2) + SRC(5);\
2143 const int s34 = SRC(3) + SRC(4);\
2144 const int a0 = s07 + s34;\
2145 const int a1 = s16 + s25;\
2146 const int a2 = s07 - s34;\
2147 const int a3 = s16 - s25;\
2148 const int d07 = SRC(0) - SRC(7);\
2149 const int d16 = SRC(1) - SRC(6);\
2150 const int d25 = SRC(2) - SRC(5);\
2151 const int d34 = SRC(3) - SRC(4);\
2152 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2153 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2154 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2155 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2157 DST(1, a4 + (a7>>2)) ;\
2158 DST(2, a2 + (a3>>1)) ;\
2159 DST(3, a5 + (a6>>2)) ;\
2161 DST(5, a6 - (a5>>2)) ;\
2162 DST(6, (a2>>1) - a3 ) ;\
2163 DST(7, (a4>>2) - a7 ) ;\
2166 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2167 MpegEncContext * const s= (MpegEncContext *)c;
2172 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2174 #define SRC(x) dct[i][x]
2175 #define DST(x,v) dct[i][x]= v
2176 for( i = 0; i < 8; i++ )
2181 #define SRC(x) dct[x][i]
2182 #define DST(x,v) sum += FFABS(v)
2183 for( i = 0; i < 8; i++ )
2191 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2192 MpegEncContext * const s= (MpegEncContext *)c;
2193 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2198 s->dsp.diff_pixels(temp, src1, src2, stride);
2202 sum= FFMAX(sum, FFABS(temp[i]));
2207 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2208 MpegEncContext * const s= (MpegEncContext *)c;
2209 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2210 int16_t * const bak = temp+64;
2216 s->dsp.diff_pixels(temp, src1, src2, stride);
2218 memcpy(bak, temp, 64*sizeof(int16_t));
2220 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2221 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2222 ff_simple_idct_8(temp); //FIXME
2225 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2230 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2231 MpegEncContext * const s= (MpegEncContext *)c;
2232 const uint8_t *scantable= s->intra_scantable.permutated;
2233 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2234 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2235 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2236 int i, last, run, bits, level, distortion, start_i;
2237 const int esc_length= s->ac_esc_length;
2239 uint8_t * last_length;
2243 copy_block8(lsrc1, src1, 8, stride, 8);
2244 copy_block8(lsrc2, src2, 8, stride, 8);
2246 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2248 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2254 length = s->intra_ac_vlc_length;
2255 last_length= s->intra_ac_vlc_last_length;
2256 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2259 length = s->inter_ac_vlc_length;
2260 last_length= s->inter_ac_vlc_last_length;
2265 for(i=start_i; i<last; i++){
2266 int j= scantable[i];
2271 if((level&(~127)) == 0){
2272 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2281 level= temp[i] + 64;
2283 av_assert2(level - 64);
2285 if((level&(~127)) == 0){
2286 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2294 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2296 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2299 s->dsp.idct_add(lsrc2, 8, temp);
2301 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2303 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2306 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2307 MpegEncContext * const s= (MpegEncContext *)c;
2308 const uint8_t *scantable= s->intra_scantable.permutated;
2309 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2310 int i, last, run, bits, level, start_i;
2311 const int esc_length= s->ac_esc_length;
2313 uint8_t * last_length;
2317 s->dsp.diff_pixels(temp, src1, src2, stride);
2319 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2325 length = s->intra_ac_vlc_length;
2326 last_length= s->intra_ac_vlc_last_length;
2327 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2330 length = s->inter_ac_vlc_length;
2331 last_length= s->inter_ac_vlc_last_length;
2336 for(i=start_i; i<last; i++){
2337 int j= scantable[i];
2342 if((level&(~127)) == 0){
2343 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2352 level= temp[i] + 64;
2354 av_assert2(level - 64);
2356 if((level&(~127)) == 0){
2357 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2365 #define VSAD_INTRA(size) \
2366 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2370 for(y=1; y<h; y++){ \
2371 for(x=0; x<size; x+=4){ \
2372 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2373 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2383 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2388 for(x=0; x<16; x++){
2389 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2398 #define SQ(a) ((a)*(a))
2399 #define VSSE_INTRA(size) \
2400 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2404 for(y=1; y<h; y++){ \
2405 for(x=0; x<size; x+=4){ \
2406 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2407 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2417 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2422 for(x=0; x<16; x++){
2423 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2432 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2436 for(i=0; i<size; i++)
2437 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2441 #define WRAPPER8_16_SQ(name8, name16)\
2442 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2444 score +=name8(s, dst , src , stride, 8);\
2445 score +=name8(s, dst+8 , src+8 , stride, 8);\
2449 score +=name8(s, dst , src , stride, 8);\
2450 score +=name8(s, dst+8 , src+8 , stride, 8);\
2455 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2456 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2457 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2459 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2461 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2462 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2463 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2464 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2466 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2467 uint32_t maxi, uint32_t maxisign)
2470 if(a > mini) return mini;
2471 else if((a^(1U<<31)) > maxisign) return maxi;
2475 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2477 uint32_t mini = *(uint32_t*)min;
2478 uint32_t maxi = *(uint32_t*)max;
2479 uint32_t maxisign = maxi ^ (1U<<31);
2480 uint32_t *dsti = (uint32_t*)dst;
2481 const uint32_t *srci = (const uint32_t*)src;
2482 for(i=0; i<len; i+=8) {
2483 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2484 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2485 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2486 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2487 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2488 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2489 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2490 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2493 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2495 if(min < 0 && max > 0) {
2496 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2498 for(i=0; i < len; i+=8) {
2499 dst[i ] = av_clipf(src[i ], min, max);
2500 dst[i + 1] = av_clipf(src[i + 1], min, max);
2501 dst[i + 2] = av_clipf(src[i + 2], min, max);
2502 dst[i + 3] = av_clipf(src[i + 3], min, max);
2503 dst[i + 4] = av_clipf(src[i + 4], min, max);
2504 dst[i + 5] = av_clipf(src[i + 5], min, max);
2505 dst[i + 6] = av_clipf(src[i + 6], min, max);
2506 dst[i + 7] = av_clipf(src[i + 7], min, max);
2511 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2516 res += *v1++ * *v2++;
2521 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2526 *v1++ += mul * *v3++;
2531 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2532 const int16_t *window, unsigned int len)
2535 int len2 = len >> 1;
2537 for (i = 0; i < len2; i++) {
2538 int16_t w = window[i];
2539 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2540 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2544 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2545 int32_t max, unsigned int len)
2548 *dst++ = av_clip(*src++, min, max);
2549 *dst++ = av_clip(*src++, min, max);
2550 *dst++ = av_clip(*src++, min, max);
2551 *dst++ = av_clip(*src++, min, max);
2552 *dst++ = av_clip(*src++, min, max);
2553 *dst++ = av_clip(*src++, min, max);
2554 *dst++ = av_clip(*src++, min, max);
2555 *dst++ = av_clip(*src++, min, max);
2560 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2562 ff_j_rev_dct (block);
2563 put_pixels_clamped_c(block, dest, line_size);
2565 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2567 ff_j_rev_dct (block);
2568 add_pixels_clamped_c(block, dest, line_size);
2571 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2573 ff_j_rev_dct4 (block);
2574 put_pixels_clamped4_c(block, dest, line_size);
2576 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2578 ff_j_rev_dct4 (block);
2579 add_pixels_clamped4_c(block, dest, line_size);
2582 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2584 ff_j_rev_dct2 (block);
2585 put_pixels_clamped2_c(block, dest, line_size);
2587 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2589 ff_j_rev_dct2 (block);
2590 add_pixels_clamped2_c(block, dest, line_size);
2593 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2595 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2597 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2599 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2602 /* init static data */
2603 av_cold void ff_dsputil_static_init(void)
2607 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2608 for(i=0;i<MAX_NEG_CROP;i++) {
2610 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2613 for(i=0;i<512;i++) {
2614 ff_squareTbl[i] = (i - 256) * (i - 256);
2617 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2620 int ff_check_alignment(void){
2621 static int did_fail=0;
2622 LOCAL_ALIGNED_16(int, aligned, [4]);
2624 if((intptr_t)aligned & 15){
2626 #if HAVE_MMX || HAVE_ALTIVEC
2627 av_log(NULL, AV_LOG_ERROR,
2628 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2629 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2630 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2631 "Do not report crashes to FFmpeg developers.\n");
2640 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2642 ff_check_alignment();
2645 if (avctx->bits_per_raw_sample == 10) {
2646 c->fdct = ff_jpeg_fdct_islow_10;
2647 c->fdct248 = ff_fdct248_islow_10;
2649 if(avctx->dct_algo==FF_DCT_FASTINT) {
2650 c->fdct = ff_fdct_ifast;
2651 c->fdct248 = ff_fdct_ifast248;
2653 else if(avctx->dct_algo==FF_DCT_FAAN) {
2654 c->fdct = ff_faandct;
2655 c->fdct248 = ff_faandct248;
2658 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2659 c->fdct248 = ff_fdct248_islow_8;
2662 #endif //CONFIG_ENCODERS
2664 if(avctx->lowres==1){
2665 c->idct_put= ff_jref_idct4_put;
2666 c->idct_add= ff_jref_idct4_add;
2667 c->idct = ff_j_rev_dct4;
2668 c->idct_permutation_type= FF_NO_IDCT_PERM;
2669 }else if(avctx->lowres==2){
2670 c->idct_put= ff_jref_idct2_put;
2671 c->idct_add= ff_jref_idct2_add;
2672 c->idct = ff_j_rev_dct2;
2673 c->idct_permutation_type= FF_NO_IDCT_PERM;
2674 }else if(avctx->lowres==3){
2675 c->idct_put= ff_jref_idct1_put;
2676 c->idct_add= ff_jref_idct1_add;
2677 c->idct = ff_j_rev_dct1;
2678 c->idct_permutation_type= FF_NO_IDCT_PERM;
2680 if (avctx->bits_per_raw_sample == 10) {
2681 c->idct_put = ff_simple_idct_put_10;
2682 c->idct_add = ff_simple_idct_add_10;
2683 c->idct = ff_simple_idct_10;
2684 c->idct_permutation_type = FF_NO_IDCT_PERM;
2686 if(avctx->idct_algo==FF_IDCT_INT){
2687 c->idct_put= ff_jref_idct_put;
2688 c->idct_add= ff_jref_idct_add;
2689 c->idct = ff_j_rev_dct;
2690 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2691 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2692 c->idct_put= ff_faanidct_put;
2693 c->idct_add= ff_faanidct_add;
2694 c->idct = ff_faanidct;
2695 c->idct_permutation_type= FF_NO_IDCT_PERM;
2696 }else{ //accurate/default
2697 c->idct_put = ff_simple_idct_put_8;
2698 c->idct_add = ff_simple_idct_add_8;
2699 c->idct = ff_simple_idct_8;
2700 c->idct_permutation_type= FF_NO_IDCT_PERM;
2705 c->diff_pixels = diff_pixels_c;
2706 c->put_pixels_clamped = put_pixels_clamped_c;
2707 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2708 c->add_pixels_clamped = add_pixels_clamped_c;
2709 c->sum_abs_dctelem = sum_abs_dctelem_c;
2712 c->pix_sum = pix_sum_c;
2713 c->pix_norm1 = pix_norm1_c;
2715 c->fill_block_tab[0] = fill_block16_c;
2716 c->fill_block_tab[1] = fill_block8_c;
2718 /* TODO [0] 16 [1] 8 */
2719 c->pix_abs[0][0] = pix_abs16_c;
2720 c->pix_abs[0][1] = pix_abs16_x2_c;
2721 c->pix_abs[0][2] = pix_abs16_y2_c;
2722 c->pix_abs[0][3] = pix_abs16_xy2_c;
2723 c->pix_abs[1][0] = pix_abs8_c;
2724 c->pix_abs[1][1] = pix_abs8_x2_c;
2725 c->pix_abs[1][2] = pix_abs8_y2_c;
2726 c->pix_abs[1][3] = pix_abs8_xy2_c;
2728 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2729 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2730 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2731 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2732 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2733 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2734 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2735 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2736 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2738 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2739 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2740 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2741 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2742 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2743 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2744 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2745 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2746 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2748 #define dspfunc(PFX, IDX, NUM) \
2749 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2750 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2751 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2752 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2753 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2754 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2755 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2756 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2757 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2758 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2759 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2760 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2761 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2762 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2763 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2764 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2766 dspfunc(put_qpel, 0, 16);
2767 dspfunc(put_no_rnd_qpel, 0, 16);
2769 dspfunc(avg_qpel, 0, 16);
2770 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2772 dspfunc(put_qpel, 1, 8);
2773 dspfunc(put_no_rnd_qpel, 1, 8);
2775 dspfunc(avg_qpel, 1, 8);
2776 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2780 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2781 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2782 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2783 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2784 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2785 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2786 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2787 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2789 #define SET_CMP_FUNC(name) \
2790 c->name[0]= name ## 16_c;\
2791 c->name[1]= name ## 8x8_c;
2793 SET_CMP_FUNC(hadamard8_diff)
2794 c->hadamard8_diff[4]= hadamard8_intra16_c;
2795 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2796 SET_CMP_FUNC(dct_sad)
2797 SET_CMP_FUNC(dct_max)
2799 SET_CMP_FUNC(dct264_sad)
2801 c->sad[0]= pix_abs16_c;
2802 c->sad[1]= pix_abs8_c;
2806 SET_CMP_FUNC(quant_psnr)
2809 c->vsad[0]= vsad16_c;
2810 c->vsad[4]= vsad_intra16_c;
2811 c->vsad[5]= vsad_intra8_c;
2812 c->vsse[0]= vsse16_c;
2813 c->vsse[4]= vsse_intra16_c;
2814 c->vsse[5]= vsse_intra8_c;
2815 c->nsse[0]= nsse16_c;
2816 c->nsse[1]= nsse8_c;
2817 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2818 ff_dsputil_init_dwt(c);
2821 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2823 c->add_bytes= add_bytes_c;
2824 c->diff_bytes= diff_bytes_c;
2825 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2826 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2827 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2828 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2829 c->bswap_buf= bswap_buf;
2830 c->bswap16_buf = bswap16_buf;
2832 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2833 c->h263_h_loop_filter= h263_h_loop_filter_c;
2834 c->h263_v_loop_filter= h263_v_loop_filter_c;
2837 c->h261_loop_filter= h261_loop_filter_c;
2839 c->try_8x8basis= try_8x8basis_c;
2840 c->add_8x8basis= add_8x8basis_c;
2842 c->vector_clipf = vector_clipf_c;
2843 c->scalarproduct_int16 = scalarproduct_int16_c;
2844 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2845 c->apply_window_int16 = apply_window_int16_c;
2846 c->vector_clip_int32 = vector_clip_int32_c;
2848 c->shrink[0]= av_image_copy_plane;
2849 c->shrink[1]= ff_shrink22;
2850 c->shrink[2]= ff_shrink44;
2851 c->shrink[3]= ff_shrink88;
2853 #define hpel_funcs(prefix, idx, num) \
2854 c->prefix ## _pixels_tab idx [0] = prefix ## _pixels ## num ## _8_c; \
2855 c->prefix ## _pixels_tab idx [1] = prefix ## _pixels ## num ## _x2_8_c; \
2856 c->prefix ## _pixels_tab idx [2] = prefix ## _pixels ## num ## _y2_8_c; \
2857 c->prefix ## _pixels_tab idx [3] = prefix ## _pixels ## num ## _xy2_8_c
2859 hpel_funcs(put, [0], 16);
2860 hpel_funcs(put, [1], 8);
2861 hpel_funcs(put, [2], 4);
2862 hpel_funcs(put, [3], 2);
2863 hpel_funcs(put_no_rnd, [0], 16);
2864 hpel_funcs(put_no_rnd, [1], 8);
2865 hpel_funcs(avg, [0], 16);
2866 hpel_funcs(avg, [1], 8);
2867 hpel_funcs(avg, [2], 4);
2868 hpel_funcs(avg, [3], 2);
2869 hpel_funcs(avg_no_rnd,, 16);
2873 #define FUNC(f, depth) f ## _ ## depth
2874 #define FUNCC(f, depth) f ## _ ## depth ## _c
2876 #define BIT_DEPTH_FUNCS(depth, dct)\
2877 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
2878 c->draw_edges = FUNCC(draw_edges , depth);\
2879 c->clear_block = FUNCC(clear_block ## dct , depth);\
2880 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
2881 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
2882 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
2884 switch (avctx->bits_per_raw_sample) {
2886 if (c->dct_bits == 32) {
2887 BIT_DEPTH_FUNCS(9, _32);
2889 BIT_DEPTH_FUNCS(9, _16);
2893 if (c->dct_bits == 32) {
2894 BIT_DEPTH_FUNCS(10, _32);
2896 BIT_DEPTH_FUNCS(10, _16);
2900 if (c->dct_bits == 32) {
2901 BIT_DEPTH_FUNCS(12, _32);
2903 BIT_DEPTH_FUNCS(12, _16);
2907 if (c->dct_bits == 32) {
2908 BIT_DEPTH_FUNCS(14, _32);
2910 BIT_DEPTH_FUNCS(14, _16);
2914 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2915 BIT_DEPTH_FUNCS(8, _16);
2921 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2922 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2923 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2924 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2925 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2926 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2927 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2929 ff_init_scantable_permutation(c->idct_permutation,
2930 c->idct_permutation_type);
2933 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2935 ff_dsputil_init(c, avctx);