3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_squareTbl[512] = {0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL/255 * 0x7f)
57 #define pb_80 (~0UL/255 * 0x80)
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60 specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62 0, 8, 1, 9, 16, 24, 2, 10,
63 17, 25, 32, 40, 48, 56, 33, 41,
64 18, 26, 3, 11, 4, 12, 19, 27,
65 34, 42, 49, 57, 50, 58, 35, 43,
66 20, 28, 5, 13, 6, 14, 21, 29,
67 36, 44, 51, 59, 52, 60, 37, 45,
68 22, 30, 7, 15, 23, 31, 38, 46,
69 53, 61, 54, 62, 39, 47, 55, 63,
72 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
73 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
75 const uint8_t ff_alternate_horizontal_scan[64] = {
76 0, 1, 2, 3, 8, 9, 16, 17,
77 10, 11, 4, 5, 6, 7, 15, 14,
78 13, 12, 19, 18, 24, 25, 32, 33,
79 26, 27, 20, 21, 22, 23, 28, 29,
80 30, 31, 34, 35, 40, 41, 48, 49,
81 42, 43, 36, 37, 38, 39, 44, 45,
82 46, 47, 50, 51, 56, 57, 58, 59,
83 52, 53, 54, 55, 60, 61, 62, 63,
86 const uint8_t ff_alternate_vertical_scan[64] = {
87 0, 8, 16, 24, 1, 9, 2, 10,
88 17, 25, 32, 40, 48, 56, 57, 49,
89 41, 33, 26, 18, 3, 11, 4, 12,
90 19, 27, 34, 42, 50, 58, 35, 43,
91 51, 59, 20, 28, 5, 13, 6, 14,
92 21, 29, 36, 44, 52, 60, 37, 45,
93 53, 61, 22, 30, 7, 15, 23, 31,
94 38, 46, 54, 62, 39, 47, 55, 63,
97 /* Input permutation for the simple_idct_mmx */
98 static const uint8_t simple_mmx_permutation[64]={
99 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
111 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
112 const uint8_t *src_scantable)
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
128 j = st->permutated[i];
130 st->raster_end[i]= end;
134 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
135 int idct_permutation_type)
139 switch(idct_permutation_type){
140 case FF_NO_IDCT_PERM:
142 idct_permutation[i]= i;
144 case FF_LIBMPEG2_IDCT_PERM:
146 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
148 case FF_SIMPLE_IDCT_PERM:
150 idct_permutation[i]= simple_mmx_permutation[i];
152 case FF_TRANSPOSE_IDCT_PERM:
154 idct_permutation[i]= ((i&7)<<3) | (i>>3);
156 case FF_PARTTRANS_IDCT_PERM:
158 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
160 case FF_SSE2_IDCT_PERM:
162 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
165 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
169 static int pix_sum_c(uint8_t * pix, int line_size)
174 for (i = 0; i < 16; i++) {
175 for (j = 0; j < 16; j += 8) {
186 pix += line_size - 16;
191 static int pix_norm1_c(uint8_t * pix, int line_size)
194 uint32_t *sq = ff_squareTbl + 256;
197 for (i = 0; i < 16; i++) {
198 for (j = 0; j < 16; j += 8) {
210 register uint64_t x=*(uint64_t*)pix;
212 s += sq[(x>>8)&0xff];
213 s += sq[(x>>16)&0xff];
214 s += sq[(x>>24)&0xff];
215 s += sq[(x>>32)&0xff];
216 s += sq[(x>>40)&0xff];
217 s += sq[(x>>48)&0xff];
218 s += sq[(x>>56)&0xff];
220 register uint32_t x=*(uint32_t*)pix;
222 s += sq[(x>>8)&0xff];
223 s += sq[(x>>16)&0xff];
224 s += sq[(x>>24)&0xff];
225 x=*(uint32_t*)(pix+4);
227 s += sq[(x>>8)&0xff];
228 s += sq[(x>>16)&0xff];
229 s += sq[(x>>24)&0xff];
234 pix += line_size - 16;
239 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
242 for(i=0; i+8<=w; i+=8){
243 dst[i+0]= av_bswap32(src[i+0]);
244 dst[i+1]= av_bswap32(src[i+1]);
245 dst[i+2]= av_bswap32(src[i+2]);
246 dst[i+3]= av_bswap32(src[i+3]);
247 dst[i+4]= av_bswap32(src[i+4]);
248 dst[i+5]= av_bswap32(src[i+5]);
249 dst[i+6]= av_bswap32(src[i+6]);
250 dst[i+7]= av_bswap32(src[i+7]);
253 dst[i+0]= av_bswap32(src[i+0]);
257 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
260 *dst++ = av_bswap16(*src++);
263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
266 uint32_t *sq = ff_squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[0] - pix2[0]];
271 s += sq[pix1[1] - pix2[1]];
272 s += sq[pix1[2] - pix2[2]];
273 s += sq[pix1[3] - pix2[3]];
280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[0] - pix2[0]];
288 s += sq[pix1[1] - pix2[1]];
289 s += sq[pix1[2] - pix2[2]];
290 s += sq[pix1[3] - pix2[3]];
291 s += sq[pix1[4] - pix2[4]];
292 s += sq[pix1[5] - pix2[5]];
293 s += sq[pix1[6] - pix2[6]];
294 s += sq[pix1[7] - pix2[7]];
301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
304 uint32_t *sq = ff_squareTbl + 256;
307 for (i = 0; i < h; i++) {
308 s += sq[pix1[ 0] - pix2[ 0]];
309 s += sq[pix1[ 1] - pix2[ 1]];
310 s += sq[pix1[ 2] - pix2[ 2]];
311 s += sq[pix1[ 3] - pix2[ 3]];
312 s += sq[pix1[ 4] - pix2[ 4]];
313 s += sq[pix1[ 5] - pix2[ 5]];
314 s += sq[pix1[ 6] - pix2[ 6]];
315 s += sq[pix1[ 7] - pix2[ 7]];
316 s += sq[pix1[ 8] - pix2[ 8]];
317 s += sq[pix1[ 9] - pix2[ 9]];
318 s += sq[pix1[10] - pix2[10]];
319 s += sq[pix1[11] - pix2[11]];
320 s += sq[pix1[12] - pix2[12]];
321 s += sq[pix1[13] - pix2[13]];
322 s += sq[pix1[14] - pix2[14]];
323 s += sq[pix1[15] - pix2[15]];
331 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
332 const uint8_t *s2, int stride){
335 /* read the pixels */
337 block[0] = s1[0] - s2[0];
338 block[1] = s1[1] - s2[1];
339 block[2] = s1[2] - s2[2];
340 block[3] = s1[3] - s2[3];
341 block[4] = s1[4] - s2[4];
342 block[5] = s1[5] - s2[5];
343 block[6] = s1[6] - s2[6];
344 block[7] = s1[7] - s2[7];
351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
356 /* read the pixels */
358 pixels[0] = av_clip_uint8(block[0]);
359 pixels[1] = av_clip_uint8(block[1]);
360 pixels[2] = av_clip_uint8(block[2]);
361 pixels[3] = av_clip_uint8(block[3]);
362 pixels[4] = av_clip_uint8(block[4]);
363 pixels[5] = av_clip_uint8(block[5]);
364 pixels[6] = av_clip_uint8(block[6]);
365 pixels[7] = av_clip_uint8(block[7]);
372 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
377 /* read the pixels */
379 pixels[0] = av_clip_uint8(block[0]);
380 pixels[1] = av_clip_uint8(block[1]);
381 pixels[2] = av_clip_uint8(block[2]);
382 pixels[3] = av_clip_uint8(block[3]);
389 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
394 /* read the pixels */
396 pixels[0] = av_clip_uint8(block[0]);
397 pixels[1] = av_clip_uint8(block[1]);
404 static void put_signed_pixels_clamped_c(const int16_t *block,
405 uint8_t *av_restrict pixels,
410 for (i = 0; i < 8; i++) {
411 for (j = 0; j < 8; j++) {
414 else if (*block > 127)
417 *pixels = (uint8_t)(*block + 128);
421 pixels += (line_size - 8);
425 static void add_pixels8_c(uint8_t *av_restrict pixels,
432 pixels[0] += block[0];
433 pixels[1] += block[1];
434 pixels[2] += block[2];
435 pixels[3] += block[3];
436 pixels[4] += block[4];
437 pixels[5] += block[5];
438 pixels[6] += block[6];
439 pixels[7] += block[7];
445 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
450 /* read the pixels */
452 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
453 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
454 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
455 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
456 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
457 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
458 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
459 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
465 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
470 /* read the pixels */
472 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
473 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
474 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
475 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
481 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
486 /* read the pixels */
488 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
489 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
495 static int sum_abs_dctelem_c(int16_t *block)
499 sum+= FFABS(block[i]);
503 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
507 for (i = 0; i < h; i++) {
508 memset(block, value, 16);
513 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
517 for (i = 0; i < h; i++) {
518 memset(block, value, 8);
523 #define avg2(a,b) ((a+b+1)>>1)
524 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
526 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
528 const int A=(16-x16)*(16-y16);
529 const int B=( x16)*(16-y16);
530 const int C=(16-x16)*( y16);
531 const int D=( x16)*( y16);
536 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
537 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
538 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
539 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
540 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
541 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
542 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
543 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
549 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
550 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
553 const int s= 1<<shift;
563 for(x=0; x<8; x++){ //XXX FIXME optimize
564 int src_x, src_y, frac_x, frac_y, index;
573 if((unsigned)src_x < width){
574 if((unsigned)src_y < height){
575 index= src_x + src_y*stride;
576 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
577 + src[index +1]* frac_x )*(s-frac_y)
578 + ( src[index+stride ]*(s-frac_x)
579 + src[index+stride+1]* frac_x )* frac_y
582 index= src_x + av_clip(src_y, 0, height)*stride;
583 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
584 + src[index +1]* frac_x )*s
588 if((unsigned)src_y < height){
589 index= av_clip(src_x, 0, width) + src_y*stride;
590 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
591 + src[index+stride ]* frac_y )*s
594 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
595 dst[y*stride + x]= src[index ];
607 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
609 case 2: put_pixels2_8_c (dst, src, stride, height); break;
610 case 4: put_pixels4_8_c (dst, src, stride, height); break;
611 case 8: put_pixels8_8_c (dst, src, stride, height); break;
612 case 16:put_pixels16_8_c(dst, src, stride, height); break;
616 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
618 for (i=0; i < height; i++) {
619 for (j=0; j < width; j++) {
620 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
627 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
629 for (i=0; i < height; i++) {
630 for (j=0; j < width; j++) {
631 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
638 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
660 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
671 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
682 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
704 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
707 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
708 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
709 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
713 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
715 for (i=0; i < height; i++) {
716 for (j=0; j < width; j++) {
717 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
724 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
726 for (i=0; i < height; i++) {
727 for (j=0; j < width; j++) {
728 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
735 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
801 #define QPEL_MC(r, OPNAME, RND, OP) \
802 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
803 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
807 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
808 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
809 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
810 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
811 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
812 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
813 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
814 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
820 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
822 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
826 const int src0= src[0*srcStride];\
827 const int src1= src[1*srcStride];\
828 const int src2= src[2*srcStride];\
829 const int src3= src[3*srcStride];\
830 const int src4= src[4*srcStride];\
831 const int src5= src[5*srcStride];\
832 const int src6= src[6*srcStride];\
833 const int src7= src[7*srcStride];\
834 const int src8= src[8*srcStride];\
835 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
836 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
837 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
838 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
839 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
840 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
841 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
842 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
848 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
849 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
854 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
855 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
856 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
857 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
858 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
859 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
860 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
861 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
862 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
863 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
864 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
865 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
866 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
867 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
868 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
869 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
875 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
876 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
881 const int src0= src[0*srcStride];\
882 const int src1= src[1*srcStride];\
883 const int src2= src[2*srcStride];\
884 const int src3= src[3*srcStride];\
885 const int src4= src[4*srcStride];\
886 const int src5= src[5*srcStride];\
887 const int src6= src[6*srcStride];\
888 const int src7= src[7*srcStride];\
889 const int src8= src[8*srcStride];\
890 const int src9= src[9*srcStride];\
891 const int src10= src[10*srcStride];\
892 const int src11= src[11*srcStride];\
893 const int src12= src[12*srcStride];\
894 const int src13= src[13*srcStride];\
895 const int src14= src[14*srcStride];\
896 const int src15= src[15*srcStride];\
897 const int src16= src[16*srcStride];\
898 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
899 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
900 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
901 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
902 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
903 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
904 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
905 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
906 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
907 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
908 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
909 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
910 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
911 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
912 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
913 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
919 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
922 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
923 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
928 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
931 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
934 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
935 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
938 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
942 copy_block9(full, src, 16, stride, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
944 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
947 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
950 copy_block9(full, src, 16, stride, 9);\
951 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
954 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
958 copy_block9(full, src, 16, stride, 9);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
960 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
962 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
968 copy_block9(full, src, 16, stride, 9);\
969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
974 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
979 copy_block9(full, src, 16, stride, 9);\
980 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
985 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
991 copy_block9(full, src, 16, stride, 9);\
992 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
993 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
994 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
997 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1001 uint8_t halfHV[64];\
1002 copy_block9(full, src, 16, stride, 9);\
1003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1004 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1008 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1010 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1020 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1022 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1031 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1033 uint8_t full[16*9];\
1036 uint8_t halfHV[64];\
1037 copy_block9(full, src, 16, stride, 9);\
1038 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1039 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1040 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1043 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1045 uint8_t full[16*9];\
1047 uint8_t halfHV[64];\
1048 copy_block9(full, src, 16, stride, 9);\
1049 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1050 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1054 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1057 uint8_t halfHV[64];\
1058 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1062 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1065 uint8_t halfHV[64];\
1066 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1067 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1068 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1070 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1072 uint8_t full[16*9];\
1075 uint8_t halfHV[64];\
1076 copy_block9(full, src, 16, stride, 9);\
1077 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1078 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1079 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1080 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1082 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1084 uint8_t full[16*9];\
1086 copy_block9(full, src, 16, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1088 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1089 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1093 uint8_t full[16*9];\
1096 uint8_t halfHV[64];\
1097 copy_block9(full, src, 16, stride, 9);\
1098 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1099 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1100 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1101 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1103 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1105 uint8_t full[16*9];\
1107 copy_block9(full, src, 16, stride, 9);\
1108 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1109 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1110 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1112 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1128 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1131 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1134 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1138 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1140 uint8_t full[24*17];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1144 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1147 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1149 uint8_t full[24*17];\
1150 copy_block17(full, src, 24, stride, 17);\
1151 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1154 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1156 uint8_t full[24*17];\
1158 copy_block17(full, src, 24, stride, 17);\
1159 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1160 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1162 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1164 uint8_t full[24*17];\
1165 uint8_t halfH[272];\
1166 uint8_t halfV[256];\
1167 uint8_t halfHV[256];\
1168 copy_block17(full, src, 24, stride, 17);\
1169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1174 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1176 uint8_t full[24*17];\
1177 uint8_t halfH[272];\
1178 uint8_t halfHV[256];\
1179 copy_block17(full, src, 24, stride, 17);\
1180 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1182 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1185 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1187 uint8_t full[24*17];\
1188 uint8_t halfH[272];\
1189 uint8_t halfV[256];\
1190 uint8_t halfHV[256];\
1191 copy_block17(full, src, 24, stride, 17);\
1192 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1193 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1194 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1197 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfHV[256];\
1202 copy_block17(full, src, 24, stride, 17);\
1203 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1204 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1208 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfV[256];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1222 uint8_t full[24*17];\
1223 uint8_t halfH[272];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1231 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1233 uint8_t full[24*17];\
1234 uint8_t halfH[272];\
1235 uint8_t halfV[256];\
1236 uint8_t halfHV[256];\
1237 copy_block17(full, src, 24, stride, 17);\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1239 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1243 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 uint8_t halfHV[256];\
1248 copy_block17(full, src, 24, stride, 17);\
1249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1256 uint8_t halfH[272];\
1257 uint8_t halfHV[256];\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1264 uint8_t halfH[272];\
1265 uint8_t halfHV[256];\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1268 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1270 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1272 uint8_t full[24*17];\
1273 uint8_t halfH[272];\
1274 uint8_t halfV[256];\
1275 uint8_t halfHV[256];\
1276 copy_block17(full, src, 24, stride, 17);\
1277 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1278 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1279 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1280 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1282 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1284 uint8_t full[24*17];\
1285 uint8_t halfH[272];\
1286 copy_block17(full, src, 24, stride, 17);\
1287 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1288 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1289 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1291 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1293 uint8_t full[24*17];\
1294 uint8_t halfH[272];\
1295 uint8_t halfV[256];\
1296 uint8_t halfHV[256];\
1297 copy_block17(full, src, 24, stride, 17);\
1298 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1299 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1300 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1301 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1303 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1305 uint8_t full[24*17];\
1306 uint8_t halfH[272];\
1307 copy_block17(full, src, 24, stride, 17);\
1308 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1309 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1310 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1312 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1314 uint8_t halfH[272];\
1315 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1316 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1319 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1320 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1321 #define op_put(a, b) a = cm[((b) + 16)>>5]
1322 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1324 QPEL_MC(0, put_ , _ , op_put)
1325 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1326 QPEL_MC(0, avg_ , _ , op_avg)
1327 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1329 #undef op_avg_no_rnd
1331 #undef op_put_no_rnd
1333 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1335 put_pixels8_8_c(dst, src, stride, 8);
1337 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1339 avg_pixels8_8_c(dst, src, stride, 8);
1341 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1343 put_pixels16_8_c(dst, src, stride, 16);
1345 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1347 avg_pixels16_8_c(dst, src, stride, 16);
1350 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1351 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1352 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1353 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1354 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1355 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1357 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1358 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1362 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1363 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1364 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1365 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1366 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1367 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1368 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1369 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1375 #if CONFIG_RV40_DECODER
1376 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 put_pixels16_xy2_8_c(dst, src, stride, 16);
1380 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1384 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 put_pixels8_xy2_8_c(dst, src, stride, 8);
1388 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1390 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1392 #endif /* CONFIG_RV40_DECODER */
1394 #if CONFIG_DIRAC_DECODER
1395 #define DIRAC_MC(OPNAME)\
1396 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1398 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1400 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1402 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1404 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1406 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1407 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1409 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1411 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1413 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1415 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1417 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1419 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1420 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1422 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1424 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1426 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1428 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1430 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1432 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1433 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1439 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1440 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1444 const int src_1= src[ -srcStride];
1445 const int src0 = src[0 ];
1446 const int src1 = src[ srcStride];
1447 const int src2 = src[2*srcStride];
1448 const int src3 = src[3*srcStride];
1449 const int src4 = src[4*srcStride];
1450 const int src5 = src[5*srcStride];
1451 const int src6 = src[6*srcStride];
1452 const int src7 = src[7*srcStride];
1453 const int src8 = src[8*srcStride];
1454 const int src9 = src[9*srcStride];
1455 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1456 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1457 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1458 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1459 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1460 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1461 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1462 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1468 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1471 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1472 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1475 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1477 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1480 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1483 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1484 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1487 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1489 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1492 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1497 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1498 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1499 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1500 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1502 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1507 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1508 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1509 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1510 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1512 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1515 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1516 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1519 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1520 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1522 const int strength= ff_h263_loop_filter_strength[qscale];
1526 int p0= src[x-2*stride];
1527 int p1= src[x-1*stride];
1528 int p2= src[x+0*stride];
1529 int p3= src[x+1*stride];
1530 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1532 if (d<-2*strength) d1= 0;
1533 else if(d<- strength) d1=-2*strength - d;
1534 else if(d< strength) d1= d;
1535 else if(d< 2*strength) d1= 2*strength - d;
1540 if(p1&256) p1= ~(p1>>31);
1541 if(p2&256) p2= ~(p2>>31);
1543 src[x-1*stride] = p1;
1544 src[x+0*stride] = p2;
1548 d2= av_clip((p0-p3)/4, -ad1, ad1);
1550 src[x-2*stride] = p0 - d2;
1551 src[x+ stride] = p3 + d2;
1556 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1557 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1559 const int strength= ff_h263_loop_filter_strength[qscale];
1563 int p0= src[y*stride-2];
1564 int p1= src[y*stride-1];
1565 int p2= src[y*stride+0];
1566 int p3= src[y*stride+1];
1567 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1569 if (d<-2*strength) d1= 0;
1570 else if(d<- strength) d1=-2*strength - d;
1571 else if(d< strength) d1= d;
1572 else if(d< 2*strength) d1= 2*strength - d;
1577 if(p1&256) p1= ~(p1>>31);
1578 if(p2&256) p2= ~(p2>>31);
1580 src[y*stride-1] = p1;
1581 src[y*stride+0] = p2;
1585 d2= av_clip((p0-p3)/4, -ad1, ad1);
1587 src[y*stride-2] = p0 - d2;
1588 src[y*stride+1] = p3 + d2;
1593 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1599 s += abs(pix1[0] - pix2[0]);
1600 s += abs(pix1[1] - pix2[1]);
1601 s += abs(pix1[2] - pix2[2]);
1602 s += abs(pix1[3] - pix2[3]);
1603 s += abs(pix1[4] - pix2[4]);
1604 s += abs(pix1[5] - pix2[5]);
1605 s += abs(pix1[6] - pix2[6]);
1606 s += abs(pix1[7] - pix2[7]);
1607 s += abs(pix1[8] - pix2[8]);
1608 s += abs(pix1[9] - pix2[9]);
1609 s += abs(pix1[10] - pix2[10]);
1610 s += abs(pix1[11] - pix2[11]);
1611 s += abs(pix1[12] - pix2[12]);
1612 s += abs(pix1[13] - pix2[13]);
1613 s += abs(pix1[14] - pix2[14]);
1614 s += abs(pix1[15] - pix2[15]);
1621 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1627 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1628 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1629 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1630 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1631 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1632 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1633 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1634 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1635 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1636 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1637 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1638 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1639 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1640 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1641 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1642 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1649 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1652 uint8_t *pix3 = pix2 + line_size;
1656 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1657 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1658 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1659 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1660 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1661 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1662 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1663 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1664 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1665 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1666 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1667 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1668 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1669 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1670 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1671 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1679 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1682 uint8_t *pix3 = pix2 + line_size;
1686 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1687 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1688 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1689 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1690 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1691 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1692 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1693 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1694 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1695 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1696 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1697 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1698 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1699 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1700 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1701 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1709 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1715 s += abs(pix1[0] - pix2[0]);
1716 s += abs(pix1[1] - pix2[1]);
1717 s += abs(pix1[2] - pix2[2]);
1718 s += abs(pix1[3] - pix2[3]);
1719 s += abs(pix1[4] - pix2[4]);
1720 s += abs(pix1[5] - pix2[5]);
1721 s += abs(pix1[6] - pix2[6]);
1722 s += abs(pix1[7] - pix2[7]);
1729 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1735 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1736 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1737 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1738 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1739 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1740 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1741 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1742 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1749 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1752 uint8_t *pix3 = pix2 + line_size;
1756 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1757 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1758 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1759 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1760 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1761 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1762 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1763 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1771 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1774 uint8_t *pix3 = pix2 + line_size;
1778 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1779 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1780 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1781 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1782 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1783 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1784 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1785 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1793 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1794 MpegEncContext *c = v;
1800 for(x=0; x<16; x++){
1801 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1804 for(x=0; x<15; x++){
1805 score2+= FFABS( s1[x ] - s1[x +stride]
1806 - s1[x+1] + s1[x+1+stride])
1807 -FFABS( s2[x ] - s2[x +stride]
1808 - s2[x+1] + s2[x+1+stride]);
1815 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1816 else return score1 + FFABS(score2)*8;
1819 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1820 MpegEncContext *c = v;
1827 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1831 score2+= FFABS( s1[x ] - s1[x +stride]
1832 - s1[x+1] + s1[x+1+stride])
1833 -FFABS( s2[x ] - s2[x +stride]
1834 - s2[x+1] + s2[x+1+stride]);
1841 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1842 else return score1 + FFABS(score2)*8;
1845 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1849 for(i=0; i<8*8; i++){
1850 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1853 av_assert2(-512<b && b<512);
1855 sum += (w*b)*(w*b)>>4;
1860 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1863 for(i=0; i<8*8; i++){
1864 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1868 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1872 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1875 memset(cmp, 0, sizeof(void*)*6);
1883 cmp[i]= c->hadamard8_diff[i];
1889 cmp[i]= c->dct_sad[i];
1892 cmp[i]= c->dct264_sad[i];
1895 cmp[i]= c->dct_max[i];
1898 cmp[i]= c->quant_psnr[i];
1927 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1932 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1934 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1935 long a = *(long*)(src+i);
1936 long b = *(long*)(dst+i);
1937 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1940 dst[i+0] += src[i+0];
1943 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1945 #if !HAVE_FAST_UNALIGNED
1946 if((long)src2 & (sizeof(long)-1)){
1947 for(i=0; i+7<w; i+=8){
1948 dst[i+0] = src1[i+0]-src2[i+0];
1949 dst[i+1] = src1[i+1]-src2[i+1];
1950 dst[i+2] = src1[i+2]-src2[i+2];
1951 dst[i+3] = src1[i+3]-src2[i+3];
1952 dst[i+4] = src1[i+4]-src2[i+4];
1953 dst[i+5] = src1[i+5]-src2[i+5];
1954 dst[i+6] = src1[i+6]-src2[i+6];
1955 dst[i+7] = src1[i+7]-src2[i+7];
1959 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1960 long a = *(long*)(src1+i);
1961 long b = *(long*)(src2+i);
1962 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1965 dst[i+0] = src1[i+0]-src2[i+0];
1968 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1976 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1985 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1993 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2003 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
2006 for(i=0; i<w-1; i++){
2033 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2063 #define BUTTERFLY2(o1,o2,i1,i2) \
2067 #define BUTTERFLY1(x,y) \
2076 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2078 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2086 //FIXME try pointer walks
2087 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2088 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2089 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2090 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2092 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2093 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2094 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2095 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2097 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2098 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2099 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2100 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2104 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2105 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2106 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2107 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2109 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2110 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2111 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2112 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2115 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2116 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2117 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2118 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2123 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2131 //FIXME try pointer walks
2132 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2133 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2134 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2135 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2137 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2138 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2139 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2140 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2142 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2143 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2144 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2145 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2149 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2150 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2151 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2152 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2154 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2155 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2156 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2157 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2160 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2161 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2162 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2163 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2166 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2171 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2172 MpegEncContext * const s= (MpegEncContext *)c;
2173 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2177 s->dsp.diff_pixels(temp, src1, src2, stride);
2179 return s->dsp.sum_abs_dctelem(temp);
2184 const int s07 = SRC(0) + SRC(7);\
2185 const int s16 = SRC(1) + SRC(6);\
2186 const int s25 = SRC(2) + SRC(5);\
2187 const int s34 = SRC(3) + SRC(4);\
2188 const int a0 = s07 + s34;\
2189 const int a1 = s16 + s25;\
2190 const int a2 = s07 - s34;\
2191 const int a3 = s16 - s25;\
2192 const int d07 = SRC(0) - SRC(7);\
2193 const int d16 = SRC(1) - SRC(6);\
2194 const int d25 = SRC(2) - SRC(5);\
2195 const int d34 = SRC(3) - SRC(4);\
2196 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2197 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2198 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2199 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2201 DST(1, a4 + (a7>>2)) ;\
2202 DST(2, a2 + (a3>>1)) ;\
2203 DST(3, a5 + (a6>>2)) ;\
2205 DST(5, a6 - (a5>>2)) ;\
2206 DST(6, (a2>>1) - a3 ) ;\
2207 DST(7, (a4>>2) - a7 ) ;\
2210 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2211 MpegEncContext * const s= (MpegEncContext *)c;
2216 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2218 #define SRC(x) dct[i][x]
2219 #define DST(x,v) dct[i][x]= v
2220 for( i = 0; i < 8; i++ )
2225 #define SRC(x) dct[x][i]
2226 #define DST(x,v) sum += FFABS(v)
2227 for( i = 0; i < 8; i++ )
2235 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2236 MpegEncContext * const s= (MpegEncContext *)c;
2237 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2242 s->dsp.diff_pixels(temp, src1, src2, stride);
2246 sum= FFMAX(sum, FFABS(temp[i]));
2251 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2252 MpegEncContext * const s= (MpegEncContext *)c;
2253 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2254 int16_t * const bak = temp+64;
2260 s->dsp.diff_pixels(temp, src1, src2, stride);
2262 memcpy(bak, temp, 64*sizeof(int16_t));
2264 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2265 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2266 ff_simple_idct_8(temp); //FIXME
2269 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2274 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2275 MpegEncContext * const s= (MpegEncContext *)c;
2276 const uint8_t *scantable= s->intra_scantable.permutated;
2277 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2278 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2279 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2280 int i, last, run, bits, level, distortion, start_i;
2281 const int esc_length= s->ac_esc_length;
2283 uint8_t * last_length;
2287 copy_block8(lsrc1, src1, 8, stride, 8);
2288 copy_block8(lsrc2, src2, 8, stride, 8);
2290 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2292 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2298 length = s->intra_ac_vlc_length;
2299 last_length= s->intra_ac_vlc_last_length;
2300 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2303 length = s->inter_ac_vlc_length;
2304 last_length= s->inter_ac_vlc_last_length;
2309 for(i=start_i; i<last; i++){
2310 int j= scantable[i];
2315 if((level&(~127)) == 0){
2316 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2325 level= temp[i] + 64;
2327 av_assert2(level - 64);
2329 if((level&(~127)) == 0){
2330 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2338 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2340 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2343 s->dsp.idct_add(lsrc2, 8, temp);
2345 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2347 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2350 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2351 MpegEncContext * const s= (MpegEncContext *)c;
2352 const uint8_t *scantable= s->intra_scantable.permutated;
2353 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2354 int i, last, run, bits, level, start_i;
2355 const int esc_length= s->ac_esc_length;
2357 uint8_t * last_length;
2361 s->dsp.diff_pixels(temp, src1, src2, stride);
2363 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2369 length = s->intra_ac_vlc_length;
2370 last_length= s->intra_ac_vlc_last_length;
2371 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2374 length = s->inter_ac_vlc_length;
2375 last_length= s->inter_ac_vlc_last_length;
2380 for(i=start_i; i<last; i++){
2381 int j= scantable[i];
2386 if((level&(~127)) == 0){
2387 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2396 level= temp[i] + 64;
2398 av_assert2(level - 64);
2400 if((level&(~127)) == 0){
2401 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2409 #define VSAD_INTRA(size) \
2410 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2414 for(y=1; y<h; y++){ \
2415 for(x=0; x<size; x+=4){ \
2416 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2417 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2427 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2432 for(x=0; x<16; x++){
2433 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2442 #define SQ(a) ((a)*(a))
2443 #define VSSE_INTRA(size) \
2444 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2448 for(y=1; y<h; y++){ \
2449 for(x=0; x<size; x+=4){ \
2450 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2451 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2461 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2466 for(x=0; x<16; x++){
2467 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2476 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2480 for(i=0; i<size; i++)
2481 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2485 #define WRAPPER8_16_SQ(name8, name16)\
2486 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2488 score +=name8(s, dst , src , stride, 8);\
2489 score +=name8(s, dst+8 , src+8 , stride, 8);\
2493 score +=name8(s, dst , src , stride, 8);\
2494 score +=name8(s, dst+8 , src+8 , stride, 8);\
2499 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2500 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2501 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2503 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2505 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2506 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2507 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2508 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2510 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2511 uint32_t maxi, uint32_t maxisign)
2514 if(a > mini) return mini;
2515 else if((a^(1U<<31)) > maxisign) return maxi;
2519 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2521 uint32_t mini = *(uint32_t*)min;
2522 uint32_t maxi = *(uint32_t*)max;
2523 uint32_t maxisign = maxi ^ (1U<<31);
2524 uint32_t *dsti = (uint32_t*)dst;
2525 const uint32_t *srci = (const uint32_t*)src;
2526 for(i=0; i<len; i+=8) {
2527 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2528 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2529 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2530 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2531 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2532 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2533 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2534 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2537 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2539 if(min < 0 && max > 0) {
2540 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2542 for(i=0; i < len; i+=8) {
2543 dst[i ] = av_clipf(src[i ], min, max);
2544 dst[i + 1] = av_clipf(src[i + 1], min, max);
2545 dst[i + 2] = av_clipf(src[i + 2], min, max);
2546 dst[i + 3] = av_clipf(src[i + 3], min, max);
2547 dst[i + 4] = av_clipf(src[i + 4], min, max);
2548 dst[i + 5] = av_clipf(src[i + 5], min, max);
2549 dst[i + 6] = av_clipf(src[i + 6], min, max);
2550 dst[i + 7] = av_clipf(src[i + 7], min, max);
2555 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2560 res += *v1++ * *v2++;
2565 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2570 *v1++ += mul * *v3++;
2575 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2576 const int16_t *window, unsigned int len)
2579 int len2 = len >> 1;
2581 for (i = 0; i < len2; i++) {
2582 int16_t w = window[i];
2583 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2584 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2588 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2589 int32_t max, unsigned int len)
2592 *dst++ = av_clip(*src++, min, max);
2593 *dst++ = av_clip(*src++, min, max);
2594 *dst++ = av_clip(*src++, min, max);
2595 *dst++ = av_clip(*src++, min, max);
2596 *dst++ = av_clip(*src++, min, max);
2597 *dst++ = av_clip(*src++, min, max);
2598 *dst++ = av_clip(*src++, min, max);
2599 *dst++ = av_clip(*src++, min, max);
2604 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2606 ff_j_rev_dct (block);
2607 put_pixels_clamped_c(block, dest, line_size);
2609 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2611 ff_j_rev_dct (block);
2612 add_pixels_clamped_c(block, dest, line_size);
2615 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2617 ff_j_rev_dct4 (block);
2618 put_pixels_clamped4_c(block, dest, line_size);
2620 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2622 ff_j_rev_dct4 (block);
2623 add_pixels_clamped4_c(block, dest, line_size);
2626 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2628 ff_j_rev_dct2 (block);
2629 put_pixels_clamped2_c(block, dest, line_size);
2631 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2633 ff_j_rev_dct2 (block);
2634 add_pixels_clamped2_c(block, dest, line_size);
2637 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2639 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2641 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2643 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2646 /* init static data */
2647 av_cold void ff_dsputil_static_init(void)
2651 for(i=0;i<512;i++) {
2652 ff_squareTbl[i] = (i - 256) * (i - 256);
2655 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2658 int ff_check_alignment(void){
2659 static int did_fail=0;
2660 LOCAL_ALIGNED_16(int, aligned, [4]);
2662 if((intptr_t)aligned & 15){
2664 #if HAVE_MMX || HAVE_ALTIVEC
2665 av_log(NULL, AV_LOG_ERROR,
2666 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2667 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2668 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2669 "Do not report crashes to FFmpeg developers.\n");
2678 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2680 ff_check_alignment();
2683 if (avctx->bits_per_raw_sample == 10) {
2684 c->fdct = ff_jpeg_fdct_islow_10;
2685 c->fdct248 = ff_fdct248_islow_10;
2687 if(avctx->dct_algo==FF_DCT_FASTINT) {
2688 c->fdct = ff_fdct_ifast;
2689 c->fdct248 = ff_fdct_ifast248;
2691 else if(avctx->dct_algo==FF_DCT_FAAN) {
2692 c->fdct = ff_faandct;
2693 c->fdct248 = ff_faandct248;
2696 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2697 c->fdct248 = ff_fdct248_islow_8;
2700 #endif //CONFIG_ENCODERS
2702 if(avctx->lowres==1){
2703 c->idct_put= ff_jref_idct4_put;
2704 c->idct_add= ff_jref_idct4_add;
2705 c->idct = ff_j_rev_dct4;
2706 c->idct_permutation_type= FF_NO_IDCT_PERM;
2707 }else if(avctx->lowres==2){
2708 c->idct_put= ff_jref_idct2_put;
2709 c->idct_add= ff_jref_idct2_add;
2710 c->idct = ff_j_rev_dct2;
2711 c->idct_permutation_type= FF_NO_IDCT_PERM;
2712 }else if(avctx->lowres==3){
2713 c->idct_put= ff_jref_idct1_put;
2714 c->idct_add= ff_jref_idct1_add;
2715 c->idct = ff_j_rev_dct1;
2716 c->idct_permutation_type= FF_NO_IDCT_PERM;
2718 if (avctx->bits_per_raw_sample == 10) {
2719 c->idct_put = ff_simple_idct_put_10;
2720 c->idct_add = ff_simple_idct_add_10;
2721 c->idct = ff_simple_idct_10;
2722 c->idct_permutation_type = FF_NO_IDCT_PERM;
2724 if(avctx->idct_algo==FF_IDCT_INT){
2725 c->idct_put= jref_idct_put;
2726 c->idct_add= jref_idct_add;
2727 c->idct = ff_j_rev_dct;
2728 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2729 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2730 c->idct_put= ff_faanidct_put;
2731 c->idct_add= ff_faanidct_add;
2732 c->idct = ff_faanidct;
2733 c->idct_permutation_type= FF_NO_IDCT_PERM;
2734 }else{ //accurate/default
2735 c->idct_put = ff_simple_idct_put_8;
2736 c->idct_add = ff_simple_idct_add_8;
2737 c->idct = ff_simple_idct_8;
2738 c->idct_permutation_type= FF_NO_IDCT_PERM;
2743 c->diff_pixels = diff_pixels_c;
2744 c->put_pixels_clamped = put_pixels_clamped_c;
2745 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2746 c->add_pixels_clamped = add_pixels_clamped_c;
2747 c->sum_abs_dctelem = sum_abs_dctelem_c;
2750 c->pix_sum = pix_sum_c;
2751 c->pix_norm1 = pix_norm1_c;
2753 c->fill_block_tab[0] = fill_block16_c;
2754 c->fill_block_tab[1] = fill_block8_c;
2756 /* TODO [0] 16 [1] 8 */
2757 c->pix_abs[0][0] = pix_abs16_c;
2758 c->pix_abs[0][1] = pix_abs16_x2_c;
2759 c->pix_abs[0][2] = pix_abs16_y2_c;
2760 c->pix_abs[0][3] = pix_abs16_xy2_c;
2761 c->pix_abs[1][0] = pix_abs8_c;
2762 c->pix_abs[1][1] = pix_abs8_x2_c;
2763 c->pix_abs[1][2] = pix_abs8_y2_c;
2764 c->pix_abs[1][3] = pix_abs8_xy2_c;
2766 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2767 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2768 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2769 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2770 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2771 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2772 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2773 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2774 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2776 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2777 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2778 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2779 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2780 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2781 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2782 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2783 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2784 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2786 #define dspfunc(PFX, IDX, NUM) \
2787 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2788 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2789 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2790 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2791 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2792 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2793 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2794 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2795 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2796 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2797 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2798 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2799 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2800 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2801 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2802 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2804 dspfunc(put_qpel, 0, 16);
2805 dspfunc(put_no_rnd_qpel, 0, 16);
2807 dspfunc(avg_qpel, 0, 16);
2808 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2810 dspfunc(put_qpel, 1, 8);
2811 dspfunc(put_no_rnd_qpel, 1, 8);
2813 dspfunc(avg_qpel, 1, 8);
2814 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2818 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2819 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2820 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2821 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2822 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2823 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2824 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2825 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2827 #define SET_CMP_FUNC(name) \
2828 c->name[0]= name ## 16_c;\
2829 c->name[1]= name ## 8x8_c;
2831 SET_CMP_FUNC(hadamard8_diff)
2832 c->hadamard8_diff[4]= hadamard8_intra16_c;
2833 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2834 SET_CMP_FUNC(dct_sad)
2835 SET_CMP_FUNC(dct_max)
2837 SET_CMP_FUNC(dct264_sad)
2839 c->sad[0]= pix_abs16_c;
2840 c->sad[1]= pix_abs8_c;
2844 SET_CMP_FUNC(quant_psnr)
2847 c->vsad[0]= vsad16_c;
2848 c->vsad[4]= vsad_intra16_c;
2849 c->vsad[5]= vsad_intra8_c;
2850 c->vsse[0]= vsse16_c;
2851 c->vsse[4]= vsse_intra16_c;
2852 c->vsse[5]= vsse_intra8_c;
2853 c->nsse[0]= nsse16_c;
2854 c->nsse[1]= nsse8_c;
2855 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2856 ff_dsputil_init_dwt(c);
2859 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2861 c->add_bytes= add_bytes_c;
2862 c->diff_bytes= diff_bytes_c;
2863 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2864 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2865 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2866 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2867 c->bswap_buf= bswap_buf;
2868 c->bswap16_buf = bswap16_buf;
2870 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2871 c->h263_h_loop_filter= h263_h_loop_filter_c;
2872 c->h263_v_loop_filter= h263_v_loop_filter_c;
2875 c->try_8x8basis= try_8x8basis_c;
2876 c->add_8x8basis= add_8x8basis_c;
2878 c->vector_clipf = vector_clipf_c;
2879 c->scalarproduct_int16 = scalarproduct_int16_c;
2880 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2881 c->apply_window_int16 = apply_window_int16_c;
2882 c->vector_clip_int32 = vector_clip_int32_c;
2884 c->shrink[0]= av_image_copy_plane;
2885 c->shrink[1]= ff_shrink22;
2886 c->shrink[2]= ff_shrink44;
2887 c->shrink[3]= ff_shrink88;
2889 c->add_pixels8 = add_pixels8_c;
2893 #define FUNC(f, depth) f ## _ ## depth
2894 #define FUNCC(f, depth) f ## _ ## depth ## _c
2896 c->draw_edges = FUNCC(draw_edges, 8);
2897 c->clear_block = FUNCC(clear_block, 8);
2898 c->clear_blocks = FUNCC(clear_blocks, 8);
2900 #define BIT_DEPTH_FUNCS(depth) \
2901 c->get_pixels = FUNCC(get_pixels, depth);
2903 switch (avctx->bits_per_raw_sample) {
2908 BIT_DEPTH_FUNCS(16);
2911 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2918 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2919 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2920 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2921 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2922 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2923 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2924 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2926 ff_init_scantable_permutation(c->idct_permutation,
2927 c->idct_permutation_type);
2930 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2932 ff_dsputil_init(c, avctx);
2935 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2937 ff_dsputil_init(c, avctx);