3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_squareTbl[512] = {0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL/255 * 0x7f)
57 #define pb_80 (~0UL/255 * 0x80)
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60 specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62 0, 8, 1, 9, 16, 24, 2, 10,
63 17, 25, 32, 40, 48, 56, 33, 41,
64 18, 26, 3, 11, 4, 12, 19, 27,
65 34, 42, 49, 57, 50, 58, 35, 43,
66 20, 28, 5, 13, 6, 14, 21, 29,
67 36, 44, 51, 59, 52, 60, 37, 45,
68 22, 30, 7, 15, 23, 31, 38, 46,
69 53, 61, 54, 62, 39, 47, 55, 63,
72 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
73 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
75 const uint8_t ff_alternate_horizontal_scan[64] = {
76 0, 1, 2, 3, 8, 9, 16, 17,
77 10, 11, 4, 5, 6, 7, 15, 14,
78 13, 12, 19, 18, 24, 25, 32, 33,
79 26, 27, 20, 21, 22, 23, 28, 29,
80 30, 31, 34, 35, 40, 41, 48, 49,
81 42, 43, 36, 37, 38, 39, 44, 45,
82 46, 47, 50, 51, 56, 57, 58, 59,
83 52, 53, 54, 55, 60, 61, 62, 63,
86 const uint8_t ff_alternate_vertical_scan[64] = {
87 0, 8, 16, 24, 1, 9, 2, 10,
88 17, 25, 32, 40, 48, 56, 57, 49,
89 41, 33, 26, 18, 3, 11, 4, 12,
90 19, 27, 34, 42, 50, 58, 35, 43,
91 51, 59, 20, 28, 5, 13, 6, 14,
92 21, 29, 36, 44, 52, 60, 37, 45,
93 53, 61, 22, 30, 7, 15, 23, 31,
94 38, 46, 54, 62, 39, 47, 55, 63,
97 /* Input permutation for the simple_idct_mmx */
98 static const uint8_t simple_mmx_permutation[64]={
99 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
100 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
101 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
102 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
103 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
104 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
105 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
106 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
109 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
111 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
112 const uint8_t *src_scantable)
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
128 j = st->permutated[i];
130 st->raster_end[i]= end;
134 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
135 int idct_permutation_type)
139 switch(idct_permutation_type){
140 case FF_NO_IDCT_PERM:
142 idct_permutation[i]= i;
144 case FF_LIBMPEG2_IDCT_PERM:
146 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
148 case FF_SIMPLE_IDCT_PERM:
150 idct_permutation[i]= simple_mmx_permutation[i];
152 case FF_TRANSPOSE_IDCT_PERM:
154 idct_permutation[i]= ((i&7)<<3) | (i>>3);
156 case FF_PARTTRANS_IDCT_PERM:
158 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
160 case FF_SSE2_IDCT_PERM:
162 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
165 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
169 static int pix_sum_c(uint8_t * pix, int line_size)
174 for (i = 0; i < 16; i++) {
175 for (j = 0; j < 16; j += 8) {
186 pix += line_size - 16;
191 static int pix_norm1_c(uint8_t * pix, int line_size)
194 uint32_t *sq = ff_squareTbl + 256;
197 for (i = 0; i < 16; i++) {
198 for (j = 0; j < 16; j += 8) {
210 register uint64_t x=*(uint64_t*)pix;
212 s += sq[(x>>8)&0xff];
213 s += sq[(x>>16)&0xff];
214 s += sq[(x>>24)&0xff];
215 s += sq[(x>>32)&0xff];
216 s += sq[(x>>40)&0xff];
217 s += sq[(x>>48)&0xff];
218 s += sq[(x>>56)&0xff];
220 register uint32_t x=*(uint32_t*)pix;
222 s += sq[(x>>8)&0xff];
223 s += sq[(x>>16)&0xff];
224 s += sq[(x>>24)&0xff];
225 x=*(uint32_t*)(pix+4);
227 s += sq[(x>>8)&0xff];
228 s += sq[(x>>16)&0xff];
229 s += sq[(x>>24)&0xff];
234 pix += line_size - 16;
239 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
242 for(i=0; i+8<=w; i+=8){
243 dst[i+0]= av_bswap32(src[i+0]);
244 dst[i+1]= av_bswap32(src[i+1]);
245 dst[i+2]= av_bswap32(src[i+2]);
246 dst[i+3]= av_bswap32(src[i+3]);
247 dst[i+4]= av_bswap32(src[i+4]);
248 dst[i+5]= av_bswap32(src[i+5]);
249 dst[i+6]= av_bswap32(src[i+6]);
250 dst[i+7]= av_bswap32(src[i+7]);
253 dst[i+0]= av_bswap32(src[i+0]);
257 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
260 *dst++ = av_bswap16(*src++);
263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
266 uint32_t *sq = ff_squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[0] - pix2[0]];
271 s += sq[pix1[1] - pix2[1]];
272 s += sq[pix1[2] - pix2[2]];
273 s += sq[pix1[3] - pix2[3]];
280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[0] - pix2[0]];
288 s += sq[pix1[1] - pix2[1]];
289 s += sq[pix1[2] - pix2[2]];
290 s += sq[pix1[3] - pix2[3]];
291 s += sq[pix1[4] - pix2[4]];
292 s += sq[pix1[5] - pix2[5]];
293 s += sq[pix1[6] - pix2[6]];
294 s += sq[pix1[7] - pix2[7]];
301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
304 uint32_t *sq = ff_squareTbl + 256;
307 for (i = 0; i < h; i++) {
308 s += sq[pix1[ 0] - pix2[ 0]];
309 s += sq[pix1[ 1] - pix2[ 1]];
310 s += sq[pix1[ 2] - pix2[ 2]];
311 s += sq[pix1[ 3] - pix2[ 3]];
312 s += sq[pix1[ 4] - pix2[ 4]];
313 s += sq[pix1[ 5] - pix2[ 5]];
314 s += sq[pix1[ 6] - pix2[ 6]];
315 s += sq[pix1[ 7] - pix2[ 7]];
316 s += sq[pix1[ 8] - pix2[ 8]];
317 s += sq[pix1[ 9] - pix2[ 9]];
318 s += sq[pix1[10] - pix2[10]];
319 s += sq[pix1[11] - pix2[11]];
320 s += sq[pix1[12] - pix2[12]];
321 s += sq[pix1[13] - pix2[13]];
322 s += sq[pix1[14] - pix2[14]];
323 s += sq[pix1[15] - pix2[15]];
331 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
332 const uint8_t *s2, int stride){
335 /* read the pixels */
337 block[0] = s1[0] - s2[0];
338 block[1] = s1[1] - s2[1];
339 block[2] = s1[2] - s2[2];
340 block[3] = s1[3] - s2[3];
341 block[4] = s1[4] - s2[4];
342 block[5] = s1[5] - s2[5];
343 block[6] = s1[6] - s2[6];
344 block[7] = s1[7] - s2[7];
351 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
356 /* read the pixels */
358 pixels[0] = av_clip_uint8(block[0]);
359 pixels[1] = av_clip_uint8(block[1]);
360 pixels[2] = av_clip_uint8(block[2]);
361 pixels[3] = av_clip_uint8(block[3]);
362 pixels[4] = av_clip_uint8(block[4]);
363 pixels[5] = av_clip_uint8(block[5]);
364 pixels[6] = av_clip_uint8(block[6]);
365 pixels[7] = av_clip_uint8(block[7]);
372 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
377 /* read the pixels */
379 pixels[0] = av_clip_uint8(block[0]);
380 pixels[1] = av_clip_uint8(block[1]);
381 pixels[2] = av_clip_uint8(block[2]);
382 pixels[3] = av_clip_uint8(block[3]);
389 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
394 /* read the pixels */
396 pixels[0] = av_clip_uint8(block[0]);
397 pixels[1] = av_clip_uint8(block[1]);
404 static void put_signed_pixels_clamped_c(const int16_t *block,
405 uint8_t *av_restrict pixels,
410 for (i = 0; i < 8; i++) {
411 for (j = 0; j < 8; j++) {
414 else if (*block > 127)
417 *pixels = (uint8_t)(*block + 128);
421 pixels += (line_size - 8);
425 static void add_pixels8_c(uint8_t *av_restrict pixels,
432 pixels[0] += block[0];
433 pixels[1] += block[1];
434 pixels[2] += block[2];
435 pixels[3] += block[3];
436 pixels[4] += block[4];
437 pixels[5] += block[5];
438 pixels[6] += block[6];
439 pixels[7] += block[7];
445 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
450 /* read the pixels */
452 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
453 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
454 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
455 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
456 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
457 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
458 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
459 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
465 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
470 /* read the pixels */
472 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
473 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
474 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
475 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
481 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
486 /* read the pixels */
488 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
489 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
495 static int sum_abs_dctelem_c(int16_t *block)
499 sum+= FFABS(block[i]);
503 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
507 for (i = 0; i < h; i++) {
508 memset(block, value, 16);
513 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
517 for (i = 0; i < h; i++) {
518 memset(block, value, 8);
523 #define avg2(a,b) ((a+b+1)>>1)
524 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
526 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
528 const int A=(16-x16)*(16-y16);
529 const int B=( x16)*(16-y16);
530 const int C=(16-x16)*( y16);
531 const int D=( x16)*( y16);
536 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
537 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
538 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
539 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
540 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
541 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
542 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
543 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
549 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
550 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
553 const int s= 1<<shift;
563 for(x=0; x<8; x++){ //XXX FIXME optimize
564 int src_x, src_y, frac_x, frac_y, index;
573 if((unsigned)src_x < width){
574 if((unsigned)src_y < height){
575 index= src_x + src_y*stride;
576 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
577 + src[index +1]* frac_x )*(s-frac_y)
578 + ( src[index+stride ]*(s-frac_x)
579 + src[index+stride+1]* frac_x )* frac_y
582 index= src_x + av_clip(src_y, 0, height)*stride;
583 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
584 + src[index +1]* frac_x )*s
588 if((unsigned)src_y < height){
589 index= av_clip(src_x, 0, width) + src_y*stride;
590 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
591 + src[index+stride ]* frac_y )*s
594 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
595 dst[y*stride + x]= src[index ];
607 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
609 case 2: put_pixels2_8_c (dst, src, stride, height); break;
610 case 4: put_pixels4_8_c (dst, src, stride, height); break;
611 case 8: put_pixels8_8_c (dst, src, stride, height); break;
612 case 16:put_pixels16_8_c(dst, src, stride, height); break;
616 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
618 for (i=0; i < height; i++) {
619 for (j=0; j < width; j++) {
620 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
627 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
629 for (i=0; i < height; i++) {
630 for (j=0; j < width; j++) {
631 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
638 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
640 for (i=0; i < height; i++) {
641 for (j=0; j < width; j++) {
642 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
649 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
651 for (i=0; i < height; i++) {
652 for (j=0; j < width; j++) {
653 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
660 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
662 for (i=0; i < height; i++) {
663 for (j=0; j < width; j++) {
664 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
671 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
673 for (i=0; i < height; i++) {
674 for (j=0; j < width; j++) {
675 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
682 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
684 for (i=0; i < height; i++) {
685 for (j=0; j < width; j++) {
686 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
693 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
695 for (i=0; i < height; i++) {
696 for (j=0; j < width; j++) {
697 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
704 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
706 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
707 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
708 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
709 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
713 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
715 for (i=0; i < height; i++) {
716 for (j=0; j < width; j++) {
717 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
724 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
726 for (i=0; i < height; i++) {
727 for (j=0; j < width; j++) {
728 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
735 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
737 for (i=0; i < height; i++) {
738 for (j=0; j < width; j++) {
739 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
746 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
748 for (i=0; i < height; i++) {
749 for (j=0; j < width; j++) {
750 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
757 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
759 for (i=0; i < height; i++) {
760 for (j=0; j < width; j++) {
761 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
768 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
770 for (i=0; i < height; i++) {
771 for (j=0; j < width; j++) {
772 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
779 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
781 for (i=0; i < height; i++) {
782 for (j=0; j < width; j++) {
783 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
790 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
792 for (i=0; i < height; i++) {
793 for (j=0; j < width; j++) {
794 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
801 #define QPEL_MC(r, OPNAME, RND, OP) \
802 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
803 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
807 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
808 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
809 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
810 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
811 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
812 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
813 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
814 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
820 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
822 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
826 const int src0= src[0*srcStride];\
827 const int src1= src[1*srcStride];\
828 const int src2= src[2*srcStride];\
829 const int src3= src[3*srcStride];\
830 const int src4= src[4*srcStride];\
831 const int src5= src[5*srcStride];\
832 const int src6= src[6*srcStride];\
833 const int src7= src[7*srcStride];\
834 const int src8= src[8*srcStride];\
835 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
836 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
837 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
838 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
839 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
840 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
841 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
842 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
848 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
849 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
854 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
855 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
856 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
857 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
858 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
859 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
860 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
861 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
862 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
863 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
864 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
865 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
866 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
867 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
868 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
869 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
875 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
876 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
881 const int src0= src[0*srcStride];\
882 const int src1= src[1*srcStride];\
883 const int src2= src[2*srcStride];\
884 const int src3= src[3*srcStride];\
885 const int src4= src[4*srcStride];\
886 const int src5= src[5*srcStride];\
887 const int src6= src[6*srcStride];\
888 const int src7= src[7*srcStride];\
889 const int src8= src[8*srcStride];\
890 const int src9= src[9*srcStride];\
891 const int src10= src[10*srcStride];\
892 const int src11= src[11*srcStride];\
893 const int src12= src[12*srcStride];\
894 const int src13= src[13*srcStride];\
895 const int src14= src[14*srcStride];\
896 const int src15= src[15*srcStride];\
897 const int src16= src[16*srcStride];\
898 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
899 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
900 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
901 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
902 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
903 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
904 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
905 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
906 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
907 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
908 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
909 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
910 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
911 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
912 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
913 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
919 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
922 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
923 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
926 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
928 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
931 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
934 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
935 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
938 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
942 copy_block9(full, src, 16, stride, 9);\
943 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
944 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
947 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
950 copy_block9(full, src, 16, stride, 9);\
951 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
954 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
958 copy_block9(full, src, 16, stride, 9);\
959 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
960 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
962 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
968 copy_block9(full, src, 16, stride, 9);\
969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
974 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
979 copy_block9(full, src, 16, stride, 9);\
980 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
985 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
991 copy_block9(full, src, 16, stride, 9);\
992 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
993 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
994 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
997 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1001 uint8_t halfHV[64];\
1002 copy_block9(full, src, 16, stride, 9);\
1003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1004 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1006 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1008 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1010 uint8_t full[16*9];\
1013 uint8_t halfHV[64];\
1014 copy_block9(full, src, 16, stride, 9);\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1020 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1022 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1031 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1033 uint8_t full[16*9];\
1036 uint8_t halfHV[64];\
1037 copy_block9(full, src, 16, stride, 9);\
1038 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1039 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1040 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1043 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1045 uint8_t full[16*9];\
1047 uint8_t halfHV[64];\
1048 copy_block9(full, src, 16, stride, 9);\
1049 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1050 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1051 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1052 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1054 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1057 uint8_t halfHV[64];\
1058 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1059 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1060 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1062 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1065 uint8_t halfHV[64];\
1066 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1067 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1068 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1070 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1072 uint8_t full[16*9];\
1075 uint8_t halfHV[64];\
1076 copy_block9(full, src, 16, stride, 9);\
1077 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1078 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1079 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1080 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1082 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1084 uint8_t full[16*9];\
1086 copy_block9(full, src, 16, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1088 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1089 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1091 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1093 uint8_t full[16*9];\
1096 uint8_t halfHV[64];\
1097 copy_block9(full, src, 16, stride, 9);\
1098 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1099 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1100 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1101 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1103 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1105 uint8_t full[16*9];\
1107 copy_block9(full, src, 16, stride, 9);\
1108 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1109 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1110 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1112 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1122 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1123 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1126 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1128 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1131 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1134 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1138 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1140 uint8_t full[24*17];\
1142 copy_block17(full, src, 24, stride, 17);\
1143 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1144 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1147 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1149 uint8_t full[24*17];\
1150 copy_block17(full, src, 24, stride, 17);\
1151 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1154 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1156 uint8_t full[24*17];\
1158 copy_block17(full, src, 24, stride, 17);\
1159 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1160 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1162 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1164 uint8_t full[24*17];\
1165 uint8_t halfH[272];\
1166 uint8_t halfV[256];\
1167 uint8_t halfHV[256];\
1168 copy_block17(full, src, 24, stride, 17);\
1169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1174 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1176 uint8_t full[24*17];\
1177 uint8_t halfH[272];\
1178 uint8_t halfHV[256];\
1179 copy_block17(full, src, 24, stride, 17);\
1180 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1182 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1185 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1187 uint8_t full[24*17];\
1188 uint8_t halfH[272];\
1189 uint8_t halfV[256];\
1190 uint8_t halfHV[256];\
1191 copy_block17(full, src, 24, stride, 17);\
1192 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1193 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1194 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1197 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfHV[256];\
1202 copy_block17(full, src, 24, stride, 17);\
1203 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1204 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1206 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1208 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfV[256];\
1213 uint8_t halfHV[256];\
1214 copy_block17(full, src, 24, stride, 17);\
1215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1218 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1220 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1222 uint8_t full[24*17];\
1223 uint8_t halfH[272];\
1224 uint8_t halfHV[256];\
1225 copy_block17(full, src, 24, stride, 17);\
1226 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1227 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1228 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1229 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1231 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1233 uint8_t full[24*17];\
1234 uint8_t halfH[272];\
1235 uint8_t halfV[256];\
1236 uint8_t halfHV[256];\
1237 copy_block17(full, src, 24, stride, 17);\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1239 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1243 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1245 uint8_t full[24*17];\
1246 uint8_t halfH[272];\
1247 uint8_t halfHV[256];\
1248 copy_block17(full, src, 24, stride, 17);\
1249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1250 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1256 uint8_t halfH[272];\
1257 uint8_t halfHV[256];\
1258 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1259 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1260 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1262 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1264 uint8_t halfH[272];\
1265 uint8_t halfHV[256];\
1266 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1267 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1268 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1270 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1272 uint8_t full[24*17];\
1273 uint8_t halfH[272];\
1274 uint8_t halfV[256];\
1275 uint8_t halfHV[256];\
1276 copy_block17(full, src, 24, stride, 17);\
1277 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1278 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1279 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1280 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1282 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1284 uint8_t full[24*17];\
1285 uint8_t halfH[272];\
1286 copy_block17(full, src, 24, stride, 17);\
1287 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1288 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1289 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1291 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1293 uint8_t full[24*17];\
1294 uint8_t halfH[272];\
1295 uint8_t halfV[256];\
1296 uint8_t halfHV[256];\
1297 copy_block17(full, src, 24, stride, 17);\
1298 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1299 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1300 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1301 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1303 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1305 uint8_t full[24*17];\
1306 uint8_t halfH[272];\
1307 copy_block17(full, src, 24, stride, 17);\
1308 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1309 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1310 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1312 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1314 uint8_t halfH[272];\
1315 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1316 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1319 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1320 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1321 #define op_put(a, b) a = cm[((b) + 16)>>5]
1322 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1324 QPEL_MC(0, put_ , _ , op_put)
1325 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1326 QPEL_MC(0, avg_ , _ , op_avg)
1327 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1329 #undef op_avg_no_rnd
1331 #undef op_put_no_rnd
1333 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1335 put_pixels8_8_c(dst, src, stride, 8);
1337 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1339 avg_pixels8_8_c(dst, src, stride, 8);
1341 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1343 put_pixels16_8_c(dst, src, stride, 16);
1345 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1347 avg_pixels16_8_c(dst, src, stride, 16);
1350 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1351 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1352 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1353 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1354 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1355 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1357 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1358 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1362 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1363 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1364 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1365 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1366 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1367 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1368 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1369 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1375 #if CONFIG_RV40_DECODER
1376 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1378 put_pixels16_xy2_8_c(dst, src, stride, 16);
1380 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1382 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1384 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1386 put_pixels8_xy2_8_c(dst, src, stride, 8);
1388 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1390 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1392 #endif /* CONFIG_RV40_DECODER */
1394 #if CONFIG_DIRAC_DECODER
1395 #define DIRAC_MC(OPNAME)\
1396 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1398 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1400 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1402 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1404 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1406 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1407 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1409 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1411 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1413 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1415 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1417 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1419 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1420 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1422 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1424 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1426 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1428 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1430 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1432 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1433 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1439 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1440 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1444 const int src_1= src[ -srcStride];
1445 const int src0 = src[0 ];
1446 const int src1 = src[ srcStride];
1447 const int src2 = src[2*srcStride];
1448 const int src3 = src[3*srcStride];
1449 const int src4 = src[4*srcStride];
1450 const int src5 = src[5*srcStride];
1451 const int src6 = src[6*srcStride];
1452 const int src7 = src[7*srcStride];
1453 const int src8 = src[8*srcStride];
1454 const int src9 = src[9*srcStride];
1455 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1456 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1457 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1458 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1459 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1460 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1461 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1462 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1468 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1471 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1472 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1475 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1477 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1480 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1483 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1484 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1487 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1489 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1492 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1497 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1498 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1499 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1500 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1502 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1507 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1508 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1509 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1510 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1512 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1515 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1516 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1519 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1525 s += abs(pix1[0] - pix2[0]);
1526 s += abs(pix1[1] - pix2[1]);
1527 s += abs(pix1[2] - pix2[2]);
1528 s += abs(pix1[3] - pix2[3]);
1529 s += abs(pix1[4] - pix2[4]);
1530 s += abs(pix1[5] - pix2[5]);
1531 s += abs(pix1[6] - pix2[6]);
1532 s += abs(pix1[7] - pix2[7]);
1533 s += abs(pix1[8] - pix2[8]);
1534 s += abs(pix1[9] - pix2[9]);
1535 s += abs(pix1[10] - pix2[10]);
1536 s += abs(pix1[11] - pix2[11]);
1537 s += abs(pix1[12] - pix2[12]);
1538 s += abs(pix1[13] - pix2[13]);
1539 s += abs(pix1[14] - pix2[14]);
1540 s += abs(pix1[15] - pix2[15]);
1547 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1553 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1554 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1555 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1556 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1557 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1558 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1559 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1560 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1561 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1562 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1563 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1564 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1565 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1566 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1567 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1568 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1575 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1578 uint8_t *pix3 = pix2 + line_size;
1582 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1583 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1584 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1585 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1586 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1587 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1588 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1589 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1590 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1591 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1592 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1593 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1594 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1595 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1596 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1597 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1605 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1608 uint8_t *pix3 = pix2 + line_size;
1612 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1613 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1614 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1615 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1616 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1617 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1618 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1619 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1620 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1621 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1622 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1623 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1624 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1625 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1626 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1627 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1635 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1641 s += abs(pix1[0] - pix2[0]);
1642 s += abs(pix1[1] - pix2[1]);
1643 s += abs(pix1[2] - pix2[2]);
1644 s += abs(pix1[3] - pix2[3]);
1645 s += abs(pix1[4] - pix2[4]);
1646 s += abs(pix1[5] - pix2[5]);
1647 s += abs(pix1[6] - pix2[6]);
1648 s += abs(pix1[7] - pix2[7]);
1655 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1661 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1662 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1663 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1664 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1665 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1666 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1667 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1668 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1675 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1678 uint8_t *pix3 = pix2 + line_size;
1682 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1683 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1684 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1685 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1686 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1687 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1688 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1689 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1697 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1700 uint8_t *pix3 = pix2 + line_size;
1704 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1705 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1706 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1707 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1708 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1709 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1710 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1711 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1719 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1720 MpegEncContext *c = v;
1726 for(x=0; x<16; x++){
1727 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1730 for(x=0; x<15; x++){
1731 score2+= FFABS( s1[x ] - s1[x +stride]
1732 - s1[x+1] + s1[x+1+stride])
1733 -FFABS( s2[x ] - s2[x +stride]
1734 - s2[x+1] + s2[x+1+stride]);
1741 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1742 else return score1 + FFABS(score2)*8;
1745 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1746 MpegEncContext *c = v;
1753 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1757 score2+= FFABS( s1[x ] - s1[x +stride]
1758 - s1[x+1] + s1[x+1+stride])
1759 -FFABS( s2[x ] - s2[x +stride]
1760 - s2[x+1] + s2[x+1+stride]);
1767 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1768 else return score1 + FFABS(score2)*8;
1771 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1775 for(i=0; i<8*8; i++){
1776 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1779 av_assert2(-512<b && b<512);
1781 sum += (w*b)*(w*b)>>4;
1786 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1789 for(i=0; i<8*8; i++){
1790 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1794 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1798 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1801 memset(cmp, 0, sizeof(void*)*6);
1809 cmp[i]= c->hadamard8_diff[i];
1815 cmp[i]= c->dct_sad[i];
1818 cmp[i]= c->dct264_sad[i];
1821 cmp[i]= c->dct_max[i];
1824 cmp[i]= c->quant_psnr[i];
1853 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1858 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1860 for(i=0; i<=w-(int)sizeof(long); i+=sizeof(long)){
1861 long a = *(long*)(src+i);
1862 long b = *(long*)(dst+i);
1863 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1866 dst[i+0] += src[i+0];
1869 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1871 #if !HAVE_FAST_UNALIGNED
1872 if((long)src2 & (sizeof(long)-1)){
1873 for(i=0; i+7<w; i+=8){
1874 dst[i+0] = src1[i+0]-src2[i+0];
1875 dst[i+1] = src1[i+1]-src2[i+1];
1876 dst[i+2] = src1[i+2]-src2[i+2];
1877 dst[i+3] = src1[i+3]-src2[i+3];
1878 dst[i+4] = src1[i+4]-src2[i+4];
1879 dst[i+5] = src1[i+5]-src2[i+5];
1880 dst[i+6] = src1[i+6]-src2[i+6];
1881 dst[i+7] = src1[i+7]-src2[i+7];
1885 for(i=0; i<=w-(int)sizeof(long); i+=sizeof(long)){
1886 long a = *(long*)(src1+i);
1887 long b = *(long*)(src2+i);
1888 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1891 dst[i+0] = src1[i+0]-src2[i+0];
1894 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1902 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1911 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1919 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1929 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1932 for(i=0; i<w-1; i++){
1959 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1989 #define BUTTERFLY2(o1,o2,i1,i2) \
1993 #define BUTTERFLY1(x,y) \
2002 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2004 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2012 //FIXME try pointer walks
2013 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2014 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2015 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2016 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2018 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2019 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2020 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2021 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2023 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2024 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2025 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2026 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2030 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2031 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2032 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2033 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2035 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2036 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2037 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2038 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2041 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2042 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2043 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2044 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2049 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2057 //FIXME try pointer walks
2058 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2059 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2060 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2061 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2063 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2064 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2065 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2066 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2068 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2069 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2070 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2071 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2075 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2076 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2077 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2078 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2080 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2081 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2082 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2083 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2086 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2087 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2088 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2089 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2092 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2097 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2098 MpegEncContext * const s= (MpegEncContext *)c;
2099 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2103 s->dsp.diff_pixels(temp, src1, src2, stride);
2105 return s->dsp.sum_abs_dctelem(temp);
2110 const int s07 = SRC(0) + SRC(7);\
2111 const int s16 = SRC(1) + SRC(6);\
2112 const int s25 = SRC(2) + SRC(5);\
2113 const int s34 = SRC(3) + SRC(4);\
2114 const int a0 = s07 + s34;\
2115 const int a1 = s16 + s25;\
2116 const int a2 = s07 - s34;\
2117 const int a3 = s16 - s25;\
2118 const int d07 = SRC(0) - SRC(7);\
2119 const int d16 = SRC(1) - SRC(6);\
2120 const int d25 = SRC(2) - SRC(5);\
2121 const int d34 = SRC(3) - SRC(4);\
2122 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2123 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2124 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2125 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2127 DST(1, a4 + (a7>>2)) ;\
2128 DST(2, a2 + (a3>>1)) ;\
2129 DST(3, a5 + (a6>>2)) ;\
2131 DST(5, a6 - (a5>>2)) ;\
2132 DST(6, (a2>>1) - a3 ) ;\
2133 DST(7, (a4>>2) - a7 ) ;\
2136 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2137 MpegEncContext * const s= (MpegEncContext *)c;
2142 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2144 #define SRC(x) dct[i][x]
2145 #define DST(x,v) dct[i][x]= v
2146 for( i = 0; i < 8; i++ )
2151 #define SRC(x) dct[x][i]
2152 #define DST(x,v) sum += FFABS(v)
2153 for( i = 0; i < 8; i++ )
2161 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2162 MpegEncContext * const s= (MpegEncContext *)c;
2163 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2168 s->dsp.diff_pixels(temp, src1, src2, stride);
2172 sum= FFMAX(sum, FFABS(temp[i]));
2177 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2178 MpegEncContext * const s= (MpegEncContext *)c;
2179 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2180 int16_t * const bak = temp+64;
2186 s->dsp.diff_pixels(temp, src1, src2, stride);
2188 memcpy(bak, temp, 64*sizeof(int16_t));
2190 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2191 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2192 ff_simple_idct_8(temp); //FIXME
2195 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2200 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2201 MpegEncContext * const s= (MpegEncContext *)c;
2202 const uint8_t *scantable= s->intra_scantable.permutated;
2203 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2204 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2205 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2206 int i, last, run, bits, level, distortion, start_i;
2207 const int esc_length= s->ac_esc_length;
2209 uint8_t * last_length;
2213 copy_block8(lsrc1, src1, 8, stride, 8);
2214 copy_block8(lsrc2, src2, 8, stride, 8);
2216 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2218 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2224 length = s->intra_ac_vlc_length;
2225 last_length= s->intra_ac_vlc_last_length;
2226 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2229 length = s->inter_ac_vlc_length;
2230 last_length= s->inter_ac_vlc_last_length;
2235 for(i=start_i; i<last; i++){
2236 int j= scantable[i];
2241 if((level&(~127)) == 0){
2242 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2251 level= temp[i] + 64;
2253 av_assert2(level - 64);
2255 if((level&(~127)) == 0){
2256 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2264 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2266 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2269 s->dsp.idct_add(lsrc2, 8, temp);
2271 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2273 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2276 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2277 MpegEncContext * const s= (MpegEncContext *)c;
2278 const uint8_t *scantable= s->intra_scantable.permutated;
2279 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2280 int i, last, run, bits, level, start_i;
2281 const int esc_length= s->ac_esc_length;
2283 uint8_t * last_length;
2287 s->dsp.diff_pixels(temp, src1, src2, stride);
2289 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2295 length = s->intra_ac_vlc_length;
2296 last_length= s->intra_ac_vlc_last_length;
2297 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2300 length = s->inter_ac_vlc_length;
2301 last_length= s->inter_ac_vlc_last_length;
2306 for(i=start_i; i<last; i++){
2307 int j= scantable[i];
2312 if((level&(~127)) == 0){
2313 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2322 level= temp[i] + 64;
2324 av_assert2(level - 64);
2326 if((level&(~127)) == 0){
2327 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2335 #define VSAD_INTRA(size) \
2336 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2340 for(y=1; y<h; y++){ \
2341 for(x=0; x<size; x+=4){ \
2342 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2343 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2353 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2358 for(x=0; x<16; x++){
2359 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2368 #define SQ(a) ((a)*(a))
2369 #define VSSE_INTRA(size) \
2370 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2374 for(y=1; y<h; y++){ \
2375 for(x=0; x<size; x+=4){ \
2376 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2377 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2387 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2392 for(x=0; x<16; x++){
2393 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2402 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2406 for(i=0; i<size; i++)
2407 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2411 #define WRAPPER8_16_SQ(name8, name16)\
2412 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2414 score +=name8(s, dst , src , stride, 8);\
2415 score +=name8(s, dst+8 , src+8 , stride, 8);\
2419 score +=name8(s, dst , src , stride, 8);\
2420 score +=name8(s, dst+8 , src+8 , stride, 8);\
2425 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2426 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2427 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2429 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2431 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2432 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2433 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2434 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2436 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2437 uint32_t maxi, uint32_t maxisign)
2440 if(a > mini) return mini;
2441 else if((a^(1U<<31)) > maxisign) return maxi;
2445 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2447 uint32_t mini = *(uint32_t*)min;
2448 uint32_t maxi = *(uint32_t*)max;
2449 uint32_t maxisign = maxi ^ (1U<<31);
2450 uint32_t *dsti = (uint32_t*)dst;
2451 const uint32_t *srci = (const uint32_t*)src;
2452 for(i=0; i<len; i+=8) {
2453 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2454 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2455 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2456 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2457 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2458 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2459 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2460 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2463 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2465 if(min < 0 && max > 0) {
2466 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2468 for(i=0; i < len; i+=8) {
2469 dst[i ] = av_clipf(src[i ], min, max);
2470 dst[i + 1] = av_clipf(src[i + 1], min, max);
2471 dst[i + 2] = av_clipf(src[i + 2], min, max);
2472 dst[i + 3] = av_clipf(src[i + 3], min, max);
2473 dst[i + 4] = av_clipf(src[i + 4], min, max);
2474 dst[i + 5] = av_clipf(src[i + 5], min, max);
2475 dst[i + 6] = av_clipf(src[i + 6], min, max);
2476 dst[i + 7] = av_clipf(src[i + 7], min, max);
2481 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2486 res += *v1++ * *v2++;
2491 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2496 *v1++ += mul * *v3++;
2501 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2502 const int16_t *window, unsigned int len)
2505 int len2 = len >> 1;
2507 for (i = 0; i < len2; i++) {
2508 int16_t w = window[i];
2509 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2510 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2514 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2515 int32_t max, unsigned int len)
2518 *dst++ = av_clip(*src++, min, max);
2519 *dst++ = av_clip(*src++, min, max);
2520 *dst++ = av_clip(*src++, min, max);
2521 *dst++ = av_clip(*src++, min, max);
2522 *dst++ = av_clip(*src++, min, max);
2523 *dst++ = av_clip(*src++, min, max);
2524 *dst++ = av_clip(*src++, min, max);
2525 *dst++ = av_clip(*src++, min, max);
2530 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2532 ff_j_rev_dct (block);
2533 put_pixels_clamped_c(block, dest, line_size);
2535 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2537 ff_j_rev_dct (block);
2538 add_pixels_clamped_c(block, dest, line_size);
2541 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2543 ff_j_rev_dct4 (block);
2544 put_pixels_clamped4_c(block, dest, line_size);
2546 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2548 ff_j_rev_dct4 (block);
2549 add_pixels_clamped4_c(block, dest, line_size);
2552 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2554 ff_j_rev_dct2 (block);
2555 put_pixels_clamped2_c(block, dest, line_size);
2557 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2559 ff_j_rev_dct2 (block);
2560 add_pixels_clamped2_c(block, dest, line_size);
2563 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2565 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2567 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2569 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2572 /* init static data */
2573 av_cold void ff_dsputil_static_init(void)
2577 for(i=0;i<512;i++) {
2578 ff_squareTbl[i] = (i - 256) * (i - 256);
2581 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2584 int ff_check_alignment(void){
2585 static int did_fail=0;
2586 LOCAL_ALIGNED_16(int, aligned, [4]);
2588 if((intptr_t)aligned & 15){
2590 #if HAVE_MMX || HAVE_ALTIVEC
2591 av_log(NULL, AV_LOG_ERROR,
2592 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2593 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2594 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2595 "Do not report crashes to FFmpeg developers.\n");
2604 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2606 ff_check_alignment();
2609 if (avctx->bits_per_raw_sample == 10) {
2610 c->fdct = ff_jpeg_fdct_islow_10;
2611 c->fdct248 = ff_fdct248_islow_10;
2613 if(avctx->dct_algo==FF_DCT_FASTINT) {
2614 c->fdct = ff_fdct_ifast;
2615 c->fdct248 = ff_fdct_ifast248;
2617 else if(avctx->dct_algo==FF_DCT_FAAN) {
2618 c->fdct = ff_faandct;
2619 c->fdct248 = ff_faandct248;
2622 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2623 c->fdct248 = ff_fdct248_islow_8;
2626 #endif //CONFIG_ENCODERS
2628 if(avctx->lowres==1){
2629 c->idct_put= ff_jref_idct4_put;
2630 c->idct_add= ff_jref_idct4_add;
2631 c->idct = ff_j_rev_dct4;
2632 c->idct_permutation_type= FF_NO_IDCT_PERM;
2633 }else if(avctx->lowres==2){
2634 c->idct_put= ff_jref_idct2_put;
2635 c->idct_add= ff_jref_idct2_add;
2636 c->idct = ff_j_rev_dct2;
2637 c->idct_permutation_type= FF_NO_IDCT_PERM;
2638 }else if(avctx->lowres==3){
2639 c->idct_put= ff_jref_idct1_put;
2640 c->idct_add= ff_jref_idct1_add;
2641 c->idct = ff_j_rev_dct1;
2642 c->idct_permutation_type= FF_NO_IDCT_PERM;
2644 if (avctx->bits_per_raw_sample == 10) {
2645 c->idct_put = ff_simple_idct_put_10;
2646 c->idct_add = ff_simple_idct_add_10;
2647 c->idct = ff_simple_idct_10;
2648 c->idct_permutation_type = FF_NO_IDCT_PERM;
2649 } else if (avctx->bits_per_raw_sample == 12) {
2650 c->idct_put = ff_simple_idct_put_12;
2651 c->idct_add = ff_simple_idct_add_12;
2652 c->idct = ff_simple_idct_12;
2653 c->idct_permutation_type = FF_NO_IDCT_PERM;
2655 if(avctx->idct_algo==FF_IDCT_INT){
2656 c->idct_put= jref_idct_put;
2657 c->idct_add= jref_idct_add;
2658 c->idct = ff_j_rev_dct;
2659 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2660 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2661 c->idct_put= ff_faanidct_put;
2662 c->idct_add= ff_faanidct_add;
2663 c->idct = ff_faanidct;
2664 c->idct_permutation_type= FF_NO_IDCT_PERM;
2665 }else{ //accurate/default
2666 c->idct_put = ff_simple_idct_put_8;
2667 c->idct_add = ff_simple_idct_add_8;
2668 c->idct = ff_simple_idct_8;
2669 c->idct_permutation_type= FF_NO_IDCT_PERM;
2674 c->diff_pixels = diff_pixels_c;
2675 c->put_pixels_clamped = put_pixels_clamped_c;
2676 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2677 c->add_pixels_clamped = add_pixels_clamped_c;
2678 c->sum_abs_dctelem = sum_abs_dctelem_c;
2681 c->pix_sum = pix_sum_c;
2682 c->pix_norm1 = pix_norm1_c;
2684 c->fill_block_tab[0] = fill_block16_c;
2685 c->fill_block_tab[1] = fill_block8_c;
2687 /* TODO [0] 16 [1] 8 */
2688 c->pix_abs[0][0] = pix_abs16_c;
2689 c->pix_abs[0][1] = pix_abs16_x2_c;
2690 c->pix_abs[0][2] = pix_abs16_y2_c;
2691 c->pix_abs[0][3] = pix_abs16_xy2_c;
2692 c->pix_abs[1][0] = pix_abs8_c;
2693 c->pix_abs[1][1] = pix_abs8_x2_c;
2694 c->pix_abs[1][2] = pix_abs8_y2_c;
2695 c->pix_abs[1][3] = pix_abs8_xy2_c;
2697 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2698 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2699 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2700 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2701 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2702 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2703 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2704 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2705 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2707 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2708 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2709 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2710 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2711 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2712 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2713 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2714 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2715 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2717 #define dspfunc(PFX, IDX, NUM) \
2718 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2719 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2720 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2721 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2722 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2723 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2724 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2725 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2726 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2727 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2728 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2729 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2730 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2731 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2732 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2733 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2735 dspfunc(put_qpel, 0, 16);
2736 dspfunc(put_no_rnd_qpel, 0, 16);
2738 dspfunc(avg_qpel, 0, 16);
2739 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2741 dspfunc(put_qpel, 1, 8);
2742 dspfunc(put_no_rnd_qpel, 1, 8);
2744 dspfunc(avg_qpel, 1, 8);
2745 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2749 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2750 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2751 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2752 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2753 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2754 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2755 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2756 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2758 #define SET_CMP_FUNC(name) \
2759 c->name[0]= name ## 16_c;\
2760 c->name[1]= name ## 8x8_c;
2762 SET_CMP_FUNC(hadamard8_diff)
2763 c->hadamard8_diff[4]= hadamard8_intra16_c;
2764 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2765 SET_CMP_FUNC(dct_sad)
2766 SET_CMP_FUNC(dct_max)
2768 SET_CMP_FUNC(dct264_sad)
2770 c->sad[0]= pix_abs16_c;
2771 c->sad[1]= pix_abs8_c;
2775 SET_CMP_FUNC(quant_psnr)
2778 c->vsad[0]= vsad16_c;
2779 c->vsad[4]= vsad_intra16_c;
2780 c->vsad[5]= vsad_intra8_c;
2781 c->vsse[0]= vsse16_c;
2782 c->vsse[4]= vsse_intra16_c;
2783 c->vsse[5]= vsse_intra8_c;
2784 c->nsse[0]= nsse16_c;
2785 c->nsse[1]= nsse8_c;
2786 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2787 ff_dsputil_init_dwt(c);
2790 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2792 c->add_bytes= add_bytes_c;
2793 c->diff_bytes= diff_bytes_c;
2794 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2795 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2796 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2797 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2798 c->bswap_buf= bswap_buf;
2799 c->bswap16_buf = bswap16_buf;
2801 c->try_8x8basis= try_8x8basis_c;
2802 c->add_8x8basis= add_8x8basis_c;
2804 c->vector_clipf = vector_clipf_c;
2805 c->scalarproduct_int16 = scalarproduct_int16_c;
2806 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2807 c->apply_window_int16 = apply_window_int16_c;
2808 c->vector_clip_int32 = vector_clip_int32_c;
2810 c->shrink[0]= av_image_copy_plane;
2811 c->shrink[1]= ff_shrink22;
2812 c->shrink[2]= ff_shrink44;
2813 c->shrink[3]= ff_shrink88;
2815 c->add_pixels8 = add_pixels8_c;
2819 #define FUNC(f, depth) f ## _ ## depth
2820 #define FUNCC(f, depth) f ## _ ## depth ## _c
2822 c->draw_edges = FUNCC(draw_edges, 8);
2823 c->clear_block = FUNCC(clear_block, 8);
2824 c->clear_blocks = FUNCC(clear_blocks, 8);
2826 #define BIT_DEPTH_FUNCS(depth) \
2827 c->get_pixels = FUNCC(get_pixels, depth);
2829 switch (avctx->bits_per_raw_sample) {
2834 BIT_DEPTH_FUNCS(16);
2837 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2845 ff_dsputil_init_alpha(c, avctx);
2847 ff_dsputil_init_arm(c, avctx);
2849 ff_dsputil_init_bfin(c, avctx);
2851 ff_dsputil_init_ppc(c, avctx);
2853 ff_dsputil_init_sh4(c, avctx);
2855 ff_dsputil_init_vis(c, avctx);
2857 ff_dsputil_init_x86(c, avctx);
2859 ff_init_scantable_permutation(c->idct_permutation,
2860 c->idct_permutation_type);
2863 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2865 ff_dsputil_init(c, avctx);
2868 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2870 ff_dsputil_init(c, avctx);