3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54 #define pb_7f (~0UL/255 * 0x7f)
55 #define pb_80 (~0UL/255 * 0x80)
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* Input permutation for the simple_idct_mmx */
96 static const uint8_t simple_mmx_permutation[64]={
97 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
98 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
99 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
100 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
101 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
102 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
103 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
104 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
109 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
113 st->scantable= src_scantable;
117 j = src_scantable[i];
118 st->permutated[i] = permutation[j];
124 j = st->permutated[i];
126 st->raster_end[i]= end;
130 void ff_init_scantable_permutation(uint8_t *idct_permutation,
131 int idct_permutation_type)
135 switch(idct_permutation_type){
136 case FF_NO_IDCT_PERM:
138 idct_permutation[i]= i;
140 case FF_LIBMPEG2_IDCT_PERM:
142 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
144 case FF_SIMPLE_IDCT_PERM:
146 idct_permutation[i]= simple_mmx_permutation[i];
148 case FF_TRANSPOSE_IDCT_PERM:
150 idct_permutation[i]= ((i&7)<<3) | (i>>3);
152 case FF_PARTTRANS_IDCT_PERM:
154 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
156 case FF_SSE2_IDCT_PERM:
158 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
161 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
165 static int pix_sum_c(uint8_t * pix, int line_size)
170 for (i = 0; i < 16; i++) {
171 for (j = 0; j < 16; j += 8) {
182 pix += line_size - 16;
187 static int pix_norm1_c(uint8_t * pix, int line_size)
190 uint32_t *sq = ff_squareTbl + 256;
193 for (i = 0; i < 16; i++) {
194 for (j = 0; j < 16; j += 8) {
206 register uint64_t x=*(uint64_t*)pix;
208 s += sq[(x>>8)&0xff];
209 s += sq[(x>>16)&0xff];
210 s += sq[(x>>24)&0xff];
211 s += sq[(x>>32)&0xff];
212 s += sq[(x>>40)&0xff];
213 s += sq[(x>>48)&0xff];
214 s += sq[(x>>56)&0xff];
216 register uint32_t x=*(uint32_t*)pix;
218 s += sq[(x>>8)&0xff];
219 s += sq[(x>>16)&0xff];
220 s += sq[(x>>24)&0xff];
221 x=*(uint32_t*)(pix+4);
223 s += sq[(x>>8)&0xff];
224 s += sq[(x>>16)&0xff];
225 s += sq[(x>>24)&0xff];
230 pix += line_size - 16;
235 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
238 for(i=0; i+8<=w; i+=8){
239 dst[i+0]= av_bswap32(src[i+0]);
240 dst[i+1]= av_bswap32(src[i+1]);
241 dst[i+2]= av_bswap32(src[i+2]);
242 dst[i+3]= av_bswap32(src[i+3]);
243 dst[i+4]= av_bswap32(src[i+4]);
244 dst[i+5]= av_bswap32(src[i+5]);
245 dst[i+6]= av_bswap32(src[i+6]);
246 dst[i+7]= av_bswap32(src[i+7]);
249 dst[i+0]= av_bswap32(src[i+0]);
253 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
256 *dst++ = av_bswap16(*src++);
259 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
276 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
279 uint32_t *sq = ff_squareTbl + 256;
282 for (i = 0; i < h; i++) {
283 s += sq[pix1[0] - pix2[0]];
284 s += sq[pix1[1] - pix2[1]];
285 s += sq[pix1[2] - pix2[2]];
286 s += sq[pix1[3] - pix2[3]];
287 s += sq[pix1[4] - pix2[4]];
288 s += sq[pix1[5] - pix2[5]];
289 s += sq[pix1[6] - pix2[6]];
290 s += sq[pix1[7] - pix2[7]];
297 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
300 uint32_t *sq = ff_squareTbl + 256;
303 for (i = 0; i < h; i++) {
304 s += sq[pix1[ 0] - pix2[ 0]];
305 s += sq[pix1[ 1] - pix2[ 1]];
306 s += sq[pix1[ 2] - pix2[ 2]];
307 s += sq[pix1[ 3] - pix2[ 3]];
308 s += sq[pix1[ 4] - pix2[ 4]];
309 s += sq[pix1[ 5] - pix2[ 5]];
310 s += sq[pix1[ 6] - pix2[ 6]];
311 s += sq[pix1[ 7] - pix2[ 7]];
312 s += sq[pix1[ 8] - pix2[ 8]];
313 s += sq[pix1[ 9] - pix2[ 9]];
314 s += sq[pix1[10] - pix2[10]];
315 s += sq[pix1[11] - pix2[11]];
316 s += sq[pix1[12] - pix2[12]];
317 s += sq[pix1[13] - pix2[13]];
318 s += sq[pix1[14] - pix2[14]];
319 s += sq[pix1[15] - pix2[15]];
327 static void diff_pixels_c(int16_t *restrict block, const uint8_t *s1,
328 const uint8_t *s2, int stride){
331 /* read the pixels */
333 block[0] = s1[0] - s2[0];
334 block[1] = s1[1] - s2[1];
335 block[2] = s1[2] - s2[2];
336 block[3] = s1[3] - s2[3];
337 block[4] = s1[4] - s2[4];
338 block[5] = s1[5] - s2[5];
339 block[6] = s1[6] - s2[6];
340 block[7] = s1[7] - s2[7];
348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
353 /* read the pixels */
355 pixels[0] = av_clip_uint8(block[0]);
356 pixels[1] = av_clip_uint8(block[1]);
357 pixels[2] = av_clip_uint8(block[2]);
358 pixels[3] = av_clip_uint8(block[3]);
359 pixels[4] = av_clip_uint8(block[4]);
360 pixels[5] = av_clip_uint8(block[5]);
361 pixels[6] = av_clip_uint8(block[6]);
362 pixels[7] = av_clip_uint8(block[7]);
369 static void put_signed_pixels_clamped_c(const int16_t *block,
370 uint8_t *restrict pixels,
375 for (i = 0; i < 8; i++) {
376 for (j = 0; j < 8; j++) {
379 else if (*block > 127)
382 *pixels = (uint8_t)(*block + 128);
386 pixels += (line_size - 8);
390 static void add_pixels8_c(uint8_t *restrict pixels,
397 pixels[0] += block[0];
398 pixels[1] += block[1];
399 pixels[2] += block[2];
400 pixels[3] += block[3];
401 pixels[4] += block[4];
402 pixels[5] += block[5];
403 pixels[6] += block[6];
404 pixels[7] += block[7];
410 static void add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels,
415 /* read the pixels */
417 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
418 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
419 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
420 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
421 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
422 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
423 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
424 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
430 static int sum_abs_dctelem_c(int16_t *block)
434 sum+= FFABS(block[i]);
438 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
442 for (i = 0; i < h; i++) {
443 memset(block, value, 16);
448 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
452 for (i = 0; i < h; i++) {
453 memset(block, value, 8);
458 #define avg2(a,b) ((a+b+1)>>1)
459 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
461 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
463 const int A=(16-x16)*(16-y16);
464 const int B=( x16)*(16-y16);
465 const int C=(16-x16)*( y16);
466 const int D=( x16)*( y16);
471 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
472 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
473 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
474 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
475 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
476 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
477 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
478 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
484 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
485 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
488 const int s= 1<<shift;
498 for(x=0; x<8; x++){ //XXX FIXME optimize
499 int src_x, src_y, frac_x, frac_y, index;
508 if((unsigned)src_x < width){
509 if((unsigned)src_y < height){
510 index= src_x + src_y*stride;
511 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
512 + src[index +1]* frac_x )*(s-frac_y)
513 + ( src[index+stride ]*(s-frac_x)
514 + src[index+stride+1]* frac_x )* frac_y
517 index= src_x + av_clip(src_y, 0, height)*stride;
518 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
519 + src[index +1]* frac_x )*s
523 if((unsigned)src_y < height){
524 index= av_clip(src_x, 0, width) + src_y*stride;
525 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
526 + src[index+stride ]* frac_y )*s
529 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
530 dst[y*stride + x]= src[index ];
542 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
544 case 2: put_pixels2_8_c (dst, src, stride, height); break;
545 case 4: put_pixels4_8_c (dst, src, stride, height); break;
546 case 8: put_pixels8_8_c (dst, src, stride, height); break;
547 case 16:put_pixels16_8_c(dst, src, stride, height); break;
551 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
553 for (i=0; i < height; i++) {
554 for (j=0; j < width; j++) {
555 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
562 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
564 for (i=0; i < height; i++) {
565 for (j=0; j < width; j++) {
566 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
573 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
575 for (i=0; i < height; i++) {
576 for (j=0; j < width; j++) {
577 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
584 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
586 for (i=0; i < height; i++) {
587 for (j=0; j < width; j++) {
588 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
595 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
597 for (i=0; i < height; i++) {
598 for (j=0; j < width; j++) {
599 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
606 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
608 for (i=0; i < height; i++) {
609 for (j=0; j < width; j++) {
610 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
617 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
619 for (i=0; i < height; i++) {
620 for (j=0; j < width; j++) {
621 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
628 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 for (i=0; i < height; i++) {
631 for (j=0; j < width; j++) {
632 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
639 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
641 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
642 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
643 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
644 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
648 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
659 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
670 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 for (i=0; i < height; i++) {
673 for (j=0; j < width; j++) {
674 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
681 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 for (i=0; i < height; i++) {
684 for (j=0; j < width; j++) {
685 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
692 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 for (i=0; i < height; i++) {
695 for (j=0; j < width; j++) {
696 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
703 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 for (i=0; i < height; i++) {
706 for (j=0; j < width; j++) {
707 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
714 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 for (i=0; i < height; i++) {
717 for (j=0; j < width; j++) {
718 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
725 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 for (i=0; i < height; i++) {
728 for (j=0; j < width; j++) {
729 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
736 #define QPEL_MC(r, OPNAME, RND, OP) \
737 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
738 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
742 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
743 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
744 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
745 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
746 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
747 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
748 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
749 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
755 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
757 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
761 const int src0= src[0*srcStride];\
762 const int src1= src[1*srcStride];\
763 const int src2= src[2*srcStride];\
764 const int src3= src[3*srcStride];\
765 const int src4= src[4*srcStride];\
766 const int src5= src[5*srcStride];\
767 const int src6= src[6*srcStride];\
768 const int src7= src[7*srcStride];\
769 const int src8= src[8*srcStride];\
770 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
771 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
772 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
773 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
774 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
775 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
776 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
777 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
783 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
784 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
789 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
790 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
791 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
792 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
793 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
794 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
795 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
796 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
797 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
798 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
799 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
800 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
801 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
802 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
803 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
804 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
810 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
811 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
816 const int src0= src[0*srcStride];\
817 const int src1= src[1*srcStride];\
818 const int src2= src[2*srcStride];\
819 const int src3= src[3*srcStride];\
820 const int src4= src[4*srcStride];\
821 const int src5= src[5*srcStride];\
822 const int src6= src[6*srcStride];\
823 const int src7= src[7*srcStride];\
824 const int src8= src[8*srcStride];\
825 const int src9= src[9*srcStride];\
826 const int src10= src[10*srcStride];\
827 const int src11= src[11*srcStride];\
828 const int src12= src[12*srcStride];\
829 const int src13= src[13*srcStride];\
830 const int src14= src[14*srcStride];\
831 const int src15= src[15*srcStride];\
832 const int src16= src[16*srcStride];\
833 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
834 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
835 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
836 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
837 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
838 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
839 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
840 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
841 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
842 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
843 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
844 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
845 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
846 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
847 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
848 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
854 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
858 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
863 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
866 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
869 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
870 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
873 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
877 copy_block9(full, src, 16, stride, 9);\
878 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
879 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
882 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
885 copy_block9(full, src, 16, stride, 9);\
886 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
889 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
893 copy_block9(full, src, 16, stride, 9);\
894 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
895 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
897 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
903 copy_block9(full, src, 16, stride, 9);\
904 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
905 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
906 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
907 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
909 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
914 copy_block9(full, src, 16, stride, 9);\
915 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
916 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
917 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
918 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
920 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
926 copy_block9(full, src, 16, stride, 9);\
927 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
928 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
929 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
930 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
932 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
937 copy_block9(full, src, 16, stride, 9);\
938 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
939 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
941 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
943 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
949 copy_block9(full, src, 16, stride, 9);\
950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
951 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
953 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
955 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
960 copy_block9(full, src, 16, stride, 9);\
961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
962 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
964 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
966 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
972 copy_block9(full, src, 16, stride, 9);\
973 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
974 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
976 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
978 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
983 copy_block9(full, src, 16, stride, 9);\
984 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
985 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
986 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
987 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
989 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
994 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
995 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
997 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1000 uint8_t halfHV[64];\
1001 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1002 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1005 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1007 uint8_t full[16*9];\
1010 uint8_t halfHV[64];\
1011 copy_block9(full, src, 16, stride, 9);\
1012 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1017 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1019 uint8_t full[16*9];\
1021 copy_block9(full, src, 16, stride, 9);\
1022 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1023 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1024 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1026 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1028 uint8_t full[16*9];\
1031 uint8_t halfHV[64];\
1032 copy_block9(full, src, 16, stride, 9);\
1033 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1034 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1035 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1036 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1038 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1040 uint8_t full[16*9];\
1042 copy_block9(full, src, 16, stride, 9);\
1043 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1044 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1045 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1047 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1051 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1054 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1057 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1058 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1061 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1063 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1066 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1069 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1070 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1073 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1075 uint8_t full[24*17];\
1077 copy_block17(full, src, 24, stride, 17);\
1078 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1079 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1082 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1084 uint8_t full[24*17];\
1085 copy_block17(full, src, 24, stride, 17);\
1086 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1089 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1091 uint8_t full[24*17];\
1093 copy_block17(full, src, 24, stride, 17);\
1094 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1095 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1097 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1099 uint8_t full[24*17];\
1100 uint8_t halfH[272];\
1101 uint8_t halfV[256];\
1102 uint8_t halfHV[256];\
1103 copy_block17(full, src, 24, stride, 17);\
1104 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1105 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1106 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1107 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1109 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1111 uint8_t full[24*17];\
1112 uint8_t halfH[272];\
1113 uint8_t halfHV[256];\
1114 copy_block17(full, src, 24, stride, 17);\
1115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1116 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1117 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1118 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1120 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1122 uint8_t full[24*17];\
1123 uint8_t halfH[272];\
1124 uint8_t halfV[256];\
1125 uint8_t halfHV[256];\
1126 copy_block17(full, src, 24, stride, 17);\
1127 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1128 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1129 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1130 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1132 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1134 uint8_t full[24*17];\
1135 uint8_t halfH[272];\
1136 uint8_t halfHV[256];\
1137 copy_block17(full, src, 24, stride, 17);\
1138 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1145 uint8_t full[24*17];\
1146 uint8_t halfH[272];\
1147 uint8_t halfV[256];\
1148 uint8_t halfHV[256];\
1149 copy_block17(full, src, 24, stride, 17);\
1150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1151 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1153 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1155 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1157 uint8_t full[24*17];\
1158 uint8_t halfH[272];\
1159 uint8_t halfHV[256];\
1160 copy_block17(full, src, 24, stride, 17);\
1161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1162 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1163 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1164 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1166 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1168 uint8_t full[24*17];\
1169 uint8_t halfH[272];\
1170 uint8_t halfV[256];\
1171 uint8_t halfHV[256];\
1172 copy_block17(full, src, 24, stride, 17);\
1173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1176 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1178 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1180 uint8_t full[24*17];\
1181 uint8_t halfH[272];\
1182 uint8_t halfHV[256];\
1183 copy_block17(full, src, 24, stride, 17);\
1184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1185 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1187 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1189 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1191 uint8_t halfH[272];\
1192 uint8_t halfHV[256];\
1193 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1194 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1195 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1197 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1199 uint8_t halfH[272];\
1200 uint8_t halfHV[256];\
1201 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1205 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1207 uint8_t full[24*17];\
1208 uint8_t halfH[272];\
1209 uint8_t halfV[256];\
1210 uint8_t halfHV[256];\
1211 copy_block17(full, src, 24, stride, 17);\
1212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1217 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1219 uint8_t full[24*17];\
1220 uint8_t halfH[272];\
1221 copy_block17(full, src, 24, stride, 17);\
1222 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1223 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1224 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1226 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1228 uint8_t full[24*17];\
1229 uint8_t halfH[272];\
1230 uint8_t halfV[256];\
1231 uint8_t halfHV[256];\
1232 copy_block17(full, src, 24, stride, 17);\
1233 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1235 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1236 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1238 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1240 uint8_t full[24*17];\
1241 uint8_t halfH[272];\
1242 copy_block17(full, src, 24, stride, 17);\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1244 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1245 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1247 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1249 uint8_t halfH[272];\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1254 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1255 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1256 #define op_put(a, b) a = cm[((b) + 16)>>5]
1257 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1259 QPEL_MC(0, put_ , _ , op_put)
1260 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1261 QPEL_MC(0, avg_ , _ , op_avg)
1262 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1264 #undef op_avg_no_rnd
1266 #undef op_put_no_rnd
1268 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1270 put_pixels8_8_c(dst, src, stride, 8);
1272 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1274 avg_pixels8_8_c(dst, src, stride, 8);
1276 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1278 put_pixels16_8_c(dst, src, stride, 16);
1280 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1282 avg_pixels16_8_c(dst, src, stride, 16);
1285 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1286 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1287 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1288 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1289 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1290 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1292 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1293 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1297 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1298 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1299 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1300 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1301 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1302 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1303 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1304 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1310 #if CONFIG_RV40_DECODER
1311 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1313 put_pixels16_xy2_8_c(dst, src, stride, 16);
1315 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1317 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1319 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1321 put_pixels8_xy2_8_c(dst, src, stride, 8);
1323 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1325 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1327 #endif /* CONFIG_RV40_DECODER */
1329 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1330 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1334 const int src_1= src[ -srcStride];
1335 const int src0 = src[0 ];
1336 const int src1 = src[ srcStride];
1337 const int src2 = src[2*srcStride];
1338 const int src3 = src[3*srcStride];
1339 const int src4 = src[4*srcStride];
1340 const int src5 = src[5*srcStride];
1341 const int src6 = src[6*srcStride];
1342 const int src7 = src[7*srcStride];
1343 const int src8 = src[8*srcStride];
1344 const int src9 = src[9*srcStride];
1345 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1346 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1347 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1348 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1349 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1350 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1351 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1352 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1358 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1361 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1362 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1365 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1367 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1370 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1373 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1374 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1377 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1379 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1382 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1387 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1388 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1389 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1390 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1392 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1397 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1398 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1399 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1400 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1402 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1405 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1409 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1410 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1412 const int strength= ff_h263_loop_filter_strength[qscale];
1416 int p0= src[x-2*stride];
1417 int p1= src[x-1*stride];
1418 int p2= src[x+0*stride];
1419 int p3= src[x+1*stride];
1420 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1422 if (d<-2*strength) d1= 0;
1423 else if(d<- strength) d1=-2*strength - d;
1424 else if(d< strength) d1= d;
1425 else if(d< 2*strength) d1= 2*strength - d;
1430 if(p1&256) p1= ~(p1>>31);
1431 if(p2&256) p2= ~(p2>>31);
1433 src[x-1*stride] = p1;
1434 src[x+0*stride] = p2;
1438 d2= av_clip((p0-p3)/4, -ad1, ad1);
1440 src[x-2*stride] = p0 - d2;
1441 src[x+ stride] = p3 + d2;
1446 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1447 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1449 const int strength= ff_h263_loop_filter_strength[qscale];
1453 int p0= src[y*stride-2];
1454 int p1= src[y*stride-1];
1455 int p2= src[y*stride+0];
1456 int p3= src[y*stride+1];
1457 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1459 if (d<-2*strength) d1= 0;
1460 else if(d<- strength) d1=-2*strength - d;
1461 else if(d< strength) d1= d;
1462 else if(d< 2*strength) d1= 2*strength - d;
1467 if(p1&256) p1= ~(p1>>31);
1468 if(p2&256) p2= ~(p2>>31);
1470 src[y*stride-1] = p1;
1471 src[y*stride+0] = p2;
1475 d2= av_clip((p0-p3)/4, -ad1, ad1);
1477 src[y*stride-2] = p0 - d2;
1478 src[y*stride+1] = p3 + d2;
1483 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1489 s += abs(pix1[0] - pix2[0]);
1490 s += abs(pix1[1] - pix2[1]);
1491 s += abs(pix1[2] - pix2[2]);
1492 s += abs(pix1[3] - pix2[3]);
1493 s += abs(pix1[4] - pix2[4]);
1494 s += abs(pix1[5] - pix2[5]);
1495 s += abs(pix1[6] - pix2[6]);
1496 s += abs(pix1[7] - pix2[7]);
1497 s += abs(pix1[8] - pix2[8]);
1498 s += abs(pix1[9] - pix2[9]);
1499 s += abs(pix1[10] - pix2[10]);
1500 s += abs(pix1[11] - pix2[11]);
1501 s += abs(pix1[12] - pix2[12]);
1502 s += abs(pix1[13] - pix2[13]);
1503 s += abs(pix1[14] - pix2[14]);
1504 s += abs(pix1[15] - pix2[15]);
1511 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1517 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1518 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1519 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1520 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1521 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1522 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1523 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1524 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1525 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1526 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1527 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1528 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1529 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1530 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1531 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1532 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1539 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1542 uint8_t *pix3 = pix2 + line_size;
1546 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1547 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1548 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1549 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1550 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1551 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1552 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1553 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1554 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1555 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1556 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1557 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1558 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1559 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1560 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1561 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1569 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1572 uint8_t *pix3 = pix2 + line_size;
1576 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1577 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1578 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1579 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1580 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1581 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1582 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1583 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1584 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1585 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1586 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1587 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1588 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1589 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1590 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1591 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1599 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605 s += abs(pix1[0] - pix2[0]);
1606 s += abs(pix1[1] - pix2[1]);
1607 s += abs(pix1[2] - pix2[2]);
1608 s += abs(pix1[3] - pix2[3]);
1609 s += abs(pix1[4] - pix2[4]);
1610 s += abs(pix1[5] - pix2[5]);
1611 s += abs(pix1[6] - pix2[6]);
1612 s += abs(pix1[7] - pix2[7]);
1619 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1625 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1626 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1627 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1628 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1629 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1630 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1631 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1632 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1639 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1642 uint8_t *pix3 = pix2 + line_size;
1646 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1647 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1648 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1649 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1650 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1651 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1652 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1653 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1661 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1664 uint8_t *pix3 = pix2 + line_size;
1668 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1669 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1670 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1671 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1672 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1673 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1674 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1675 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1683 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1684 MpegEncContext *c = v;
1690 for(x=0; x<16; x++){
1691 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1694 for(x=0; x<15; x++){
1695 score2+= FFABS( s1[x ] - s1[x +stride]
1696 - s1[x+1] + s1[x+1+stride])
1697 -FFABS( s2[x ] - s2[x +stride]
1698 - s2[x+1] + s2[x+1+stride]);
1705 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1706 else return score1 + FFABS(score2)*8;
1709 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1710 MpegEncContext *c = v;
1717 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1721 score2+= FFABS( s1[x ] - s1[x +stride]
1722 - s1[x+1] + s1[x+1+stride])
1723 -FFABS( s2[x ] - s2[x +stride]
1724 - s2[x+1] + s2[x+1+stride]);
1731 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1732 else return score1 + FFABS(score2)*8;
1735 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1739 for(i=0; i<8*8; i++){
1740 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1743 assert(-512<b && b<512);
1745 sum += (w*b)*(w*b)>>4;
1750 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1753 for(i=0; i<8*8; i++){
1754 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1758 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1762 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1765 memset(cmp, 0, sizeof(void*)*6);
1773 cmp[i]= c->hadamard8_diff[i];
1779 cmp[i]= c->dct_sad[i];
1782 cmp[i]= c->dct264_sad[i];
1785 cmp[i]= c->dct_max[i];
1788 cmp[i]= c->quant_psnr[i];
1809 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1814 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1816 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1817 long a = *(long*)(src+i);
1818 long b = *(long*)(dst+i);
1819 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1822 dst[i+0] += src[i+0];
1825 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1827 #if !HAVE_FAST_UNALIGNED
1828 if((long)src2 & (sizeof(long)-1)){
1829 for(i=0; i+7<w; i+=8){
1830 dst[i+0] = src1[i+0]-src2[i+0];
1831 dst[i+1] = src1[i+1]-src2[i+1];
1832 dst[i+2] = src1[i+2]-src2[i+2];
1833 dst[i+3] = src1[i+3]-src2[i+3];
1834 dst[i+4] = src1[i+4]-src2[i+4];
1835 dst[i+5] = src1[i+5]-src2[i+5];
1836 dst[i+6] = src1[i+6]-src2[i+6];
1837 dst[i+7] = src1[i+7]-src2[i+7];
1841 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1842 long a = *(long*)(src1+i);
1843 long b = *(long*)(src2+i);
1844 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1847 dst[i+0] = src1[i+0]-src2[i+0];
1850 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1858 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1867 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1875 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1885 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1888 for(i=0; i<w-1; i++){
1915 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1945 #define BUTTERFLY2(o1,o2,i1,i2) \
1949 #define BUTTERFLY1(x,y) \
1958 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1960 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1968 //FIXME try pointer walks
1969 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1970 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1971 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1972 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1974 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1975 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1976 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1977 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1979 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1980 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1981 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1982 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1986 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1987 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1988 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1989 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1991 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1992 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1993 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1994 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1997 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1998 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1999 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2000 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2005 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2013 //FIXME try pointer walks
2014 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2015 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2016 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2017 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2019 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2020 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2021 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2022 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2024 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2025 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2026 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2027 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2031 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2032 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2033 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2034 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2036 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2037 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2038 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2039 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2042 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2043 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2044 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2045 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2048 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2053 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2054 MpegEncContext * const s= (MpegEncContext *)c;
2055 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2059 s->dsp.diff_pixels(temp, src1, src2, stride);
2061 return s->dsp.sum_abs_dctelem(temp);
2066 const int s07 = SRC(0) + SRC(7);\
2067 const int s16 = SRC(1) + SRC(6);\
2068 const int s25 = SRC(2) + SRC(5);\
2069 const int s34 = SRC(3) + SRC(4);\
2070 const int a0 = s07 + s34;\
2071 const int a1 = s16 + s25;\
2072 const int a2 = s07 - s34;\
2073 const int a3 = s16 - s25;\
2074 const int d07 = SRC(0) - SRC(7);\
2075 const int d16 = SRC(1) - SRC(6);\
2076 const int d25 = SRC(2) - SRC(5);\
2077 const int d34 = SRC(3) - SRC(4);\
2078 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2079 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2080 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2081 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2083 DST(1, a4 + (a7>>2)) ;\
2084 DST(2, a2 + (a3>>1)) ;\
2085 DST(3, a5 + (a6>>2)) ;\
2087 DST(5, a6 - (a5>>2)) ;\
2088 DST(6, (a2>>1) - a3 ) ;\
2089 DST(7, (a4>>2) - a7 ) ;\
2092 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2093 MpegEncContext * const s= (MpegEncContext *)c;
2098 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2100 #define SRC(x) dct[i][x]
2101 #define DST(x,v) dct[i][x]= v
2102 for( i = 0; i < 8; i++ )
2107 #define SRC(x) dct[x][i]
2108 #define DST(x,v) sum += FFABS(v)
2109 for( i = 0; i < 8; i++ )
2117 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2118 MpegEncContext * const s= (MpegEncContext *)c;
2119 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2124 s->dsp.diff_pixels(temp, src1, src2, stride);
2128 sum= FFMAX(sum, FFABS(temp[i]));
2133 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134 MpegEncContext * const s= (MpegEncContext *)c;
2135 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2136 int16_t * const bak = temp+64;
2142 s->dsp.diff_pixels(temp, src1, src2, stride);
2144 memcpy(bak, temp, 64*sizeof(int16_t));
2146 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2147 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2148 ff_simple_idct_8(temp); //FIXME
2151 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2156 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2157 MpegEncContext * const s= (MpegEncContext *)c;
2158 const uint8_t *scantable= s->intra_scantable.permutated;
2159 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2160 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2161 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2162 int i, last, run, bits, level, distortion, start_i;
2163 const int esc_length= s->ac_esc_length;
2165 uint8_t * last_length;
2169 copy_block8(lsrc1, src1, 8, stride, 8);
2170 copy_block8(lsrc2, src2, 8, stride, 8);
2172 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2174 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2180 length = s->intra_ac_vlc_length;
2181 last_length= s->intra_ac_vlc_last_length;
2182 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2185 length = s->inter_ac_vlc_length;
2186 last_length= s->inter_ac_vlc_last_length;
2191 for(i=start_i; i<last; i++){
2192 int j= scantable[i];
2197 if((level&(~127)) == 0){
2198 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2207 level= temp[i] + 64;
2211 if((level&(~127)) == 0){
2212 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2220 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2222 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2225 s->dsp.idct_add(lsrc2, 8, temp);
2227 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2229 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2232 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2233 MpegEncContext * const s= (MpegEncContext *)c;
2234 const uint8_t *scantable= s->intra_scantable.permutated;
2235 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2236 int i, last, run, bits, level, start_i;
2237 const int esc_length= s->ac_esc_length;
2239 uint8_t * last_length;
2243 s->dsp.diff_pixels(temp, src1, src2, stride);
2245 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2251 length = s->intra_ac_vlc_length;
2252 last_length= s->intra_ac_vlc_last_length;
2253 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2256 length = s->inter_ac_vlc_length;
2257 last_length= s->inter_ac_vlc_last_length;
2262 for(i=start_i; i<last; i++){
2263 int j= scantable[i];
2268 if((level&(~127)) == 0){
2269 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2278 level= temp[i] + 64;
2282 if((level&(~127)) == 0){
2283 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2291 #define VSAD_INTRA(size) \
2292 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2296 for(y=1; y<h; y++){ \
2297 for(x=0; x<size; x+=4){ \
2298 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2299 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2309 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2314 for(x=0; x<16; x++){
2315 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2324 #define SQ(a) ((a)*(a))
2325 #define VSSE_INTRA(size) \
2326 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2330 for(y=1; y<h; y++){ \
2331 for(x=0; x<size; x+=4){ \
2332 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2333 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2343 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2348 for(x=0; x<16; x++){
2349 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2358 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2362 for(i=0; i<size; i++)
2363 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2367 #define WRAPPER8_16_SQ(name8, name16)\
2368 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2370 score +=name8(s, dst , src , stride, 8);\
2371 score +=name8(s, dst+8 , src+8 , stride, 8);\
2375 score +=name8(s, dst , src , stride, 8);\
2376 score +=name8(s, dst+8 , src+8 , stride, 8);\
2381 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2382 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2383 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2385 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2387 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2388 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2389 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2390 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2392 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2393 uint32_t maxi, uint32_t maxisign)
2396 if(a > mini) return mini;
2397 else if((a^(1U<<31)) > maxisign) return maxi;
2401 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2403 uint32_t mini = *(uint32_t*)min;
2404 uint32_t maxi = *(uint32_t*)max;
2405 uint32_t maxisign = maxi ^ (1U<<31);
2406 uint32_t *dsti = (uint32_t*)dst;
2407 const uint32_t *srci = (const uint32_t*)src;
2408 for(i=0; i<len; i+=8) {
2409 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2410 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2411 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2412 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2413 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2414 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2415 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2416 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2419 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2421 if(min < 0 && max > 0) {
2422 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2424 for(i=0; i < len; i+=8) {
2425 dst[i ] = av_clipf(src[i ], min, max);
2426 dst[i + 1] = av_clipf(src[i + 1], min, max);
2427 dst[i + 2] = av_clipf(src[i + 2], min, max);
2428 dst[i + 3] = av_clipf(src[i + 3], min, max);
2429 dst[i + 4] = av_clipf(src[i + 4], min, max);
2430 dst[i + 5] = av_clipf(src[i + 5], min, max);
2431 dst[i + 6] = av_clipf(src[i + 6], min, max);
2432 dst[i + 7] = av_clipf(src[i + 7], min, max);
2437 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2442 res += *v1++ * *v2++;
2447 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2452 *v1++ += mul * *v3++;
2457 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2458 const int16_t *window, unsigned int len)
2461 int len2 = len >> 1;
2463 for (i = 0; i < len2; i++) {
2464 int16_t w = window[i];
2465 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2466 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2470 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2471 int32_t max, unsigned int len)
2474 *dst++ = av_clip(*src++, min, max);
2475 *dst++ = av_clip(*src++, min, max);
2476 *dst++ = av_clip(*src++, min, max);
2477 *dst++ = av_clip(*src++, min, max);
2478 *dst++ = av_clip(*src++, min, max);
2479 *dst++ = av_clip(*src++, min, max);
2480 *dst++ = av_clip(*src++, min, max);
2481 *dst++ = av_clip(*src++, min, max);
2486 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2488 ff_j_rev_dct (block);
2489 put_pixels_clamped_c(block, dest, line_size);
2491 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2493 ff_j_rev_dct (block);
2494 add_pixels_clamped_c(block, dest, line_size);
2497 /* init static data */
2498 av_cold void ff_dsputil_static_init(void)
2502 for(i=0;i<512;i++) {
2503 ff_squareTbl[i] = (i - 256) * (i - 256);
2506 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2509 int ff_check_alignment(void){
2510 static int did_fail=0;
2511 LOCAL_ALIGNED_16(int, aligned, [4]);
2513 if((intptr_t)aligned & 15){
2515 #if HAVE_MMX || HAVE_ALTIVEC
2516 av_log(NULL, AV_LOG_ERROR,
2517 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2518 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2519 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2520 "Do not report crashes to Libav developers.\n");
2529 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2531 ff_check_alignment();
2534 if (avctx->bits_per_raw_sample == 10) {
2535 c->fdct = ff_jpeg_fdct_islow_10;
2536 c->fdct248 = ff_fdct248_islow_10;
2538 if(avctx->dct_algo==FF_DCT_FASTINT) {
2539 c->fdct = ff_fdct_ifast;
2540 c->fdct248 = ff_fdct_ifast248;
2542 else if(avctx->dct_algo==FF_DCT_FAAN) {
2543 c->fdct = ff_faandct;
2544 c->fdct248 = ff_faandct248;
2547 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2548 c->fdct248 = ff_fdct248_islow_8;
2551 #endif //CONFIG_ENCODERS
2553 if (avctx->bits_per_raw_sample == 10) {
2554 c->idct_put = ff_simple_idct_put_10;
2555 c->idct_add = ff_simple_idct_add_10;
2556 c->idct = ff_simple_idct_10;
2557 c->idct_permutation_type = FF_NO_IDCT_PERM;
2559 if(avctx->idct_algo==FF_IDCT_INT){
2560 c->idct_put= jref_idct_put;
2561 c->idct_add= jref_idct_add;
2562 c->idct = ff_j_rev_dct;
2563 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2564 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2565 c->idct_put= ff_faanidct_put;
2566 c->idct_add= ff_faanidct_add;
2567 c->idct = ff_faanidct;
2568 c->idct_permutation_type= FF_NO_IDCT_PERM;
2569 }else{ //accurate/default
2570 c->idct_put = ff_simple_idct_put_8;
2571 c->idct_add = ff_simple_idct_add_8;
2572 c->idct = ff_simple_idct_8;
2573 c->idct_permutation_type= FF_NO_IDCT_PERM;
2577 c->diff_pixels = diff_pixels_c;
2578 c->put_pixels_clamped = put_pixels_clamped_c;
2579 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2580 c->add_pixels_clamped = add_pixels_clamped_c;
2581 c->sum_abs_dctelem = sum_abs_dctelem_c;
2584 c->pix_sum = pix_sum_c;
2585 c->pix_norm1 = pix_norm1_c;
2587 c->fill_block_tab[0] = fill_block16_c;
2588 c->fill_block_tab[1] = fill_block8_c;
2590 /* TODO [0] 16 [1] 8 */
2591 c->pix_abs[0][0] = pix_abs16_c;
2592 c->pix_abs[0][1] = pix_abs16_x2_c;
2593 c->pix_abs[0][2] = pix_abs16_y2_c;
2594 c->pix_abs[0][3] = pix_abs16_xy2_c;
2595 c->pix_abs[1][0] = pix_abs8_c;
2596 c->pix_abs[1][1] = pix_abs8_x2_c;
2597 c->pix_abs[1][2] = pix_abs8_y2_c;
2598 c->pix_abs[1][3] = pix_abs8_xy2_c;
2600 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2601 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2602 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2603 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2604 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2605 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2606 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2607 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2608 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2610 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2611 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2612 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2613 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2614 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2615 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2616 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2617 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2618 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2620 #define dspfunc(PFX, IDX, NUM) \
2621 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2622 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2623 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2624 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2625 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2626 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2627 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2628 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2629 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2630 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2631 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2632 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2633 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2634 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2635 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2636 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2638 dspfunc(put_qpel, 0, 16);
2639 dspfunc(put_no_rnd_qpel, 0, 16);
2641 dspfunc(avg_qpel, 0, 16);
2642 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2644 dspfunc(put_qpel, 1, 8);
2645 dspfunc(put_no_rnd_qpel, 1, 8);
2647 dspfunc(avg_qpel, 1, 8);
2648 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2652 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2653 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2654 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2655 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2656 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2657 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2658 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2659 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2661 #define SET_CMP_FUNC(name) \
2662 c->name[0]= name ## 16_c;\
2663 c->name[1]= name ## 8x8_c;
2665 SET_CMP_FUNC(hadamard8_diff)
2666 c->hadamard8_diff[4]= hadamard8_intra16_c;
2667 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2668 SET_CMP_FUNC(dct_sad)
2669 SET_CMP_FUNC(dct_max)
2671 SET_CMP_FUNC(dct264_sad)
2673 c->sad[0]= pix_abs16_c;
2674 c->sad[1]= pix_abs8_c;
2678 SET_CMP_FUNC(quant_psnr)
2681 c->vsad[0]= vsad16_c;
2682 c->vsad[4]= vsad_intra16_c;
2683 c->vsad[5]= vsad_intra8_c;
2684 c->vsse[0]= vsse16_c;
2685 c->vsse[4]= vsse_intra16_c;
2686 c->vsse[5]= vsse_intra8_c;
2687 c->nsse[0]= nsse16_c;
2688 c->nsse[1]= nsse8_c;
2690 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2692 c->add_bytes= add_bytes_c;
2693 c->diff_bytes= diff_bytes_c;
2694 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2695 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2696 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2697 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2698 c->bswap_buf= bswap_buf;
2699 c->bswap16_buf = bswap16_buf;
2701 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2702 c->h263_h_loop_filter= h263_h_loop_filter_c;
2703 c->h263_v_loop_filter= h263_v_loop_filter_c;
2706 c->try_8x8basis= try_8x8basis_c;
2707 c->add_8x8basis= add_8x8basis_c;
2709 c->vector_clipf = vector_clipf_c;
2710 c->scalarproduct_int16 = scalarproduct_int16_c;
2711 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2712 c->apply_window_int16 = apply_window_int16_c;
2713 c->vector_clip_int32 = vector_clip_int32_c;
2715 c->shrink[0]= av_image_copy_plane;
2716 c->shrink[1]= ff_shrink22;
2717 c->shrink[2]= ff_shrink44;
2718 c->shrink[3]= ff_shrink88;
2720 c->add_pixels8 = add_pixels8_c;
2724 #define FUNC(f, depth) f ## _ ## depth
2725 #define FUNCC(f, depth) f ## _ ## depth ## _c
2727 c->draw_edges = FUNCC(draw_edges, 8);
2728 c->clear_block = FUNCC(clear_block, 8);
2729 c->clear_blocks = FUNCC(clear_blocks, 8);
2731 #define BIT_DEPTH_FUNCS(depth) \
2732 c->get_pixels = FUNCC(get_pixels, depth);
2734 switch (avctx->bits_per_raw_sample) {
2737 BIT_DEPTH_FUNCS(16);
2745 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2746 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2747 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2748 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2749 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2750 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2751 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2753 ff_init_scantable_permutation(c->idct_permutation,
2754 c->idct_permutation_type);