3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
31 #include "libavutil/internal.h"
33 #include "copy_block.h"
36 #include "simple_idct.h"
39 #include "imgconvert.h"
41 #include "mpegvideo.h"
45 uint32_t ff_squareTbl[512] = {0, };
48 #include "dsputil_template.c"
52 #include "dsputil_template.c"
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
59 specification, we interleave the fields */
60 const uint8_t ff_zigzag248_direct[64] = {
61 0, 8, 1, 9, 16, 24, 2, 10,
62 17, 25, 32, 40, 48, 56, 33, 41,
63 18, 26, 3, 11, 4, 12, 19, 27,
64 34, 42, 49, 57, 50, 58, 35, 43,
65 20, 28, 5, 13, 6, 14, 21, 29,
66 36, 44, 51, 59, 52, 60, 37, 45,
67 22, 30, 7, 15, 23, 31, 38, 46,
68 53, 61, 54, 62, 39, 47, 55, 63,
71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72 DECLARE_ALIGNED(16, uint16_t, ff_inv_zigzag_direct16)[64];
74 const uint8_t ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 const uint8_t ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
96 /* Input permutation for the simple_idct_mmx */
97 static const uint8_t simple_mmx_permutation[64]={
98 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
99 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
100 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
101 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
102 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
103 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
104 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
105 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
108 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
110 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
114 st->scantable= src_scantable;
118 j = src_scantable[i];
119 st->permutated[i] = permutation[j];
125 j = st->permutated[i];
127 st->raster_end[i]= end;
131 void ff_init_scantable_permutation(uint8_t *idct_permutation,
132 int idct_permutation_type)
136 switch(idct_permutation_type){
137 case FF_NO_IDCT_PERM:
139 idct_permutation[i]= i;
141 case FF_LIBMPEG2_IDCT_PERM:
143 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
145 case FF_SIMPLE_IDCT_PERM:
147 idct_permutation[i]= simple_mmx_permutation[i];
149 case FF_TRANSPOSE_IDCT_PERM:
151 idct_permutation[i]= ((i&7)<<3) | (i>>3);
153 case FF_PARTTRANS_IDCT_PERM:
155 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
157 case FF_SSE2_IDCT_PERM:
159 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
162 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
166 static int pix_sum_c(uint8_t * pix, int line_size)
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
183 pix += line_size - 16;
188 static int pix_norm1_c(uint8_t * pix, int line_size)
191 uint32_t *sq = ff_squareTbl + 256;
194 for (i = 0; i < 16; i++) {
195 for (j = 0; j < 16; j += 8) {
207 register uint64_t x=*(uint64_t*)pix;
209 s += sq[(x>>8)&0xff];
210 s += sq[(x>>16)&0xff];
211 s += sq[(x>>24)&0xff];
212 s += sq[(x>>32)&0xff];
213 s += sq[(x>>40)&0xff];
214 s += sq[(x>>48)&0xff];
215 s += sq[(x>>56)&0xff];
217 register uint32_t x=*(uint32_t*)pix;
219 s += sq[(x>>8)&0xff];
220 s += sq[(x>>16)&0xff];
221 s += sq[(x>>24)&0xff];
222 x=*(uint32_t*)(pix+4);
224 s += sq[(x>>8)&0xff];
225 s += sq[(x>>16)&0xff];
226 s += sq[(x>>24)&0xff];
231 pix += line_size - 16;
236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
239 for(i=0; i+8<=w; i+=8){
240 dst[i+0]= av_bswap32(src[i+0]);
241 dst[i+1]= av_bswap32(src[i+1]);
242 dst[i+2]= av_bswap32(src[i+2]);
243 dst[i+3]= av_bswap32(src[i+3]);
244 dst[i+4]= av_bswap32(src[i+4]);
245 dst[i+5]= av_bswap32(src[i+5]);
246 dst[i+6]= av_bswap32(src[i+6]);
247 dst[i+7]= av_bswap32(src[i+7]);
250 dst[i+0]= av_bswap32(src[i+0]);
254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
257 *dst++ = av_bswap16(*src++);
260 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
263 uint32_t *sq = ff_squareTbl + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[0] - pix2[0]];
268 s += sq[pix1[1] - pix2[1]];
269 s += sq[pix1[2] - pix2[2]];
270 s += sq[pix1[3] - pix2[3]];
277 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
280 uint32_t *sq = ff_squareTbl + 256;
283 for (i = 0; i < h; i++) {
284 s += sq[pix1[0] - pix2[0]];
285 s += sq[pix1[1] - pix2[1]];
286 s += sq[pix1[2] - pix2[2]];
287 s += sq[pix1[3] - pix2[3]];
288 s += sq[pix1[4] - pix2[4]];
289 s += sq[pix1[5] - pix2[5]];
290 s += sq[pix1[6] - pix2[6]];
291 s += sq[pix1[7] - pix2[7]];
298 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
301 uint32_t *sq = ff_squareTbl + 256;
304 for (i = 0; i < h; i++) {
305 s += sq[pix1[ 0] - pix2[ 0]];
306 s += sq[pix1[ 1] - pix2[ 1]];
307 s += sq[pix1[ 2] - pix2[ 2]];
308 s += sq[pix1[ 3] - pix2[ 3]];
309 s += sq[pix1[ 4] - pix2[ 4]];
310 s += sq[pix1[ 5] - pix2[ 5]];
311 s += sq[pix1[ 6] - pix2[ 6]];
312 s += sq[pix1[ 7] - pix2[ 7]];
313 s += sq[pix1[ 8] - pix2[ 8]];
314 s += sq[pix1[ 9] - pix2[ 9]];
315 s += sq[pix1[10] - pix2[10]];
316 s += sq[pix1[11] - pix2[11]];
317 s += sq[pix1[12] - pix2[12]];
318 s += sq[pix1[13] - pix2[13]];
319 s += sq[pix1[14] - pix2[14]];
320 s += sq[pix1[15] - pix2[15]];
328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
329 const uint8_t *s2, int stride){
332 /* read the pixels */
334 block[0] = s1[0] - s2[0];
335 block[1] = s1[1] - s2[1];
336 block[2] = s1[2] - s2[2];
337 block[3] = s1[3] - s2[3];
338 block[4] = s1[4] - s2[4];
339 block[5] = s1[5] - s2[5];
340 block[6] = s1[6] - s2[6];
341 block[7] = s1[7] - s2[7];
348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
353 /* read the pixels */
355 pixels[0] = av_clip_uint8(block[0]);
356 pixels[1] = av_clip_uint8(block[1]);
357 pixels[2] = av_clip_uint8(block[2]);
358 pixels[3] = av_clip_uint8(block[3]);
359 pixels[4] = av_clip_uint8(block[4]);
360 pixels[5] = av_clip_uint8(block[5]);
361 pixels[6] = av_clip_uint8(block[6]);
362 pixels[7] = av_clip_uint8(block[7]);
369 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
374 /* read the pixels */
376 pixels[0] = av_clip_uint8(block[0]);
377 pixels[1] = av_clip_uint8(block[1]);
378 pixels[2] = av_clip_uint8(block[2]);
379 pixels[3] = av_clip_uint8(block[3]);
386 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
391 /* read the pixels */
393 pixels[0] = av_clip_uint8(block[0]);
394 pixels[1] = av_clip_uint8(block[1]);
401 static void put_signed_pixels_clamped_c(const int16_t *block,
402 uint8_t *av_restrict pixels,
407 for (i = 0; i < 8; i++) {
408 for (j = 0; j < 8; j++) {
411 else if (*block > 127)
414 *pixels = (uint8_t)(*block + 128);
418 pixels += (line_size - 8);
422 static void add_pixels8_c(uint8_t *av_restrict pixels,
429 pixels[0] += block[0];
430 pixels[1] += block[1];
431 pixels[2] += block[2];
432 pixels[3] += block[3];
433 pixels[4] += block[4];
434 pixels[5] += block[5];
435 pixels[6] += block[6];
436 pixels[7] += block[7];
442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
447 /* read the pixels */
449 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
450 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
451 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
452 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
453 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
454 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
455 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
456 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
467 /* read the pixels */
469 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
470 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
471 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
472 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
483 /* read the pixels */
485 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
486 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
492 static int sum_abs_dctelem_c(int16_t *block)
496 sum+= FFABS(block[i]);
500 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
504 for (i = 0; i < h; i++) {
505 memset(block, value, 16);
510 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
514 for (i = 0; i < h; i++) {
515 memset(block, value, 8);
520 #define avg2(a,b) ((a+b+1)>>1)
521 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
523 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
525 const int A=(16-x16)*(16-y16);
526 const int B=( x16)*(16-y16);
527 const int C=(16-x16)*( y16);
528 const int D=( x16)*( y16);
533 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
534 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
535 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
536 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
537 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
538 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
539 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
540 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
546 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
547 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
550 const int s= 1<<shift;
560 for(x=0; x<8; x++){ //XXX FIXME optimize
561 int src_x, src_y, frac_x, frac_y, index;
570 if((unsigned)src_x < width){
571 if((unsigned)src_y < height){
572 index= src_x + src_y*stride;
573 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
574 + src[index +1]* frac_x )*(s-frac_y)
575 + ( src[index+stride ]*(s-frac_x)
576 + src[index+stride+1]* frac_x )* frac_y
579 index= src_x + av_clip(src_y, 0, height)*stride;
580 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
581 + src[index +1]* frac_x )*s
585 if((unsigned)src_y < height){
586 index= av_clip(src_x, 0, width) + src_y*stride;
587 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
588 + src[index+stride ]* frac_y )*s
591 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
592 dst[y*stride + x]= src[index ];
604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
606 case 2: put_pixels2_8_c (dst, src, stride, height); break;
607 case 4: put_pixels4_8_c (dst, src, stride, height); break;
608 case 8: put_pixels8_8_c (dst, src, stride, height); break;
609 case 16:put_pixels16_8_c(dst, src, stride, height); break;
613 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
615 for (i=0; i < height; i++) {
616 for (j=0; j < width; j++) {
617 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
624 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
626 for (i=0; i < height; i++) {
627 for (j=0; j < width; j++) {
628 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
635 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
637 for (i=0; i < height; i++) {
638 for (j=0; j < width; j++) {
639 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
646 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
648 for (i=0; i < height; i++) {
649 for (j=0; j < width; j++) {
650 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
657 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
659 for (i=0; i < height; i++) {
660 for (j=0; j < width; j++) {
661 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
668 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
670 for (i=0; i < height; i++) {
671 for (j=0; j < width; j++) {
672 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
679 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
681 for (i=0; i < height; i++) {
682 for (j=0; j < width; j++) {
683 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
690 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
692 for (i=0; i < height; i++) {
693 for (j=0; j < width; j++) {
694 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
701 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
703 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
704 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
705 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
706 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
710 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
712 for (i=0; i < height; i++) {
713 for (j=0; j < width; j++) {
714 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
721 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
723 for (i=0; i < height; i++) {
724 for (j=0; j < width; j++) {
725 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
732 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
734 for (i=0; i < height; i++) {
735 for (j=0; j < width; j++) {
736 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
743 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
745 for (i=0; i < height; i++) {
746 for (j=0; j < width; j++) {
747 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
754 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
756 for (i=0; i < height; i++) {
757 for (j=0; j < width; j++) {
758 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
765 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
767 for (i=0; i < height; i++) {
768 for (j=0; j < width; j++) {
769 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
776 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
778 for (i=0; i < height; i++) {
779 for (j=0; j < width; j++) {
780 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
787 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
789 for (i=0; i < height; i++) {
790 for (j=0; j < width; j++) {
791 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
798 #define QPEL_MC(r, OPNAME, RND, OP) \
799 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
804 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
805 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
806 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
807 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
808 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
809 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
810 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
811 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
817 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
819 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
823 const int src0= src[0*srcStride];\
824 const int src1= src[1*srcStride];\
825 const int src2= src[2*srcStride];\
826 const int src3= src[3*srcStride];\
827 const int src4= src[4*srcStride];\
828 const int src5= src[5*srcStride];\
829 const int src6= src[6*srcStride];\
830 const int src7= src[7*srcStride];\
831 const int src8= src[8*srcStride];\
832 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
833 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
834 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
835 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
836 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
837 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
838 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
839 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
845 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
851 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
852 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
853 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
854 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
855 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
856 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
857 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
858 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
859 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
860 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
861 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
862 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
863 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
864 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
865 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
866 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
872 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
873 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
878 const int src0= src[0*srcStride];\
879 const int src1= src[1*srcStride];\
880 const int src2= src[2*srcStride];\
881 const int src3= src[3*srcStride];\
882 const int src4= src[4*srcStride];\
883 const int src5= src[5*srcStride];\
884 const int src6= src[6*srcStride];\
885 const int src7= src[7*srcStride];\
886 const int src8= src[8*srcStride];\
887 const int src9= src[9*srcStride];\
888 const int src10= src[10*srcStride];\
889 const int src11= src[11*srcStride];\
890 const int src12= src[12*srcStride];\
891 const int src13= src[13*srcStride];\
892 const int src14= src[14*srcStride];\
893 const int src15= src[15*srcStride];\
894 const int src16= src[16*srcStride];\
895 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
896 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
897 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
898 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
899 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
900 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
901 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
902 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
903 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
904 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
905 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
906 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
907 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
908 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
909 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
910 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
916 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
919 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
920 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
925 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
928 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
931 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
932 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
939 copy_block9(full, src, 16, stride, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
941 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
947 copy_block9(full, src, 16, stride, 9);\
948 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
951 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
955 copy_block9(full, src, 16, stride, 9);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
957 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
959 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
965 copy_block9(full, src, 16, stride, 9);\
966 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
967 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
968 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
969 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
971 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
976 copy_block9(full, src, 16, stride, 9);\
977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
978 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
979 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
980 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
982 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
988 copy_block9(full, src, 16, stride, 9);\
989 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
990 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
991 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
992 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
999 copy_block9(full, src, 16, stride, 9);\
1000 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1005 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1007 uint8_t full[16*9];\
1010 uint8_t halfHV[64];\
1011 copy_block9(full, src, 16, stride, 9);\
1012 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1017 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1019 uint8_t full[16*9];\
1021 uint8_t halfHV[64];\
1022 copy_block9(full, src, 16, stride, 9);\
1023 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1024 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1025 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1026 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1028 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1030 uint8_t full[16*9];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1036 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1040 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1042 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1054 uint8_t halfHV[64];\
1055 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1056 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1057 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1059 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1062 uint8_t halfHV[64];\
1063 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1064 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1065 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1067 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1069 uint8_t full[16*9];\
1072 uint8_t halfHV[64];\
1073 copy_block9(full, src, 16, stride, 9);\
1074 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1076 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1079 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1081 uint8_t full[16*9];\
1083 copy_block9(full, src, 16, stride, 9);\
1084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1086 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1088 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1090 uint8_t full[16*9];\
1093 uint8_t halfHV[64];\
1094 copy_block9(full, src, 16, stride, 9);\
1095 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1096 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1097 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1098 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1100 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1102 uint8_t full[16*9];\
1104 copy_block9(full, src, 16, stride, 9);\
1105 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1107 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1109 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1112 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1113 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1116 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1119 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1120 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1123 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1125 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1128 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1137 uint8_t full[24*17];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1146 uint8_t full[24*17];\
1147 copy_block17(full, src, 24, stride, 17);\
1148 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1151 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1153 uint8_t full[24*17];\
1155 copy_block17(full, src, 24, stride, 17);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1157 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1159 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1161 uint8_t full[24*17];\
1162 uint8_t halfH[272];\
1163 uint8_t halfV[256];\
1164 uint8_t halfHV[256];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1171 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1173 uint8_t full[24*17];\
1174 uint8_t halfH[272];\
1175 uint8_t halfHV[256];\
1176 copy_block17(full, src, 24, stride, 17);\
1177 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1182 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1184 uint8_t full[24*17];\
1185 uint8_t halfH[272];\
1186 uint8_t halfV[256];\
1187 uint8_t halfHV[256];\
1188 copy_block17(full, src, 24, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1194 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1196 uint8_t full[24*17];\
1197 uint8_t halfH[272];\
1198 uint8_t halfHV[256];\
1199 copy_block17(full, src, 24, stride, 17);\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1205 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1207 uint8_t full[24*17];\
1208 uint8_t halfH[272];\
1209 uint8_t halfV[256];\
1210 uint8_t halfHV[256];\
1211 copy_block17(full, src, 24, stride, 17);\
1212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1217 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1219 uint8_t full[24*17];\
1220 uint8_t halfH[272];\
1221 uint8_t halfHV[256];\
1222 copy_block17(full, src, 24, stride, 17);\
1223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1228 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1230 uint8_t full[24*17];\
1231 uint8_t halfH[272];\
1232 uint8_t halfV[256];\
1233 uint8_t halfHV[256];\
1234 copy_block17(full, src, 24, stride, 17);\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1236 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1240 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1242 uint8_t full[24*17];\
1243 uint8_t halfH[272];\
1244 uint8_t halfHV[256];\
1245 copy_block17(full, src, 24, stride, 17);\
1246 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1251 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1253 uint8_t halfH[272];\
1254 uint8_t halfHV[256];\
1255 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1257 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1259 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1261 uint8_t halfH[272];\
1262 uint8_t halfHV[256];\
1263 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1267 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1269 uint8_t full[24*17];\
1270 uint8_t halfH[272];\
1271 uint8_t halfV[256];\
1272 uint8_t halfHV[256];\
1273 copy_block17(full, src, 24, stride, 17);\
1274 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1275 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1276 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1277 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1279 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1281 uint8_t full[24*17];\
1282 uint8_t halfH[272];\
1283 copy_block17(full, src, 24, stride, 17);\
1284 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1285 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1286 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1288 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1290 uint8_t full[24*17];\
1291 uint8_t halfH[272];\
1292 uint8_t halfV[256];\
1293 uint8_t halfHV[256];\
1294 copy_block17(full, src, 24, stride, 17);\
1295 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1296 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1297 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1298 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1300 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1302 uint8_t full[24*17];\
1303 uint8_t halfH[272];\
1304 copy_block17(full, src, 24, stride, 17);\
1305 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1306 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1307 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1309 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1311 uint8_t halfH[272];\
1312 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1316 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318 #define op_put(a, b) a = cm[((b) + 16)>>5]
1319 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1321 QPEL_MC(0, put_ , _ , op_put)
1322 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323 QPEL_MC(0, avg_ , _ , op_avg)
1324 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1326 #undef op_avg_no_rnd
1328 #undef op_put_no_rnd
1330 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1332 put_pixels8_8_c(dst, src, stride, 8);
1334 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1336 avg_pixels8_8_c(dst, src, stride, 8);
1338 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1340 put_pixels16_8_c(dst, src, stride, 16);
1342 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1344 avg_pixels16_8_c(dst, src, stride, 16);
1347 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1348 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1349 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1350 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1351 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1352 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1354 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1355 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1359 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1360 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1361 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1362 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1363 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1364 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1365 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1366 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1372 #if CONFIG_RV40_DECODER
1373 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1375 put_pixels16_xy2_8_c(dst, src, stride, 16);
1377 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1379 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1381 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1383 put_pixels8_xy2_8_c(dst, src, stride, 8);
1385 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1387 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1389 #endif /* CONFIG_RV40_DECODER */
1391 #if CONFIG_DIRAC_DECODER
1392 #define DIRAC_MC(OPNAME)\
1393 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1395 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1397 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1399 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1401 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1403 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1404 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1406 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1408 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1410 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1412 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1414 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1416 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1417 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1419 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1421 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1423 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1425 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1427 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1429 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1430 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1436 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1437 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1441 const int src_1= src[ -srcStride];
1442 const int src0 = src[0 ];
1443 const int src1 = src[ srcStride];
1444 const int src2 = src[2*srcStride];
1445 const int src3 = src[3*srcStride];
1446 const int src4 = src[4*srcStride];
1447 const int src5 = src[5*srcStride];
1448 const int src6 = src[6*srcStride];
1449 const int src7 = src[7*srcStride];
1450 const int src8 = src[8*srcStride];
1451 const int src9 = src[9*srcStride];
1452 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1453 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1454 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1455 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1456 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1457 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1458 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1459 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1465 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1468 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1469 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1472 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1474 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1477 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1480 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1481 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1484 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1486 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1489 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1494 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1495 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1496 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1497 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1499 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1504 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1505 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1506 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1507 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1509 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1512 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1513 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1516 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1517 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1519 const int strength= ff_h263_loop_filter_strength[qscale];
1523 int p0= src[x-2*stride];
1524 int p1= src[x-1*stride];
1525 int p2= src[x+0*stride];
1526 int p3= src[x+1*stride];
1527 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1529 if (d<-2*strength) d1= 0;
1530 else if(d<- strength) d1=-2*strength - d;
1531 else if(d< strength) d1= d;
1532 else if(d< 2*strength) d1= 2*strength - d;
1537 if(p1&256) p1= ~(p1>>31);
1538 if(p2&256) p2= ~(p2>>31);
1540 src[x-1*stride] = p1;
1541 src[x+0*stride] = p2;
1545 d2= av_clip((p0-p3)/4, -ad1, ad1);
1547 src[x-2*stride] = p0 - d2;
1548 src[x+ stride] = p3 + d2;
1553 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1554 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1556 const int strength= ff_h263_loop_filter_strength[qscale];
1560 int p0= src[y*stride-2];
1561 int p1= src[y*stride-1];
1562 int p2= src[y*stride+0];
1563 int p3= src[y*stride+1];
1564 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1566 if (d<-2*strength) d1= 0;
1567 else if(d<- strength) d1=-2*strength - d;
1568 else if(d< strength) d1= d;
1569 else if(d< 2*strength) d1= 2*strength - d;
1574 if(p1&256) p1= ~(p1>>31);
1575 if(p2&256) p2= ~(p2>>31);
1577 src[y*stride-1] = p1;
1578 src[y*stride+0] = p2;
1582 d2= av_clip((p0-p3)/4, -ad1, ad1);
1584 src[y*stride-2] = p0 - d2;
1585 src[y*stride+1] = p3 + d2;
1590 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1596 s += abs(pix1[0] - pix2[0]);
1597 s += abs(pix1[1] - pix2[1]);
1598 s += abs(pix1[2] - pix2[2]);
1599 s += abs(pix1[3] - pix2[3]);
1600 s += abs(pix1[4] - pix2[4]);
1601 s += abs(pix1[5] - pix2[5]);
1602 s += abs(pix1[6] - pix2[6]);
1603 s += abs(pix1[7] - pix2[7]);
1604 s += abs(pix1[8] - pix2[8]);
1605 s += abs(pix1[9] - pix2[9]);
1606 s += abs(pix1[10] - pix2[10]);
1607 s += abs(pix1[11] - pix2[11]);
1608 s += abs(pix1[12] - pix2[12]);
1609 s += abs(pix1[13] - pix2[13]);
1610 s += abs(pix1[14] - pix2[14]);
1611 s += abs(pix1[15] - pix2[15]);
1618 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1624 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1625 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1626 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1627 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1628 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1629 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1630 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1631 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1632 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1633 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1634 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1635 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1636 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1637 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1638 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1639 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1646 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1649 uint8_t *pix3 = pix2 + line_size;
1653 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1654 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1655 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1656 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1657 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1658 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1659 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1660 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1661 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1662 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1663 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1664 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1665 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1666 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1667 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1668 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1676 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1679 uint8_t *pix3 = pix2 + line_size;
1683 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1684 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1685 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1686 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1687 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1688 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1689 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1690 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1691 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1692 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1693 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1694 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1695 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1696 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1697 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1698 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1706 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1712 s += abs(pix1[0] - pix2[0]);
1713 s += abs(pix1[1] - pix2[1]);
1714 s += abs(pix1[2] - pix2[2]);
1715 s += abs(pix1[3] - pix2[3]);
1716 s += abs(pix1[4] - pix2[4]);
1717 s += abs(pix1[5] - pix2[5]);
1718 s += abs(pix1[6] - pix2[6]);
1719 s += abs(pix1[7] - pix2[7]);
1726 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1732 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1733 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1734 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1735 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1736 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1737 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1738 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1739 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1746 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1749 uint8_t *pix3 = pix2 + line_size;
1753 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1754 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1755 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1756 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1757 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1758 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1759 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1760 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1768 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1771 uint8_t *pix3 = pix2 + line_size;
1775 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1776 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1777 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1778 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1779 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1780 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1781 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1782 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1790 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1791 MpegEncContext *c = v;
1797 for(x=0; x<16; x++){
1798 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1801 for(x=0; x<15; x++){
1802 score2+= FFABS( s1[x ] - s1[x +stride]
1803 - s1[x+1] + s1[x+1+stride])
1804 -FFABS( s2[x ] - s2[x +stride]
1805 - s2[x+1] + s2[x+1+stride]);
1812 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1813 else return score1 + FFABS(score2)*8;
1816 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1817 MpegEncContext *c = v;
1824 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1828 score2+= FFABS( s1[x ] - s1[x +stride]
1829 - s1[x+1] + s1[x+1+stride])
1830 -FFABS( s2[x ] - s2[x +stride]
1831 - s2[x+1] + s2[x+1+stride]);
1838 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1839 else return score1 + FFABS(score2)*8;
1842 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1846 for(i=0; i<8*8; i++){
1847 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1850 av_assert2(-512<b && b<512);
1852 sum += (w*b)*(w*b)>>4;
1857 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1860 for(i=0; i<8*8; i++){
1861 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1865 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1869 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1872 memset(cmp, 0, sizeof(void*)*6);
1880 cmp[i]= c->hadamard8_diff[i];
1886 cmp[i]= c->dct_sad[i];
1889 cmp[i]= c->dct264_sad[i];
1892 cmp[i]= c->dct_max[i];
1895 cmp[i]= c->quant_psnr[i];
1924 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1929 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1931 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1932 long a = *(long*)(src+i);
1933 long b = *(long*)(dst+i);
1934 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1937 dst[i+0] += src[i+0];
1940 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1942 #if !HAVE_FAST_UNALIGNED
1943 if((long)src2 & (sizeof(long)-1)){
1944 for(i=0; i+7<w; i+=8){
1945 dst[i+0] = src1[i+0]-src2[i+0];
1946 dst[i+1] = src1[i+1]-src2[i+1];
1947 dst[i+2] = src1[i+2]-src2[i+2];
1948 dst[i+3] = src1[i+3]-src2[i+3];
1949 dst[i+4] = src1[i+4]-src2[i+4];
1950 dst[i+5] = src1[i+5]-src2[i+5];
1951 dst[i+6] = src1[i+6]-src2[i+6];
1952 dst[i+7] = src1[i+7]-src2[i+7];
1956 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1957 long a = *(long*)(src1+i);
1958 long b = *(long*)(src2+i);
1959 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1962 dst[i+0] = src1[i+0]-src2[i+0];
1965 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1973 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1982 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1990 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2000 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
2003 for(i=0; i<w-1; i++){
2030 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2060 #define BUTTERFLY2(o1,o2,i1,i2) \
2064 #define BUTTERFLY1(x,y) \
2073 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2075 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2083 //FIXME try pointer walks
2084 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2085 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2086 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2087 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2089 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2090 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2091 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2092 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2094 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2095 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2096 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2097 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2101 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2102 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2103 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2104 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2106 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2107 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2108 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2109 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2112 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2113 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2114 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2115 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2120 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2128 //FIXME try pointer walks
2129 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2130 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2131 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2132 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2134 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2135 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2136 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2137 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2139 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2140 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2141 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2142 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2146 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2147 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2148 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2149 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2151 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2152 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2153 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2154 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2157 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2158 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2159 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2160 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2163 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2168 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2169 MpegEncContext * const s= (MpegEncContext *)c;
2170 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2174 s->dsp.diff_pixels(temp, src1, src2, stride);
2176 return s->dsp.sum_abs_dctelem(temp);
2181 const int s07 = SRC(0) + SRC(7);\
2182 const int s16 = SRC(1) + SRC(6);\
2183 const int s25 = SRC(2) + SRC(5);\
2184 const int s34 = SRC(3) + SRC(4);\
2185 const int a0 = s07 + s34;\
2186 const int a1 = s16 + s25;\
2187 const int a2 = s07 - s34;\
2188 const int a3 = s16 - s25;\
2189 const int d07 = SRC(0) - SRC(7);\
2190 const int d16 = SRC(1) - SRC(6);\
2191 const int d25 = SRC(2) - SRC(5);\
2192 const int d34 = SRC(3) - SRC(4);\
2193 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2194 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2195 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2196 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2198 DST(1, a4 + (a7>>2)) ;\
2199 DST(2, a2 + (a3>>1)) ;\
2200 DST(3, a5 + (a6>>2)) ;\
2202 DST(5, a6 - (a5>>2)) ;\
2203 DST(6, (a2>>1) - a3 ) ;\
2204 DST(7, (a4>>2) - a7 ) ;\
2207 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2208 MpegEncContext * const s= (MpegEncContext *)c;
2213 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2215 #define SRC(x) dct[i][x]
2216 #define DST(x,v) dct[i][x]= v
2217 for( i = 0; i < 8; i++ )
2222 #define SRC(x) dct[x][i]
2223 #define DST(x,v) sum += FFABS(v)
2224 for( i = 0; i < 8; i++ )
2232 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2233 MpegEncContext * const s= (MpegEncContext *)c;
2234 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2239 s->dsp.diff_pixels(temp, src1, src2, stride);
2243 sum= FFMAX(sum, FFABS(temp[i]));
2248 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2249 MpegEncContext * const s= (MpegEncContext *)c;
2250 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2251 int16_t * const bak = temp+64;
2257 s->dsp.diff_pixels(temp, src1, src2, stride);
2259 memcpy(bak, temp, 64*sizeof(int16_t));
2261 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2262 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2263 ff_simple_idct_8(temp); //FIXME
2266 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2271 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2272 MpegEncContext * const s= (MpegEncContext *)c;
2273 const uint8_t *scantable= s->intra_scantable.permutated;
2274 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2275 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2276 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2277 int i, last, run, bits, level, distortion, start_i;
2278 const int esc_length= s->ac_esc_length;
2280 uint8_t * last_length;
2284 copy_block8(lsrc1, src1, 8, stride, 8);
2285 copy_block8(lsrc2, src2, 8, stride, 8);
2287 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2289 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2295 length = s->intra_ac_vlc_length;
2296 last_length= s->intra_ac_vlc_last_length;
2297 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2300 length = s->inter_ac_vlc_length;
2301 last_length= s->inter_ac_vlc_last_length;
2306 for(i=start_i; i<last; i++){
2307 int j= scantable[i];
2312 if((level&(~127)) == 0){
2313 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2322 level= temp[i] + 64;
2324 av_assert2(level - 64);
2326 if((level&(~127)) == 0){
2327 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2335 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2337 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2340 s->dsp.idct_add(lsrc2, 8, temp);
2342 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2344 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2347 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2348 MpegEncContext * const s= (MpegEncContext *)c;
2349 const uint8_t *scantable= s->intra_scantable.permutated;
2350 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2351 int i, last, run, bits, level, start_i;
2352 const int esc_length= s->ac_esc_length;
2354 uint8_t * last_length;
2358 s->dsp.diff_pixels(temp, src1, src2, stride);
2360 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2366 length = s->intra_ac_vlc_length;
2367 last_length= s->intra_ac_vlc_last_length;
2368 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2371 length = s->inter_ac_vlc_length;
2372 last_length= s->inter_ac_vlc_last_length;
2377 for(i=start_i; i<last; i++){
2378 int j= scantable[i];
2383 if((level&(~127)) == 0){
2384 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2393 level= temp[i] + 64;
2395 av_assert2(level - 64);
2397 if((level&(~127)) == 0){
2398 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2406 #define VSAD_INTRA(size) \
2407 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2411 for(y=1; y<h; y++){ \
2412 for(x=0; x<size; x+=4){ \
2413 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2414 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2424 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2429 for(x=0; x<16; x++){
2430 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2439 #define SQ(a) ((a)*(a))
2440 #define VSSE_INTRA(size) \
2441 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2445 for(y=1; y<h; y++){ \
2446 for(x=0; x<size; x+=4){ \
2447 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2448 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2458 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2463 for(x=0; x<16; x++){
2464 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2473 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2477 for(i=0; i<size; i++)
2478 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2482 #define WRAPPER8_16_SQ(name8, name16)\
2483 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2485 score +=name8(s, dst , src , stride, 8);\
2486 score +=name8(s, dst+8 , src+8 , stride, 8);\
2490 score +=name8(s, dst , src , stride, 8);\
2491 score +=name8(s, dst+8 , src+8 , stride, 8);\
2496 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2497 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2498 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2500 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2502 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2503 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2504 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2505 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2507 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2508 uint32_t maxi, uint32_t maxisign)
2511 if(a > mini) return mini;
2512 else if((a^(1U<<31)) > maxisign) return maxi;
2516 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2518 uint32_t mini = *(uint32_t*)min;
2519 uint32_t maxi = *(uint32_t*)max;
2520 uint32_t maxisign = maxi ^ (1U<<31);
2521 uint32_t *dsti = (uint32_t*)dst;
2522 const uint32_t *srci = (const uint32_t*)src;
2523 for(i=0; i<len; i+=8) {
2524 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2525 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2526 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2527 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2528 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2529 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2530 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2531 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2534 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2536 if(min < 0 && max > 0) {
2537 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2539 for(i=0; i < len; i+=8) {
2540 dst[i ] = av_clipf(src[i ], min, max);
2541 dst[i + 1] = av_clipf(src[i + 1], min, max);
2542 dst[i + 2] = av_clipf(src[i + 2], min, max);
2543 dst[i + 3] = av_clipf(src[i + 3], min, max);
2544 dst[i + 4] = av_clipf(src[i + 4], min, max);
2545 dst[i + 5] = av_clipf(src[i + 5], min, max);
2546 dst[i + 6] = av_clipf(src[i + 6], min, max);
2547 dst[i + 7] = av_clipf(src[i + 7], min, max);
2552 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2557 res += *v1++ * *v2++;
2562 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2567 *v1++ += mul * *v3++;
2572 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2573 const int16_t *window, unsigned int len)
2576 int len2 = len >> 1;
2578 for (i = 0; i < len2; i++) {
2579 int16_t w = window[i];
2580 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2581 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2585 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2586 int32_t max, unsigned int len)
2589 *dst++ = av_clip(*src++, min, max);
2590 *dst++ = av_clip(*src++, min, max);
2591 *dst++ = av_clip(*src++, min, max);
2592 *dst++ = av_clip(*src++, min, max);
2593 *dst++ = av_clip(*src++, min, max);
2594 *dst++ = av_clip(*src++, min, max);
2595 *dst++ = av_clip(*src++, min, max);
2596 *dst++ = av_clip(*src++, min, max);
2601 static void ff_jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2603 ff_j_rev_dct (block);
2604 put_pixels_clamped_c(block, dest, line_size);
2606 static void ff_jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2608 ff_j_rev_dct (block);
2609 add_pixels_clamped_c(block, dest, line_size);
2612 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2614 ff_j_rev_dct4 (block);
2615 put_pixels_clamped4_c(block, dest, line_size);
2617 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2619 ff_j_rev_dct4 (block);
2620 add_pixels_clamped4_c(block, dest, line_size);
2623 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2625 ff_j_rev_dct2 (block);
2626 put_pixels_clamped2_c(block, dest, line_size);
2628 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2630 ff_j_rev_dct2 (block);
2631 add_pixels_clamped2_c(block, dest, line_size);
2634 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2636 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2638 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2640 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2643 /* init static data */
2644 av_cold void ff_dsputil_static_init(void)
2648 for(i=0;i<512;i++) {
2649 ff_squareTbl[i] = (i - 256) * (i - 256);
2652 for(i=0; i<64; i++) ff_inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2655 int ff_check_alignment(void){
2656 static int did_fail=0;
2657 LOCAL_ALIGNED_16(int, aligned, [4]);
2659 if((intptr_t)aligned & 15){
2661 #if HAVE_MMX || HAVE_ALTIVEC
2662 av_log(NULL, AV_LOG_ERROR,
2663 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2664 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2665 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2666 "Do not report crashes to FFmpeg developers.\n");
2675 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2677 ff_check_alignment();
2680 if (avctx->bits_per_raw_sample == 10) {
2681 c->fdct = ff_jpeg_fdct_islow_10;
2682 c->fdct248 = ff_fdct248_islow_10;
2684 if(avctx->dct_algo==FF_DCT_FASTINT) {
2685 c->fdct = ff_fdct_ifast;
2686 c->fdct248 = ff_fdct_ifast248;
2688 else if(avctx->dct_algo==FF_DCT_FAAN) {
2689 c->fdct = ff_faandct;
2690 c->fdct248 = ff_faandct248;
2693 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2694 c->fdct248 = ff_fdct248_islow_8;
2697 #endif //CONFIG_ENCODERS
2699 if(avctx->lowres==1){
2700 c->idct_put= ff_jref_idct4_put;
2701 c->idct_add= ff_jref_idct4_add;
2702 c->idct = ff_j_rev_dct4;
2703 c->idct_permutation_type= FF_NO_IDCT_PERM;
2704 }else if(avctx->lowres==2){
2705 c->idct_put= ff_jref_idct2_put;
2706 c->idct_add= ff_jref_idct2_add;
2707 c->idct = ff_j_rev_dct2;
2708 c->idct_permutation_type= FF_NO_IDCT_PERM;
2709 }else if(avctx->lowres==3){
2710 c->idct_put= ff_jref_idct1_put;
2711 c->idct_add= ff_jref_idct1_add;
2712 c->idct = ff_j_rev_dct1;
2713 c->idct_permutation_type= FF_NO_IDCT_PERM;
2715 if (avctx->bits_per_raw_sample == 10) {
2716 c->idct_put = ff_simple_idct_put_10;
2717 c->idct_add = ff_simple_idct_add_10;
2718 c->idct = ff_simple_idct_10;
2719 c->idct_permutation_type = FF_NO_IDCT_PERM;
2721 if(avctx->idct_algo==FF_IDCT_INT){
2722 c->idct_put= ff_jref_idct_put;
2723 c->idct_add= ff_jref_idct_add;
2724 c->idct = ff_j_rev_dct;
2725 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2726 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2727 c->idct_put= ff_faanidct_put;
2728 c->idct_add= ff_faanidct_add;
2729 c->idct = ff_faanidct;
2730 c->idct_permutation_type= FF_NO_IDCT_PERM;
2731 }else{ //accurate/default
2732 c->idct_put = ff_simple_idct_put_8;
2733 c->idct_add = ff_simple_idct_add_8;
2734 c->idct = ff_simple_idct_8;
2735 c->idct_permutation_type= FF_NO_IDCT_PERM;
2740 c->diff_pixels = diff_pixels_c;
2741 c->put_pixels_clamped = put_pixels_clamped_c;
2742 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2743 c->add_pixels_clamped = add_pixels_clamped_c;
2744 c->sum_abs_dctelem = sum_abs_dctelem_c;
2747 c->pix_sum = pix_sum_c;
2748 c->pix_norm1 = pix_norm1_c;
2750 c->fill_block_tab[0] = fill_block16_c;
2751 c->fill_block_tab[1] = fill_block8_c;
2753 /* TODO [0] 16 [1] 8 */
2754 c->pix_abs[0][0] = pix_abs16_c;
2755 c->pix_abs[0][1] = pix_abs16_x2_c;
2756 c->pix_abs[0][2] = pix_abs16_y2_c;
2757 c->pix_abs[0][3] = pix_abs16_xy2_c;
2758 c->pix_abs[1][0] = pix_abs8_c;
2759 c->pix_abs[1][1] = pix_abs8_x2_c;
2760 c->pix_abs[1][2] = pix_abs8_y2_c;
2761 c->pix_abs[1][3] = pix_abs8_xy2_c;
2763 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2764 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2765 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2766 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2767 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2768 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2769 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2770 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2771 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2773 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2774 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2775 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2776 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2777 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2778 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2779 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2780 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2781 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2783 #define dspfunc(PFX, IDX, NUM) \
2784 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2785 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2786 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2787 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2788 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2789 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2790 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2791 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2792 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2793 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2794 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2795 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2796 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2797 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2798 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2799 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2801 dspfunc(put_qpel, 0, 16);
2802 dspfunc(put_no_rnd_qpel, 0, 16);
2804 dspfunc(avg_qpel, 0, 16);
2805 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2807 dspfunc(put_qpel, 1, 8);
2808 dspfunc(put_no_rnd_qpel, 1, 8);
2810 dspfunc(avg_qpel, 1, 8);
2811 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2815 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2816 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2817 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2818 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2819 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2820 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2821 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2822 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2824 #define SET_CMP_FUNC(name) \
2825 c->name[0]= name ## 16_c;\
2826 c->name[1]= name ## 8x8_c;
2828 SET_CMP_FUNC(hadamard8_diff)
2829 c->hadamard8_diff[4]= hadamard8_intra16_c;
2830 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2831 SET_CMP_FUNC(dct_sad)
2832 SET_CMP_FUNC(dct_max)
2834 SET_CMP_FUNC(dct264_sad)
2836 c->sad[0]= pix_abs16_c;
2837 c->sad[1]= pix_abs8_c;
2841 SET_CMP_FUNC(quant_psnr)
2844 c->vsad[0]= vsad16_c;
2845 c->vsad[4]= vsad_intra16_c;
2846 c->vsad[5]= vsad_intra8_c;
2847 c->vsse[0]= vsse16_c;
2848 c->vsse[4]= vsse_intra16_c;
2849 c->vsse[5]= vsse_intra8_c;
2850 c->nsse[0]= nsse16_c;
2851 c->nsse[1]= nsse8_c;
2852 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2853 ff_dsputil_init_dwt(c);
2856 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2858 c->add_bytes= add_bytes_c;
2859 c->diff_bytes= diff_bytes_c;
2860 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2861 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2862 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2863 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2864 c->bswap_buf= bswap_buf;
2865 c->bswap16_buf = bswap16_buf;
2867 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2868 c->h263_h_loop_filter= h263_h_loop_filter_c;
2869 c->h263_v_loop_filter= h263_v_loop_filter_c;
2872 c->try_8x8basis= try_8x8basis_c;
2873 c->add_8x8basis= add_8x8basis_c;
2875 c->vector_clipf = vector_clipf_c;
2876 c->scalarproduct_int16 = scalarproduct_int16_c;
2877 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2878 c->apply_window_int16 = apply_window_int16_c;
2879 c->vector_clip_int32 = vector_clip_int32_c;
2881 c->shrink[0]= av_image_copy_plane;
2882 c->shrink[1]= ff_shrink22;
2883 c->shrink[2]= ff_shrink44;
2884 c->shrink[3]= ff_shrink88;
2886 c->add_pixels8 = add_pixels8_c;
2890 #define FUNC(f, depth) f ## _ ## depth
2891 #define FUNCC(f, depth) f ## _ ## depth ## _c
2893 #define BIT_DEPTH_FUNCS(depth) \
2894 c->get_pixels = FUNCC(get_pixels, depth);
2896 c->draw_edges = FUNCC(draw_edges, 8);
2897 c->clear_block = FUNCC(clear_block, 8);
2898 c->clear_blocks = FUNCC(clear_blocks, 8);
2900 switch (avctx->bits_per_raw_sample) {
2905 BIT_DEPTH_FUNCS(16);
2908 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2915 if (HAVE_MMX) ff_dsputil_init_mmx (c, avctx);
2916 if (ARCH_ARM) ff_dsputil_init_arm (c, avctx);
2917 if (HAVE_VIS) ff_dsputil_init_vis (c, avctx);
2918 if (ARCH_ALPHA) ff_dsputil_init_alpha (c, avctx);
2919 if (ARCH_PPC) ff_dsputil_init_ppc (c, avctx);
2920 if (ARCH_SH4) ff_dsputil_init_sh4 (c, avctx);
2921 if (ARCH_BFIN) ff_dsputil_init_bfin (c, avctx);
2923 ff_init_scantable_permutation(c->idct_permutation,
2924 c->idct_permutation_type);
2927 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2929 ff_dsputil_init(c, avctx);