3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/attributes.h"
31 #include "libavutil/imgutils.h"
32 #include "libavutil/internal.h"
34 #include "copy_block.h"
37 #include "simple_idct.h"
40 #include "imgconvert.h"
42 #include "mpegvideo.h"
46 uint32_t ff_squareTbl[512] = {0, };
49 #include "dsputil_template.c"
53 #include "dsputil_template.c"
55 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
56 #define pb_7f (~0UL/255 * 0x7f)
57 #define pb_80 (~0UL/255 * 0x80)
59 /* Specific zigzag scan for 248 idct. NOTE that unlike the
60 specification, we interleave the fields */
61 const uint8_t ff_zigzag248_direct[64] = {
62 0, 8, 1, 9, 16, 24, 2, 10,
63 17, 25, 32, 40, 48, 56, 33, 41,
64 18, 26, 3, 11, 4, 12, 19, 27,
65 34, 42, 49, 57, 50, 58, 35, 43,
66 20, 28, 5, 13, 6, 14, 21, 29,
67 36, 44, 51, 59, 52, 60, 37, 45,
68 22, 30, 7, 15, 23, 31, 38, 46,
69 53, 61, 54, 62, 39, 47, 55, 63,
72 const uint8_t ff_alternate_horizontal_scan[64] = {
73 0, 1, 2, 3, 8, 9, 16, 17,
74 10, 11, 4, 5, 6, 7, 15, 14,
75 13, 12, 19, 18, 24, 25, 32, 33,
76 26, 27, 20, 21, 22, 23, 28, 29,
77 30, 31, 34, 35, 40, 41, 48, 49,
78 42, 43, 36, 37, 38, 39, 44, 45,
79 46, 47, 50, 51, 56, 57, 58, 59,
80 52, 53, 54, 55, 60, 61, 62, 63,
83 const uint8_t ff_alternate_vertical_scan[64] = {
84 0, 8, 16, 24, 1, 9, 2, 10,
85 17, 25, 32, 40, 48, 56, 57, 49,
86 41, 33, 26, 18, 3, 11, 4, 12,
87 19, 27, 34, 42, 50, 58, 35, 43,
88 51, 59, 20, 28, 5, 13, 6, 14,
89 21, 29, 36, 44, 52, 60, 37, 45,
90 53, 61, 22, 30, 7, 15, 23, 31,
91 38, 46, 54, 62, 39, 47, 55, 63,
94 /* Input permutation for the simple_idct_mmx */
95 static const uint8_t simple_mmx_permutation[64]={
96 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
97 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
98 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
99 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
100 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
101 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
102 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
103 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
106 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
108 av_cold void ff_init_scantable(uint8_t *permutation, ScanTable *st,
109 const uint8_t *src_scantable)
114 st->scantable= src_scantable;
118 j = src_scantable[i];
119 st->permutated[i] = permutation[j];
125 j = st->permutated[i];
127 st->raster_end[i]= end;
131 av_cold void ff_init_scantable_permutation(uint8_t *idct_permutation,
132 int idct_permutation_type)
136 switch(idct_permutation_type){
137 case FF_NO_IDCT_PERM:
139 idct_permutation[i]= i;
141 case FF_LIBMPEG2_IDCT_PERM:
143 idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
145 case FF_SIMPLE_IDCT_PERM:
147 idct_permutation[i]= simple_mmx_permutation[i];
149 case FF_TRANSPOSE_IDCT_PERM:
151 idct_permutation[i]= ((i&7)<<3) | (i>>3);
153 case FF_PARTTRANS_IDCT_PERM:
155 idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
157 case FF_SSE2_IDCT_PERM:
159 idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
162 av_log(NULL, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
166 static int pix_sum_c(uint8_t * pix, int line_size)
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
183 pix += line_size - 16;
188 static int pix_norm1_c(uint8_t * pix, int line_size)
191 uint32_t *sq = ff_squareTbl + 256;
194 for (i = 0; i < 16; i++) {
195 for (j = 0; j < 16; j += 8) {
207 register uint64_t x=*(uint64_t*)pix;
209 s += sq[(x>>8)&0xff];
210 s += sq[(x>>16)&0xff];
211 s += sq[(x>>24)&0xff];
212 s += sq[(x>>32)&0xff];
213 s += sq[(x>>40)&0xff];
214 s += sq[(x>>48)&0xff];
215 s += sq[(x>>56)&0xff];
217 register uint32_t x=*(uint32_t*)pix;
219 s += sq[(x>>8)&0xff];
220 s += sq[(x>>16)&0xff];
221 s += sq[(x>>24)&0xff];
222 x=*(uint32_t*)(pix+4);
224 s += sq[(x>>8)&0xff];
225 s += sq[(x>>16)&0xff];
226 s += sq[(x>>24)&0xff];
231 pix += line_size - 16;
236 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
239 for(i=0; i+8<=w; i+=8){
240 dst[i+0]= av_bswap32(src[i+0]);
241 dst[i+1]= av_bswap32(src[i+1]);
242 dst[i+2]= av_bswap32(src[i+2]);
243 dst[i+3]= av_bswap32(src[i+3]);
244 dst[i+4]= av_bswap32(src[i+4]);
245 dst[i+5]= av_bswap32(src[i+5]);
246 dst[i+6]= av_bswap32(src[i+6]);
247 dst[i+7]= av_bswap32(src[i+7]);
250 dst[i+0]= av_bswap32(src[i+0]);
254 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
257 *dst++ = av_bswap16(*src++);
260 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
263 uint32_t *sq = ff_squareTbl + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[0] - pix2[0]];
268 s += sq[pix1[1] - pix2[1]];
269 s += sq[pix1[2] - pix2[2]];
270 s += sq[pix1[3] - pix2[3]];
277 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
280 uint32_t *sq = ff_squareTbl + 256;
283 for (i = 0; i < h; i++) {
284 s += sq[pix1[0] - pix2[0]];
285 s += sq[pix1[1] - pix2[1]];
286 s += sq[pix1[2] - pix2[2]];
287 s += sq[pix1[3] - pix2[3]];
288 s += sq[pix1[4] - pix2[4]];
289 s += sq[pix1[5] - pix2[5]];
290 s += sq[pix1[6] - pix2[6]];
291 s += sq[pix1[7] - pix2[7]];
298 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
301 uint32_t *sq = ff_squareTbl + 256;
304 for (i = 0; i < h; i++) {
305 s += sq[pix1[ 0] - pix2[ 0]];
306 s += sq[pix1[ 1] - pix2[ 1]];
307 s += sq[pix1[ 2] - pix2[ 2]];
308 s += sq[pix1[ 3] - pix2[ 3]];
309 s += sq[pix1[ 4] - pix2[ 4]];
310 s += sq[pix1[ 5] - pix2[ 5]];
311 s += sq[pix1[ 6] - pix2[ 6]];
312 s += sq[pix1[ 7] - pix2[ 7]];
313 s += sq[pix1[ 8] - pix2[ 8]];
314 s += sq[pix1[ 9] - pix2[ 9]];
315 s += sq[pix1[10] - pix2[10]];
316 s += sq[pix1[11] - pix2[11]];
317 s += sq[pix1[12] - pix2[12]];
318 s += sq[pix1[13] - pix2[13]];
319 s += sq[pix1[14] - pix2[14]];
320 s += sq[pix1[15] - pix2[15]];
328 static void diff_pixels_c(int16_t *av_restrict block, const uint8_t *s1,
329 const uint8_t *s2, int stride){
332 /* read the pixels */
334 block[0] = s1[0] - s2[0];
335 block[1] = s1[1] - s2[1];
336 block[2] = s1[2] - s2[2];
337 block[3] = s1[3] - s2[3];
338 block[4] = s1[4] - s2[4];
339 block[5] = s1[5] - s2[5];
340 block[6] = s1[6] - s2[6];
341 block[7] = s1[7] - s2[7];
348 static void put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
353 /* read the pixels */
355 pixels[0] = av_clip_uint8(block[0]);
356 pixels[1] = av_clip_uint8(block[1]);
357 pixels[2] = av_clip_uint8(block[2]);
358 pixels[3] = av_clip_uint8(block[3]);
359 pixels[4] = av_clip_uint8(block[4]);
360 pixels[5] = av_clip_uint8(block[5]);
361 pixels[6] = av_clip_uint8(block[6]);
362 pixels[7] = av_clip_uint8(block[7]);
369 static void put_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
374 /* read the pixels */
376 pixels[0] = av_clip_uint8(block[0]);
377 pixels[1] = av_clip_uint8(block[1]);
378 pixels[2] = av_clip_uint8(block[2]);
379 pixels[3] = av_clip_uint8(block[3]);
386 static void put_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
391 /* read the pixels */
393 pixels[0] = av_clip_uint8(block[0]);
394 pixels[1] = av_clip_uint8(block[1]);
401 static void put_signed_pixels_clamped_c(const int16_t *block,
402 uint8_t *av_restrict pixels,
407 for (i = 0; i < 8; i++) {
408 for (j = 0; j < 8; j++) {
411 else if (*block > 127)
414 *pixels = (uint8_t)(*block + 128);
418 pixels += (line_size - 8);
422 static void add_pixels8_c(uint8_t *av_restrict pixels,
429 pixels[0] += block[0];
430 pixels[1] += block[1];
431 pixels[2] += block[2];
432 pixels[3] += block[3];
433 pixels[4] += block[4];
434 pixels[5] += block[5];
435 pixels[6] += block[6];
436 pixels[7] += block[7];
442 static void add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels,
447 /* read the pixels */
449 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
450 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
451 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
452 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
453 pixels[4] = av_clip_uint8(pixels[4] + block[4]);
454 pixels[5] = av_clip_uint8(pixels[5] + block[5]);
455 pixels[6] = av_clip_uint8(pixels[6] + block[6]);
456 pixels[7] = av_clip_uint8(pixels[7] + block[7]);
462 static void add_pixels_clamped4_c(const int16_t *block, uint8_t *av_restrict pixels,
467 /* read the pixels */
469 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
470 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
471 pixels[2] = av_clip_uint8(pixels[2] + block[2]);
472 pixels[3] = av_clip_uint8(pixels[3] + block[3]);
478 static void add_pixels_clamped2_c(const int16_t *block, uint8_t *av_restrict pixels,
483 /* read the pixels */
485 pixels[0] = av_clip_uint8(pixels[0] + block[0]);
486 pixels[1] = av_clip_uint8(pixels[1] + block[1]);
492 static int sum_abs_dctelem_c(int16_t *block)
496 sum+= FFABS(block[i]);
500 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
504 for (i = 0; i < h; i++) {
505 memset(block, value, 16);
510 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
514 for (i = 0; i < h; i++) {
515 memset(block, value, 8);
520 #define avg2(a,b) ((a+b+1)>>1)
521 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
523 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
525 const int A=(16-x16)*(16-y16);
526 const int B=( x16)*(16-y16);
527 const int C=(16-x16)*( y16);
528 const int D=( x16)*( y16);
533 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
534 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
535 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
536 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
537 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
538 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
539 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
540 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
546 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
547 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
550 const int s= 1<<shift;
560 for(x=0; x<8; x++){ //XXX FIXME optimize
561 int src_x, src_y, frac_x, frac_y, index;
570 if((unsigned)src_x < width){
571 if((unsigned)src_y < height){
572 index= src_x + src_y*stride;
573 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
574 + src[index +1]* frac_x )*(s-frac_y)
575 + ( src[index+stride ]*(s-frac_x)
576 + src[index+stride+1]* frac_x )* frac_y
579 index= src_x + av_clip(src_y, 0, height)*stride;
580 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
581 + src[index +1]* frac_x )*s
585 if((unsigned)src_y < height){
586 index= av_clip(src_x, 0, width) + src_y*stride;
587 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
588 + src[index+stride ]* frac_y )*s
591 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
592 dst[y*stride + x]= src[index ];
604 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
606 case 2: put_pixels2_8_c (dst, src, stride, height); break;
607 case 4: put_pixels4_8_c (dst, src, stride, height); break;
608 case 8: put_pixels8_8_c (dst, src, stride, height); break;
609 case 16:put_pixels16_8_c(dst, src, stride, height); break;
613 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
615 for (i=0; i < height; i++) {
616 for (j=0; j < width; j++) {
617 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
624 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
626 for (i=0; i < height; i++) {
627 for (j=0; j < width; j++) {
628 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
635 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
637 for (i=0; i < height; i++) {
638 for (j=0; j < width; j++) {
639 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
646 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
648 for (i=0; i < height; i++) {
649 for (j=0; j < width; j++) {
650 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
657 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
659 for (i=0; i < height; i++) {
660 for (j=0; j < width; j++) {
661 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
668 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
670 for (i=0; i < height; i++) {
671 for (j=0; j < width; j++) {
672 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
679 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
681 for (i=0; i < height; i++) {
682 for (j=0; j < width; j++) {
683 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
690 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
692 for (i=0; i < height; i++) {
693 for (j=0; j < width; j++) {
694 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
701 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
703 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
704 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
705 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
706 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
710 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
712 for (i=0; i < height; i++) {
713 for (j=0; j < width; j++) {
714 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
721 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
723 for (i=0; i < height; i++) {
724 for (j=0; j < width; j++) {
725 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
732 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
734 for (i=0; i < height; i++) {
735 for (j=0; j < width; j++) {
736 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
743 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
745 for (i=0; i < height; i++) {
746 for (j=0; j < width; j++) {
747 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
754 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
756 for (i=0; i < height; i++) {
757 for (j=0; j < width; j++) {
758 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
765 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
767 for (i=0; i < height; i++) {
768 for (j=0; j < width; j++) {
769 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
776 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
778 for (i=0; i < height; i++) {
779 for (j=0; j < width; j++) {
780 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
787 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
789 for (i=0; i < height; i++) {
790 for (j=0; j < width; j++) {
791 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
798 #define QPEL_MC(r, OPNAME, RND, OP) \
799 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
800 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
804 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
805 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
806 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
807 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
808 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
809 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
810 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
811 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
817 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
819 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
823 const int src0= src[0*srcStride];\
824 const int src1= src[1*srcStride];\
825 const int src2= src[2*srcStride];\
826 const int src3= src[3*srcStride];\
827 const int src4= src[4*srcStride];\
828 const int src5= src[5*srcStride];\
829 const int src6= src[6*srcStride];\
830 const int src7= src[7*srcStride];\
831 const int src8= src[8*srcStride];\
832 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
833 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
834 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
835 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
836 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
837 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
838 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
839 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
845 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
846 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
851 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
852 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
853 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
854 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
855 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
856 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
857 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
858 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
859 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
860 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
861 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
862 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
863 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
864 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
865 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
866 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
872 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
873 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
878 const int src0= src[0*srcStride];\
879 const int src1= src[1*srcStride];\
880 const int src2= src[2*srcStride];\
881 const int src3= src[3*srcStride];\
882 const int src4= src[4*srcStride];\
883 const int src5= src[5*srcStride];\
884 const int src6= src[6*srcStride];\
885 const int src7= src[7*srcStride];\
886 const int src8= src[8*srcStride];\
887 const int src9= src[9*srcStride];\
888 const int src10= src[10*srcStride];\
889 const int src11= src[11*srcStride];\
890 const int src12= src[12*srcStride];\
891 const int src13= src[13*srcStride];\
892 const int src14= src[14*srcStride];\
893 const int src15= src[15*srcStride];\
894 const int src16= src[16*srcStride];\
895 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
896 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
897 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
898 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
899 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
900 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
901 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
902 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
903 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
904 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
905 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
906 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
907 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
908 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
909 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
910 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
916 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
919 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
920 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
925 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
928 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
931 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
932 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
935 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
939 copy_block9(full, src, 16, stride, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
941 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
944 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
947 copy_block9(full, src, 16, stride, 9);\
948 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
951 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
955 copy_block9(full, src, 16, stride, 9);\
956 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
957 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
959 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
965 copy_block9(full, src, 16, stride, 9);\
966 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
967 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
968 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
969 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
971 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
976 copy_block9(full, src, 16, stride, 9);\
977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
978 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
979 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
980 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
982 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
988 copy_block9(full, src, 16, stride, 9);\
989 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
990 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
991 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
992 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
994 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
999 copy_block9(full, src, 16, stride, 9);\
1000 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1005 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1007 uint8_t full[16*9];\
1010 uint8_t halfHV[64];\
1011 copy_block9(full, src, 16, stride, 9);\
1012 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1013 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1014 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1015 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1017 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1019 uint8_t full[16*9];\
1021 uint8_t halfHV[64];\
1022 copy_block9(full, src, 16, stride, 9);\
1023 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1024 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1025 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1026 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1028 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1030 uint8_t full[16*9];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1036 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1040 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1042 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1054 uint8_t halfHV[64];\
1055 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1056 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1057 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1059 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1062 uint8_t halfHV[64];\
1063 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1064 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1065 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1067 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1069 uint8_t full[16*9];\
1072 uint8_t halfHV[64];\
1073 copy_block9(full, src, 16, stride, 9);\
1074 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1075 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1076 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1077 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1079 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1081 uint8_t full[16*9];\
1083 copy_block9(full, src, 16, stride, 9);\
1084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1085 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1086 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1088 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1090 uint8_t full[16*9];\
1093 uint8_t halfHV[64];\
1094 copy_block9(full, src, 16, stride, 9);\
1095 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1096 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1097 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1098 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1100 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1102 uint8_t full[16*9];\
1104 copy_block9(full, src, 16, stride, 9);\
1105 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1107 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1109 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1112 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1113 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1116 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1119 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1120 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1123 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1125 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1128 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1137 uint8_t full[24*17];\
1139 copy_block17(full, src, 24, stride, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1141 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1144 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1146 uint8_t full[24*17];\
1147 copy_block17(full, src, 24, stride, 17);\
1148 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1151 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1153 uint8_t full[24*17];\
1155 copy_block17(full, src, 24, stride, 17);\
1156 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1157 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1159 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1161 uint8_t full[24*17];\
1162 uint8_t halfH[272];\
1163 uint8_t halfV[256];\
1164 uint8_t halfHV[256];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1167 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1168 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1169 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1171 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1173 uint8_t full[24*17];\
1174 uint8_t halfH[272];\
1175 uint8_t halfHV[256];\
1176 copy_block17(full, src, 24, stride, 17);\
1177 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1178 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1179 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1180 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1182 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1184 uint8_t full[24*17];\
1185 uint8_t halfH[272];\
1186 uint8_t halfV[256];\
1187 uint8_t halfHV[256];\
1188 copy_block17(full, src, 24, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1190 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1191 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1192 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1194 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1196 uint8_t full[24*17];\
1197 uint8_t halfH[272];\
1198 uint8_t halfHV[256];\
1199 copy_block17(full, src, 24, stride, 17);\
1200 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1201 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1203 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1205 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1207 uint8_t full[24*17];\
1208 uint8_t halfH[272];\
1209 uint8_t halfV[256];\
1210 uint8_t halfHV[256];\
1211 copy_block17(full, src, 24, stride, 17);\
1212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1213 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1214 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1215 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1217 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1219 uint8_t full[24*17];\
1220 uint8_t halfH[272];\
1221 uint8_t halfHV[256];\
1222 copy_block17(full, src, 24, stride, 17);\
1223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1228 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1230 uint8_t full[24*17];\
1231 uint8_t halfH[272];\
1232 uint8_t halfV[256];\
1233 uint8_t halfHV[256];\
1234 copy_block17(full, src, 24, stride, 17);\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1236 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1240 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1242 uint8_t full[24*17];\
1243 uint8_t halfH[272];\
1244 uint8_t halfHV[256];\
1245 copy_block17(full, src, 24, stride, 17);\
1246 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1247 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1251 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1253 uint8_t halfH[272];\
1254 uint8_t halfHV[256];\
1255 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1256 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1257 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1259 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1261 uint8_t halfH[272];\
1262 uint8_t halfHV[256];\
1263 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1267 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1269 uint8_t full[24*17];\
1270 uint8_t halfH[272];\
1271 uint8_t halfV[256];\
1272 uint8_t halfHV[256];\
1273 copy_block17(full, src, 24, stride, 17);\
1274 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1275 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1276 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1277 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1279 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1281 uint8_t full[24*17];\
1282 uint8_t halfH[272];\
1283 copy_block17(full, src, 24, stride, 17);\
1284 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1285 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1286 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1288 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1290 uint8_t full[24*17];\
1291 uint8_t halfH[272];\
1292 uint8_t halfV[256];\
1293 uint8_t halfHV[256];\
1294 copy_block17(full, src, 24, stride, 17);\
1295 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1296 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1297 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1298 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1300 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1302 uint8_t full[24*17];\
1303 uint8_t halfH[272];\
1304 copy_block17(full, src, 24, stride, 17);\
1305 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1306 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1307 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1309 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
1311 uint8_t halfH[272];\
1312 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1316 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318 #define op_put(a, b) a = cm[((b) + 16)>>5]
1319 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1321 QPEL_MC(0, put_ , _ , op_put)
1322 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323 QPEL_MC(0, avg_ , _ , op_avg)
1324 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1326 #undef op_avg_no_rnd
1328 #undef op_put_no_rnd
1330 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1332 put_pixels8_8_c(dst, src, stride, 8);
1334 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1336 avg_pixels8_8_c(dst, src, stride, 8);
1338 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1340 put_pixels16_8_c(dst, src, stride, 16);
1342 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1344 avg_pixels16_8_c(dst, src, stride, 16);
1347 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1348 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1349 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1350 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1351 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1352 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1354 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1355 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1359 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1360 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1361 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1362 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1363 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1364 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1365 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1366 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1372 #if CONFIG_RV40_DECODER
1373 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1375 put_pixels16_xy2_8_c(dst, src, stride, 16);
1377 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1379 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1381 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1383 put_pixels8_xy2_8_c(dst, src, stride, 8);
1385 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1387 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1389 #endif /* CONFIG_RV40_DECODER */
1391 #if CONFIG_DIRAC_DECODER
1392 #define DIRAC_MC(OPNAME)\
1393 void ff_ ## OPNAME ## _dirac_pixels8_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1395 OPNAME ## _pixels8_8_c(dst, src[0], stride, h);\
1397 void ff_ ## OPNAME ## _dirac_pixels16_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1399 OPNAME ## _pixels16_8_c(dst, src[0], stride, h);\
1401 void ff_ ## OPNAME ## _dirac_pixels32_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1403 OPNAME ## _pixels16_8_c(dst , src[0] , stride, h);\
1404 OPNAME ## _pixels16_8_c(dst+16, src[0]+16, stride, h);\
1406 void ff_ ## OPNAME ## _dirac_pixels8_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1408 OPNAME ## _pixels8_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1410 void ff_ ## OPNAME ## _dirac_pixels16_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1412 OPNAME ## _pixels16_l2_8(dst, src[0], src[1], stride, stride, stride, h);\
1414 void ff_ ## OPNAME ## _dirac_pixels32_l2_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1416 OPNAME ## _pixels16_l2_8(dst , src[0] , src[1] , stride, stride, stride, h);\
1417 OPNAME ## _pixels16_l2_8(dst+16, src[0]+16, src[1]+16, stride, stride, stride, h);\
1419 void ff_ ## OPNAME ## _dirac_pixels8_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1421 OPNAME ## _pixels8_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1423 void ff_ ## OPNAME ## _dirac_pixels16_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1425 OPNAME ## _pixels16_l4_8(dst, src[0], src[1], src[2], src[3], stride, stride, stride, stride, stride, h);\
1427 void ff_ ## OPNAME ## _dirac_pixels32_l4_c(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1429 OPNAME ## _pixels16_l4_8(dst , src[0] , src[1] , src[2] , src[3] , stride, stride, stride, stride, stride, h);\
1430 OPNAME ## _pixels16_l4_8(dst+16, src[0]+16, src[1]+16, src[2]+16, src[3]+16, stride, stride, stride, stride, stride, h);\
1436 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1437 const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1441 const int src_1= src[ -srcStride];
1442 const int src0 = src[0 ];
1443 const int src1 = src[ srcStride];
1444 const int src2 = src[2*srcStride];
1445 const int src3 = src[3*srcStride];
1446 const int src4 = src[4*srcStride];
1447 const int src5 = src[5*srcStride];
1448 const int src6 = src[6*srcStride];
1449 const int src7 = src[7*srcStride];
1450 const int src8 = src[8*srcStride];
1451 const int src9 = src[9*srcStride];
1452 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1453 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1454 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1455 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1456 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1457 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1458 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1459 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1465 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1468 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1469 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1472 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1474 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1477 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1480 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1481 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1484 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1486 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1489 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1494 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1495 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1496 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1497 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1499 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1504 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1505 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1506 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1507 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1509 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1512 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1513 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1516 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1522 s += abs(pix1[0] - pix2[0]);
1523 s += abs(pix1[1] - pix2[1]);
1524 s += abs(pix1[2] - pix2[2]);
1525 s += abs(pix1[3] - pix2[3]);
1526 s += abs(pix1[4] - pix2[4]);
1527 s += abs(pix1[5] - pix2[5]);
1528 s += abs(pix1[6] - pix2[6]);
1529 s += abs(pix1[7] - pix2[7]);
1530 s += abs(pix1[8] - pix2[8]);
1531 s += abs(pix1[9] - pix2[9]);
1532 s += abs(pix1[10] - pix2[10]);
1533 s += abs(pix1[11] - pix2[11]);
1534 s += abs(pix1[12] - pix2[12]);
1535 s += abs(pix1[13] - pix2[13]);
1536 s += abs(pix1[14] - pix2[14]);
1537 s += abs(pix1[15] - pix2[15]);
1544 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1550 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1551 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1552 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1553 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1554 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1555 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1556 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1557 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1558 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1559 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1560 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1561 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1562 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1563 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1564 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1565 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1572 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1575 uint8_t *pix3 = pix2 + line_size;
1579 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1580 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1581 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1582 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1583 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1584 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1585 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1586 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1587 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1588 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1589 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1590 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1591 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1592 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1593 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1594 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1602 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1605 uint8_t *pix3 = pix2 + line_size;
1609 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1610 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1611 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1612 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1613 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1614 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1615 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1616 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1617 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1618 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1619 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1620 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1621 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1622 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1623 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1624 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1632 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1638 s += abs(pix1[0] - pix2[0]);
1639 s += abs(pix1[1] - pix2[1]);
1640 s += abs(pix1[2] - pix2[2]);
1641 s += abs(pix1[3] - pix2[3]);
1642 s += abs(pix1[4] - pix2[4]);
1643 s += abs(pix1[5] - pix2[5]);
1644 s += abs(pix1[6] - pix2[6]);
1645 s += abs(pix1[7] - pix2[7]);
1652 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1658 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1659 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1660 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1661 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1662 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1663 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1664 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1665 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1672 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1675 uint8_t *pix3 = pix2 + line_size;
1679 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1680 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1681 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1682 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1683 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1684 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1685 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1686 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1694 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1697 uint8_t *pix3 = pix2 + line_size;
1701 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1702 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1703 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1704 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1705 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1706 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1707 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1708 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1716 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1717 MpegEncContext *c = v;
1723 for(x=0; x<16; x++){
1724 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1727 for(x=0; x<15; x++){
1728 score2+= FFABS( s1[x ] - s1[x +stride]
1729 - s1[x+1] + s1[x+1+stride])
1730 -FFABS( s2[x ] - s2[x +stride]
1731 - s2[x+1] + s2[x+1+stride]);
1738 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1739 else return score1 + FFABS(score2)*8;
1742 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1743 MpegEncContext *c = v;
1750 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1754 score2+= FFABS( s1[x ] - s1[x +stride]
1755 - s1[x+1] + s1[x+1+stride])
1756 -FFABS( s2[x ] - s2[x +stride]
1757 - s2[x+1] + s2[x+1+stride]);
1764 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1765 else return score1 + FFABS(score2)*8;
1768 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1772 for(i=0; i<8*8; i++){
1773 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1776 av_assert2(-512<b && b<512);
1778 sum += (w*b)*(w*b)>>4;
1783 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1786 for(i=0; i<8*8; i++){
1787 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1791 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1795 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1798 memset(cmp, 0, sizeof(void*)*6);
1806 cmp[i]= c->hadamard8_diff[i];
1812 cmp[i]= c->dct_sad[i];
1815 cmp[i]= c->dct264_sad[i];
1818 cmp[i]= c->dct_max[i];
1821 cmp[i]= c->quant_psnr[i];
1850 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1855 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1857 for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1858 long a = *(long*)(src+i);
1859 long b = *(long*)(dst+i);
1860 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1863 dst[i+0] += src[i+0];
1866 static void diff_bytes_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
1868 #if !HAVE_FAST_UNALIGNED
1869 if((long)src2 & (sizeof(long)-1)){
1870 for(i=0; i+7<w; i+=8){
1871 dst[i+0] = src1[i+0]-src2[i+0];
1872 dst[i+1] = src1[i+1]-src2[i+1];
1873 dst[i+2] = src1[i+2]-src2[i+2];
1874 dst[i+3] = src1[i+3]-src2[i+3];
1875 dst[i+4] = src1[i+4]-src2[i+4];
1876 dst[i+5] = src1[i+5]-src2[i+5];
1877 dst[i+6] = src1[i+6]-src2[i+6];
1878 dst[i+7] = src1[i+7]-src2[i+7];
1882 for (i = 0; i <= w - (int)sizeof(long); i += sizeof(long)) {
1883 long a = *(long*)(src1+i);
1884 long b = *(long*)(src2+i);
1885 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1888 dst[i+0] = src1[i+0]-src2[i+0];
1891 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1899 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1908 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1916 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1926 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1929 for(i=0; i<w-1; i++){
1956 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1986 #define BUTTERFLY2(o1,o2,i1,i2) \
1990 #define BUTTERFLY1(x,y) \
1999 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2001 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2009 //FIXME try pointer walks
2010 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2011 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2012 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2013 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2015 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2016 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2017 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2018 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2020 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2021 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2022 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2023 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2027 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2028 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2029 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2030 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2032 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2033 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2034 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2035 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2038 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2039 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2040 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2041 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2046 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2054 //FIXME try pointer walks
2055 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2056 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2057 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2058 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2060 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2061 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2062 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2063 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2065 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2066 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2067 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2068 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2072 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2073 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2074 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2075 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2077 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2078 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2079 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2080 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2083 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2084 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2085 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2086 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2089 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2094 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2095 MpegEncContext * const s= (MpegEncContext *)c;
2096 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2100 s->dsp.diff_pixels(temp, src1, src2, stride);
2102 return s->dsp.sum_abs_dctelem(temp);
2107 const int s07 = SRC(0) + SRC(7);\
2108 const int s16 = SRC(1) + SRC(6);\
2109 const int s25 = SRC(2) + SRC(5);\
2110 const int s34 = SRC(3) + SRC(4);\
2111 const int a0 = s07 + s34;\
2112 const int a1 = s16 + s25;\
2113 const int a2 = s07 - s34;\
2114 const int a3 = s16 - s25;\
2115 const int d07 = SRC(0) - SRC(7);\
2116 const int d16 = SRC(1) - SRC(6);\
2117 const int d25 = SRC(2) - SRC(5);\
2118 const int d34 = SRC(3) - SRC(4);\
2119 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2120 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2121 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2122 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2124 DST(1, a4 + (a7>>2)) ;\
2125 DST(2, a2 + (a3>>1)) ;\
2126 DST(3, a5 + (a6>>2)) ;\
2128 DST(5, a6 - (a5>>2)) ;\
2129 DST(6, (a2>>1) - a3 ) ;\
2130 DST(7, (a4>>2) - a7 ) ;\
2133 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134 MpegEncContext * const s= (MpegEncContext *)c;
2139 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2141 #define SRC(x) dct[i][x]
2142 #define DST(x,v) dct[i][x]= v
2143 for( i = 0; i < 8; i++ )
2148 #define SRC(x) dct[x][i]
2149 #define DST(x,v) sum += FFABS(v)
2150 for( i = 0; i < 8; i++ )
2158 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2159 MpegEncContext * const s= (MpegEncContext *)c;
2160 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2165 s->dsp.diff_pixels(temp, src1, src2, stride);
2169 sum= FFMAX(sum, FFABS(temp[i]));
2174 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2175 MpegEncContext * const s= (MpegEncContext *)c;
2176 LOCAL_ALIGNED_16(int16_t, temp, [64*2]);
2177 int16_t * const bak = temp+64;
2183 s->dsp.diff_pixels(temp, src1, src2, stride);
2185 memcpy(bak, temp, 64*sizeof(int16_t));
2187 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2188 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2189 ff_simple_idct_8(temp); //FIXME
2192 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2197 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198 MpegEncContext * const s= (MpegEncContext *)c;
2199 const uint8_t *scantable= s->intra_scantable.permutated;
2200 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2201 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2202 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2203 int i, last, run, bits, level, distortion, start_i;
2204 const int esc_length= s->ac_esc_length;
2206 uint8_t * last_length;
2210 copy_block8(lsrc1, src1, 8, stride, 8);
2211 copy_block8(lsrc2, src2, 8, stride, 8);
2213 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2215 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2221 length = s->intra_ac_vlc_length;
2222 last_length= s->intra_ac_vlc_last_length;
2223 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2226 length = s->inter_ac_vlc_length;
2227 last_length= s->inter_ac_vlc_last_length;
2232 for(i=start_i; i<last; i++){
2233 int j= scantable[i];
2238 if((level&(~127)) == 0){
2239 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2248 level= temp[i] + 64;
2250 av_assert2(level - 64);
2252 if((level&(~127)) == 0){
2253 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2261 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2263 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2266 s->dsp.idct_add(lsrc2, 8, temp);
2268 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2270 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2273 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2274 MpegEncContext * const s= (MpegEncContext *)c;
2275 const uint8_t *scantable= s->intra_scantable.permutated;
2276 LOCAL_ALIGNED_16(int16_t, temp, [64]);
2277 int i, last, run, bits, level, start_i;
2278 const int esc_length= s->ac_esc_length;
2280 uint8_t * last_length;
2284 s->dsp.diff_pixels(temp, src1, src2, stride);
2286 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2292 length = s->intra_ac_vlc_length;
2293 last_length= s->intra_ac_vlc_last_length;
2294 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2297 length = s->inter_ac_vlc_length;
2298 last_length= s->inter_ac_vlc_last_length;
2303 for(i=start_i; i<last; i++){
2304 int j= scantable[i];
2309 if((level&(~127)) == 0){
2310 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2319 level= temp[i] + 64;
2321 av_assert2(level - 64);
2323 if((level&(~127)) == 0){
2324 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2332 #define VSAD_INTRA(size) \
2333 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2337 for(y=1; y<h; y++){ \
2338 for(x=0; x<size; x+=4){ \
2339 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2340 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2350 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2355 for(x=0; x<16; x++){
2356 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2365 #define SQ(a) ((a)*(a))
2366 #define VSSE_INTRA(size) \
2367 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2371 for(y=1; y<h; y++){ \
2372 for(x=0; x<size; x+=4){ \
2373 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2374 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2384 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2389 for(x=0; x<16; x++){
2390 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2399 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2403 for(i=0; i<size; i++)
2404 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2408 #define WRAPPER8_16_SQ(name8, name16)\
2409 static int name16(void /*MpegEncContext*/ *s, uint8_t *dst, uint8_t *src, int stride, int h){\
2411 score +=name8(s, dst , src , stride, 8);\
2412 score +=name8(s, dst+8 , src+8 , stride, 8);\
2416 score +=name8(s, dst , src , stride, 8);\
2417 score +=name8(s, dst+8 , src+8 , stride, 8);\
2422 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2423 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2424 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2426 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2428 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2429 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2430 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2431 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2433 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2434 uint32_t maxi, uint32_t maxisign)
2437 if(a > mini) return mini;
2438 else if((a^(1U<<31)) > maxisign) return maxi;
2442 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2444 uint32_t mini = *(uint32_t*)min;
2445 uint32_t maxi = *(uint32_t*)max;
2446 uint32_t maxisign = maxi ^ (1U<<31);
2447 uint32_t *dsti = (uint32_t*)dst;
2448 const uint32_t *srci = (const uint32_t*)src;
2449 for(i=0; i<len; i+=8) {
2450 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2451 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2452 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2453 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2454 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2455 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2456 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2457 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2460 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2462 if(min < 0 && max > 0) {
2463 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2465 for(i=0; i < len; i+=8) {
2466 dst[i ] = av_clipf(src[i ], min, max);
2467 dst[i + 1] = av_clipf(src[i + 1], min, max);
2468 dst[i + 2] = av_clipf(src[i + 2], min, max);
2469 dst[i + 3] = av_clipf(src[i + 3], min, max);
2470 dst[i + 4] = av_clipf(src[i + 4], min, max);
2471 dst[i + 5] = av_clipf(src[i + 5], min, max);
2472 dst[i + 6] = av_clipf(src[i + 6], min, max);
2473 dst[i + 7] = av_clipf(src[i + 7], min, max);
2478 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order)
2483 res += *v1++ * *v2++;
2488 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2493 *v1++ += mul * *v3++;
2498 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2499 int32_t max, unsigned int len)
2502 *dst++ = av_clip(*src++, min, max);
2503 *dst++ = av_clip(*src++, min, max);
2504 *dst++ = av_clip(*src++, min, max);
2505 *dst++ = av_clip(*src++, min, max);
2506 *dst++ = av_clip(*src++, min, max);
2507 *dst++ = av_clip(*src++, min, max);
2508 *dst++ = av_clip(*src++, min, max);
2509 *dst++ = av_clip(*src++, min, max);
2514 static void jref_idct_put(uint8_t *dest, int line_size, int16_t *block)
2516 ff_j_rev_dct (block);
2517 put_pixels_clamped_c(block, dest, line_size);
2519 static void jref_idct_add(uint8_t *dest, int line_size, int16_t *block)
2521 ff_j_rev_dct (block);
2522 add_pixels_clamped_c(block, dest, line_size);
2525 static void ff_jref_idct4_put(uint8_t *dest, int line_size, int16_t *block)
2527 ff_j_rev_dct4 (block);
2528 put_pixels_clamped4_c(block, dest, line_size);
2530 static void ff_jref_idct4_add(uint8_t *dest, int line_size, int16_t *block)
2532 ff_j_rev_dct4 (block);
2533 add_pixels_clamped4_c(block, dest, line_size);
2536 static void ff_jref_idct2_put(uint8_t *dest, int line_size, int16_t *block)
2538 ff_j_rev_dct2 (block);
2539 put_pixels_clamped2_c(block, dest, line_size);
2541 static void ff_jref_idct2_add(uint8_t *dest, int line_size, int16_t *block)
2543 ff_j_rev_dct2 (block);
2544 add_pixels_clamped2_c(block, dest, line_size);
2547 static void ff_jref_idct1_put(uint8_t *dest, int line_size, int16_t *block)
2549 dest[0] = av_clip_uint8((block[0] + 4)>>3);
2551 static void ff_jref_idct1_add(uint8_t *dest, int line_size, int16_t *block)
2553 dest[0] = av_clip_uint8(dest[0] + ((block[0] + 4)>>3));
2556 /* init static data */
2557 av_cold void ff_dsputil_static_init(void)
2561 for(i=0;i<512;i++) {
2562 ff_squareTbl[i] = (i - 256) * (i - 256);
2566 int ff_check_alignment(void){
2567 static int did_fail=0;
2568 LOCAL_ALIGNED_16(int, aligned, [4]);
2570 if((intptr_t)aligned & 15){
2572 #if HAVE_MMX || HAVE_ALTIVEC
2573 av_log(NULL, AV_LOG_ERROR,
2574 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2575 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2576 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2577 "Do not report crashes to FFmpeg developers.\n");
2586 av_cold void ff_dsputil_init(DSPContext* c, AVCodecContext *avctx)
2588 ff_check_alignment();
2591 if (avctx->bits_per_raw_sample == 10) {
2592 c->fdct = ff_jpeg_fdct_islow_10;
2593 c->fdct248 = ff_fdct248_islow_10;
2595 if(avctx->dct_algo==FF_DCT_FASTINT) {
2596 c->fdct = ff_fdct_ifast;
2597 c->fdct248 = ff_fdct_ifast248;
2599 else if(avctx->dct_algo==FF_DCT_FAAN) {
2600 c->fdct = ff_faandct;
2601 c->fdct248 = ff_faandct248;
2604 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2605 c->fdct248 = ff_fdct248_islow_8;
2608 #endif //CONFIG_ENCODERS
2610 if(avctx->lowres==1){
2611 c->idct_put= ff_jref_idct4_put;
2612 c->idct_add= ff_jref_idct4_add;
2613 c->idct = ff_j_rev_dct4;
2614 c->idct_permutation_type= FF_NO_IDCT_PERM;
2615 }else if(avctx->lowres==2){
2616 c->idct_put= ff_jref_idct2_put;
2617 c->idct_add= ff_jref_idct2_add;
2618 c->idct = ff_j_rev_dct2;
2619 c->idct_permutation_type= FF_NO_IDCT_PERM;
2620 }else if(avctx->lowres==3){
2621 c->idct_put= ff_jref_idct1_put;
2622 c->idct_add= ff_jref_idct1_add;
2623 c->idct = ff_j_rev_dct1;
2624 c->idct_permutation_type= FF_NO_IDCT_PERM;
2626 if (avctx->bits_per_raw_sample == 10) {
2627 c->idct_put = ff_simple_idct_put_10;
2628 c->idct_add = ff_simple_idct_add_10;
2629 c->idct = ff_simple_idct_10;
2630 c->idct_permutation_type = FF_NO_IDCT_PERM;
2631 } else if (avctx->bits_per_raw_sample == 12) {
2632 c->idct_put = ff_simple_idct_put_12;
2633 c->idct_add = ff_simple_idct_add_12;
2634 c->idct = ff_simple_idct_12;
2635 c->idct_permutation_type = FF_NO_IDCT_PERM;
2637 if(avctx->idct_algo==FF_IDCT_INT){
2638 c->idct_put= jref_idct_put;
2639 c->idct_add= jref_idct_add;
2640 c->idct = ff_j_rev_dct;
2641 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2642 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2643 c->idct_put= ff_faanidct_put;
2644 c->idct_add= ff_faanidct_add;
2645 c->idct = ff_faanidct;
2646 c->idct_permutation_type= FF_NO_IDCT_PERM;
2647 }else{ //accurate/default
2648 c->idct_put = ff_simple_idct_put_8;
2649 c->idct_add = ff_simple_idct_add_8;
2650 c->idct = ff_simple_idct_8;
2651 c->idct_permutation_type= FF_NO_IDCT_PERM;
2656 c->diff_pixels = diff_pixels_c;
2657 c->put_pixels_clamped = put_pixels_clamped_c;
2658 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
2659 c->add_pixels_clamped = add_pixels_clamped_c;
2660 c->sum_abs_dctelem = sum_abs_dctelem_c;
2663 c->pix_sum = pix_sum_c;
2664 c->pix_norm1 = pix_norm1_c;
2666 c->fill_block_tab[0] = fill_block16_c;
2667 c->fill_block_tab[1] = fill_block8_c;
2669 /* TODO [0] 16 [1] 8 */
2670 c->pix_abs[0][0] = pix_abs16_c;
2671 c->pix_abs[0][1] = pix_abs16_x2_c;
2672 c->pix_abs[0][2] = pix_abs16_y2_c;
2673 c->pix_abs[0][3] = pix_abs16_xy2_c;
2674 c->pix_abs[1][0] = pix_abs8_c;
2675 c->pix_abs[1][1] = pix_abs8_x2_c;
2676 c->pix_abs[1][2] = pix_abs8_y2_c;
2677 c->pix_abs[1][3] = pix_abs8_xy2_c;
2679 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2680 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2681 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2682 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2683 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2684 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2685 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2686 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2687 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2689 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2690 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2691 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2692 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2693 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2694 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2695 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2696 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2697 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2699 #define dspfunc(PFX, IDX, NUM) \
2700 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2701 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2702 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2703 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2704 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2705 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2706 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2707 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2708 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2709 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2710 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2711 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2712 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2713 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2714 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2715 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2717 dspfunc(put_qpel, 0, 16);
2718 dspfunc(put_no_rnd_qpel, 0, 16);
2720 dspfunc(avg_qpel, 0, 16);
2721 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2723 dspfunc(put_qpel, 1, 8);
2724 dspfunc(put_no_rnd_qpel, 1, 8);
2726 dspfunc(avg_qpel, 1, 8);
2727 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2731 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2732 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2733 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2734 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2735 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2736 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2737 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2738 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2740 #define SET_CMP_FUNC(name) \
2741 c->name[0]= name ## 16_c;\
2742 c->name[1]= name ## 8x8_c;
2744 SET_CMP_FUNC(hadamard8_diff)
2745 c->hadamard8_diff[4]= hadamard8_intra16_c;
2746 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2747 SET_CMP_FUNC(dct_sad)
2748 SET_CMP_FUNC(dct_max)
2750 SET_CMP_FUNC(dct264_sad)
2752 c->sad[0]= pix_abs16_c;
2753 c->sad[1]= pix_abs8_c;
2757 SET_CMP_FUNC(quant_psnr)
2760 c->vsad[0]= vsad16_c;
2761 c->vsad[4]= vsad_intra16_c;
2762 c->vsad[5]= vsad_intra8_c;
2763 c->vsse[0]= vsse16_c;
2764 c->vsse[4]= vsse_intra16_c;
2765 c->vsse[5]= vsse_intra8_c;
2766 c->nsse[0]= nsse16_c;
2767 c->nsse[1]= nsse8_c;
2768 #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
2769 ff_dsputil_init_dwt(c);
2772 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2774 c->add_bytes= add_bytes_c;
2775 c->diff_bytes= diff_bytes_c;
2776 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2777 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2778 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2779 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2780 c->bswap_buf= bswap_buf;
2781 c->bswap16_buf = bswap16_buf;
2783 c->try_8x8basis= try_8x8basis_c;
2784 c->add_8x8basis= add_8x8basis_c;
2786 c->vector_clipf = vector_clipf_c;
2787 c->scalarproduct_int16 = scalarproduct_int16_c;
2788 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2789 c->vector_clip_int32 = vector_clip_int32_c;
2791 c->shrink[0]= av_image_copy_plane;
2792 c->shrink[1]= ff_shrink22;
2793 c->shrink[2]= ff_shrink44;
2794 c->shrink[3]= ff_shrink88;
2796 c->add_pixels8 = add_pixels8_c;
2800 #define FUNC(f, depth) f ## _ ## depth
2801 #define FUNCC(f, depth) f ## _ ## depth ## _c
2803 c->draw_edges = FUNCC(draw_edges, 8);
2804 c->clear_block = FUNCC(clear_block, 8);
2805 c->clear_blocks = FUNCC(clear_blocks, 8);
2807 #define BIT_DEPTH_FUNCS(depth) \
2808 c->get_pixels = FUNCC(get_pixels, depth);
2810 switch (avctx->bits_per_raw_sample) {
2815 BIT_DEPTH_FUNCS(16);
2818 if(avctx->bits_per_raw_sample<=8 || avctx->codec_type != AVMEDIA_TYPE_VIDEO) {
2826 ff_dsputil_init_alpha(c, avctx);
2828 ff_dsputil_init_arm(c, avctx);
2830 ff_dsputil_init_bfin(c, avctx);
2832 ff_dsputil_init_ppc(c, avctx);
2834 ff_dsputil_init_sh4(c, avctx);
2836 ff_dsputil_init_vis(c, avctx);
2838 ff_dsputil_init_x86(c, avctx);
2840 ff_init_scantable_permutation(c->idct_permutation,
2841 c->idct_permutation_type);
2844 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2846 ff_dsputil_init(c, avctx);
2849 av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
2851 ff_dsputil_init(c, avctx);