3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43 uint32_t ff_squareTbl[512] = {0, };
46 #include "dsputil_template.c"
50 #include "dsputil_template.c"
54 #include "dsputil_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL/255 * 0x7f)
58 #define pb_80 (~0UL/255 * 0x80)
60 const uint8_t ff_zigzag_direct[64] = {
61 0, 1, 8, 16, 9, 2, 3, 10,
62 17, 24, 32, 25, 18, 11, 4, 5,
63 12, 19, 26, 33, 40, 48, 41, 34,
64 27, 20, 13, 6, 7, 14, 21, 28,
65 35, 42, 49, 56, 57, 50, 43, 36,
66 29, 22, 15, 23, 30, 37, 44, 51,
67 58, 59, 52, 45, 38, 31, 39, 46,
68 53, 60, 61, 54, 47, 55, 62, 63
71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
72 specification, we interleave the fields */
73 const uint8_t ff_zigzag248_direct[64] = {
74 0, 8, 1, 9, 16, 24, 2, 10,
75 17, 25, 32, 40, 48, 56, 33, 41,
76 18, 26, 3, 11, 4, 12, 19, 27,
77 34, 42, 49, 57, 50, 58, 35, 43,
78 20, 28, 5, 13, 6, 14, 21, 29,
79 36, 44, 51, 59, 52, 60, 37, 45,
80 22, 30, 7, 15, 23, 31, 38, 46,
81 53, 61, 54, 62, 39, 47, 55, 63,
84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87 const uint8_t ff_alternate_horizontal_scan[64] = {
88 0, 1, 2, 3, 8, 9, 16, 17,
89 10, 11, 4, 5, 6, 7, 15, 14,
90 13, 12, 19, 18, 24, 25, 32, 33,
91 26, 27, 20, 21, 22, 23, 28, 29,
92 30, 31, 34, 35, 40, 41, 48, 49,
93 42, 43, 36, 37, 38, 39, 44, 45,
94 46, 47, 50, 51, 56, 57, 58, 59,
95 52, 53, 54, 55, 60, 61, 62, 63,
98 const uint8_t ff_alternate_vertical_scan[64] = {
99 0, 8, 16, 24, 1, 9, 2, 10,
100 17, 25, 32, 40, 48, 56, 57, 49,
101 41, 33, 26, 18, 3, 11, 4, 12,
102 19, 27, 34, 42, 50, 58, 35, 43,
103 51, 59, 20, 28, 5, 13, 6, 14,
104 21, 29, 36, 44, 52, 60, 37, 45,
105 53, 61, 22, 30, 7, 15, 23, 31,
106 38, 46, 54, 62, 39, 47, 55, 63,
109 /* Input permutation for the simple_idct_mmx */
110 static const uint8_t simple_mmx_permutation[64]={
111 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
112 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
113 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
114 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
115 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
116 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
117 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
118 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
127 st->scantable= src_scantable;
131 j = src_scantable[i];
132 st->permutated[i] = permutation[j];
141 j = st->permutated[i];
143 st->raster_end[i]= end;
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= av_bswap32(src[i+0]);
222 dst[i+1]= av_bswap32(src[i+1]);
223 dst[i+2]= av_bswap32(src[i+2]);
224 dst[i+3]= av_bswap32(src[i+3]);
225 dst[i+4]= av_bswap32(src[i+4]);
226 dst[i+5]= av_bswap32(src[i+5]);
227 dst[i+6]= av_bswap32(src[i+6]);
228 dst[i+7]= av_bswap32(src[i+7]);
231 dst[i+0]= av_bswap32(src[i+0]);
235 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
238 *dst++ = av_bswap16(*src++);
241 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244 uint32_t *sq = ff_squareTbl + 256;
247 for (i = 0; i < h; i++) {
248 s += sq[pix1[0] - pix2[0]];
249 s += sq[pix1[1] - pix2[1]];
250 s += sq[pix1[2] - pix2[2]];
251 s += sq[pix1[3] - pix2[3]];
258 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
261 uint32_t *sq = ff_squareTbl + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
269 s += sq[pix1[4] - pix2[4]];
270 s += sq[pix1[5] - pix2[5]];
271 s += sq[pix1[6] - pix2[6]];
272 s += sq[pix1[7] - pix2[7]];
279 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
282 uint32_t *sq = ff_squareTbl + 256;
285 for (i = 0; i < h; i++) {
286 s += sq[pix1[ 0] - pix2[ 0]];
287 s += sq[pix1[ 1] - pix2[ 1]];
288 s += sq[pix1[ 2] - pix2[ 2]];
289 s += sq[pix1[ 3] - pix2[ 3]];
290 s += sq[pix1[ 4] - pix2[ 4]];
291 s += sq[pix1[ 5] - pix2[ 5]];
292 s += sq[pix1[ 6] - pix2[ 6]];
293 s += sq[pix1[ 7] - pix2[ 7]];
294 s += sq[pix1[ 8] - pix2[ 8]];
295 s += sq[pix1[ 9] - pix2[ 9]];
296 s += sq[pix1[10] - pix2[10]];
297 s += sq[pix1[11] - pix2[11]];
298 s += sq[pix1[12] - pix2[12]];
299 s += sq[pix1[13] - pix2[13]];
300 s += sq[pix1[14] - pix2[14]];
301 s += sq[pix1[15] - pix2[15]];
309 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
310 const uint8_t *s2, int stride){
313 /* read the pixels */
315 block[0] = s1[0] - s2[0];
316 block[1] = s1[1] - s2[1];
317 block[2] = s1[2] - s2[2];
318 block[3] = s1[3] - s2[3];
319 block[4] = s1[4] - s2[4];
320 block[5] = s1[5] - s2[5];
321 block[6] = s1[6] - s2[6];
322 block[7] = s1[7] - s2[7];
330 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
334 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
336 /* read the pixels */
338 pixels[0] = cm[block[0]];
339 pixels[1] = cm[block[1]];
340 pixels[2] = cm[block[2]];
341 pixels[3] = cm[block[3]];
342 pixels[4] = cm[block[4]];
343 pixels[5] = cm[block[5]];
344 pixels[6] = cm[block[6]];
345 pixels[7] = cm[block[7]];
352 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
356 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
358 /* read the pixels */
360 pixels[0] = cm[block[0]];
361 pixels[1] = cm[block[1]];
362 pixels[2] = cm[block[2]];
363 pixels[3] = cm[block[3]];
370 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
374 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
376 /* read the pixels */
378 pixels[0] = cm[block[0]];
379 pixels[1] = cm[block[1]];
386 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
387 uint8_t *restrict pixels,
392 for (i = 0; i < 8; i++) {
393 for (j = 0; j < 8; j++) {
396 else if (*block > 127)
399 *pixels = (uint8_t)(*block + 128);
403 pixels += (line_size - 8);
407 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
411 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
413 /* read the pixels */
415 pixels[0] = cm[pixels[0] + block[0]];
416 pixels[1] = cm[pixels[1] + block[1]];
417 pixels[2] = cm[pixels[2] + block[2]];
418 pixels[3] = cm[pixels[3] + block[3]];
419 pixels[4] = cm[pixels[4] + block[4]];
420 pixels[5] = cm[pixels[5] + block[5]];
421 pixels[6] = cm[pixels[6] + block[6]];
422 pixels[7] = cm[pixels[7] + block[7]];
428 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
432 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
434 /* read the pixels */
436 pixels[0] = cm[pixels[0] + block[0]];
437 pixels[1] = cm[pixels[1] + block[1]];
438 pixels[2] = cm[pixels[2] + block[2]];
439 pixels[3] = cm[pixels[3] + block[3]];
445 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
449 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
451 /* read the pixels */
453 pixels[0] = cm[pixels[0] + block[0]];
454 pixels[1] = cm[pixels[1] + block[1]];
460 static int sum_abs_dctelem_c(DCTELEM *block)
464 sum+= FFABS(block[i]);
468 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
472 for (i = 0; i < h; i++) {
473 memset(block, value, 16);
478 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
482 for (i = 0; i < h; i++) {
483 memset(block, value, 8);
488 #define avg2(a,b) ((a+b+1)>>1)
489 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
491 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
493 const int A=(16-x16)*(16-y16);
494 const int B=( x16)*(16-y16);
495 const int C=(16-x16)*( y16);
496 const int D=( x16)*( y16);
501 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
502 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
503 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
504 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
505 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
506 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
507 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
508 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
514 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
515 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
518 const int s= 1<<shift;
528 for(x=0; x<8; x++){ //XXX FIXME optimize
529 int src_x, src_y, frac_x, frac_y, index;
538 if((unsigned)src_x < width){
539 if((unsigned)src_y < height){
540 index= src_x + src_y*stride;
541 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
542 + src[index +1]* frac_x )*(s-frac_y)
543 + ( src[index+stride ]*(s-frac_x)
544 + src[index+stride+1]* frac_x )* frac_y
547 index= src_x + av_clip(src_y, 0, height)*stride;
548 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
549 + src[index +1]* frac_x )*s
553 if((unsigned)src_y < height){
554 index= av_clip(src_x, 0, width) + src_y*stride;
555 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
556 + src[index+stride ]* frac_y )*s
559 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
560 dst[y*stride + x]= src[index ];
572 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
574 case 2: put_pixels2_8_c (dst, src, stride, height); break;
575 case 4: put_pixels4_8_c (dst, src, stride, height); break;
576 case 8: put_pixels8_8_c (dst, src, stride, height); break;
577 case 16:put_pixels16_8_c(dst, src, stride, height); break;
581 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
583 for (i=0; i < height; i++) {
584 for (j=0; j < width; j++) {
585 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
592 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
594 for (i=0; i < height; i++) {
595 for (j=0; j < width; j++) {
596 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
603 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
605 for (i=0; i < height; i++) {
606 for (j=0; j < width; j++) {
607 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
614 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
616 for (i=0; i < height; i++) {
617 for (j=0; j < width; j++) {
618 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
625 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
627 for (i=0; i < height; i++) {
628 for (j=0; j < width; j++) {
629 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
636 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
638 for (i=0; i < height; i++) {
639 for (j=0; j < width; j++) {
640 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
647 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
649 for (i=0; i < height; i++) {
650 for (j=0; j < width; j++) {
651 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
658 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
660 for (i=0; i < height; i++) {
661 for (j=0; j < width; j++) {
662 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
669 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
671 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
672 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
673 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
674 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
678 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
680 for (i=0; i < height; i++) {
681 for (j=0; j < width; j++) {
682 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
689 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
691 for (i=0; i < height; i++) {
692 for (j=0; j < width; j++) {
693 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
700 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
702 for (i=0; i < height; i++) {
703 for (j=0; j < width; j++) {
704 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
711 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
713 for (i=0; i < height; i++) {
714 for (j=0; j < width; j++) {
715 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
722 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
724 for (i=0; i < height; i++) {
725 for (j=0; j < width; j++) {
726 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
733 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
735 for (i=0; i < height; i++) {
736 for (j=0; j < width; j++) {
737 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
744 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
746 for (i=0; i < height; i++) {
747 for (j=0; j < width; j++) {
748 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
755 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
757 for (i=0; i < height; i++) {
758 for (j=0; j < width; j++) {
759 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
766 #define QPEL_MC(r, OPNAME, RND, OP) \
767 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
768 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
772 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
773 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
774 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
775 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
776 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
777 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
778 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
779 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
785 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
787 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
791 const int src0= src[0*srcStride];\
792 const int src1= src[1*srcStride];\
793 const int src2= src[2*srcStride];\
794 const int src3= src[3*srcStride];\
795 const int src4= src[4*srcStride];\
796 const int src5= src[5*srcStride];\
797 const int src6= src[6*srcStride];\
798 const int src7= src[7*srcStride];\
799 const int src8= src[8*srcStride];\
800 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
801 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
802 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
803 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
804 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
805 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
806 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
807 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
813 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
814 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
819 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
820 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
821 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
822 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
823 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
824 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
825 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
826 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
827 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
828 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
829 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
830 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
831 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
832 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
833 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
834 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
840 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
841 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
846 const int src0= src[0*srcStride];\
847 const int src1= src[1*srcStride];\
848 const int src2= src[2*srcStride];\
849 const int src3= src[3*srcStride];\
850 const int src4= src[4*srcStride];\
851 const int src5= src[5*srcStride];\
852 const int src6= src[6*srcStride];\
853 const int src7= src[7*srcStride];\
854 const int src8= src[8*srcStride];\
855 const int src9= src[9*srcStride];\
856 const int src10= src[10*srcStride];\
857 const int src11= src[11*srcStride];\
858 const int src12= src[12*srcStride];\
859 const int src13= src[13*srcStride];\
860 const int src14= src[14*srcStride];\
861 const int src15= src[15*srcStride];\
862 const int src16= src[16*srcStride];\
863 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
864 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
865 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
866 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
867 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
868 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
869 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
870 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
871 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
872 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
873 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
874 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
875 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
876 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
877 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
878 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
884 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
886 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
887 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
890 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
891 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
894 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
896 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
897 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
900 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
903 copy_block9(full, src, 16, stride, 9);\
904 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
905 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
908 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
910 copy_block9(full, src, 16, stride, 9);\
911 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
914 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
917 copy_block9(full, src, 16, stride, 9);\
918 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
919 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
921 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
926 copy_block9(full, src, 16, stride, 9);\
927 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
928 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
929 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
930 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
932 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
936 copy_block9(full, src, 16, stride, 9);\
937 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
938 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
939 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
940 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
942 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
947 copy_block9(full, src, 16, stride, 9);\
948 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
949 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
950 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
951 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
953 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
957 copy_block9(full, src, 16, stride, 9);\
958 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
959 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
960 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
961 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
963 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
968 copy_block9(full, src, 16, stride, 9);\
969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
972 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
974 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
978 copy_block9(full, src, 16, stride, 9);\
979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
980 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
981 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
982 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
984 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
989 copy_block9(full, src, 16, stride, 9);\
990 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
991 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
992 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
993 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
995 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
999 copy_block9(full, src, 16, stride, 9);\
1000 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1001 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1002 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1003 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1005 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1007 uint8_t halfHV[64];\
1008 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1009 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1010 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1012 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1014 uint8_t halfHV[64];\
1015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1019 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t full[16*9];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1030 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1033 copy_block9(full, src, 16, stride, 9);\
1034 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1035 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1036 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1038 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1039 uint8_t full[16*9];\
1042 uint8_t halfHV[64];\
1043 copy_block9(full, src, 16, stride, 9);\
1044 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1045 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1046 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1047 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1049 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1050 uint8_t full[16*9];\
1052 copy_block9(full, src, 16, stride, 9);\
1053 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1054 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1055 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1057 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1060 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1063 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1065 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1066 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1069 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1070 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1073 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1075 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1076 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1079 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1080 uint8_t full[24*17];\
1082 copy_block17(full, src, 24, stride, 17);\
1083 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1084 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1087 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1088 uint8_t full[24*17];\
1089 copy_block17(full, src, 24, stride, 17);\
1090 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1093 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1094 uint8_t full[24*17];\
1096 copy_block17(full, src, 24, stride, 17);\
1097 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1098 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1100 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1101 uint8_t full[24*17];\
1102 uint8_t halfH[272];\
1103 uint8_t halfV[256];\
1104 uint8_t halfHV[256];\
1105 copy_block17(full, src, 24, stride, 17);\
1106 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1107 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1109 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1111 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1112 uint8_t full[24*17];\
1113 uint8_t halfH[272];\
1114 uint8_t halfHV[256];\
1115 copy_block17(full, src, 24, stride, 17);\
1116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1117 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1118 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1119 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1121 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1122 uint8_t full[24*17];\
1123 uint8_t halfH[272];\
1124 uint8_t halfV[256];\
1125 uint8_t halfHV[256];\
1126 copy_block17(full, src, 24, stride, 17);\
1127 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1128 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1129 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1130 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1132 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1133 uint8_t full[24*17];\
1134 uint8_t halfH[272];\
1135 uint8_t halfHV[256];\
1136 copy_block17(full, src, 24, stride, 17);\
1137 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1138 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1139 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1140 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1142 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1143 uint8_t full[24*17];\
1144 uint8_t halfH[272];\
1145 uint8_t halfV[256];\
1146 uint8_t halfHV[256];\
1147 copy_block17(full, src, 24, stride, 17);\
1148 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1149 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1151 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1153 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1154 uint8_t full[24*17];\
1155 uint8_t halfH[272];\
1156 uint8_t halfHV[256];\
1157 copy_block17(full, src, 24, stride, 17);\
1158 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1159 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1160 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1161 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1163 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1164 uint8_t full[24*17];\
1165 uint8_t halfH[272];\
1166 uint8_t halfV[256];\
1167 uint8_t halfHV[256];\
1168 copy_block17(full, src, 24, stride, 17);\
1169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1172 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1174 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1175 uint8_t full[24*17];\
1176 uint8_t halfH[272];\
1177 uint8_t halfHV[256];\
1178 copy_block17(full, src, 24, stride, 17);\
1179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1180 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1181 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1182 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1184 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1185 uint8_t halfH[272];\
1186 uint8_t halfHV[256];\
1187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1189 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1191 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1192 uint8_t halfH[272];\
1193 uint8_t halfHV[256];\
1194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1198 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfV[256];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1209 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 copy_block17(full, src, 24, stride, 17);\
1213 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1214 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1215 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1217 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1218 uint8_t full[24*17];\
1219 uint8_t halfH[272];\
1220 uint8_t halfV[256];\
1221 uint8_t halfHV[256];\
1222 copy_block17(full, src, 24, stride, 17);\
1223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1224 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1226 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1228 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1229 uint8_t full[24*17];\
1230 uint8_t halfH[272];\
1231 copy_block17(full, src, 24, stride, 17);\
1232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1234 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1236 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1237 uint8_t halfH[272];\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1239 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1242 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1243 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1244 #define op_put(a, b) a = cm[((b) + 16)>>5]
1245 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1247 QPEL_MC(0, put_ , _ , op_put)
1248 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1249 QPEL_MC(0, avg_ , _ , op_avg)
1250 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1252 #undef op_avg_no_rnd
1254 #undef op_put_no_rnd
1256 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1257 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1258 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1259 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1260 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1261 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1263 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1264 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1268 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1269 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1270 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1271 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1272 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1273 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1274 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1275 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1281 #if CONFIG_RV40_DECODER
1282 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1283 put_pixels16_xy2_8_c(dst, src, stride, 16);
1285 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1286 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1288 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1289 put_pixels8_xy2_8_c(dst, src, stride, 8);
1291 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1292 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1294 #endif /* CONFIG_RV40_DECODER */
1296 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1297 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1301 const int src_1= src[ -srcStride];
1302 const int src0 = src[0 ];
1303 const int src1 = src[ srcStride];
1304 const int src2 = src[2*srcStride];
1305 const int src3 = src[3*srcStride];
1306 const int src4 = src[4*srcStride];
1307 const int src5 = src[5*srcStride];
1308 const int src6 = src[6*srcStride];
1309 const int src7 = src[7*srcStride];
1310 const int src8 = src[8*srcStride];
1311 const int src9 = src[9*srcStride];
1312 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1313 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1314 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1315 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1316 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1317 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1318 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1319 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1325 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1327 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1328 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1331 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1332 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1335 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1337 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1338 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1341 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1342 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1345 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1349 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1350 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1351 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1352 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1354 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1358 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1359 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1360 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1361 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1363 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1365 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1366 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1369 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1370 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1372 const int strength= ff_h263_loop_filter_strength[qscale];
1376 int p0= src[x-2*stride];
1377 int p1= src[x-1*stride];
1378 int p2= src[x+0*stride];
1379 int p3= src[x+1*stride];
1380 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1382 if (d<-2*strength) d1= 0;
1383 else if(d<- strength) d1=-2*strength - d;
1384 else if(d< strength) d1= d;
1385 else if(d< 2*strength) d1= 2*strength - d;
1390 if(p1&256) p1= ~(p1>>31);
1391 if(p2&256) p2= ~(p2>>31);
1393 src[x-1*stride] = p1;
1394 src[x+0*stride] = p2;
1398 d2= av_clip((p0-p3)/4, -ad1, ad1);
1400 src[x-2*stride] = p0 - d2;
1401 src[x+ stride] = p3 + d2;
1406 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1407 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1409 const int strength= ff_h263_loop_filter_strength[qscale];
1413 int p0= src[y*stride-2];
1414 int p1= src[y*stride-1];
1415 int p2= src[y*stride+0];
1416 int p3= src[y*stride+1];
1417 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1419 if (d<-2*strength) d1= 0;
1420 else if(d<- strength) d1=-2*strength - d;
1421 else if(d< strength) d1= d;
1422 else if(d< 2*strength) d1= 2*strength - d;
1427 if(p1&256) p1= ~(p1>>31);
1428 if(p2&256) p2= ~(p2>>31);
1430 src[y*stride-1] = p1;
1431 src[y*stride+0] = p2;
1435 d2= av_clip((p0-p3)/4, -ad1, ad1);
1437 src[y*stride-2] = p0 - d2;
1438 src[y*stride+1] = p3 + d2;
1443 static void h261_loop_filter_c(uint8_t *src, int stride){
1448 temp[x ] = 4*src[x ];
1449 temp[x + 7*8] = 4*src[x + 7*stride];
1453 xy = y * stride + x;
1455 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1460 src[ y*stride] = (temp[ y*8] + 2)>>2;
1461 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1463 xy = y * stride + x;
1465 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1470 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1476 s += abs(pix1[0] - pix2[0]);
1477 s += abs(pix1[1] - pix2[1]);
1478 s += abs(pix1[2] - pix2[2]);
1479 s += abs(pix1[3] - pix2[3]);
1480 s += abs(pix1[4] - pix2[4]);
1481 s += abs(pix1[5] - pix2[5]);
1482 s += abs(pix1[6] - pix2[6]);
1483 s += abs(pix1[7] - pix2[7]);
1484 s += abs(pix1[8] - pix2[8]);
1485 s += abs(pix1[9] - pix2[9]);
1486 s += abs(pix1[10] - pix2[10]);
1487 s += abs(pix1[11] - pix2[11]);
1488 s += abs(pix1[12] - pix2[12]);
1489 s += abs(pix1[13] - pix2[13]);
1490 s += abs(pix1[14] - pix2[14]);
1491 s += abs(pix1[15] - pix2[15]);
1498 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1504 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1505 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1506 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1507 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1508 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1509 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1510 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1511 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1512 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1513 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1514 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1515 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1516 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1517 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1518 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1519 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1526 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1529 uint8_t *pix3 = pix2 + line_size;
1533 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1534 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1535 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1536 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1537 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1538 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1539 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1540 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1541 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1542 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1543 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1544 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1545 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1546 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1547 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1548 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1556 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1559 uint8_t *pix3 = pix2 + line_size;
1563 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1564 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1565 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1566 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1567 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1568 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1569 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1570 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1571 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1572 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1573 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1574 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1575 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1576 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1577 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1578 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1586 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1592 s += abs(pix1[0] - pix2[0]);
1593 s += abs(pix1[1] - pix2[1]);
1594 s += abs(pix1[2] - pix2[2]);
1595 s += abs(pix1[3] - pix2[3]);
1596 s += abs(pix1[4] - pix2[4]);
1597 s += abs(pix1[5] - pix2[5]);
1598 s += abs(pix1[6] - pix2[6]);
1599 s += abs(pix1[7] - pix2[7]);
1606 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1612 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1613 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1614 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1615 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1616 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1617 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1618 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1619 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1626 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1629 uint8_t *pix3 = pix2 + line_size;
1633 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1634 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1635 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1636 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1637 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1638 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1639 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1640 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1648 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1651 uint8_t *pix3 = pix2 + line_size;
1655 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1656 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1657 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1658 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1659 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1660 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1661 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1662 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1670 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1671 MpegEncContext *c = v;
1677 for(x=0; x<16; x++){
1678 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1681 for(x=0; x<15; x++){
1682 score2+= FFABS( s1[x ] - s1[x +stride]
1683 - s1[x+1] + s1[x+1+stride])
1684 -FFABS( s2[x ] - s2[x +stride]
1685 - s2[x+1] + s2[x+1+stride]);
1692 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1693 else return score1 + FFABS(score2)*8;
1696 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1697 MpegEncContext *c = v;
1704 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1708 score2+= FFABS( s1[x ] - s1[x +stride]
1709 - s1[x+1] + s1[x+1+stride])
1710 -FFABS( s2[x ] - s2[x +stride]
1711 - s2[x+1] + s2[x+1+stride]);
1718 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1719 else return score1 + FFABS(score2)*8;
1722 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1726 for(i=0; i<8*8; i++){
1727 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1730 assert(-512<b && b<512);
1732 sum += (w*b)*(w*b)>>4;
1737 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1740 for(i=0; i<8*8; i++){
1741 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1746 * permutes an 8x8 block.
1747 * @param block the block which will be permuted according to the given permutation vector
1748 * @param permutation the permutation vector
1749 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1750 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1751 * (inverse) permutated to scantable order!
1753 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1759 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1761 for(i=0; i<=last; i++){
1762 const int j= scantable[i];
1767 for(i=0; i<=last; i++){
1768 const int j= scantable[i];
1769 const int perm_j= permutation[j];
1770 block[perm_j]= temp[j];
1774 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1778 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1781 memset(cmp, 0, sizeof(void*)*6);
1789 cmp[i]= c->hadamard8_diff[i];
1795 cmp[i]= c->dct_sad[i];
1798 cmp[i]= c->dct264_sad[i];
1801 cmp[i]= c->dct_max[i];
1804 cmp[i]= c->quant_psnr[i];
1833 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1838 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1840 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1841 long a = *(long*)(src+i);
1842 long b = *(long*)(dst+i);
1843 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1846 dst[i+0] += src[i+0];
1849 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1851 #if !HAVE_FAST_UNALIGNED
1852 if((long)src2 & (sizeof(long)-1)){
1853 for(i=0; i+7<w; i+=8){
1854 dst[i+0] = src1[i+0]-src2[i+0];
1855 dst[i+1] = src1[i+1]-src2[i+1];
1856 dst[i+2] = src1[i+2]-src2[i+2];
1857 dst[i+3] = src1[i+3]-src2[i+3];
1858 dst[i+4] = src1[i+4]-src2[i+4];
1859 dst[i+5] = src1[i+5]-src2[i+5];
1860 dst[i+6] = src1[i+6]-src2[i+6];
1861 dst[i+7] = src1[i+7]-src2[i+7];
1865 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1866 long a = *(long*)(src1+i);
1867 long b = *(long*)(src2+i);
1868 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1871 dst[i+0] = src1[i+0]-src2[i+0];
1874 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1882 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1891 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1899 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1909 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1912 for(i=0; i<w-1; i++){
1939 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1969 #define BUTTERFLY2(o1,o2,i1,i2) \
1973 #define BUTTERFLY1(x,y) \
1982 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1984 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
1992 //FIXME try pointer walks
1993 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1994 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1995 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1996 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1998 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1999 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2000 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2001 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2003 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2004 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2005 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2006 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2010 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2011 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2012 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2013 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2015 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2016 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2017 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2018 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2021 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2022 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2023 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2024 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2029 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2037 //FIXME try pointer walks
2038 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2039 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2040 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2041 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2043 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2044 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2045 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2046 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2048 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2049 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2050 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2051 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2055 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2056 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2057 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2058 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2060 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2061 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2062 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2063 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2066 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2067 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2068 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2069 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2072 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2077 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2078 MpegEncContext * const s= (MpegEncContext *)c;
2079 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2083 s->dsp.diff_pixels(temp, src1, src2, stride);
2085 return s->dsp.sum_abs_dctelem(temp);
2090 const int s07 = SRC(0) + SRC(7);\
2091 const int s16 = SRC(1) + SRC(6);\
2092 const int s25 = SRC(2) + SRC(5);\
2093 const int s34 = SRC(3) + SRC(4);\
2094 const int a0 = s07 + s34;\
2095 const int a1 = s16 + s25;\
2096 const int a2 = s07 - s34;\
2097 const int a3 = s16 - s25;\
2098 const int d07 = SRC(0) - SRC(7);\
2099 const int d16 = SRC(1) - SRC(6);\
2100 const int d25 = SRC(2) - SRC(5);\
2101 const int d34 = SRC(3) - SRC(4);\
2102 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2103 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2104 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2105 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2107 DST(1, a4 + (a7>>2)) ;\
2108 DST(2, a2 + (a3>>1)) ;\
2109 DST(3, a5 + (a6>>2)) ;\
2111 DST(5, a6 - (a5>>2)) ;\
2112 DST(6, (a2>>1) - a3 ) ;\
2113 DST(7, (a4>>2) - a7 ) ;\
2116 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2117 MpegEncContext * const s= (MpegEncContext *)c;
2122 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2124 #define SRC(x) dct[i][x]
2125 #define DST(x,v) dct[i][x]= v
2126 for( i = 0; i < 8; i++ )
2131 #define SRC(x) dct[x][i]
2132 #define DST(x,v) sum += FFABS(v)
2133 for( i = 0; i < 8; i++ )
2141 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2142 MpegEncContext * const s= (MpegEncContext *)c;
2143 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2148 s->dsp.diff_pixels(temp, src1, src2, stride);
2152 sum= FFMAX(sum, FFABS(temp[i]));
2157 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2158 MpegEncContext * const s= (MpegEncContext *)c;
2159 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2160 DCTELEM * const bak = temp+64;
2166 s->dsp.diff_pixels(temp, src1, src2, stride);
2168 memcpy(bak, temp, 64*sizeof(DCTELEM));
2170 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2171 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2172 ff_simple_idct_8(temp); //FIXME
2175 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2180 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2181 MpegEncContext * const s= (MpegEncContext *)c;
2182 const uint8_t *scantable= s->intra_scantable.permutated;
2183 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2184 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2185 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2186 int i, last, run, bits, level, distortion, start_i;
2187 const int esc_length= s->ac_esc_length;
2189 uint8_t * last_length;
2193 copy_block8(lsrc1, src1, 8, stride, 8);
2194 copy_block8(lsrc2, src2, 8, stride, 8);
2196 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2198 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2204 length = s->intra_ac_vlc_length;
2205 last_length= s->intra_ac_vlc_last_length;
2206 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2209 length = s->inter_ac_vlc_length;
2210 last_length= s->inter_ac_vlc_last_length;
2215 for(i=start_i; i<last; i++){
2216 int j= scantable[i];
2221 if((level&(~127)) == 0){
2222 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2231 level= temp[i] + 64;
2235 if((level&(~127)) == 0){
2236 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2244 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2246 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2249 s->dsp.idct_add(lsrc2, 8, temp);
2251 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2253 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2256 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2257 MpegEncContext * const s= (MpegEncContext *)c;
2258 const uint8_t *scantable= s->intra_scantable.permutated;
2259 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2260 int i, last, run, bits, level, start_i;
2261 const int esc_length= s->ac_esc_length;
2263 uint8_t * last_length;
2267 s->dsp.diff_pixels(temp, src1, src2, stride);
2269 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2275 length = s->intra_ac_vlc_length;
2276 last_length= s->intra_ac_vlc_last_length;
2277 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2280 length = s->inter_ac_vlc_length;
2281 last_length= s->inter_ac_vlc_last_length;
2286 for(i=start_i; i<last; i++){
2287 int j= scantable[i];
2292 if((level&(~127)) == 0){
2293 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2302 level= temp[i] + 64;
2306 if((level&(~127)) == 0){
2307 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2315 #define VSAD_INTRA(size) \
2316 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2320 for(y=1; y<h; y++){ \
2321 for(x=0; x<size; x+=4){ \
2322 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2323 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2333 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2338 for(x=0; x<16; x++){
2339 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2348 #define SQ(a) ((a)*(a))
2349 #define VSSE_INTRA(size) \
2350 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2354 for(y=1; y<h; y++){ \
2355 for(x=0; x<size; x+=4){ \
2356 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2357 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2367 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2372 for(x=0; x<16; x++){
2373 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2382 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2386 for(i=0; i<size; i++)
2387 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2391 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2392 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2393 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2395 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2397 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2398 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2399 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2400 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2402 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2404 for(i=0; i<len; i++)
2405 dst[i] = src0[i] * src1[i];
2408 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2411 for(i=0; i<len; i++)
2412 dst[i] = src0[i] * src1[-i];
2415 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2417 for(i=0; i<len; i++)
2418 dst[i] = src0[i] * src1[i] + src2[i];
2421 static void vector_fmul_window_c(float *dst, const float *src0,
2422 const float *src1, const float *win, int len)
2428 for(i=-len, j=len-1; i<0; i++, j--) {
2433 dst[i] = s0*wj - s1*wi;
2434 dst[j] = s0*wi + s1*wj;
2438 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2442 for (i = 0; i < len; i++)
2443 dst[i] = src[i] * mul;
2446 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2450 for (i = 0; i < len; i++) {
2451 float t = v1[i] - v2[i];
2457 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2462 for (i = 0; i < len; i++)
2468 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2469 uint32_t maxi, uint32_t maxisign)
2472 if(a > mini) return mini;
2473 else if((a^(1U<<31)) > maxisign) return maxi;
2477 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2479 uint32_t mini = *(uint32_t*)min;
2480 uint32_t maxi = *(uint32_t*)max;
2481 uint32_t maxisign = maxi ^ (1U<<31);
2482 uint32_t *dsti = (uint32_t*)dst;
2483 const uint32_t *srci = (const uint32_t*)src;
2484 for(i=0; i<len; i+=8) {
2485 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2486 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2487 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2488 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2489 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2490 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2491 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2492 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2495 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2497 if(min < 0 && max > 0) {
2498 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2500 for(i=0; i < len; i+=8) {
2501 dst[i ] = av_clipf(src[i ], min, max);
2502 dst[i + 1] = av_clipf(src[i + 1], min, max);
2503 dst[i + 2] = av_clipf(src[i + 2], min, max);
2504 dst[i + 3] = av_clipf(src[i + 3], min, max);
2505 dst[i + 4] = av_clipf(src[i + 4], min, max);
2506 dst[i + 5] = av_clipf(src[i + 5], min, max);
2507 dst[i + 6] = av_clipf(src[i + 6], min, max);
2508 dst[i + 7] = av_clipf(src[i + 7], min, max);
2513 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2518 res += (*v1++ * *v2++) >> shift;
2523 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2528 *v1++ += mul * *v3++;
2533 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2534 const int16_t *window, unsigned int len)
2537 int len2 = len >> 1;
2539 for (i = 0; i < len2; i++) {
2540 int16_t w = window[i];
2541 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2542 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2546 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2547 int32_t max, unsigned int len)
2550 *dst++ = av_clip(*src++, min, max);
2551 *dst++ = av_clip(*src++, min, max);
2552 *dst++ = av_clip(*src++, min, max);
2553 *dst++ = av_clip(*src++, min, max);
2554 *dst++ = av_clip(*src++, min, max);
2555 *dst++ = av_clip(*src++, min, max);
2556 *dst++ = av_clip(*src++, min, max);
2557 *dst++ = av_clip(*src++, min, max);
2563 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2564 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2565 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2566 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2567 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2568 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2569 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2571 static void wmv2_idct_row(short * b)
2574 int a0,a1,a2,a3,a4,a5,a6,a7;
2576 a1 = W1*b[1]+W7*b[7];
2577 a7 = W7*b[1]-W1*b[7];
2578 a5 = W5*b[5]+W3*b[3];
2579 a3 = W3*b[5]-W5*b[3];
2580 a2 = W2*b[2]+W6*b[6];
2581 a6 = W6*b[2]-W2*b[6];
2582 a0 = W0*b[0]+W0*b[4];
2583 a4 = W0*b[0]-W0*b[4];
2585 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2586 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2588 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2589 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2590 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2591 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2592 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2593 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2594 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2595 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2597 static void wmv2_idct_col(short * b)
2600 int a0,a1,a2,a3,a4,a5,a6,a7;
2601 /*step 1, with extended precision*/
2602 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2603 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2604 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2605 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2606 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2607 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2608 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2609 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2611 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2612 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2614 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2615 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2616 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2617 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2619 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2620 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2621 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2622 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2624 void ff_wmv2_idct_c(short * block){
2628 wmv2_idct_row(block+i);
2631 wmv2_idct_col(block+i);
2634 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2636 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2638 ff_wmv2_idct_c(block);
2639 ff_put_pixels_clamped_c(block, dest, line_size);
2641 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2643 ff_wmv2_idct_c(block);
2644 ff_add_pixels_clamped_c(block, dest, line_size);
2646 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2649 ff_put_pixels_clamped_c(block, dest, line_size);
2651 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2654 ff_add_pixels_clamped_c(block, dest, line_size);
2657 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2660 put_pixels_clamped4_c(block, dest, line_size);
2662 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2665 add_pixels_clamped4_c(block, dest, line_size);
2668 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2671 put_pixels_clamped2_c(block, dest, line_size);
2673 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2676 add_pixels_clamped2_c(block, dest, line_size);
2679 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2683 dest[0] = cm[(block[0] + 4)>>3];
2685 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2687 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2689 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2692 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2694 /* init static data */
2695 av_cold void dsputil_static_init(void)
2699 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2700 for(i=0;i<MAX_NEG_CROP;i++) {
2702 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2705 for(i=0;i<512;i++) {
2706 ff_squareTbl[i] = (i - 256) * (i - 256);
2709 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2712 int ff_check_alignment(void){
2713 static int did_fail=0;
2714 LOCAL_ALIGNED_16(int, aligned, [4]);
2716 if((intptr_t)aligned & 15){
2718 #if HAVE_MMX || HAVE_ALTIVEC
2719 av_log(NULL, AV_LOG_ERROR,
2720 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2721 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2722 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2723 "Do not report crashes to FFmpeg developers.\n");
2732 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2736 ff_check_alignment();
2739 if (avctx->bits_per_raw_sample == 10) {
2740 c->fdct = ff_jpeg_fdct_islow_10;
2741 c->fdct248 = ff_fdct248_islow_10;
2743 if(avctx->dct_algo==FF_DCT_FASTINT) {
2744 c->fdct = fdct_ifast;
2745 c->fdct248 = fdct_ifast248;
2747 else if(avctx->dct_algo==FF_DCT_FAAN) {
2748 c->fdct = ff_faandct;
2749 c->fdct248 = ff_faandct248;
2752 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2753 c->fdct248 = ff_fdct248_islow_8;
2756 #endif //CONFIG_ENCODERS
2758 if(avctx->lowres==1){
2759 c->idct_put= ff_jref_idct4_put;
2760 c->idct_add= ff_jref_idct4_add;
2761 c->idct = j_rev_dct4;
2762 c->idct_permutation_type= FF_NO_IDCT_PERM;
2763 }else if(avctx->lowres==2){
2764 c->idct_put= ff_jref_idct2_put;
2765 c->idct_add= ff_jref_idct2_add;
2766 c->idct = j_rev_dct2;
2767 c->idct_permutation_type= FF_NO_IDCT_PERM;
2768 }else if(avctx->lowres==3){
2769 c->idct_put= ff_jref_idct1_put;
2770 c->idct_add= ff_jref_idct1_add;
2771 c->idct = j_rev_dct1;
2772 c->idct_permutation_type= FF_NO_IDCT_PERM;
2774 if (avctx->bits_per_raw_sample == 10) {
2775 c->idct_put = ff_simple_idct_put_10;
2776 c->idct_add = ff_simple_idct_add_10;
2777 c->idct = ff_simple_idct_10;
2778 c->idct_permutation_type = FF_NO_IDCT_PERM;
2780 if(avctx->idct_algo==FF_IDCT_INT){
2781 c->idct_put= ff_jref_idct_put;
2782 c->idct_add= ff_jref_idct_add;
2783 c->idct = j_rev_dct;
2784 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2785 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2786 avctx->idct_algo==FF_IDCT_VP3){
2787 c->idct_put= ff_vp3_idct_put_c;
2788 c->idct_add= ff_vp3_idct_add_c;
2789 c->idct = ff_vp3_idct_c;
2790 c->idct_permutation_type= FF_NO_IDCT_PERM;
2791 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2792 c->idct_put= ff_wmv2_idct_put_c;
2793 c->idct_add= ff_wmv2_idct_add_c;
2794 c->idct = ff_wmv2_idct_c;
2795 c->idct_permutation_type= FF_NO_IDCT_PERM;
2796 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2797 c->idct_put= ff_faanidct_put;
2798 c->idct_add= ff_faanidct_add;
2799 c->idct = ff_faanidct;
2800 c->idct_permutation_type= FF_NO_IDCT_PERM;
2801 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2802 c->idct_put= ff_ea_idct_put_c;
2803 c->idct_permutation_type= FF_NO_IDCT_PERM;
2804 }else{ //accurate/default
2805 c->idct_put = ff_simple_idct_put_8;
2806 c->idct_add = ff_simple_idct_add_8;
2807 c->idct = ff_simple_idct_8;
2808 c->idct_permutation_type= FF_NO_IDCT_PERM;
2813 c->diff_pixels = diff_pixels_c;
2814 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2815 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2816 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2817 c->sum_abs_dctelem = sum_abs_dctelem_c;
2820 c->pix_sum = pix_sum_c;
2821 c->pix_norm1 = pix_norm1_c;
2823 c->fill_block_tab[0] = fill_block16_c;
2824 c->fill_block_tab[1] = fill_block8_c;
2826 /* TODO [0] 16 [1] 8 */
2827 c->pix_abs[0][0] = pix_abs16_c;
2828 c->pix_abs[0][1] = pix_abs16_x2_c;
2829 c->pix_abs[0][2] = pix_abs16_y2_c;
2830 c->pix_abs[0][3] = pix_abs16_xy2_c;
2831 c->pix_abs[1][0] = pix_abs8_c;
2832 c->pix_abs[1][1] = pix_abs8_x2_c;
2833 c->pix_abs[1][2] = pix_abs8_y2_c;
2834 c->pix_abs[1][3] = pix_abs8_xy2_c;
2836 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2837 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2838 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2839 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2840 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2841 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2842 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2843 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2844 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2846 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2847 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2848 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2849 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2850 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2851 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2852 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2853 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2854 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2856 #define dspfunc(PFX, IDX, NUM) \
2857 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2858 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2859 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2860 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2861 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2862 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2863 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2864 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2865 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2866 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2867 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2868 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2869 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2870 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2871 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2872 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2874 dspfunc(put_qpel, 0, 16);
2875 dspfunc(put_no_rnd_qpel, 0, 16);
2877 dspfunc(avg_qpel, 0, 16);
2878 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2880 dspfunc(put_qpel, 1, 8);
2881 dspfunc(put_no_rnd_qpel, 1, 8);
2883 dspfunc(avg_qpel, 1, 8);
2884 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2888 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2889 ff_mlp_init(c, avctx);
2891 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2892 ff_intrax8dsp_init(c,avctx);
2895 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2896 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2897 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2898 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2899 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2900 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2901 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2902 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2904 #define SET_CMP_FUNC(name) \
2905 c->name[0]= name ## 16_c;\
2906 c->name[1]= name ## 8x8_c;
2908 SET_CMP_FUNC(hadamard8_diff)
2909 c->hadamard8_diff[4]= hadamard8_intra16_c;
2910 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2911 SET_CMP_FUNC(dct_sad)
2912 SET_CMP_FUNC(dct_max)
2914 SET_CMP_FUNC(dct264_sad)
2916 c->sad[0]= pix_abs16_c;
2917 c->sad[1]= pix_abs8_c;
2921 SET_CMP_FUNC(quant_psnr)
2924 c->vsad[0]= vsad16_c;
2925 c->vsad[4]= vsad_intra16_c;
2926 c->vsad[5]= vsad_intra8_c;
2927 c->vsse[0]= vsse16_c;
2928 c->vsse[4]= vsse_intra16_c;
2929 c->vsse[5]= vsse_intra8_c;
2930 c->nsse[0]= nsse16_c;
2931 c->nsse[1]= nsse8_c;
2933 ff_dsputil_init_dwt(c);
2936 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2938 c->add_bytes= add_bytes_c;
2939 c->diff_bytes= diff_bytes_c;
2940 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2941 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2942 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2943 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2944 c->bswap_buf= bswap_buf;
2945 c->bswap16_buf = bswap16_buf;
2947 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2948 c->h263_h_loop_filter= h263_h_loop_filter_c;
2949 c->h263_v_loop_filter= h263_v_loop_filter_c;
2952 if (CONFIG_VP3_DECODER) {
2953 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2954 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2955 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2958 c->h261_loop_filter= h261_loop_filter_c;
2960 c->try_8x8basis= try_8x8basis_c;
2961 c->add_8x8basis= add_8x8basis_c;
2963 #if CONFIG_VORBIS_DECODER
2964 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
2966 #if CONFIG_AC3_DECODER
2967 c->ac3_downmix = ff_ac3_downmix_c;
2969 c->vector_fmul = vector_fmul_c;
2970 c->vector_fmul_reverse = vector_fmul_reverse_c;
2971 c->vector_fmul_add = vector_fmul_add_c;
2972 c->vector_fmul_window = vector_fmul_window_c;
2973 c->vector_clipf = vector_clipf_c;
2974 c->scalarproduct_int16 = scalarproduct_int16_c;
2975 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2976 c->apply_window_int16 = apply_window_int16_c;
2977 c->vector_clip_int32 = vector_clip_int32_c;
2978 c->scalarproduct_float = scalarproduct_float_c;
2979 c->butterflies_float = butterflies_float_c;
2980 c->vector_fmul_scalar = vector_fmul_scalar_c;
2982 c->shrink[0]= av_image_copy_plane;
2983 c->shrink[1]= ff_shrink22;
2984 c->shrink[2]= ff_shrink44;
2985 c->shrink[3]= ff_shrink88;
2987 c->prefetch= just_return;
2989 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
2990 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
2994 #define FUNC(f, depth) f ## _ ## depth
2995 #define FUNCC(f, depth) f ## _ ## depth ## _c
2997 #define dspfunc1(PFX, IDX, NUM, depth)\
2998 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
2999 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3000 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3001 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3003 #define dspfunc2(PFX, IDX, NUM, depth)\
3004 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3005 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3006 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3007 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3008 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3009 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3010 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3011 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3012 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3013 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3014 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3015 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3016 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3017 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3018 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3019 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3022 #define BIT_DEPTH_FUNCS(depth, dct)\
3023 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3024 c->draw_edges = FUNCC(draw_edges , depth);\
3025 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3026 c->clear_block = FUNCC(clear_block ## dct , depth);\
3027 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3028 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3029 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3030 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3031 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3033 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3034 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3035 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3036 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3037 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3038 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3040 dspfunc1(put , 0, 16, depth);\
3041 dspfunc1(put , 1, 8, depth);\
3042 dspfunc1(put , 2, 4, depth);\
3043 dspfunc1(put , 3, 2, depth);\
3044 dspfunc1(put_no_rnd, 0, 16, depth);\
3045 dspfunc1(put_no_rnd, 1, 8, depth);\
3046 dspfunc1(avg , 0, 16, depth);\
3047 dspfunc1(avg , 1, 8, depth);\
3048 dspfunc1(avg , 2, 4, depth);\
3049 dspfunc1(avg , 3, 2, depth);\
3050 dspfunc1(avg_no_rnd, 0, 16, depth);\
3051 dspfunc1(avg_no_rnd, 1, 8, depth);\
3053 dspfunc2(put_h264_qpel, 0, 16, depth);\
3054 dspfunc2(put_h264_qpel, 1, 8, depth);\
3055 dspfunc2(put_h264_qpel, 2, 4, depth);\
3056 dspfunc2(put_h264_qpel, 3, 2, depth);\
3057 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3058 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3059 dspfunc2(avg_h264_qpel, 2, 4, depth);
3061 switch (avctx->bits_per_raw_sample) {
3063 if (c->dct_bits == 32) {
3064 BIT_DEPTH_FUNCS(9, _32);
3066 BIT_DEPTH_FUNCS(9, _16);
3070 if (c->dct_bits == 32) {
3071 BIT_DEPTH_FUNCS(10, _32);
3073 BIT_DEPTH_FUNCS(10, _16);
3077 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3079 BIT_DEPTH_FUNCS(8, _16);
3084 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3085 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3086 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3087 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3088 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3089 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3090 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3091 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3092 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3094 for(i=0; i<64; i++){
3095 if(!c->put_2tap_qpel_pixels_tab[0][i])
3096 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3097 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3098 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3101 switch(c->idct_permutation_type){
3102 case FF_NO_IDCT_PERM:
3104 c->idct_permutation[i]= i;
3106 case FF_LIBMPEG2_IDCT_PERM:
3108 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3110 case FF_SIMPLE_IDCT_PERM:
3112 c->idct_permutation[i]= simple_mmx_permutation[i];
3114 case FF_TRANSPOSE_IDCT_PERM:
3116 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3118 case FF_PARTTRANS_IDCT_PERM:
3120 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3122 case FF_SSE2_IDCT_PERM:
3124 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3127 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");