3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
311 const uint8_t *s2, int stride){
314 /* read the pixels */
316 block[0] = s1[0] - s2[0];
317 block[1] = s1[1] - s2[1];
318 block[2] = s1[2] - s2[2];
319 block[3] = s1[3] - s2[3];
320 block[4] = s1[4] - s2[4];
321 block[5] = s1[5] - s2[5];
322 block[6] = s1[6] - s2[6];
323 block[7] = s1[7] - s2[7];
331 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
337 /* read the pixels */
339 pixels[0] = cm[block[0]];
340 pixels[1] = cm[block[1]];
341 pixels[2] = cm[block[2]];
342 pixels[3] = cm[block[3]];
343 pixels[4] = cm[block[4]];
344 pixels[5] = cm[block[5]];
345 pixels[6] = cm[block[6]];
346 pixels[7] = cm[block[7]];
353 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
357 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
359 /* read the pixels */
361 pixels[0] = cm[block[0]];
362 pixels[1] = cm[block[1]];
363 pixels[2] = cm[block[2]];
364 pixels[3] = cm[block[3]];
371 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377 /* read the pixels */
379 pixels[0] = cm[block[0]];
380 pixels[1] = cm[block[1]];
387 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
388 uint8_t *restrict pixels,
393 for (i = 0; i < 8; i++) {
394 for (j = 0; j < 8; j++) {
397 else if (*block > 127)
400 *pixels = (uint8_t)(*block + 128);
404 pixels += (line_size - 8);
408 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
412 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
414 /* read the pixels */
416 pixels[0] = cm[pixels[0] + block[0]];
417 pixels[1] = cm[pixels[1] + block[1]];
418 pixels[2] = cm[pixels[2] + block[2]];
419 pixels[3] = cm[pixels[3] + block[3]];
420 pixels[4] = cm[pixels[4] + block[4]];
421 pixels[5] = cm[pixels[5] + block[5]];
422 pixels[6] = cm[pixels[6] + block[6]];
423 pixels[7] = cm[pixels[7] + block[7]];
429 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
435 /* read the pixels */
437 pixels[0] = cm[pixels[0] + block[0]];
438 pixels[1] = cm[pixels[1] + block[1]];
439 pixels[2] = cm[pixels[2] + block[2]];
440 pixels[3] = cm[pixels[3] + block[3]];
446 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452 /* read the pixels */
454 pixels[0] = cm[pixels[0] + block[0]];
455 pixels[1] = cm[pixels[1] + block[1]];
461 static int sum_abs_dctelem_c(DCTELEM *block)
465 sum+= FFABS(block[i]);
469 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
473 for (i = 0; i < h; i++) {
474 memset(block, value, 16);
479 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
483 for (i = 0; i < h; i++) {
484 memset(block, value, 8);
489 #define avg2(a,b) ((a+b+1)>>1)
490 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
492 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
494 const int A=(16-x16)*(16-y16);
495 const int B=( x16)*(16-y16);
496 const int C=(16-x16)*( y16);
497 const int D=( x16)*( y16);
502 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
503 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
504 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
505 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
506 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
507 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
508 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
509 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
515 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
516 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
519 const int s= 1<<shift;
529 for(x=0; x<8; x++){ //XXX FIXME optimize
530 int src_x, src_y, frac_x, frac_y, index;
539 if((unsigned)src_x < width){
540 if((unsigned)src_y < height){
541 index= src_x + src_y*stride;
542 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
543 + src[index +1]* frac_x )*(s-frac_y)
544 + ( src[index+stride ]*(s-frac_x)
545 + src[index+stride+1]* frac_x )* frac_y
548 index= src_x + av_clip(src_y, 0, height)*stride;
549 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
550 + src[index +1]* frac_x )*s
554 if((unsigned)src_y < height){
555 index= av_clip(src_x, 0, width) + src_y*stride;
556 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
557 + src[index+stride ]* frac_y )*s
560 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
561 dst[y*stride + x]= src[index ];
573 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
575 case 2: put_pixels2_8_c (dst, src, stride, height); break;
576 case 4: put_pixels4_8_c (dst, src, stride, height); break;
577 case 8: put_pixels8_8_c (dst, src, stride, height); break;
578 case 16:put_pixels16_8_c(dst, src, stride, height); break;
582 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
584 for (i=0; i < height; i++) {
585 for (j=0; j < width; j++) {
586 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
593 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
595 for (i=0; i < height; i++) {
596 for (j=0; j < width; j++) {
597 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
604 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
606 for (i=0; i < height; i++) {
607 for (j=0; j < width; j++) {
608 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
615 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617 for (i=0; i < height; i++) {
618 for (j=0; j < width; j++) {
619 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
626 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
628 for (i=0; i < height; i++) {
629 for (j=0; j < width; j++) {
630 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
637 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
659 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
670 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
673 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
674 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
675 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
679 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
681 for (i=0; i < height; i++) {
682 for (j=0; j < width; j++) {
683 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
690 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
692 for (i=0; i < height; i++) {
693 for (j=0; j < width; j++) {
694 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
701 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
703 for (i=0; i < height; i++) {
704 for (j=0; j < width; j++) {
705 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
712 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714 for (i=0; i < height; i++) {
715 for (j=0; j < width; j++) {
716 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
723 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
725 for (i=0; i < height; i++) {
726 for (j=0; j < width; j++) {
727 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
734 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
767 #define QPEL_MC(r, OPNAME, RND, OP) \
768 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
769 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
773 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
774 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
775 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
776 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
777 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
778 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
779 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
780 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
786 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
788 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
792 const int src0= src[0*srcStride];\
793 const int src1= src[1*srcStride];\
794 const int src2= src[2*srcStride];\
795 const int src3= src[3*srcStride];\
796 const int src4= src[4*srcStride];\
797 const int src5= src[5*srcStride];\
798 const int src6= src[6*srcStride];\
799 const int src7= src[7*srcStride];\
800 const int src8= src[8*srcStride];\
801 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
802 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
803 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
804 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
805 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
806 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
807 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
808 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
814 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
815 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
820 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
821 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
822 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
823 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
824 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
825 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
826 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
827 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
828 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
829 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
830 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
831 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
832 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
833 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
834 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
835 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
841 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
842 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847 const int src0= src[0*srcStride];\
848 const int src1= src[1*srcStride];\
849 const int src2= src[2*srcStride];\
850 const int src3= src[3*srcStride];\
851 const int src4= src[4*srcStride];\
852 const int src5= src[5*srcStride];\
853 const int src6= src[6*srcStride];\
854 const int src7= src[7*srcStride];\
855 const int src8= src[8*srcStride];\
856 const int src9= src[9*srcStride];\
857 const int src10= src[10*srcStride];\
858 const int src11= src[11*srcStride];\
859 const int src12= src[12*srcStride];\
860 const int src13= src[13*srcStride];\
861 const int src14= src[14*srcStride];\
862 const int src15= src[15*srcStride];\
863 const int src16= src[16*srcStride];\
864 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
865 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
866 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
867 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
868 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
869 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
870 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
871 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
872 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
873 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
874 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
875 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
876 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
877 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
878 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
879 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
885 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
887 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
888 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
891 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
892 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
895 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
897 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
898 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
901 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
904 copy_block9(full, src, 16, stride, 9);\
905 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
906 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
909 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
911 copy_block9(full, src, 16, stride, 9);\
912 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
915 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
918 copy_block9(full, src, 16, stride, 9);\
919 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
920 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
922 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
927 copy_block9(full, src, 16, stride, 9);\
928 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
929 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
930 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
931 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
933 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
937 copy_block9(full, src, 16, stride, 9);\
938 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
939 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
941 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
943 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
948 copy_block9(full, src, 16, stride, 9);\
949 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
950 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
951 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
952 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
954 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
958 copy_block9(full, src, 16, stride, 9);\
959 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
960 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
961 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
962 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
964 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
969 copy_block9(full, src, 16, stride, 9);\
970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
972 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
973 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
975 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
979 copy_block9(full, src, 16, stride, 9);\
980 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
985 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
990 copy_block9(full, src, 16, stride, 9);\
991 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
992 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
993 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
994 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
996 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1000 copy_block9(full, src, 16, stride, 9);\
1001 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1002 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1003 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1004 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1006 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1008 uint8_t halfHV[64];\
1009 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1010 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1013 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1015 uint8_t halfHV[64];\
1016 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1039 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1040 uint8_t full[16*9];\
1043 uint8_t halfHV[64];\
1044 copy_block9(full, src, 16, stride, 9);\
1045 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1046 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1047 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1048 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1050 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1051 uint8_t full[16*9];\
1053 copy_block9(full, src, 16, stride, 9);\
1054 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1055 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1056 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1058 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1060 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1061 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1064 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1066 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1067 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1070 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1071 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1074 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1076 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1077 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1080 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1081 uint8_t full[24*17];\
1083 copy_block17(full, src, 24, stride, 17);\
1084 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1085 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1088 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1089 uint8_t full[24*17];\
1090 copy_block17(full, src, 24, stride, 17);\
1091 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1094 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[24*17];\
1097 copy_block17(full, src, 24, stride, 17);\
1098 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1099 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1101 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1113 uint8_t full[24*17];\
1114 uint8_t halfH[272];\
1115 uint8_t halfHV[256];\
1116 copy_block17(full, src, 24, stride, 17);\
1117 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1118 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1119 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1120 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1122 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1123 uint8_t full[24*17];\
1124 uint8_t halfH[272];\
1125 uint8_t halfV[256];\
1126 uint8_t halfHV[256];\
1127 copy_block17(full, src, 24, stride, 17);\
1128 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1129 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1130 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1131 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1133 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1134 uint8_t full[24*17];\
1135 uint8_t halfH[272];\
1136 uint8_t halfHV[256];\
1137 copy_block17(full, src, 24, stride, 17);\
1138 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1144 uint8_t full[24*17];\
1145 uint8_t halfH[272];\
1146 uint8_t halfV[256];\
1147 uint8_t halfHV[256];\
1148 copy_block17(full, src, 24, stride, 17);\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1151 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1152 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1154 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1155 uint8_t full[24*17];\
1156 uint8_t halfH[272];\
1157 uint8_t halfHV[256];\
1158 copy_block17(full, src, 24, stride, 17);\
1159 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1160 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1161 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1162 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1164 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1165 uint8_t full[24*17];\
1166 uint8_t halfH[272];\
1167 uint8_t halfV[256];\
1168 uint8_t halfHV[256];\
1169 copy_block17(full, src, 24, stride, 17);\
1170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1172 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1173 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1175 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1176 uint8_t full[24*17];\
1177 uint8_t halfH[272];\
1178 uint8_t halfHV[256];\
1179 copy_block17(full, src, 24, stride, 17);\
1180 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1182 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1185 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1186 uint8_t halfH[272];\
1187 uint8_t halfHV[256];\
1188 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1192 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1193 uint8_t halfH[272];\
1194 uint8_t halfHV[256];\
1195 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1218 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1219 uint8_t full[24*17];\
1220 uint8_t halfH[272];\
1221 uint8_t halfV[256];\
1222 uint8_t halfHV[256];\
1223 copy_block17(full, src, 24, stride, 17);\
1224 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1227 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1229 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1230 uint8_t full[24*17];\
1231 uint8_t halfH[272];\
1232 copy_block17(full, src, 24, stride, 17);\
1233 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1235 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1237 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1238 uint8_t halfH[272];\
1239 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1240 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1243 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1244 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1245 #define op_put(a, b) a = cm[((b) + 16)>>5]
1246 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1248 QPEL_MC(0, put_ , _ , op_put)
1249 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1250 QPEL_MC(0, avg_ , _ , op_avg)
1251 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1253 #undef op_avg_no_rnd
1255 #undef op_put_no_rnd
1257 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1258 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1259 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1260 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1261 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1262 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1264 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1265 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1269 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1270 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1271 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1272 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1273 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1274 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1275 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1276 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1282 #if CONFIG_RV40_DECODER
1283 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1284 put_pixels16_xy2_8_c(dst, src, stride, 16);
1286 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1287 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1289 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1290 put_pixels8_xy2_8_c(dst, src, stride, 8);
1292 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1293 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1295 #endif /* CONFIG_RV40_DECODER */
1297 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1298 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1302 const int src_1= src[ -srcStride];
1303 const int src0 = src[0 ];
1304 const int src1 = src[ srcStride];
1305 const int src2 = src[2*srcStride];
1306 const int src3 = src[3*srcStride];
1307 const int src4 = src[4*srcStride];
1308 const int src5 = src[5*srcStride];
1309 const int src6 = src[6*srcStride];
1310 const int src7 = src[7*srcStride];
1311 const int src8 = src[8*srcStride];
1312 const int src9 = src[9*srcStride];
1313 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1314 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1315 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1316 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1317 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1318 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1319 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1320 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1326 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1328 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1329 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1332 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1333 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1336 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1338 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1339 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1342 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1343 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1346 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1350 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1351 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1352 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1353 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1355 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1359 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1360 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1361 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1362 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1364 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1366 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1367 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1370 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1371 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1373 const int strength= ff_h263_loop_filter_strength[qscale];
1377 int p0= src[x-2*stride];
1378 int p1= src[x-1*stride];
1379 int p2= src[x+0*stride];
1380 int p3= src[x+1*stride];
1381 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1383 if (d<-2*strength) d1= 0;
1384 else if(d<- strength) d1=-2*strength - d;
1385 else if(d< strength) d1= d;
1386 else if(d< 2*strength) d1= 2*strength - d;
1391 if(p1&256) p1= ~(p1>>31);
1392 if(p2&256) p2= ~(p2>>31);
1394 src[x-1*stride] = p1;
1395 src[x+0*stride] = p2;
1399 d2= av_clip((p0-p3)/4, -ad1, ad1);
1401 src[x-2*stride] = p0 - d2;
1402 src[x+ stride] = p3 + d2;
1407 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1408 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1410 const int strength= ff_h263_loop_filter_strength[qscale];
1414 int p0= src[y*stride-2];
1415 int p1= src[y*stride-1];
1416 int p2= src[y*stride+0];
1417 int p3= src[y*stride+1];
1418 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1420 if (d<-2*strength) d1= 0;
1421 else if(d<- strength) d1=-2*strength - d;
1422 else if(d< strength) d1= d;
1423 else if(d< 2*strength) d1= 2*strength - d;
1428 if(p1&256) p1= ~(p1>>31);
1429 if(p2&256) p2= ~(p2>>31);
1431 src[y*stride-1] = p1;
1432 src[y*stride+0] = p2;
1436 d2= av_clip((p0-p3)/4, -ad1, ad1);
1438 src[y*stride-2] = p0 - d2;
1439 src[y*stride+1] = p3 + d2;
1444 static void h261_loop_filter_c(uint8_t *src, int stride){
1449 temp[x ] = 4*src[x ];
1450 temp[x + 7*8] = 4*src[x + 7*stride];
1454 xy = y * stride + x;
1456 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1461 src[ y*stride] = (temp[ y*8] + 2)>>2;
1462 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1464 xy = y * stride + x;
1466 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1471 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1477 s += abs(pix1[0] - pix2[0]);
1478 s += abs(pix1[1] - pix2[1]);
1479 s += abs(pix1[2] - pix2[2]);
1480 s += abs(pix1[3] - pix2[3]);
1481 s += abs(pix1[4] - pix2[4]);
1482 s += abs(pix1[5] - pix2[5]);
1483 s += abs(pix1[6] - pix2[6]);
1484 s += abs(pix1[7] - pix2[7]);
1485 s += abs(pix1[8] - pix2[8]);
1486 s += abs(pix1[9] - pix2[9]);
1487 s += abs(pix1[10] - pix2[10]);
1488 s += abs(pix1[11] - pix2[11]);
1489 s += abs(pix1[12] - pix2[12]);
1490 s += abs(pix1[13] - pix2[13]);
1491 s += abs(pix1[14] - pix2[14]);
1492 s += abs(pix1[15] - pix2[15]);
1499 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1505 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1513 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1514 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1515 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1516 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1517 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1518 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1519 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1520 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1527 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1530 uint8_t *pix3 = pix2 + line_size;
1534 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1535 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1536 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1537 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1538 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1539 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1540 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1541 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1542 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1543 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1544 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1545 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1546 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1547 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1548 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1549 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1557 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1560 uint8_t *pix3 = pix2 + line_size;
1564 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1565 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1566 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1567 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1568 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1569 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1570 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1571 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1572 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1573 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1574 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1575 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1576 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1577 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1578 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1579 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1587 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1593 s += abs(pix1[0] - pix2[0]);
1594 s += abs(pix1[1] - pix2[1]);
1595 s += abs(pix1[2] - pix2[2]);
1596 s += abs(pix1[3] - pix2[3]);
1597 s += abs(pix1[4] - pix2[4]);
1598 s += abs(pix1[5] - pix2[5]);
1599 s += abs(pix1[6] - pix2[6]);
1600 s += abs(pix1[7] - pix2[7]);
1607 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1614 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1615 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1616 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1617 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1618 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1619 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1620 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1627 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1630 uint8_t *pix3 = pix2 + line_size;
1634 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1649 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1652 uint8_t *pix3 = pix2 + line_size;
1656 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1657 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1658 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1659 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1660 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1661 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1662 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1663 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1671 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1672 MpegEncContext *c = v;
1678 for(x=0; x<16; x++){
1679 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1682 for(x=0; x<15; x++){
1683 score2+= FFABS( s1[x ] - s1[x +stride]
1684 - s1[x+1] + s1[x+1+stride])
1685 -FFABS( s2[x ] - s2[x +stride]
1686 - s2[x+1] + s2[x+1+stride]);
1693 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1694 else return score1 + FFABS(score2)*8;
1697 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1698 MpegEncContext *c = v;
1705 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1709 score2+= FFABS( s1[x ] - s1[x +stride]
1710 - s1[x+1] + s1[x+1+stride])
1711 -FFABS( s2[x ] - s2[x +stride]
1712 - s2[x+1] + s2[x+1+stride]);
1719 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1720 else return score1 + FFABS(score2)*8;
1723 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1727 for(i=0; i<8*8; i++){
1728 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1731 assert(-512<b && b<512);
1733 sum += (w*b)*(w*b)>>4;
1738 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1741 for(i=0; i<8*8; i++){
1742 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1747 * permutes an 8x8 block.
1748 * @param block the block which will be permuted according to the given permutation vector
1749 * @param permutation the permutation vector
1750 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1751 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1752 * (inverse) permutated to scantable order!
1754 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1760 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1762 for(i=0; i<=last; i++){
1763 const int j= scantable[i];
1768 for(i=0; i<=last; i++){
1769 const int j= scantable[i];
1770 const int perm_j= permutation[j];
1771 block[perm_j]= temp[j];
1775 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1779 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1782 memset(cmp, 0, sizeof(void*)*6);
1790 cmp[i]= c->hadamard8_diff[i];
1796 cmp[i]= c->dct_sad[i];
1799 cmp[i]= c->dct264_sad[i];
1802 cmp[i]= c->dct_max[i];
1805 cmp[i]= c->quant_psnr[i];
1834 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1839 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1841 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1842 long a = *(long*)(src+i);
1843 long b = *(long*)(dst+i);
1844 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1847 dst[i+0] += src[i+0];
1850 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1852 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1853 long a = *(long*)(src1+i);
1854 long b = *(long*)(src2+i);
1855 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1858 dst[i] = src1[i]+src2[i];
1861 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1863 #if !HAVE_FAST_UNALIGNED
1864 if((long)src2 & (sizeof(long)-1)){
1865 for(i=0; i+7<w; i+=8){
1866 dst[i+0] = src1[i+0]-src2[i+0];
1867 dst[i+1] = src1[i+1]-src2[i+1];
1868 dst[i+2] = src1[i+2]-src2[i+2];
1869 dst[i+3] = src1[i+3]-src2[i+3];
1870 dst[i+4] = src1[i+4]-src2[i+4];
1871 dst[i+5] = src1[i+5]-src2[i+5];
1872 dst[i+6] = src1[i+6]-src2[i+6];
1873 dst[i+7] = src1[i+7]-src2[i+7];
1877 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1878 long a = *(long*)(src1+i);
1879 long b = *(long*)(src2+i);
1880 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1883 dst[i+0] = src1[i+0]-src2[i+0];
1886 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1894 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1903 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1911 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1921 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1924 for(i=0; i<w-1; i++){
1951 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1981 #define BUTTERFLY2(o1,o2,i1,i2) \
1985 #define BUTTERFLY1(x,y) \
1994 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1996 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2004 //FIXME try pointer walks
2005 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2006 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2007 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2008 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2010 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2011 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2012 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2013 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2015 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2016 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2017 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2018 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2022 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2023 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2024 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2025 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2027 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2028 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2029 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2030 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2033 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2034 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2035 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2036 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2041 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2049 //FIXME try pointer walks
2050 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2051 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2052 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2053 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2055 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2056 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2057 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2058 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2060 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2061 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2062 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2063 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2067 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2068 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2069 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2070 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2072 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2073 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2074 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2075 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2078 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2079 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2080 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2081 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2084 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2089 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2090 MpegEncContext * const s= (MpegEncContext *)c;
2091 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2095 s->dsp.diff_pixels(temp, src1, src2, stride);
2097 return s->dsp.sum_abs_dctelem(temp);
2102 const int s07 = SRC(0) + SRC(7);\
2103 const int s16 = SRC(1) + SRC(6);\
2104 const int s25 = SRC(2) + SRC(5);\
2105 const int s34 = SRC(3) + SRC(4);\
2106 const int a0 = s07 + s34;\
2107 const int a1 = s16 + s25;\
2108 const int a2 = s07 - s34;\
2109 const int a3 = s16 - s25;\
2110 const int d07 = SRC(0) - SRC(7);\
2111 const int d16 = SRC(1) - SRC(6);\
2112 const int d25 = SRC(2) - SRC(5);\
2113 const int d34 = SRC(3) - SRC(4);\
2114 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2115 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2116 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2117 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2119 DST(1, a4 + (a7>>2)) ;\
2120 DST(2, a2 + (a3>>1)) ;\
2121 DST(3, a5 + (a6>>2)) ;\
2123 DST(5, a6 - (a5>>2)) ;\
2124 DST(6, (a2>>1) - a3 ) ;\
2125 DST(7, (a4>>2) - a7 ) ;\
2128 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2129 MpegEncContext * const s= (MpegEncContext *)c;
2134 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2136 #define SRC(x) dct[i][x]
2137 #define DST(x,v) dct[i][x]= v
2138 for( i = 0; i < 8; i++ )
2143 #define SRC(x) dct[x][i]
2144 #define DST(x,v) sum += FFABS(v)
2145 for( i = 0; i < 8; i++ )
2153 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2154 MpegEncContext * const s= (MpegEncContext *)c;
2155 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2160 s->dsp.diff_pixels(temp, src1, src2, stride);
2164 sum= FFMAX(sum, FFABS(temp[i]));
2169 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2170 MpegEncContext * const s= (MpegEncContext *)c;
2171 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2172 DCTELEM * const bak = temp+64;
2178 s->dsp.diff_pixels(temp, src1, src2, stride);
2180 memcpy(bak, temp, 64*sizeof(DCTELEM));
2182 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2183 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2184 ff_simple_idct_8(temp); //FIXME
2187 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2192 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2193 MpegEncContext * const s= (MpegEncContext *)c;
2194 const uint8_t *scantable= s->intra_scantable.permutated;
2195 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2196 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2197 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2198 int i, last, run, bits, level, distortion, start_i;
2199 const int esc_length= s->ac_esc_length;
2201 uint8_t * last_length;
2205 copy_block8(lsrc1, src1, 8, stride, 8);
2206 copy_block8(lsrc2, src2, 8, stride, 8);
2208 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2210 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2216 length = s->intra_ac_vlc_length;
2217 last_length= s->intra_ac_vlc_last_length;
2218 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2221 length = s->inter_ac_vlc_length;
2222 last_length= s->inter_ac_vlc_last_length;
2227 for(i=start_i; i<last; i++){
2228 int j= scantable[i];
2233 if((level&(~127)) == 0){
2234 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2243 level= temp[i] + 64;
2247 if((level&(~127)) == 0){
2248 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2256 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2258 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2261 s->dsp.idct_add(lsrc2, 8, temp);
2263 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2265 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2268 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2269 MpegEncContext * const s= (MpegEncContext *)c;
2270 const uint8_t *scantable= s->intra_scantable.permutated;
2271 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2272 int i, last, run, bits, level, start_i;
2273 const int esc_length= s->ac_esc_length;
2275 uint8_t * last_length;
2279 s->dsp.diff_pixels(temp, src1, src2, stride);
2281 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2287 length = s->intra_ac_vlc_length;
2288 last_length= s->intra_ac_vlc_last_length;
2289 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2292 length = s->inter_ac_vlc_length;
2293 last_length= s->inter_ac_vlc_last_length;
2298 for(i=start_i; i<last; i++){
2299 int j= scantable[i];
2304 if((level&(~127)) == 0){
2305 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2314 level= temp[i] + 64;
2318 if((level&(~127)) == 0){
2319 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2327 #define VSAD_INTRA(size) \
2328 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2332 for(y=1; y<h; y++){ \
2333 for(x=0; x<size; x+=4){ \
2334 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2335 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2345 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2350 for(x=0; x<16; x++){
2351 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2360 #define SQ(a) ((a)*(a))
2361 #define VSSE_INTRA(size) \
2362 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2366 for(y=1; y<h; y++){ \
2367 for(x=0; x<size; x+=4){ \
2368 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2369 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2379 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2384 for(x=0; x<16; x++){
2385 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2394 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2398 for(i=0; i<size; i++)
2399 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2403 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2404 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2405 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2407 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2409 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2410 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2411 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2412 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2414 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2416 for(i=0; i<len; i++)
2417 dst[i] = src0[i] * src1[i];
2420 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2423 for(i=0; i<len; i++)
2424 dst[i] = src0[i] * src1[-i];
2427 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2429 for(i=0; i<len; i++)
2430 dst[i] = src0[i] * src1[i] + src2[i];
2433 static void vector_fmul_window_c(float *dst, const float *src0,
2434 const float *src1, const float *win, int len)
2440 for(i=-len, j=len-1; i<0; i++, j--) {
2445 dst[i] = s0*wj - s1*wi;
2446 dst[j] = s0*wi + s1*wj;
2450 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2454 for (i = 0; i < len; i++)
2455 dst[i] = src[i] * mul;
2458 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2462 for (i = 0; i < len; i++) {
2463 float t = v1[i] - v2[i];
2469 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2474 for (i = 0; i < len; i++)
2480 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2481 uint32_t maxi, uint32_t maxisign)
2484 if(a > mini) return mini;
2485 else if((a^(1U<<31)) > maxisign) return maxi;
2489 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2491 uint32_t mini = *(uint32_t*)min;
2492 uint32_t maxi = *(uint32_t*)max;
2493 uint32_t maxisign = maxi ^ (1U<<31);
2494 uint32_t *dsti = (uint32_t*)dst;
2495 const uint32_t *srci = (const uint32_t*)src;
2496 for(i=0; i<len; i+=8) {
2497 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2498 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2499 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2500 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2501 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2502 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2503 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2504 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2507 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2509 if(min < 0 && max > 0) {
2510 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2512 for(i=0; i < len; i+=8) {
2513 dst[i ] = av_clipf(src[i ], min, max);
2514 dst[i + 1] = av_clipf(src[i + 1], min, max);
2515 dst[i + 2] = av_clipf(src[i + 2], min, max);
2516 dst[i + 3] = av_clipf(src[i + 3], min, max);
2517 dst[i + 4] = av_clipf(src[i + 4], min, max);
2518 dst[i + 5] = av_clipf(src[i + 5], min, max);
2519 dst[i + 6] = av_clipf(src[i + 6], min, max);
2520 dst[i + 7] = av_clipf(src[i + 7], min, max);
2525 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2530 res += (*v1++ * *v2++) >> shift;
2535 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2540 *v1++ += mul * *v3++;
2545 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2546 const int16_t *window, unsigned int len)
2549 int len2 = len >> 1;
2551 for (i = 0; i < len2; i++) {
2552 int16_t w = window[i];
2553 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2554 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2558 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2559 int32_t max, unsigned int len)
2562 *dst++ = av_clip(*src++, min, max);
2563 *dst++ = av_clip(*src++, min, max);
2564 *dst++ = av_clip(*src++, min, max);
2565 *dst++ = av_clip(*src++, min, max);
2566 *dst++ = av_clip(*src++, min, max);
2567 *dst++ = av_clip(*src++, min, max);
2568 *dst++ = av_clip(*src++, min, max);
2569 *dst++ = av_clip(*src++, min, max);
2575 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2576 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2577 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2578 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2579 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2580 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2581 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2583 static void wmv2_idct_row(short * b)
2586 int a0,a1,a2,a3,a4,a5,a6,a7;
2588 a1 = W1*b[1]+W7*b[7];
2589 a7 = W7*b[1]-W1*b[7];
2590 a5 = W5*b[5]+W3*b[3];
2591 a3 = W3*b[5]-W5*b[3];
2592 a2 = W2*b[2]+W6*b[6];
2593 a6 = W6*b[2]-W2*b[6];
2594 a0 = W0*b[0]+W0*b[4];
2595 a4 = W0*b[0]-W0*b[4];
2597 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2598 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2600 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2601 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2602 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2603 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2604 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2605 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2606 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2607 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2609 static void wmv2_idct_col(short * b)
2612 int a0,a1,a2,a3,a4,a5,a6,a7;
2613 /*step 1, with extended precision*/
2614 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2615 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2616 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2617 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2618 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2619 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2620 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2621 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2623 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2624 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2626 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2627 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2628 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2629 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2631 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2632 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2633 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2634 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2636 void ff_wmv2_idct_c(short * block){
2640 wmv2_idct_row(block+i);
2643 wmv2_idct_col(block+i);
2646 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2648 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2650 ff_wmv2_idct_c(block);
2651 ff_put_pixels_clamped_c(block, dest, line_size);
2653 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2655 ff_wmv2_idct_c(block);
2656 ff_add_pixels_clamped_c(block, dest, line_size);
2658 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2661 ff_put_pixels_clamped_c(block, dest, line_size);
2663 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2666 ff_add_pixels_clamped_c(block, dest, line_size);
2669 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2672 put_pixels_clamped4_c(block, dest, line_size);
2674 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2677 add_pixels_clamped4_c(block, dest, line_size);
2680 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2683 put_pixels_clamped2_c(block, dest, line_size);
2685 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2688 add_pixels_clamped2_c(block, dest, line_size);
2691 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2693 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2695 dest[0] = cm[(block[0] + 4)>>3];
2697 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2699 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2701 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2704 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2706 /* init static data */
2707 av_cold void dsputil_static_init(void)
2711 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2712 for(i=0;i<MAX_NEG_CROP;i++) {
2714 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2717 for(i=0;i<512;i++) {
2718 ff_squareTbl[i] = (i - 256) * (i - 256);
2721 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2724 int ff_check_alignment(void){
2725 static int did_fail=0;
2726 LOCAL_ALIGNED_16(int, aligned, [4]);
2728 if((intptr_t)aligned & 15){
2730 #if HAVE_MMX || HAVE_ALTIVEC
2731 av_log(NULL, AV_LOG_ERROR,
2732 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2733 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2734 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2735 "Do not report crashes to Libav developers.\n");
2744 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2748 ff_check_alignment();
2751 if (avctx->bits_per_raw_sample == 10) {
2752 c->fdct = ff_jpeg_fdct_islow_10;
2753 c->fdct248 = ff_fdct248_islow_10;
2755 if(avctx->dct_algo==FF_DCT_FASTINT) {
2756 c->fdct = fdct_ifast;
2757 c->fdct248 = fdct_ifast248;
2759 else if(avctx->dct_algo==FF_DCT_FAAN) {
2760 c->fdct = ff_faandct;
2761 c->fdct248 = ff_faandct248;
2764 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2765 c->fdct248 = ff_fdct248_islow_8;
2768 #endif //CONFIG_ENCODERS
2770 if(avctx->lowres==1){
2771 c->idct_put= ff_jref_idct4_put;
2772 c->idct_add= ff_jref_idct4_add;
2773 c->idct = j_rev_dct4;
2774 c->idct_permutation_type= FF_NO_IDCT_PERM;
2775 }else if(avctx->lowres==2){
2776 c->idct_put= ff_jref_idct2_put;
2777 c->idct_add= ff_jref_idct2_add;
2778 c->idct = j_rev_dct2;
2779 c->idct_permutation_type= FF_NO_IDCT_PERM;
2780 }else if(avctx->lowres==3){
2781 c->idct_put= ff_jref_idct1_put;
2782 c->idct_add= ff_jref_idct1_add;
2783 c->idct = j_rev_dct1;
2784 c->idct_permutation_type= FF_NO_IDCT_PERM;
2786 if (avctx->bits_per_raw_sample == 10) {
2787 c->idct_put = ff_simple_idct_put_10;
2788 c->idct_add = ff_simple_idct_add_10;
2789 c->idct = ff_simple_idct_10;
2790 c->idct_permutation_type = FF_NO_IDCT_PERM;
2792 if(avctx->idct_algo==FF_IDCT_INT){
2793 c->idct_put= ff_jref_idct_put;
2794 c->idct_add= ff_jref_idct_add;
2795 c->idct = j_rev_dct;
2796 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2797 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2798 avctx->idct_algo==FF_IDCT_VP3){
2799 c->idct_put= ff_vp3_idct_put_c;
2800 c->idct_add= ff_vp3_idct_add_c;
2801 c->idct = ff_vp3_idct_c;
2802 c->idct_permutation_type= FF_NO_IDCT_PERM;
2803 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2804 c->idct_put= ff_wmv2_idct_put_c;
2805 c->idct_add= ff_wmv2_idct_add_c;
2806 c->idct = ff_wmv2_idct_c;
2807 c->idct_permutation_type= FF_NO_IDCT_PERM;
2808 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2809 c->idct_put= ff_faanidct_put;
2810 c->idct_add= ff_faanidct_add;
2811 c->idct = ff_faanidct;
2812 c->idct_permutation_type= FF_NO_IDCT_PERM;
2813 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2814 c->idct_put= ff_ea_idct_put_c;
2815 c->idct_permutation_type= FF_NO_IDCT_PERM;
2816 }else{ //accurate/default
2817 c->idct_put = ff_simple_idct_put_8;
2818 c->idct_add = ff_simple_idct_add_8;
2819 c->idct = ff_simple_idct_8;
2820 c->idct_permutation_type= FF_NO_IDCT_PERM;
2825 c->diff_pixels = diff_pixels_c;
2826 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2827 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2828 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2829 c->sum_abs_dctelem = sum_abs_dctelem_c;
2832 c->pix_sum = pix_sum_c;
2833 c->pix_norm1 = pix_norm1_c;
2835 c->fill_block_tab[0] = fill_block16_c;
2836 c->fill_block_tab[1] = fill_block8_c;
2838 /* TODO [0] 16 [1] 8 */
2839 c->pix_abs[0][0] = pix_abs16_c;
2840 c->pix_abs[0][1] = pix_abs16_x2_c;
2841 c->pix_abs[0][2] = pix_abs16_y2_c;
2842 c->pix_abs[0][3] = pix_abs16_xy2_c;
2843 c->pix_abs[1][0] = pix_abs8_c;
2844 c->pix_abs[1][1] = pix_abs8_x2_c;
2845 c->pix_abs[1][2] = pix_abs8_y2_c;
2846 c->pix_abs[1][3] = pix_abs8_xy2_c;
2848 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2849 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2850 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2851 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2852 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2853 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2854 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2855 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2856 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2858 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2859 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2860 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2861 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2862 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2863 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2864 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2865 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2866 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2868 #define dspfunc(PFX, IDX, NUM) \
2869 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2870 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2871 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2872 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2873 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2874 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2875 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2876 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2877 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2878 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2879 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2880 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2881 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2882 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2883 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2884 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2886 dspfunc(put_qpel, 0, 16);
2887 dspfunc(put_no_rnd_qpel, 0, 16);
2889 dspfunc(avg_qpel, 0, 16);
2890 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2892 dspfunc(put_qpel, 1, 8);
2893 dspfunc(put_no_rnd_qpel, 1, 8);
2895 dspfunc(avg_qpel, 1, 8);
2896 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2900 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2901 ff_mlp_init(c, avctx);
2903 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2904 ff_intrax8dsp_init(c,avctx);
2907 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2908 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2909 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2910 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2911 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2912 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2913 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2914 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2916 #define SET_CMP_FUNC(name) \
2917 c->name[0]= name ## 16_c;\
2918 c->name[1]= name ## 8x8_c;
2920 SET_CMP_FUNC(hadamard8_diff)
2921 c->hadamard8_diff[4]= hadamard8_intra16_c;
2922 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2923 SET_CMP_FUNC(dct_sad)
2924 SET_CMP_FUNC(dct_max)
2926 SET_CMP_FUNC(dct264_sad)
2928 c->sad[0]= pix_abs16_c;
2929 c->sad[1]= pix_abs8_c;
2933 SET_CMP_FUNC(quant_psnr)
2936 c->vsad[0]= vsad16_c;
2937 c->vsad[4]= vsad_intra16_c;
2938 c->vsad[5]= vsad_intra8_c;
2939 c->vsse[0]= vsse16_c;
2940 c->vsse[4]= vsse_intra16_c;
2941 c->vsse[5]= vsse_intra8_c;
2942 c->nsse[0]= nsse16_c;
2943 c->nsse[1]= nsse8_c;
2945 ff_dsputil_init_dwt(c);
2948 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2950 c->add_bytes= add_bytes_c;
2951 c->add_bytes_l2= add_bytes_l2_c;
2952 c->diff_bytes= diff_bytes_c;
2953 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2954 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2955 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2956 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2957 c->bswap_buf= bswap_buf;
2958 c->bswap16_buf = bswap16_buf;
2959 #if CONFIG_PNG_DECODER
2960 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
2963 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2964 c->h263_h_loop_filter= h263_h_loop_filter_c;
2965 c->h263_v_loop_filter= h263_v_loop_filter_c;
2968 if (CONFIG_VP3_DECODER) {
2969 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2970 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2971 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2974 c->h261_loop_filter= h261_loop_filter_c;
2976 c->try_8x8basis= try_8x8basis_c;
2977 c->add_8x8basis= add_8x8basis_c;
2979 #if CONFIG_VORBIS_DECODER
2980 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
2982 #if CONFIG_AC3_DECODER
2983 c->ac3_downmix = ff_ac3_downmix_c;
2985 c->vector_fmul = vector_fmul_c;
2986 c->vector_fmul_reverse = vector_fmul_reverse_c;
2987 c->vector_fmul_add = vector_fmul_add_c;
2988 c->vector_fmul_window = vector_fmul_window_c;
2989 c->vector_clipf = vector_clipf_c;
2990 c->scalarproduct_int16 = scalarproduct_int16_c;
2991 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
2992 c->apply_window_int16 = apply_window_int16_c;
2993 c->vector_clip_int32 = vector_clip_int32_c;
2994 c->scalarproduct_float = scalarproduct_float_c;
2995 c->butterflies_float = butterflies_float_c;
2996 c->vector_fmul_scalar = vector_fmul_scalar_c;
2998 c->shrink[0]= av_image_copy_plane;
2999 c->shrink[1]= ff_shrink22;
3000 c->shrink[2]= ff_shrink44;
3001 c->shrink[3]= ff_shrink88;
3003 c->prefetch= just_return;
3005 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3006 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3010 #define FUNC(f, depth) f ## _ ## depth
3011 #define FUNCC(f, depth) f ## _ ## depth ## _c
3013 #define dspfunc1(PFX, IDX, NUM, depth)\
3014 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3015 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3016 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3017 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3019 #define dspfunc2(PFX, IDX, NUM, depth)\
3020 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3021 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3022 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3023 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3024 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3025 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3026 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3027 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3028 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3029 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3030 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3031 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3032 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3033 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3034 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3035 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3038 #define BIT_DEPTH_FUNCS(depth, dct)\
3039 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3040 c->draw_edges = FUNCC(draw_edges , depth);\
3041 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3042 c->clear_block = FUNCC(clear_block ## dct , depth);\
3043 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3044 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3045 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3046 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3047 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3049 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3050 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3051 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3052 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3053 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3054 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3056 dspfunc1(put , 0, 16, depth);\
3057 dspfunc1(put , 1, 8, depth);\
3058 dspfunc1(put , 2, 4, depth);\
3059 dspfunc1(put , 3, 2, depth);\
3060 dspfunc1(put_no_rnd, 0, 16, depth);\
3061 dspfunc1(put_no_rnd, 1, 8, depth);\
3062 dspfunc1(avg , 0, 16, depth);\
3063 dspfunc1(avg , 1, 8, depth);\
3064 dspfunc1(avg , 2, 4, depth);\
3065 dspfunc1(avg , 3, 2, depth);\
3066 dspfunc1(avg_no_rnd, 0, 16, depth);\
3067 dspfunc1(avg_no_rnd, 1, 8, depth);\
3069 dspfunc2(put_h264_qpel, 0, 16, depth);\
3070 dspfunc2(put_h264_qpel, 1, 8, depth);\
3071 dspfunc2(put_h264_qpel, 2, 4, depth);\
3072 dspfunc2(put_h264_qpel, 3, 2, depth);\
3073 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3074 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3075 dspfunc2(avg_h264_qpel, 2, 4, depth);
3077 switch (avctx->bits_per_raw_sample) {
3079 if (c->dct_bits == 32) {
3080 BIT_DEPTH_FUNCS(9, _32);
3082 BIT_DEPTH_FUNCS(9, _16);
3086 if (c->dct_bits == 32) {
3087 BIT_DEPTH_FUNCS(10, _32);
3089 BIT_DEPTH_FUNCS(10, _16);
3093 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3095 BIT_DEPTH_FUNCS(8, _16);
3100 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3101 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3102 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3103 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3104 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3105 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3106 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3107 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3108 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3110 for(i=0; i<64; i++){
3111 if(!c->put_2tap_qpel_pixels_tab[0][i])
3112 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3113 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3114 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3117 switch(c->idct_permutation_type){
3118 case FF_NO_IDCT_PERM:
3120 c->idct_permutation[i]= i;
3122 case FF_LIBMPEG2_IDCT_PERM:
3124 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3126 case FF_SIMPLE_IDCT_PERM:
3128 c->idct_permutation[i]= simple_mmx_permutation[i];
3130 case FF_TRANSPOSE_IDCT_PERM:
3132 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3134 case FF_PARTTRANS_IDCT_PERM:
3136 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3138 case FF_SSE2_IDCT_PERM:
3140 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3143 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");