3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
47 #include "dsputil_template.c"
51 #include "dsputil_template.c"
55 #include "dsputil_template.c"
57 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
58 #define pb_7f (~0UL/255 * 0x7f)
59 #define pb_80 (~0UL/255 * 0x80)
61 const uint8_t ff_zigzag_direct[64] = {
62 0, 1, 8, 16, 9, 2, 3, 10,
63 17, 24, 32, 25, 18, 11, 4, 5,
64 12, 19, 26, 33, 40, 48, 41, 34,
65 27, 20, 13, 6, 7, 14, 21, 28,
66 35, 42, 49, 56, 57, 50, 43, 36,
67 29, 22, 15, 23, 30, 37, 44, 51,
68 58, 59, 52, 45, 38, 31, 39, 46,
69 53, 60, 61, 54, 47, 55, 62, 63
72 /* Specific zigzag scan for 248 idct. NOTE that unlike the
73 specification, we interleave the fields */
74 const uint8_t ff_zigzag248_direct[64] = {
75 0, 8, 1, 9, 16, 24, 2, 10,
76 17, 25, 32, 40, 48, 56, 33, 41,
77 18, 26, 3, 11, 4, 12, 19, 27,
78 34, 42, 49, 57, 50, 58, 35, 43,
79 20, 28, 5, 13, 6, 14, 21, 29,
80 36, 44, 51, 59, 52, 60, 37, 45,
81 22, 30, 7, 15, 23, 31, 38, 46,
82 53, 61, 54, 62, 39, 47, 55, 63,
85 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
86 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
88 const uint8_t ff_alternate_horizontal_scan[64] = {
89 0, 1, 2, 3, 8, 9, 16, 17,
90 10, 11, 4, 5, 6, 7, 15, 14,
91 13, 12, 19, 18, 24, 25, 32, 33,
92 26, 27, 20, 21, 22, 23, 28, 29,
93 30, 31, 34, 35, 40, 41, 48, 49,
94 42, 43, 36, 37, 38, 39, 44, 45,
95 46, 47, 50, 51, 56, 57, 58, 59,
96 52, 53, 54, 55, 60, 61, 62, 63,
99 const uint8_t ff_alternate_vertical_scan[64] = {
100 0, 8, 16, 24, 1, 9, 2, 10,
101 17, 25, 32, 40, 48, 56, 57, 49,
102 41, 33, 26, 18, 3, 11, 4, 12,
103 19, 27, 34, 42, 50, 58, 35, 43,
104 51, 59, 20, 28, 5, 13, 6, 14,
105 21, 29, 36, 44, 52, 60, 37, 45,
106 53, 61, 22, 30, 7, 15, 23, 31,
107 38, 46, 54, 62, 39, 47, 55, 63,
110 /* Input permutation for the simple_idct_mmx */
111 static const uint8_t simple_mmx_permutation[64]={
112 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
113 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
114 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
115 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
116 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
117 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
118 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
119 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
122 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
124 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
128 st->scantable= src_scantable;
132 j = src_scantable[i];
133 st->permutated[i] = permutation[j];
142 j = st->permutated[i];
144 st->raster_end[i]= end;
148 static int pix_sum_c(uint8_t * pix, int line_size)
153 for (i = 0; i < 16; i++) {
154 for (j = 0; j < 16; j += 8) {
165 pix += line_size - 16;
170 static int pix_norm1_c(uint8_t * pix, int line_size)
173 uint32_t *sq = ff_squareTbl + 256;
176 for (i = 0; i < 16; i++) {
177 for (j = 0; j < 16; j += 8) {
189 register uint64_t x=*(uint64_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 s += sq[(x>>32)&0xff];
195 s += sq[(x>>40)&0xff];
196 s += sq[(x>>48)&0xff];
197 s += sq[(x>>56)&0xff];
199 register uint32_t x=*(uint32_t*)pix;
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
204 x=*(uint32_t*)(pix+4);
206 s += sq[(x>>8)&0xff];
207 s += sq[(x>>16)&0xff];
208 s += sq[(x>>24)&0xff];
213 pix += line_size - 16;
218 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
221 for(i=0; i+8<=w; i+=8){
222 dst[i+0]= av_bswap32(src[i+0]);
223 dst[i+1]= av_bswap32(src[i+1]);
224 dst[i+2]= av_bswap32(src[i+2]);
225 dst[i+3]= av_bswap32(src[i+3]);
226 dst[i+4]= av_bswap32(src[i+4]);
227 dst[i+5]= av_bswap32(src[i+5]);
228 dst[i+6]= av_bswap32(src[i+6]);
229 dst[i+7]= av_bswap32(src[i+7]);
232 dst[i+0]= av_bswap32(src[i+0]);
236 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
239 *dst++ = av_bswap16(*src++);
242 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
259 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
262 uint32_t *sq = ff_squareTbl + 256;
265 for (i = 0; i < h; i++) {
266 s += sq[pix1[0] - pix2[0]];
267 s += sq[pix1[1] - pix2[1]];
268 s += sq[pix1[2] - pix2[2]];
269 s += sq[pix1[3] - pix2[3]];
270 s += sq[pix1[4] - pix2[4]];
271 s += sq[pix1[5] - pix2[5]];
272 s += sq[pix1[6] - pix2[6]];
273 s += sq[pix1[7] - pix2[7]];
280 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
283 uint32_t *sq = ff_squareTbl + 256;
286 for (i = 0; i < h; i++) {
287 s += sq[pix1[ 0] - pix2[ 0]];
288 s += sq[pix1[ 1] - pix2[ 1]];
289 s += sq[pix1[ 2] - pix2[ 2]];
290 s += sq[pix1[ 3] - pix2[ 3]];
291 s += sq[pix1[ 4] - pix2[ 4]];
292 s += sq[pix1[ 5] - pix2[ 5]];
293 s += sq[pix1[ 6] - pix2[ 6]];
294 s += sq[pix1[ 7] - pix2[ 7]];
295 s += sq[pix1[ 8] - pix2[ 8]];
296 s += sq[pix1[ 9] - pix2[ 9]];
297 s += sq[pix1[10] - pix2[10]];
298 s += sq[pix1[11] - pix2[11]];
299 s += sq[pix1[12] - pix2[12]];
300 s += sq[pix1[13] - pix2[13]];
301 s += sq[pix1[14] - pix2[14]];
302 s += sq[pix1[15] - pix2[15]];
310 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
311 const uint8_t *s2, int stride){
314 /* read the pixels */
316 block[0] = s1[0] - s2[0];
317 block[1] = s1[1] - s2[1];
318 block[2] = s1[2] - s2[2];
319 block[3] = s1[3] - s2[3];
320 block[4] = s1[4] - s2[4];
321 block[5] = s1[5] - s2[5];
322 block[6] = s1[6] - s2[6];
323 block[7] = s1[7] - s2[7];
331 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
337 /* read the pixels */
339 pixels[0] = cm[block[0]];
340 pixels[1] = cm[block[1]];
341 pixels[2] = cm[block[2]];
342 pixels[3] = cm[block[3]];
343 pixels[4] = cm[block[4]];
344 pixels[5] = cm[block[5]];
345 pixels[6] = cm[block[6]];
346 pixels[7] = cm[block[7]];
353 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
357 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
359 /* read the pixels */
361 pixels[0] = cm[block[0]];
362 pixels[1] = cm[block[1]];
363 pixels[2] = cm[block[2]];
364 pixels[3] = cm[block[3]];
371 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377 /* read the pixels */
379 pixels[0] = cm[block[0]];
380 pixels[1] = cm[block[1]];
387 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
388 uint8_t *restrict pixels,
393 for (i = 0; i < 8; i++) {
394 for (j = 0; j < 8; j++) {
397 else if (*block > 127)
400 *pixels = (uint8_t)(*block + 128);
404 pixels += (line_size - 8);
408 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
412 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
414 /* read the pixels */
416 pixels[0] = cm[pixels[0] + block[0]];
417 pixels[1] = cm[pixels[1] + block[1]];
418 pixels[2] = cm[pixels[2] + block[2]];
419 pixels[3] = cm[pixels[3] + block[3]];
420 pixels[4] = cm[pixels[4] + block[4]];
421 pixels[5] = cm[pixels[5] + block[5]];
422 pixels[6] = cm[pixels[6] + block[6]];
423 pixels[7] = cm[pixels[7] + block[7]];
429 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
435 /* read the pixels */
437 pixels[0] = cm[pixels[0] + block[0]];
438 pixels[1] = cm[pixels[1] + block[1]];
439 pixels[2] = cm[pixels[2] + block[2]];
440 pixels[3] = cm[pixels[3] + block[3]];
446 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452 /* read the pixels */
454 pixels[0] = cm[pixels[0] + block[0]];
455 pixels[1] = cm[pixels[1] + block[1]];
461 static int sum_abs_dctelem_c(DCTELEM *block)
465 sum+= FFABS(block[i]);
469 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
473 for (i = 0; i < h; i++) {
474 memset(block, value, 16);
479 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
483 for (i = 0; i < h; i++) {
484 memset(block, value, 8);
489 #define avg2(a,b) ((a+b+1)>>1)
490 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
492 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
494 const int A=(16-x16)*(16-y16);
495 const int B=( x16)*(16-y16);
496 const int C=(16-x16)*( y16);
497 const int D=( x16)*( y16);
502 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
503 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
504 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
505 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
506 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
507 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
508 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
509 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
515 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
516 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
519 const int s= 1<<shift;
529 for(x=0; x<8; x++){ //XXX FIXME optimize
530 int src_x, src_y, frac_x, frac_y, index;
539 if((unsigned)src_x < width){
540 if((unsigned)src_y < height){
541 index= src_x + src_y*stride;
542 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
543 + src[index +1]* frac_x )*(s-frac_y)
544 + ( src[index+stride ]*(s-frac_x)
545 + src[index+stride+1]* frac_x )* frac_y
548 index= src_x + av_clip(src_y, 0, height)*stride;
549 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
550 + src[index +1]* frac_x )*s
554 if((unsigned)src_y < height){
555 index= av_clip(src_x, 0, width) + src_y*stride;
556 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
557 + src[index+stride ]* frac_y )*s
560 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
561 dst[y*stride + x]= src[index ];
573 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
575 case 2: put_pixels2_8_c (dst, src, stride, height); break;
576 case 4: put_pixels4_8_c (dst, src, stride, height); break;
577 case 8: put_pixels8_8_c (dst, src, stride, height); break;
578 case 16:put_pixels16_8_c(dst, src, stride, height); break;
582 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
584 for (i=0; i < height; i++) {
585 for (j=0; j < width; j++) {
586 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
593 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
595 for (i=0; i < height; i++) {
596 for (j=0; j < width; j++) {
597 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
604 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
606 for (i=0; i < height; i++) {
607 for (j=0; j < width; j++) {
608 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
615 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
617 for (i=0; i < height; i++) {
618 for (j=0; j < width; j++) {
619 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
626 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
628 for (i=0; i < height; i++) {
629 for (j=0; j < width; j++) {
630 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
637 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
659 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
670 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
673 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
674 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
675 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
679 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
681 for (i=0; i < height; i++) {
682 for (j=0; j < width; j++) {
683 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
690 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
692 for (i=0; i < height; i++) {
693 for (j=0; j < width; j++) {
694 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
701 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
703 for (i=0; i < height; i++) {
704 for (j=0; j < width; j++) {
705 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
712 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
714 for (i=0; i < height; i++) {
715 for (j=0; j < width; j++) {
716 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
723 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
725 for (i=0; i < height; i++) {
726 for (j=0; j < width; j++) {
727 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
734 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
767 #define QPEL_MC(r, OPNAME, RND, OP) \
768 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
769 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
773 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
774 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
775 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
776 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
777 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
778 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
779 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
780 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
786 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
788 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
792 const int src0= src[0*srcStride];\
793 const int src1= src[1*srcStride];\
794 const int src2= src[2*srcStride];\
795 const int src3= src[3*srcStride];\
796 const int src4= src[4*srcStride];\
797 const int src5= src[5*srcStride];\
798 const int src6= src[6*srcStride];\
799 const int src7= src[7*srcStride];\
800 const int src8= src[8*srcStride];\
801 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
802 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
803 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
804 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
805 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
806 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
807 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
808 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
814 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
815 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
820 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
821 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
822 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
823 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
824 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
825 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
826 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
827 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
828 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
829 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
830 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
831 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
832 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
833 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
834 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
835 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
841 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
842 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847 const int src0= src[0*srcStride];\
848 const int src1= src[1*srcStride];\
849 const int src2= src[2*srcStride];\
850 const int src3= src[3*srcStride];\
851 const int src4= src[4*srcStride];\
852 const int src5= src[5*srcStride];\
853 const int src6= src[6*srcStride];\
854 const int src7= src[7*srcStride];\
855 const int src8= src[8*srcStride];\
856 const int src9= src[9*srcStride];\
857 const int src10= src[10*srcStride];\
858 const int src11= src[11*srcStride];\
859 const int src12= src[12*srcStride];\
860 const int src13= src[13*srcStride];\
861 const int src14= src[14*srcStride];\
862 const int src15= src[15*srcStride];\
863 const int src16= src[16*srcStride];\
864 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
865 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
866 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
867 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
868 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
869 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
870 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
871 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
872 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
873 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
874 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
875 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
876 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
877 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
878 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
879 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
885 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
887 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
888 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
891 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
892 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
895 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
897 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
898 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
901 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
904 copy_block9(full, src, 16, stride, 9);\
905 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
906 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
909 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
911 copy_block9(full, src, 16, stride, 9);\
912 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
915 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
918 copy_block9(full, src, 16, stride, 9);\
919 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
920 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
922 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
927 copy_block9(full, src, 16, stride, 9);\
928 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
929 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
930 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
931 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
933 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
937 copy_block9(full, src, 16, stride, 9);\
938 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
939 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
940 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
941 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
943 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
948 copy_block9(full, src, 16, stride, 9);\
949 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
950 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
951 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
952 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
954 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
958 copy_block9(full, src, 16, stride, 9);\
959 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
960 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
961 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
962 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
964 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
969 copy_block9(full, src, 16, stride, 9);\
970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
971 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
972 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
973 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
975 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
979 copy_block9(full, src, 16, stride, 9);\
980 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
981 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
982 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
983 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
985 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
990 copy_block9(full, src, 16, stride, 9);\
991 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
992 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
993 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
994 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
996 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1000 copy_block9(full, src, 16, stride, 9);\
1001 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1002 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1003 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1004 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1006 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1008 uint8_t halfHV[64];\
1009 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1010 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1013 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1015 uint8_t halfHV[64];\
1016 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1018 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1020 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1021 uint8_t full[16*9];\
1024 uint8_t halfHV[64];\
1025 copy_block9(full, src, 16, stride, 9);\
1026 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1028 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1029 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1031 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1032 uint8_t full[16*9];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1039 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1040 uint8_t full[16*9];\
1043 uint8_t halfHV[64];\
1044 copy_block9(full, src, 16, stride, 9);\
1045 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1046 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1047 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1048 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1050 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1051 uint8_t full[16*9];\
1053 copy_block9(full, src, 16, stride, 9);\
1054 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1055 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1056 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1058 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1060 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1061 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1064 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1066 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1067 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1070 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1071 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1074 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1076 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1077 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1080 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1081 uint8_t full[24*17];\
1083 copy_block17(full, src, 24, stride, 17);\
1084 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1085 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1088 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1089 uint8_t full[24*17];\
1090 copy_block17(full, src, 24, stride, 17);\
1091 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1094 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[24*17];\
1097 copy_block17(full, src, 24, stride, 17);\
1098 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1099 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1101 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1102 uint8_t full[24*17];\
1103 uint8_t halfH[272];\
1104 uint8_t halfV[256];\
1105 uint8_t halfHV[256];\
1106 copy_block17(full, src, 24, stride, 17);\
1107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1110 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1112 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1113 uint8_t full[24*17];\
1114 uint8_t halfH[272];\
1115 uint8_t halfHV[256];\
1116 copy_block17(full, src, 24, stride, 17);\
1117 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1118 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1119 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1120 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1122 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1123 uint8_t full[24*17];\
1124 uint8_t halfH[272];\
1125 uint8_t halfV[256];\
1126 uint8_t halfHV[256];\
1127 copy_block17(full, src, 24, stride, 17);\
1128 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1129 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1130 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1131 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1133 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1134 uint8_t full[24*17];\
1135 uint8_t halfH[272];\
1136 uint8_t halfHV[256];\
1137 copy_block17(full, src, 24, stride, 17);\
1138 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1139 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1140 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1141 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1143 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1144 uint8_t full[24*17];\
1145 uint8_t halfH[272];\
1146 uint8_t halfV[256];\
1147 uint8_t halfHV[256];\
1148 copy_block17(full, src, 24, stride, 17);\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1150 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1151 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1152 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1154 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1155 uint8_t full[24*17];\
1156 uint8_t halfH[272];\
1157 uint8_t halfHV[256];\
1158 copy_block17(full, src, 24, stride, 17);\
1159 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1160 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1161 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1162 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1164 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1165 uint8_t full[24*17];\
1166 uint8_t halfH[272];\
1167 uint8_t halfV[256];\
1168 uint8_t halfHV[256];\
1169 copy_block17(full, src, 24, stride, 17);\
1170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1172 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1173 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1175 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1176 uint8_t full[24*17];\
1177 uint8_t halfH[272];\
1178 uint8_t halfHV[256];\
1179 copy_block17(full, src, 24, stride, 17);\
1180 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1182 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1185 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1186 uint8_t halfH[272];\
1187 uint8_t halfHV[256];\
1188 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1189 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1190 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1192 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1193 uint8_t halfH[272];\
1194 uint8_t halfHV[256];\
1195 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1197 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1199 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1200 uint8_t full[24*17];\
1201 uint8_t halfH[272];\
1202 uint8_t halfV[256];\
1203 uint8_t halfHV[256];\
1204 copy_block17(full, src, 24, stride, 17);\
1205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1208 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1210 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t full[24*17];\
1212 uint8_t halfH[272];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1218 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1219 uint8_t full[24*17];\
1220 uint8_t halfH[272];\
1221 uint8_t halfV[256];\
1222 uint8_t halfHV[256];\
1223 copy_block17(full, src, 24, stride, 17);\
1224 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1225 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1227 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1229 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1230 uint8_t full[24*17];\
1231 uint8_t halfH[272];\
1232 copy_block17(full, src, 24, stride, 17);\
1233 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1234 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1235 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1237 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1238 uint8_t halfH[272];\
1239 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1240 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1243 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1244 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1245 #define op_put(a, b) a = cm[((b) + 16)>>5]
1246 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1248 QPEL_MC(0, put_ , _ , op_put)
1249 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1250 QPEL_MC(0, avg_ , _ , op_avg)
1251 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1253 #undef op_avg_no_rnd
1255 #undef op_put_no_rnd
1257 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1258 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1259 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1260 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1261 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1262 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1264 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1265 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1269 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1270 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1271 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1272 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1273 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1274 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1275 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1276 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1282 #if CONFIG_RV40_DECODER
1283 void ff_put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1284 put_pixels16_xy2_8_c(dst, src, stride, 16);
1286 void ff_avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1287 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1289 void ff_put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1290 put_pixels8_xy2_8_c(dst, src, stride, 8);
1292 void ff_avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1293 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1295 #endif /* CONFIG_RV40_DECODER */
1297 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1298 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1302 const int src_1= src[ -srcStride];
1303 const int src0 = src[0 ];
1304 const int src1 = src[ srcStride];
1305 const int src2 = src[2*srcStride];
1306 const int src3 = src[3*srcStride];
1307 const int src4 = src[4*srcStride];
1308 const int src5 = src[5*srcStride];
1309 const int src6 = src[6*srcStride];
1310 const int src7 = src[7*srcStride];
1311 const int src8 = src[8*srcStride];
1312 const int src9 = src[9*srcStride];
1313 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1314 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1315 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1316 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1317 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1318 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1319 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1320 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1326 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1328 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1329 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1332 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1333 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1336 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1338 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1339 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1342 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1343 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1346 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1350 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1351 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1352 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1353 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1355 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1359 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1360 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1361 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1362 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1364 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1366 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1367 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1370 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1371 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1373 const int strength= ff_h263_loop_filter_strength[qscale];
1377 int p0= src[x-2*stride];
1378 int p1= src[x-1*stride];
1379 int p2= src[x+0*stride];
1380 int p3= src[x+1*stride];
1381 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1383 if (d<-2*strength) d1= 0;
1384 else if(d<- strength) d1=-2*strength - d;
1385 else if(d< strength) d1= d;
1386 else if(d< 2*strength) d1= 2*strength - d;
1391 if(p1&256) p1= ~(p1>>31);
1392 if(p2&256) p2= ~(p2>>31);
1394 src[x-1*stride] = p1;
1395 src[x+0*stride] = p2;
1399 d2= av_clip((p0-p3)/4, -ad1, ad1);
1401 src[x-2*stride] = p0 - d2;
1402 src[x+ stride] = p3 + d2;
1407 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1408 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1410 const int strength= ff_h263_loop_filter_strength[qscale];
1414 int p0= src[y*stride-2];
1415 int p1= src[y*stride-1];
1416 int p2= src[y*stride+0];
1417 int p3= src[y*stride+1];
1418 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1420 if (d<-2*strength) d1= 0;
1421 else if(d<- strength) d1=-2*strength - d;
1422 else if(d< strength) d1= d;
1423 else if(d< 2*strength) d1= 2*strength - d;
1428 if(p1&256) p1= ~(p1>>31);
1429 if(p2&256) p2= ~(p2>>31);
1431 src[y*stride-1] = p1;
1432 src[y*stride+0] = p2;
1436 d2= av_clip((p0-p3)/4, -ad1, ad1);
1438 src[y*stride-2] = p0 - d2;
1439 src[y*stride+1] = p3 + d2;
1444 static void h261_loop_filter_c(uint8_t *src, int stride){
1449 temp[x ] = 4*src[x ];
1450 temp[x + 7*8] = 4*src[x + 7*stride];
1454 xy = y * stride + x;
1456 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1461 src[ y*stride] = (temp[ y*8] + 2)>>2;
1462 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1464 xy = y * stride + x;
1466 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1471 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1477 s += abs(pix1[0] - pix2[0]);
1478 s += abs(pix1[1] - pix2[1]);
1479 s += abs(pix1[2] - pix2[2]);
1480 s += abs(pix1[3] - pix2[3]);
1481 s += abs(pix1[4] - pix2[4]);
1482 s += abs(pix1[5] - pix2[5]);
1483 s += abs(pix1[6] - pix2[6]);
1484 s += abs(pix1[7] - pix2[7]);
1485 s += abs(pix1[8] - pix2[8]);
1486 s += abs(pix1[9] - pix2[9]);
1487 s += abs(pix1[10] - pix2[10]);
1488 s += abs(pix1[11] - pix2[11]);
1489 s += abs(pix1[12] - pix2[12]);
1490 s += abs(pix1[13] - pix2[13]);
1491 s += abs(pix1[14] - pix2[14]);
1492 s += abs(pix1[15] - pix2[15]);
1499 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1505 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1506 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1507 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1508 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1509 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1510 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1511 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1512 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1513 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1514 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1515 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1516 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1517 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1518 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1519 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1520 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1527 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1530 uint8_t *pix3 = pix2 + line_size;
1534 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1535 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1536 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1537 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1538 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1539 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1540 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1541 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1542 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1543 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1544 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1545 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1546 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1547 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1548 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1549 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1557 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1560 uint8_t *pix3 = pix2 + line_size;
1564 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1565 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1566 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1567 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1568 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1569 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1570 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1571 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1572 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1573 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1574 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1575 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1576 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1577 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1578 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1579 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1587 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1593 s += abs(pix1[0] - pix2[0]);
1594 s += abs(pix1[1] - pix2[1]);
1595 s += abs(pix1[2] - pix2[2]);
1596 s += abs(pix1[3] - pix2[3]);
1597 s += abs(pix1[4] - pix2[4]);
1598 s += abs(pix1[5] - pix2[5]);
1599 s += abs(pix1[6] - pix2[6]);
1600 s += abs(pix1[7] - pix2[7]);
1607 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1613 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1614 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1615 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1616 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1617 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1618 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1619 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1620 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1627 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1630 uint8_t *pix3 = pix2 + line_size;
1634 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1635 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1636 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1637 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1638 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1639 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1640 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1641 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1649 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1652 uint8_t *pix3 = pix2 + line_size;
1656 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1657 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1658 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1659 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1660 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1661 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1662 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1663 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1671 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1672 MpegEncContext *c = v;
1678 for(x=0; x<16; x++){
1679 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1682 for(x=0; x<15; x++){
1683 score2+= FFABS( s1[x ] - s1[x +stride]
1684 - s1[x+1] + s1[x+1+stride])
1685 -FFABS( s2[x ] - s2[x +stride]
1686 - s2[x+1] + s2[x+1+stride]);
1693 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1694 else return score1 + FFABS(score2)*8;
1697 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1698 MpegEncContext *c = v;
1705 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1709 score2+= FFABS( s1[x ] - s1[x +stride]
1710 - s1[x+1] + s1[x+1+stride])
1711 -FFABS( s2[x ] - s2[x +stride]
1712 - s2[x+1] + s2[x+1+stride]);
1719 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1720 else return score1 + FFABS(score2)*8;
1723 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1727 for(i=0; i<8*8; i++){
1728 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1731 assert(-512<b && b<512);
1733 sum += (w*b)*(w*b)>>4;
1738 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1741 for(i=0; i<8*8; i++){
1742 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1747 * permutes an 8x8 block.
1748 * @param block the block which will be permuted according to the given permutation vector
1749 * @param permutation the permutation vector
1750 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1751 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1752 * (inverse) permutated to scantable order!
1754 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1760 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1762 for(i=0; i<=last; i++){
1763 const int j= scantable[i];
1768 for(i=0; i<=last; i++){
1769 const int j= scantable[i];
1770 const int perm_j= permutation[j];
1771 block[perm_j]= temp[j];
1775 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1779 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1782 memset(cmp, 0, sizeof(void*)*6);
1790 cmp[i]= c->hadamard8_diff[i];
1796 cmp[i]= c->dct_sad[i];
1799 cmp[i]= c->dct264_sad[i];
1802 cmp[i]= c->dct_max[i];
1805 cmp[i]= c->quant_psnr[i];
1834 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1839 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1841 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1842 long a = *(long*)(src+i);
1843 long b = *(long*)(dst+i);
1844 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1847 dst[i+0] += src[i+0];
1850 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1852 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1853 long a = *(long*)(src1+i);
1854 long b = *(long*)(src2+i);
1855 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1858 dst[i] = src1[i]+src2[i];
1861 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1863 #if !HAVE_FAST_UNALIGNED
1864 if((long)src2 & (sizeof(long)-1)){
1865 for(i=0; i+7<w; i+=8){
1866 dst[i+0] = src1[i+0]-src2[i+0];
1867 dst[i+1] = src1[i+1]-src2[i+1];
1868 dst[i+2] = src1[i+2]-src2[i+2];
1869 dst[i+3] = src1[i+3]-src2[i+3];
1870 dst[i+4] = src1[i+4]-src2[i+4];
1871 dst[i+5] = src1[i+5]-src2[i+5];
1872 dst[i+6] = src1[i+6]-src2[i+6];
1873 dst[i+7] = src1[i+7]-src2[i+7];
1877 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1878 long a = *(long*)(src1+i);
1879 long b = *(long*)(src2+i);
1880 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1883 dst[i+0] = src1[i+0]-src2[i+0];
1886 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1894 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1903 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1911 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1921 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1924 for(i=0; i<w-1; i++){
1951 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
1981 #define BUTTERFLY2(o1,o2,i1,i2) \
1985 #define BUTTERFLY1(x,y) \
1994 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
1996 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2004 //FIXME try pointer walks
2005 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2006 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2007 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2008 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2010 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2011 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2012 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2013 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2015 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2016 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2017 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2018 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2022 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2023 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2024 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2025 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2027 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2028 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2029 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2030 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2033 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2034 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2035 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2036 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2041 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2049 //FIXME try pointer walks
2050 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2051 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2052 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2053 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2055 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2056 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2057 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2058 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2060 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2061 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2062 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2063 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2067 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2068 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2069 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2070 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2072 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2073 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2074 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2075 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2078 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2079 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2080 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2081 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2084 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2089 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2090 MpegEncContext * const s= (MpegEncContext *)c;
2091 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2095 s->dsp.diff_pixels(temp, src1, src2, stride);
2097 return s->dsp.sum_abs_dctelem(temp);
2102 const int s07 = SRC(0) + SRC(7);\
2103 const int s16 = SRC(1) + SRC(6);\
2104 const int s25 = SRC(2) + SRC(5);\
2105 const int s34 = SRC(3) + SRC(4);\
2106 const int a0 = s07 + s34;\
2107 const int a1 = s16 + s25;\
2108 const int a2 = s07 - s34;\
2109 const int a3 = s16 - s25;\
2110 const int d07 = SRC(0) - SRC(7);\
2111 const int d16 = SRC(1) - SRC(6);\
2112 const int d25 = SRC(2) - SRC(5);\
2113 const int d34 = SRC(3) - SRC(4);\
2114 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2115 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2116 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2117 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2119 DST(1, a4 + (a7>>2)) ;\
2120 DST(2, a2 + (a3>>1)) ;\
2121 DST(3, a5 + (a6>>2)) ;\
2123 DST(5, a6 - (a5>>2)) ;\
2124 DST(6, (a2>>1) - a3 ) ;\
2125 DST(7, (a4>>2) - a7 ) ;\
2128 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2129 MpegEncContext * const s= (MpegEncContext *)c;
2134 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2136 #define SRC(x) dct[i][x]
2137 #define DST(x,v) dct[i][x]= v
2138 for( i = 0; i < 8; i++ )
2143 #define SRC(x) dct[x][i]
2144 #define DST(x,v) sum += FFABS(v)
2145 for( i = 0; i < 8; i++ )
2153 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2154 MpegEncContext * const s= (MpegEncContext *)c;
2155 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2160 s->dsp.diff_pixels(temp, src1, src2, stride);
2164 sum= FFMAX(sum, FFABS(temp[i]));
2169 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2170 MpegEncContext * const s= (MpegEncContext *)c;
2171 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2172 DCTELEM * const bak = temp+64;
2178 s->dsp.diff_pixels(temp, src1, src2, stride);
2180 memcpy(bak, temp, 64*sizeof(DCTELEM));
2182 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2183 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2184 ff_simple_idct_8(temp); //FIXME
2187 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2192 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2193 MpegEncContext * const s= (MpegEncContext *)c;
2194 const uint8_t *scantable= s->intra_scantable.permutated;
2195 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2196 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2197 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2198 int i, last, run, bits, level, distortion, start_i;
2199 const int esc_length= s->ac_esc_length;
2201 uint8_t * last_length;
2205 copy_block8(lsrc1, src1, 8, stride, 8);
2206 copy_block8(lsrc2, src2, 8, stride, 8);
2208 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2210 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2216 length = s->intra_ac_vlc_length;
2217 last_length= s->intra_ac_vlc_last_length;
2218 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2221 length = s->inter_ac_vlc_length;
2222 last_length= s->inter_ac_vlc_last_length;
2227 for(i=start_i; i<last; i++){
2228 int j= scantable[i];
2233 if((level&(~127)) == 0){
2234 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2243 level= temp[i] + 64;
2247 if((level&(~127)) == 0){
2248 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2256 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2258 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2261 s->dsp.idct_add(lsrc2, 8, temp);
2263 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2265 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2268 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2269 MpegEncContext * const s= (MpegEncContext *)c;
2270 const uint8_t *scantable= s->intra_scantable.permutated;
2271 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2272 int i, last, run, bits, level, start_i;
2273 const int esc_length= s->ac_esc_length;
2275 uint8_t * last_length;
2279 s->dsp.diff_pixels(temp, src1, src2, stride);
2281 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2287 length = s->intra_ac_vlc_length;
2288 last_length= s->intra_ac_vlc_last_length;
2289 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2292 length = s->inter_ac_vlc_length;
2293 last_length= s->inter_ac_vlc_last_length;
2298 for(i=start_i; i<last; i++){
2299 int j= scantable[i];
2304 if((level&(~127)) == 0){
2305 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2314 level= temp[i] + 64;
2318 if((level&(~127)) == 0){
2319 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2327 #define VSAD_INTRA(size) \
2328 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2332 for(y=1; y<h; y++){ \
2333 for(x=0; x<size; x+=4){ \
2334 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2335 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2345 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2350 for(x=0; x<16; x++){
2351 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2360 #define SQ(a) ((a)*(a))
2361 #define VSSE_INTRA(size) \
2362 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2366 for(y=1; y<h; y++){ \
2367 for(x=0; x<size; x+=4){ \
2368 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2369 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2379 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2384 for(x=0; x<16; x++){
2385 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2394 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2398 for(i=0; i<size; i++)
2399 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2403 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2404 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2405 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2407 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2409 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2410 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2411 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2412 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2414 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2416 for(i=0; i<len; i++)
2417 dst[i] = src0[i] * src1[i];
2420 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2423 for(i=0; i<len; i++)
2424 dst[i] = src0[i] * src1[-i];
2427 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2429 for(i=0; i<len; i++)
2430 dst[i] = src0[i] * src1[i] + src2[i];
2433 static void vector_fmul_window_c(float *dst, const float *src0,
2434 const float *src1, const float *win, int len)
2440 for(i=-len, j=len-1; i<0; i++, j--) {
2445 dst[i] = s0*wj - s1*wi;
2446 dst[j] = s0*wi + s1*wj;
2450 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2454 for (i = 0; i < len; i++)
2455 dst[i] = src[i] * mul;
2458 static void vector_fmac_scalar_c(float *dst, const float *src, float mul,
2462 for (i = 0; i < len; i++)
2463 dst[i] += src[i] * mul;
2466 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2470 for (i = 0; i < len; i++) {
2471 float t = v1[i] - v2[i];
2477 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2482 for (i = 0; i < len; i++)
2488 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2489 uint32_t maxi, uint32_t maxisign)
2492 if(a > mini) return mini;
2493 else if((a^(1U<<31)) > maxisign) return maxi;
2497 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2499 uint32_t mini = *(uint32_t*)min;
2500 uint32_t maxi = *(uint32_t*)max;
2501 uint32_t maxisign = maxi ^ (1U<<31);
2502 uint32_t *dsti = (uint32_t*)dst;
2503 const uint32_t *srci = (const uint32_t*)src;
2504 for(i=0; i<len; i+=8) {
2505 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2506 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2507 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2508 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2509 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2510 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2511 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2512 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2515 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2517 if(min < 0 && max > 0) {
2518 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2520 for(i=0; i < len; i+=8) {
2521 dst[i ] = av_clipf(src[i ], min, max);
2522 dst[i + 1] = av_clipf(src[i + 1], min, max);
2523 dst[i + 2] = av_clipf(src[i + 2], min, max);
2524 dst[i + 3] = av_clipf(src[i + 3], min, max);
2525 dst[i + 4] = av_clipf(src[i + 4], min, max);
2526 dst[i + 5] = av_clipf(src[i + 5], min, max);
2527 dst[i + 6] = av_clipf(src[i + 6], min, max);
2528 dst[i + 7] = av_clipf(src[i + 7], min, max);
2533 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2538 res += (*v1++ * *v2++) >> shift;
2543 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2548 *v1++ += mul * *v3++;
2553 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2554 const int16_t *window, unsigned int len)
2557 int len2 = len >> 1;
2559 for (i = 0; i < len2; i++) {
2560 int16_t w = window[i];
2561 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2562 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2566 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2567 int32_t max, unsigned int len)
2570 *dst++ = av_clip(*src++, min, max);
2571 *dst++ = av_clip(*src++, min, max);
2572 *dst++ = av_clip(*src++, min, max);
2573 *dst++ = av_clip(*src++, min, max);
2574 *dst++ = av_clip(*src++, min, max);
2575 *dst++ = av_clip(*src++, min, max);
2576 *dst++ = av_clip(*src++, min, max);
2577 *dst++ = av_clip(*src++, min, max);
2583 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2584 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2585 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2586 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2587 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2588 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2589 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2591 static void wmv2_idct_row(short * b)
2594 int a0,a1,a2,a3,a4,a5,a6,a7;
2596 a1 = W1*b[1]+W7*b[7];
2597 a7 = W7*b[1]-W1*b[7];
2598 a5 = W5*b[5]+W3*b[3];
2599 a3 = W3*b[5]-W5*b[3];
2600 a2 = W2*b[2]+W6*b[6];
2601 a6 = W6*b[2]-W2*b[6];
2602 a0 = W0*b[0]+W0*b[4];
2603 a4 = W0*b[0]-W0*b[4];
2605 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2606 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2608 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2609 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2610 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2611 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2612 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2613 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2614 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2615 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2617 static void wmv2_idct_col(short * b)
2620 int a0,a1,a2,a3,a4,a5,a6,a7;
2621 /*step 1, with extended precision*/
2622 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2623 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2624 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2625 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2626 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2627 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2628 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2629 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2631 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2632 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2634 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2635 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2636 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2637 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2639 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2640 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2641 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2642 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2644 void ff_wmv2_idct_c(short * block){
2648 wmv2_idct_row(block+i);
2651 wmv2_idct_col(block+i);
2654 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2656 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2658 ff_wmv2_idct_c(block);
2659 ff_put_pixels_clamped_c(block, dest, line_size);
2661 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2663 ff_wmv2_idct_c(block);
2664 ff_add_pixels_clamped_c(block, dest, line_size);
2666 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2669 ff_put_pixels_clamped_c(block, dest, line_size);
2671 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2674 ff_add_pixels_clamped_c(block, dest, line_size);
2677 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2680 put_pixels_clamped4_c(block, dest, line_size);
2682 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2685 add_pixels_clamped4_c(block, dest, line_size);
2688 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2691 put_pixels_clamped2_c(block, dest, line_size);
2693 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2696 add_pixels_clamped2_c(block, dest, line_size);
2699 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2703 dest[0] = cm[(block[0] + 4)>>3];
2705 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2707 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2709 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2712 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2714 /* init static data */
2715 av_cold void dsputil_static_init(void)
2719 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2720 for(i=0;i<MAX_NEG_CROP;i++) {
2722 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2725 for(i=0;i<512;i++) {
2726 ff_squareTbl[i] = (i - 256) * (i - 256);
2729 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2732 int ff_check_alignment(void){
2733 static int did_fail=0;
2734 LOCAL_ALIGNED_16(int, aligned, [4]);
2736 if((intptr_t)aligned & 15){
2738 #if HAVE_MMX || HAVE_ALTIVEC
2739 av_log(NULL, AV_LOG_ERROR,
2740 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2741 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2742 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2743 "Do not report crashes to Libav developers.\n");
2752 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2756 ff_check_alignment();
2759 if (avctx->bits_per_raw_sample == 10) {
2760 c->fdct = ff_jpeg_fdct_islow_10;
2761 c->fdct248 = ff_fdct248_islow_10;
2763 if(avctx->dct_algo==FF_DCT_FASTINT) {
2764 c->fdct = fdct_ifast;
2765 c->fdct248 = fdct_ifast248;
2767 else if(avctx->dct_algo==FF_DCT_FAAN) {
2768 c->fdct = ff_faandct;
2769 c->fdct248 = ff_faandct248;
2772 c->fdct = ff_jpeg_fdct_islow_8; //slow/accurate/default
2773 c->fdct248 = ff_fdct248_islow_8;
2776 #endif //CONFIG_ENCODERS
2778 if(avctx->lowres==1){
2779 c->idct_put= ff_jref_idct4_put;
2780 c->idct_add= ff_jref_idct4_add;
2781 c->idct = j_rev_dct4;
2782 c->idct_permutation_type= FF_NO_IDCT_PERM;
2783 }else if(avctx->lowres==2){
2784 c->idct_put= ff_jref_idct2_put;
2785 c->idct_add= ff_jref_idct2_add;
2786 c->idct = j_rev_dct2;
2787 c->idct_permutation_type= FF_NO_IDCT_PERM;
2788 }else if(avctx->lowres==3){
2789 c->idct_put= ff_jref_idct1_put;
2790 c->idct_add= ff_jref_idct1_add;
2791 c->idct = j_rev_dct1;
2792 c->idct_permutation_type= FF_NO_IDCT_PERM;
2794 if (avctx->bits_per_raw_sample == 10) {
2795 c->idct_put = ff_simple_idct_put_10;
2796 c->idct_add = ff_simple_idct_add_10;
2797 c->idct = ff_simple_idct_10;
2798 c->idct_permutation_type = FF_NO_IDCT_PERM;
2800 if(avctx->idct_algo==FF_IDCT_INT){
2801 c->idct_put= ff_jref_idct_put;
2802 c->idct_add= ff_jref_idct_add;
2803 c->idct = j_rev_dct;
2804 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2805 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2806 avctx->idct_algo==FF_IDCT_VP3){
2807 c->idct_put= ff_vp3_idct_put_c;
2808 c->idct_add= ff_vp3_idct_add_c;
2809 c->idct = ff_vp3_idct_c;
2810 c->idct_permutation_type= FF_NO_IDCT_PERM;
2811 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2812 c->idct_put= ff_wmv2_idct_put_c;
2813 c->idct_add= ff_wmv2_idct_add_c;
2814 c->idct = ff_wmv2_idct_c;
2815 c->idct_permutation_type= FF_NO_IDCT_PERM;
2816 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2817 c->idct_put= ff_faanidct_put;
2818 c->idct_add= ff_faanidct_add;
2819 c->idct = ff_faanidct;
2820 c->idct_permutation_type= FF_NO_IDCT_PERM;
2821 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2822 c->idct_put= ff_ea_idct_put_c;
2823 c->idct_permutation_type= FF_NO_IDCT_PERM;
2824 }else{ //accurate/default
2825 c->idct_put = ff_simple_idct_put_8;
2826 c->idct_add = ff_simple_idct_add_8;
2827 c->idct = ff_simple_idct_8;
2828 c->idct_permutation_type= FF_NO_IDCT_PERM;
2833 c->diff_pixels = diff_pixels_c;
2834 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2835 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2836 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2837 c->sum_abs_dctelem = sum_abs_dctelem_c;
2840 c->pix_sum = pix_sum_c;
2841 c->pix_norm1 = pix_norm1_c;
2843 c->fill_block_tab[0] = fill_block16_c;
2844 c->fill_block_tab[1] = fill_block8_c;
2846 /* TODO [0] 16 [1] 8 */
2847 c->pix_abs[0][0] = pix_abs16_c;
2848 c->pix_abs[0][1] = pix_abs16_x2_c;
2849 c->pix_abs[0][2] = pix_abs16_y2_c;
2850 c->pix_abs[0][3] = pix_abs16_xy2_c;
2851 c->pix_abs[1][0] = pix_abs8_c;
2852 c->pix_abs[1][1] = pix_abs8_x2_c;
2853 c->pix_abs[1][2] = pix_abs8_y2_c;
2854 c->pix_abs[1][3] = pix_abs8_xy2_c;
2856 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2857 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2858 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2859 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2860 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2861 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2862 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2863 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2864 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2866 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2867 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2868 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2869 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2870 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2871 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2872 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2873 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2874 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2876 #define dspfunc(PFX, IDX, NUM) \
2877 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2878 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2879 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2880 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2881 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2882 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2883 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2884 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2885 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2886 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2887 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2888 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2889 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2890 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2891 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2892 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2894 dspfunc(put_qpel, 0, 16);
2895 dspfunc(put_no_rnd_qpel, 0, 16);
2897 dspfunc(avg_qpel, 0, 16);
2898 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2900 dspfunc(put_qpel, 1, 8);
2901 dspfunc(put_no_rnd_qpel, 1, 8);
2903 dspfunc(avg_qpel, 1, 8);
2904 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2908 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2909 ff_mlp_init(c, avctx);
2911 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2912 ff_intrax8dsp_init(c,avctx);
2915 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
2916 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2917 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2918 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2919 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2920 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2921 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2922 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2924 #define SET_CMP_FUNC(name) \
2925 c->name[0]= name ## 16_c;\
2926 c->name[1]= name ## 8x8_c;
2928 SET_CMP_FUNC(hadamard8_diff)
2929 c->hadamard8_diff[4]= hadamard8_intra16_c;
2930 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
2931 SET_CMP_FUNC(dct_sad)
2932 SET_CMP_FUNC(dct_max)
2934 SET_CMP_FUNC(dct264_sad)
2936 c->sad[0]= pix_abs16_c;
2937 c->sad[1]= pix_abs8_c;
2941 SET_CMP_FUNC(quant_psnr)
2944 c->vsad[0]= vsad16_c;
2945 c->vsad[4]= vsad_intra16_c;
2946 c->vsad[5]= vsad_intra8_c;
2947 c->vsse[0]= vsse16_c;
2948 c->vsse[4]= vsse_intra16_c;
2949 c->vsse[5]= vsse_intra8_c;
2950 c->nsse[0]= nsse16_c;
2951 c->nsse[1]= nsse8_c;
2953 ff_dsputil_init_dwt(c);
2956 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
2958 c->add_bytes= add_bytes_c;
2959 c->add_bytes_l2= add_bytes_l2_c;
2960 c->diff_bytes= diff_bytes_c;
2961 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
2962 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
2963 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
2964 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
2965 c->bswap_buf= bswap_buf;
2966 c->bswap16_buf = bswap16_buf;
2967 #if CONFIG_PNG_DECODER
2968 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
2971 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2972 c->h263_h_loop_filter= h263_h_loop_filter_c;
2973 c->h263_v_loop_filter= h263_v_loop_filter_c;
2976 if (CONFIG_VP3_DECODER) {
2977 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
2978 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
2979 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
2982 c->h261_loop_filter= h261_loop_filter_c;
2984 c->try_8x8basis= try_8x8basis_c;
2985 c->add_8x8basis= add_8x8basis_c;
2987 #if CONFIG_VORBIS_DECODER
2988 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
2990 #if CONFIG_AC3_DECODER
2991 c->ac3_downmix = ff_ac3_downmix_c;
2993 c->vector_fmul = vector_fmul_c;
2994 c->vector_fmul_reverse = vector_fmul_reverse_c;
2995 c->vector_fmul_add = vector_fmul_add_c;
2996 c->vector_fmul_window = vector_fmul_window_c;
2997 c->vector_clipf = vector_clipf_c;
2998 c->scalarproduct_int16 = scalarproduct_int16_c;
2999 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3000 c->apply_window_int16 = apply_window_int16_c;
3001 c->vector_clip_int32 = vector_clip_int32_c;
3002 c->scalarproduct_float = scalarproduct_float_c;
3003 c->butterflies_float = butterflies_float_c;
3004 c->vector_fmul_scalar = vector_fmul_scalar_c;
3005 c->vector_fmac_scalar = vector_fmac_scalar_c;
3007 c->shrink[0]= av_image_copy_plane;
3008 c->shrink[1]= ff_shrink22;
3009 c->shrink[2]= ff_shrink44;
3010 c->shrink[3]= ff_shrink88;
3012 c->prefetch= just_return;
3014 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3015 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3019 #define FUNC(f, depth) f ## _ ## depth
3020 #define FUNCC(f, depth) f ## _ ## depth ## _c
3022 #define dspfunc1(PFX, IDX, NUM, depth)\
3023 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3024 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3025 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3026 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3028 #define dspfunc2(PFX, IDX, NUM, depth)\
3029 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3030 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3031 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3032 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3033 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3034 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3035 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3036 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3037 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3038 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3039 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3040 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3041 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3042 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3043 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3044 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3047 #define BIT_DEPTH_FUNCS(depth, dct)\
3048 c->get_pixels = FUNCC(get_pixels ## dct , depth);\
3049 c->draw_edges = FUNCC(draw_edges , depth);\
3050 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3051 c->clear_block = FUNCC(clear_block ## dct , depth);\
3052 c->clear_blocks = FUNCC(clear_blocks ## dct , depth);\
3053 c->add_pixels8 = FUNCC(add_pixels8 ## dct , depth);\
3054 c->add_pixels4 = FUNCC(add_pixels4 ## dct , depth);\
3055 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3056 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3058 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3059 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3060 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3061 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3062 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3063 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3065 dspfunc1(put , 0, 16, depth);\
3066 dspfunc1(put , 1, 8, depth);\
3067 dspfunc1(put , 2, 4, depth);\
3068 dspfunc1(put , 3, 2, depth);\
3069 dspfunc1(put_no_rnd, 0, 16, depth);\
3070 dspfunc1(put_no_rnd, 1, 8, depth);\
3071 dspfunc1(avg , 0, 16, depth);\
3072 dspfunc1(avg , 1, 8, depth);\
3073 dspfunc1(avg , 2, 4, depth);\
3074 dspfunc1(avg , 3, 2, depth);\
3075 dspfunc1(avg_no_rnd, 0, 16, depth);\
3076 dspfunc1(avg_no_rnd, 1, 8, depth);\
3078 dspfunc2(put_h264_qpel, 0, 16, depth);\
3079 dspfunc2(put_h264_qpel, 1, 8, depth);\
3080 dspfunc2(put_h264_qpel, 2, 4, depth);\
3081 dspfunc2(put_h264_qpel, 3, 2, depth);\
3082 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3083 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3084 dspfunc2(avg_h264_qpel, 2, 4, depth);
3086 switch (avctx->bits_per_raw_sample) {
3088 if (c->dct_bits == 32) {
3089 BIT_DEPTH_FUNCS(9, _32);
3091 BIT_DEPTH_FUNCS(9, _16);
3095 if (c->dct_bits == 32) {
3096 BIT_DEPTH_FUNCS(10, _32);
3098 BIT_DEPTH_FUNCS(10, _16);
3102 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3104 BIT_DEPTH_FUNCS(8, _16);
3109 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3110 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3111 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3112 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3113 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3114 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3115 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3116 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3117 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3119 for(i=0; i<64; i++){
3120 if(!c->put_2tap_qpel_pixels_tab[0][i])
3121 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3122 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3123 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3126 switch(c->idct_permutation_type){
3127 case FF_NO_IDCT_PERM:
3129 c->idct_permutation[i]= i;
3131 case FF_LIBMPEG2_IDCT_PERM:
3133 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3135 case FF_SIMPLE_IDCT_PERM:
3137 c->idct_permutation[i]= simple_mmx_permutation[i];
3139 case FF_TRANSPOSE_IDCT_PERM:
3141 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3143 case FF_PARTTRANS_IDCT_PERM:
3145 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3147 case FF_SSE2_IDCT_PERM:
3149 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3152 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");