3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 #include "dsputil_internal.h"
48 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
49 #define pb_7f (~0UL/255 * 0x7f)
50 #define pb_80 (~0UL/255 * 0x80)
52 const uint8_t ff_zigzag_direct[64] = {
53 0, 1, 8, 16, 9, 2, 3, 10,
54 17, 24, 32, 25, 18, 11, 4, 5,
55 12, 19, 26, 33, 40, 48, 41, 34,
56 27, 20, 13, 6, 7, 14, 21, 28,
57 35, 42, 49, 56, 57, 50, 43, 36,
58 29, 22, 15, 23, 30, 37, 44, 51,
59 58, 59, 52, 45, 38, 31, 39, 46,
60 53, 60, 61, 54, 47, 55, 62, 63
63 /* Specific zigzag scan for 248 idct. NOTE that unlike the
64 specification, we interleave the fields */
65 const uint8_t ff_zigzag248_direct[64] = {
66 0, 8, 1, 9, 16, 24, 2, 10,
67 17, 25, 32, 40, 48, 56, 33, 41,
68 18, 26, 3, 11, 4, 12, 19, 27,
69 34, 42, 49, 57, 50, 58, 35, 43,
70 20, 28, 5, 13, 6, 14, 21, 29,
71 36, 44, 51, 59, 52, 60, 37, 45,
72 22, 30, 7, 15, 23, 31, 38, 46,
73 53, 61, 54, 62, 39, 47, 55, 63,
76 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
77 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
79 const uint8_t ff_alternate_horizontal_scan[64] = {
80 0, 1, 2, 3, 8, 9, 16, 17,
81 10, 11, 4, 5, 6, 7, 15, 14,
82 13, 12, 19, 18, 24, 25, 32, 33,
83 26, 27, 20, 21, 22, 23, 28, 29,
84 30, 31, 34, 35, 40, 41, 48, 49,
85 42, 43, 36, 37, 38, 39, 44, 45,
86 46, 47, 50, 51, 56, 57, 58, 59,
87 52, 53, 54, 55, 60, 61, 62, 63,
90 const uint8_t ff_alternate_vertical_scan[64] = {
91 0, 8, 16, 24, 1, 9, 2, 10,
92 17, 25, 32, 40, 48, 56, 57, 49,
93 41, 33, 26, 18, 3, 11, 4, 12,
94 19, 27, 34, 42, 50, 58, 35, 43,
95 51, 59, 20, 28, 5, 13, 6, 14,
96 21, 29, 36, 44, 52, 60, 37, 45,
97 53, 61, 22, 30, 7, 15, 23, 31,
98 38, 46, 54, 62, 39, 47, 55, 63,
101 /* Input permutation for the simple_idct_mmx */
102 static const uint8_t simple_mmx_permutation[64]={
103 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
104 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
105 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
106 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
107 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
108 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
109 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
110 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
113 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
115 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
119 st->scantable= src_scantable;
123 j = src_scantable[i];
124 st->permutated[i] = permutation[j];
133 j = st->permutated[i];
135 st->raster_end[i]= end;
139 static int pix_sum_c(uint8_t * pix, int line_size)
144 for (i = 0; i < 16; i++) {
145 for (j = 0; j < 16; j += 8) {
156 pix += line_size - 16;
161 static int pix_norm1_c(uint8_t * pix, int line_size)
164 uint32_t *sq = ff_squareTbl + 256;
167 for (i = 0; i < 16; i++) {
168 for (j = 0; j < 16; j += 8) {
179 #if LONG_MAX > 2147483647
180 register uint64_t x=*(uint64_t*)pix;
182 s += sq[(x>>8)&0xff];
183 s += sq[(x>>16)&0xff];
184 s += sq[(x>>24)&0xff];
185 s += sq[(x>>32)&0xff];
186 s += sq[(x>>40)&0xff];
187 s += sq[(x>>48)&0xff];
188 s += sq[(x>>56)&0xff];
190 register uint32_t x=*(uint32_t*)pix;
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 x=*(uint32_t*)(pix+4);
197 s += sq[(x>>8)&0xff];
198 s += sq[(x>>16)&0xff];
199 s += sq[(x>>24)&0xff];
204 pix += line_size - 16;
209 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
212 for(i=0; i+8<=w; i+=8){
213 dst[i+0]= av_bswap32(src[i+0]);
214 dst[i+1]= av_bswap32(src[i+1]);
215 dst[i+2]= av_bswap32(src[i+2]);
216 dst[i+3]= av_bswap32(src[i+3]);
217 dst[i+4]= av_bswap32(src[i+4]);
218 dst[i+5]= av_bswap32(src[i+5]);
219 dst[i+6]= av_bswap32(src[i+6]);
220 dst[i+7]= av_bswap32(src[i+7]);
223 dst[i+0]= av_bswap32(src[i+0]);
227 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
230 *dst++ = av_bswap16(*src++);
233 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
236 uint32_t *sq = ff_squareTbl + 256;
239 for (i = 0; i < h; i++) {
240 s += sq[pix1[0] - pix2[0]];
241 s += sq[pix1[1] - pix2[1]];
242 s += sq[pix1[2] - pix2[2]];
243 s += sq[pix1[3] - pix2[3]];
250 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
253 uint32_t *sq = ff_squareTbl + 256;
256 for (i = 0; i < h; i++) {
257 s += sq[pix1[0] - pix2[0]];
258 s += sq[pix1[1] - pix2[1]];
259 s += sq[pix1[2] - pix2[2]];
260 s += sq[pix1[3] - pix2[3]];
261 s += sq[pix1[4] - pix2[4]];
262 s += sq[pix1[5] - pix2[5]];
263 s += sq[pix1[6] - pix2[6]];
264 s += sq[pix1[7] - pix2[7]];
271 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
274 uint32_t *sq = ff_squareTbl + 256;
277 for (i = 0; i < h; i++) {
278 s += sq[pix1[ 0] - pix2[ 0]];
279 s += sq[pix1[ 1] - pix2[ 1]];
280 s += sq[pix1[ 2] - pix2[ 2]];
281 s += sq[pix1[ 3] - pix2[ 3]];
282 s += sq[pix1[ 4] - pix2[ 4]];
283 s += sq[pix1[ 5] - pix2[ 5]];
284 s += sq[pix1[ 6] - pix2[ 6]];
285 s += sq[pix1[ 7] - pix2[ 7]];
286 s += sq[pix1[ 8] - pix2[ 8]];
287 s += sq[pix1[ 9] - pix2[ 9]];
288 s += sq[pix1[10] - pix2[10]];
289 s += sq[pix1[11] - pix2[11]];
290 s += sq[pix1[12] - pix2[12]];
291 s += sq[pix1[13] - pix2[13]];
292 s += sq[pix1[14] - pix2[14]];
293 s += sq[pix1[15] - pix2[15]];
301 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
305 /* read the pixels */
307 block[0] = pixels[0];
308 block[1] = pixels[1];
309 block[2] = pixels[2];
310 block[3] = pixels[3];
311 block[4] = pixels[4];
312 block[5] = pixels[5];
313 block[6] = pixels[6];
314 block[7] = pixels[7];
320 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
321 const uint8_t *s2, int stride){
324 /* read the pixels */
326 block[0] = s1[0] - s2[0];
327 block[1] = s1[1] - s2[1];
328 block[2] = s1[2] - s2[2];
329 block[3] = s1[3] - s2[3];
330 block[4] = s1[4] - s2[4];
331 block[5] = s1[5] - s2[5];
332 block[6] = s1[6] - s2[6];
333 block[7] = s1[7] - s2[7];
341 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
345 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
347 /* read the pixels */
349 pixels[0] = cm[block[0]];
350 pixels[1] = cm[block[1]];
351 pixels[2] = cm[block[2]];
352 pixels[3] = cm[block[3]];
353 pixels[4] = cm[block[4]];
354 pixels[5] = cm[block[5]];
355 pixels[6] = cm[block[6]];
356 pixels[7] = cm[block[7]];
363 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
367 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
369 /* read the pixels */
371 pixels[0] = cm[block[0]];
372 pixels[1] = cm[block[1]];
373 pixels[2] = cm[block[2]];
374 pixels[3] = cm[block[3]];
381 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
385 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
387 /* read the pixels */
389 pixels[0] = cm[block[0]];
390 pixels[1] = cm[block[1]];
397 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
398 uint8_t *restrict pixels,
403 for (i = 0; i < 8; i++) {
404 for (j = 0; j < 8; j++) {
407 else if (*block > 127)
410 *pixels = (uint8_t)(*block + 128);
414 pixels += (line_size - 8);
418 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
423 /* read the pixels */
425 pixels[0] = block[0];
426 pixels[1] = block[1];
427 pixels[2] = block[2];
428 pixels[3] = block[3];
429 pixels[4] = block[4];
430 pixels[5] = block[5];
431 pixels[6] = block[6];
432 pixels[7] = block[7];
439 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445 /* read the pixels */
447 pixels[0] = cm[pixels[0] + block[0]];
448 pixels[1] = cm[pixels[1] + block[1]];
449 pixels[2] = cm[pixels[2] + block[2]];
450 pixels[3] = cm[pixels[3] + block[3]];
451 pixels[4] = cm[pixels[4] + block[4]];
452 pixels[5] = cm[pixels[5] + block[5]];
453 pixels[6] = cm[pixels[6] + block[6]];
454 pixels[7] = cm[pixels[7] + block[7]];
460 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
464 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
466 /* read the pixels */
468 pixels[0] = cm[pixels[0] + block[0]];
469 pixels[1] = cm[pixels[1] + block[1]];
470 pixels[2] = cm[pixels[2] + block[2]];
471 pixels[3] = cm[pixels[3] + block[3]];
477 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
481 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
483 /* read the pixels */
485 pixels[0] = cm[pixels[0] + block[0]];
486 pixels[1] = cm[pixels[1] + block[1]];
492 static int sum_abs_dctelem_c(DCTELEM *block)
496 sum+= FFABS(block[i]);
500 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
504 for (i = 0; i < h; i++) {
505 memset(block, value, 16);
510 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
514 for (i = 0; i < h; i++) {
515 memset(block, value, 8);
520 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
523 uint16_t *dst1 = (uint16_t *) dst;
524 uint16_t *dst2 = (uint16_t *)(dst + linesize);
526 for (j = 0; j < 8; j++) {
527 for (i = 0; i < 8; i++) {
528 dst1[i] = dst2[i] = src[i] * 0x0101;
536 #define avg2(a,b) ((a+b+1)>>1)
537 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
539 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
541 const int A=(16-x16)*(16-y16);
542 const int B=( x16)*(16-y16);
543 const int C=(16-x16)*( y16);
544 const int D=( x16)*( y16);
549 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
550 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
551 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
552 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
553 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
554 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
555 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
556 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
562 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
563 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
566 const int s= 1<<shift;
576 for(x=0; x<8; x++){ //XXX FIXME optimize
577 int src_x, src_y, frac_x, frac_y, index;
586 if((unsigned)src_x < width){
587 if((unsigned)src_y < height){
588 index= src_x + src_y*stride;
589 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
590 + src[index +1]* frac_x )*(s-frac_y)
591 + ( src[index+stride ]*(s-frac_x)
592 + src[index+stride+1]* frac_x )* frac_y
595 index= src_x + av_clip(src_y, 0, height)*stride;
596 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
597 + src[index +1]* frac_x )*s
601 if((unsigned)src_y < height){
602 index= av_clip(src_x, 0, width) + src_y*stride;
603 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
604 + src[index+stride ]* frac_y )*s
607 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
608 dst[y*stride + x]= src[index ];
620 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
622 case 2: put_pixels2_c (dst, src, stride, height); break;
623 case 4: put_pixels4_c (dst, src, stride, height); break;
624 case 8: put_pixels8_c (dst, src, stride, height); break;
625 case 16:put_pixels16_c(dst, src, stride, height); break;
629 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
631 for (i=0; i < height; i++) {
632 for (j=0; j < width; j++) {
633 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
640 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
642 for (i=0; i < height; i++) {
643 for (j=0; j < width; j++) {
644 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
651 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
653 for (i=0; i < height; i++) {
654 for (j=0; j < width; j++) {
655 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
662 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
664 for (i=0; i < height; i++) {
665 for (j=0; j < width; j++) {
666 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
673 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
675 for (i=0; i < height; i++) {
676 for (j=0; j < width; j++) {
677 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
684 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
686 for (i=0; i < height; i++) {
687 for (j=0; j < width; j++) {
688 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
695 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
697 for (i=0; i < height; i++) {
698 for (j=0; j < width; j++) {
699 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
706 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
708 for (i=0; i < height; i++) {
709 for (j=0; j < width; j++) {
710 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
717 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
719 case 2: avg_pixels2_c (dst, src, stride, height); break;
720 case 4: avg_pixels4_c (dst, src, stride, height); break;
721 case 8: avg_pixels8_c (dst, src, stride, height); break;
722 case 16:avg_pixels16_c(dst, src, stride, height); break;
726 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
728 for (i=0; i < height; i++) {
729 for (j=0; j < width; j++) {
730 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
737 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
739 for (i=0; i < height; i++) {
740 for (j=0; j < width; j++) {
741 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
748 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
750 for (i=0; i < height; i++) {
751 for (j=0; j < width; j++) {
752 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
759 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
761 for (i=0; i < height; i++) {
762 for (j=0; j < width; j++) {
763 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
770 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
772 for (i=0; i < height; i++) {
773 for (j=0; j < width; j++) {
774 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
781 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
783 for (i=0; i < height; i++) {
784 for (j=0; j < width; j++) {
785 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
792 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
794 for (i=0; i < height; i++) {
795 for (j=0; j < width; j++) {
796 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
803 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
805 for (i=0; i < height; i++) {
806 for (j=0; j < width; j++) {
807 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
814 #define TPEL_WIDTH(width)\
815 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
816 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
817 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
818 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
819 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
820 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
821 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
822 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
823 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
824 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
825 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
826 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
827 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
828 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
829 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
830 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
831 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
832 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
835 #define QPEL_MC(r, OPNAME, RND, OP) \
836 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
837 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
841 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
842 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
843 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
844 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
845 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
846 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
847 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
848 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
854 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
856 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
860 const int src0= src[0*srcStride];\
861 const int src1= src[1*srcStride];\
862 const int src2= src[2*srcStride];\
863 const int src3= src[3*srcStride];\
864 const int src4= src[4*srcStride];\
865 const int src5= src[5*srcStride];\
866 const int src6= src[6*srcStride];\
867 const int src7= src[7*srcStride];\
868 const int src8= src[8*srcStride];\
869 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
870 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
871 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
872 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
873 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
874 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
875 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
876 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
882 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
883 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
888 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
889 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
890 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
891 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
892 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
893 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
894 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
895 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
896 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
897 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
898 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
899 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
900 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
901 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
902 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
903 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
909 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
910 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
915 const int src0= src[0*srcStride];\
916 const int src1= src[1*srcStride];\
917 const int src2= src[2*srcStride];\
918 const int src3= src[3*srcStride];\
919 const int src4= src[4*srcStride];\
920 const int src5= src[5*srcStride];\
921 const int src6= src[6*srcStride];\
922 const int src7= src[7*srcStride];\
923 const int src8= src[8*srcStride];\
924 const int src9= src[9*srcStride];\
925 const int src10= src[10*srcStride];\
926 const int src11= src[11*srcStride];\
927 const int src12= src[12*srcStride];\
928 const int src13= src[13*srcStride];\
929 const int src14= src[14*srcStride];\
930 const int src15= src[15*srcStride];\
931 const int src16= src[16*srcStride];\
932 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
933 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
934 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
935 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
936 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
937 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
938 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
939 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
940 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
941 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
942 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
943 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
944 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
945 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
946 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
947 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
953 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
955 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
956 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
959 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
960 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
963 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
965 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
966 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
969 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
972 copy_block9(full, src, 16, stride, 9);\
973 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
974 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
977 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
979 copy_block9(full, src, 16, stride, 9);\
980 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
983 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
986 copy_block9(full, src, 16, stride, 9);\
987 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
988 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
990 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
995 copy_block9(full, src, 16, stride, 9);\
996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
999 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1001 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t full[16*9];\
1004 uint8_t halfHV[64];\
1005 copy_block9(full, src, 16, stride, 9);\
1006 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1007 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1008 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1009 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1011 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1012 uint8_t full[16*9];\
1015 uint8_t halfHV[64];\
1016 copy_block9(full, src, 16, stride, 9);\
1017 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1018 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1019 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1020 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1022 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1023 uint8_t full[16*9];\
1025 uint8_t halfHV[64];\
1026 copy_block9(full, src, 16, stride, 9);\
1027 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1028 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1029 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1030 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1032 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1033 uint8_t full[16*9];\
1036 uint8_t halfHV[64];\
1037 copy_block9(full, src, 16, stride, 9);\
1038 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1039 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1040 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1041 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1043 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1044 uint8_t full[16*9];\
1046 uint8_t halfHV[64];\
1047 copy_block9(full, src, 16, stride, 9);\
1048 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1049 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1050 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1051 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1053 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1054 uint8_t full[16*9];\
1057 uint8_t halfHV[64];\
1058 copy_block9(full, src, 16, stride, 9);\
1059 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1060 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1061 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1062 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1064 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1065 uint8_t full[16*9];\
1067 uint8_t halfHV[64];\
1068 copy_block9(full, src, 16, stride, 9);\
1069 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1070 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1071 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1072 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1074 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1076 uint8_t halfHV[64];\
1077 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1078 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1079 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1081 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1083 uint8_t halfHV[64];\
1084 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1085 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1086 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1088 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1089 uint8_t full[16*9];\
1092 uint8_t halfHV[64];\
1093 copy_block9(full, src, 16, stride, 9);\
1094 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1095 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1096 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1097 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1099 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1100 uint8_t full[16*9];\
1102 copy_block9(full, src, 16, stride, 9);\
1103 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1104 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1105 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1107 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1108 uint8_t full[16*9];\
1111 uint8_t halfHV[64];\
1112 copy_block9(full, src, 16, stride, 9);\
1113 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1114 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1115 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1116 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1118 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1119 uint8_t full[16*9];\
1121 copy_block9(full, src, 16, stride, 9);\
1122 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1123 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1124 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1126 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1128 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1129 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1132 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1134 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1135 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1138 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1139 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1142 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1144 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1145 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1148 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1149 uint8_t full[24*17];\
1151 copy_block17(full, src, 24, stride, 17);\
1152 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1153 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1156 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1157 uint8_t full[24*17];\
1158 copy_block17(full, src, 24, stride, 17);\
1159 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1162 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1163 uint8_t full[24*17];\
1165 copy_block17(full, src, 24, stride, 17);\
1166 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1167 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1169 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1170 uint8_t full[24*17];\
1171 uint8_t halfH[272];\
1172 uint8_t halfV[256];\
1173 uint8_t halfHV[256];\
1174 copy_block17(full, src, 24, stride, 17);\
1175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1176 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1178 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1180 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1181 uint8_t full[24*17];\
1182 uint8_t halfH[272];\
1183 uint8_t halfHV[256];\
1184 copy_block17(full, src, 24, stride, 17);\
1185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1186 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1187 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1188 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1190 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1191 uint8_t full[24*17];\
1192 uint8_t halfH[272];\
1193 uint8_t halfV[256];\
1194 uint8_t halfHV[256];\
1195 copy_block17(full, src, 24, stride, 17);\
1196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1197 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1199 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1201 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1202 uint8_t full[24*17];\
1203 uint8_t halfH[272];\
1204 uint8_t halfHV[256];\
1205 copy_block17(full, src, 24, stride, 17);\
1206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1208 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1209 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1211 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1212 uint8_t full[24*17];\
1213 uint8_t halfH[272];\
1214 uint8_t halfV[256];\
1215 uint8_t halfHV[256];\
1216 copy_block17(full, src, 24, stride, 17);\
1217 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1218 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1219 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1220 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1222 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1223 uint8_t full[24*17];\
1224 uint8_t halfH[272];\
1225 uint8_t halfHV[256];\
1226 copy_block17(full, src, 24, stride, 17);\
1227 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1228 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1229 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1230 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1232 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1233 uint8_t full[24*17];\
1234 uint8_t halfH[272];\
1235 uint8_t halfV[256];\
1236 uint8_t halfHV[256];\
1237 copy_block17(full, src, 24, stride, 17);\
1238 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1239 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1240 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1241 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1243 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1244 uint8_t full[24*17];\
1245 uint8_t halfH[272];\
1246 uint8_t halfHV[256];\
1247 copy_block17(full, src, 24, stride, 17);\
1248 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1249 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1250 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1251 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1253 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1254 uint8_t halfH[272];\
1255 uint8_t halfHV[256];\
1256 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1257 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1258 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1260 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1261 uint8_t halfH[272];\
1262 uint8_t halfHV[256];\
1263 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1264 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1265 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1267 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1268 uint8_t full[24*17];\
1269 uint8_t halfH[272];\
1270 uint8_t halfV[256];\
1271 uint8_t halfHV[256];\
1272 copy_block17(full, src, 24, stride, 17);\
1273 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1274 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1275 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1276 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1278 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1279 uint8_t full[24*17];\
1280 uint8_t halfH[272];\
1281 copy_block17(full, src, 24, stride, 17);\
1282 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1283 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1284 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1286 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1287 uint8_t full[24*17];\
1288 uint8_t halfH[272];\
1289 uint8_t halfV[256];\
1290 uint8_t halfHV[256];\
1291 copy_block17(full, src, 24, stride, 17);\
1292 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1293 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1294 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1295 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1297 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1298 uint8_t full[24*17];\
1299 uint8_t halfH[272];\
1300 copy_block17(full, src, 24, stride, 17);\
1301 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1302 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1303 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1305 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1306 uint8_t halfH[272];\
1307 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1308 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1311 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1312 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1313 #define op_put(a, b) a = cm[((b) + 16)>>5]
1314 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1316 QPEL_MC(0, put_ , _ , op_put)
1317 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1318 QPEL_MC(0, avg_ , _ , op_avg)
1319 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1321 #undef op_avg_no_rnd
1323 #undef op_put_no_rnd
1325 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1326 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1327 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1328 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1329 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1330 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
1332 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1333 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1337 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1338 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1339 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1340 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1341 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1342 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1343 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1344 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1350 #if CONFIG_RV40_DECODER
1351 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1352 put_pixels16_xy2_c(dst, src, stride, 16);
1354 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1355 avg_pixels16_xy2_c(dst, src, stride, 16);
1357 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1358 put_pixels8_xy2_c(dst, src, stride, 8);
1360 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1361 avg_pixels8_xy2_c(dst, src, stride, 8);
1363 #endif /* CONFIG_RV40_DECODER */
1365 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1366 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1370 const int src_1= src[ -srcStride];
1371 const int src0 = src[0 ];
1372 const int src1 = src[ srcStride];
1373 const int src2 = src[2*srcStride];
1374 const int src3 = src[3*srcStride];
1375 const int src4 = src[4*srcStride];
1376 const int src5 = src[5*srcStride];
1377 const int src6 = src[6*srcStride];
1378 const int src7 = src[7*srcStride];
1379 const int src8 = src[8*srcStride];
1380 const int src9 = src[9*srcStride];
1381 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1382 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1383 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1384 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1385 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1386 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1387 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1388 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1394 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1396 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1397 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1400 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1401 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1404 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1406 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1407 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1410 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1411 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1414 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1418 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1419 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1420 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1421 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1423 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1427 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1428 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1429 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1430 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1432 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1434 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1435 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1438 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1439 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1441 const int strength= ff_h263_loop_filter_strength[qscale];
1445 int p0= src[x-2*stride];
1446 int p1= src[x-1*stride];
1447 int p2= src[x+0*stride];
1448 int p3= src[x+1*stride];
1449 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1451 if (d<-2*strength) d1= 0;
1452 else if(d<- strength) d1=-2*strength - d;
1453 else if(d< strength) d1= d;
1454 else if(d< 2*strength) d1= 2*strength - d;
1459 if(p1&256) p1= ~(p1>>31);
1460 if(p2&256) p2= ~(p2>>31);
1462 src[x-1*stride] = p1;
1463 src[x+0*stride] = p2;
1467 d2= av_clip((p0-p3)/4, -ad1, ad1);
1469 src[x-2*stride] = p0 - d2;
1470 src[x+ stride] = p3 + d2;
1475 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1476 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1478 const int strength= ff_h263_loop_filter_strength[qscale];
1482 int p0= src[y*stride-2];
1483 int p1= src[y*stride-1];
1484 int p2= src[y*stride+0];
1485 int p3= src[y*stride+1];
1486 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1488 if (d<-2*strength) d1= 0;
1489 else if(d<- strength) d1=-2*strength - d;
1490 else if(d< strength) d1= d;
1491 else if(d< 2*strength) d1= 2*strength - d;
1496 if(p1&256) p1= ~(p1>>31);
1497 if(p2&256) p2= ~(p2>>31);
1499 src[y*stride-1] = p1;
1500 src[y*stride+0] = p2;
1504 d2= av_clip((p0-p3)/4, -ad1, ad1);
1506 src[y*stride-2] = p0 - d2;
1507 src[y*stride+1] = p3 + d2;
1512 static void h261_loop_filter_c(uint8_t *src, int stride){
1517 temp[x ] = 4*src[x ];
1518 temp[x + 7*8] = 4*src[x + 7*stride];
1522 xy = y * stride + x;
1524 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1529 src[ y*stride] = (temp[ y*8] + 2)>>2;
1530 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1532 xy = y * stride + x;
1534 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1539 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1545 s += abs(pix1[0] - pix2[0]);
1546 s += abs(pix1[1] - pix2[1]);
1547 s += abs(pix1[2] - pix2[2]);
1548 s += abs(pix1[3] - pix2[3]);
1549 s += abs(pix1[4] - pix2[4]);
1550 s += abs(pix1[5] - pix2[5]);
1551 s += abs(pix1[6] - pix2[6]);
1552 s += abs(pix1[7] - pix2[7]);
1553 s += abs(pix1[8] - pix2[8]);
1554 s += abs(pix1[9] - pix2[9]);
1555 s += abs(pix1[10] - pix2[10]);
1556 s += abs(pix1[11] - pix2[11]);
1557 s += abs(pix1[12] - pix2[12]);
1558 s += abs(pix1[13] - pix2[13]);
1559 s += abs(pix1[14] - pix2[14]);
1560 s += abs(pix1[15] - pix2[15]);
1567 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1573 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1574 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1575 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1576 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1577 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1578 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1579 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1580 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1581 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1582 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1583 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1584 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1585 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1586 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1587 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1588 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1595 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1598 uint8_t *pix3 = pix2 + line_size;
1602 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1603 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1604 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1605 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1606 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1607 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1608 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1609 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1610 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1611 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1612 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1613 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1614 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1615 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1616 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1617 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1625 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1628 uint8_t *pix3 = pix2 + line_size;
1632 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1633 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1634 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1635 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1636 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1637 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1638 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1639 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1640 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1641 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1642 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1643 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1644 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1645 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1646 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1647 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1655 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1661 s += abs(pix1[0] - pix2[0]);
1662 s += abs(pix1[1] - pix2[1]);
1663 s += abs(pix1[2] - pix2[2]);
1664 s += abs(pix1[3] - pix2[3]);
1665 s += abs(pix1[4] - pix2[4]);
1666 s += abs(pix1[5] - pix2[5]);
1667 s += abs(pix1[6] - pix2[6]);
1668 s += abs(pix1[7] - pix2[7]);
1675 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1681 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1682 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1683 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1684 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1685 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1686 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1687 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1688 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1695 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1698 uint8_t *pix3 = pix2 + line_size;
1702 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1703 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1704 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1705 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1706 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1707 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1708 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1709 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1717 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1720 uint8_t *pix3 = pix2 + line_size;
1724 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1725 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1726 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1727 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1728 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1729 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1730 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1731 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1739 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1740 MpegEncContext *c = v;
1746 for(x=0; x<16; x++){
1747 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1750 for(x=0; x<15; x++){
1751 score2+= FFABS( s1[x ] - s1[x +stride]
1752 - s1[x+1] + s1[x+1+stride])
1753 -FFABS( s2[x ] - s2[x +stride]
1754 - s2[x+1] + s2[x+1+stride]);
1761 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1762 else return score1 + FFABS(score2)*8;
1765 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1766 MpegEncContext *c = v;
1773 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1777 score2+= FFABS( s1[x ] - s1[x +stride]
1778 - s1[x+1] + s1[x+1+stride])
1779 -FFABS( s2[x ] - s2[x +stride]
1780 - s2[x+1] + s2[x+1+stride]);
1787 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1788 else return score1 + FFABS(score2)*8;
1791 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1795 for(i=0; i<8*8; i++){
1796 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1799 assert(-512<b && b<512);
1801 sum += (w*b)*(w*b)>>4;
1806 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1809 for(i=0; i<8*8; i++){
1810 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1815 * permutes an 8x8 block.
1816 * @param block the block which will be permuted according to the given permutation vector
1817 * @param permutation the permutation vector
1818 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1819 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1820 * (inverse) permutated to scantable order!
1822 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1828 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1830 for(i=0; i<=last; i++){
1831 const int j= scantable[i];
1836 for(i=0; i<=last; i++){
1837 const int j= scantable[i];
1838 const int perm_j= permutation[j];
1839 block[perm_j]= temp[j];
1843 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1847 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1850 memset(cmp, 0, sizeof(void*)*6);
1858 cmp[i]= c->hadamard8_diff[i];
1864 cmp[i]= c->dct_sad[i];
1867 cmp[i]= c->dct264_sad[i];
1870 cmp[i]= c->dct_max[i];
1873 cmp[i]= c->quant_psnr[i];
1902 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1907 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1909 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1910 long a = *(long*)(src+i);
1911 long b = *(long*)(dst+i);
1912 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1915 dst[i+0] += src[i+0];
1918 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1920 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1921 long a = *(long*)(src1+i);
1922 long b = *(long*)(src2+i);
1923 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1926 dst[i] = src1[i]+src2[i];
1929 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1931 #if !HAVE_FAST_UNALIGNED
1932 if((long)src2 & (sizeof(long)-1)){
1933 for(i=0; i+7<w; i+=8){
1934 dst[i+0] = src1[i+0]-src2[i+0];
1935 dst[i+1] = src1[i+1]-src2[i+1];
1936 dst[i+2] = src1[i+2]-src2[i+2];
1937 dst[i+3] = src1[i+3]-src2[i+3];
1938 dst[i+4] = src1[i+4]-src2[i+4];
1939 dst[i+5] = src1[i+5]-src2[i+5];
1940 dst[i+6] = src1[i+6]-src2[i+6];
1941 dst[i+7] = src1[i+7]-src2[i+7];
1945 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1946 long a = *(long*)(src1+i);
1947 long b = *(long*)(src2+i);
1948 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1951 dst[i+0] = src1[i+0]-src2[i+0];
1954 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1962 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1971 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1979 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1989 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1992 for(i=0; i<w-1; i++){
2019 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2049 #define BUTTERFLY2(o1,o2,i1,i2) \
2053 #define BUTTERFLY1(x,y) \
2062 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2064 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2072 //FIXME try pointer walks
2073 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2074 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2075 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2076 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2078 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2079 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2080 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2081 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2083 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2084 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2085 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2086 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2090 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2091 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2092 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2093 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2095 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2096 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2097 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2098 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2101 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2102 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2103 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2104 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2110 printf("MAX:%d\n", maxi);
2116 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2124 //FIXME try pointer walks
2125 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2126 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2127 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2128 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2130 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2131 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2132 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2133 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2135 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2136 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2137 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2138 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2142 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2143 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2144 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2145 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2147 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2148 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2149 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2150 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2153 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2154 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2155 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2156 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2159 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2164 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2165 MpegEncContext * const s= (MpegEncContext *)c;
2166 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2170 s->dsp.diff_pixels(temp, src1, src2, stride);
2172 return s->dsp.sum_abs_dctelem(temp);
2177 const int s07 = SRC(0) + SRC(7);\
2178 const int s16 = SRC(1) + SRC(6);\
2179 const int s25 = SRC(2) + SRC(5);\
2180 const int s34 = SRC(3) + SRC(4);\
2181 const int a0 = s07 + s34;\
2182 const int a1 = s16 + s25;\
2183 const int a2 = s07 - s34;\
2184 const int a3 = s16 - s25;\
2185 const int d07 = SRC(0) - SRC(7);\
2186 const int d16 = SRC(1) - SRC(6);\
2187 const int d25 = SRC(2) - SRC(5);\
2188 const int d34 = SRC(3) - SRC(4);\
2189 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2190 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2191 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2192 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2194 DST(1, a4 + (a7>>2)) ;\
2195 DST(2, a2 + (a3>>1)) ;\
2196 DST(3, a5 + (a6>>2)) ;\
2198 DST(5, a6 - (a5>>2)) ;\
2199 DST(6, (a2>>1) - a3 ) ;\
2200 DST(7, (a4>>2) - a7 ) ;\
2203 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2204 MpegEncContext * const s= (MpegEncContext *)c;
2209 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2211 #define SRC(x) dct[i][x]
2212 #define DST(x,v) dct[i][x]= v
2213 for( i = 0; i < 8; i++ )
2218 #define SRC(x) dct[x][i]
2219 #define DST(x,v) sum += FFABS(v)
2220 for( i = 0; i < 8; i++ )
2228 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2229 MpegEncContext * const s= (MpegEncContext *)c;
2230 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2235 s->dsp.diff_pixels(temp, src1, src2, stride);
2239 sum= FFMAX(sum, FFABS(temp[i]));
2244 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2245 MpegEncContext * const s= (MpegEncContext *)c;
2246 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2247 DCTELEM * const bak = temp+64;
2253 s->dsp.diff_pixels(temp, src1, src2, stride);
2255 memcpy(bak, temp, 64*sizeof(DCTELEM));
2257 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2258 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2259 ff_simple_idct(temp); //FIXME
2262 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2267 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2268 MpegEncContext * const s= (MpegEncContext *)c;
2269 const uint8_t *scantable= s->intra_scantable.permutated;
2270 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2271 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2272 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2273 int i, last, run, bits, level, distortion, start_i;
2274 const int esc_length= s->ac_esc_length;
2276 uint8_t * last_length;
2280 copy_block8(lsrc1, src1, 8, stride, 8);
2281 copy_block8(lsrc2, src2, 8, stride, 8);
2283 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2285 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2291 length = s->intra_ac_vlc_length;
2292 last_length= s->intra_ac_vlc_last_length;
2293 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2296 length = s->inter_ac_vlc_length;
2297 last_length= s->inter_ac_vlc_last_length;
2302 for(i=start_i; i<last; i++){
2303 int j= scantable[i];
2308 if((level&(~127)) == 0){
2309 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2318 level= temp[i] + 64;
2322 if((level&(~127)) == 0){
2323 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2331 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2333 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2336 s->dsp.idct_add(lsrc2, 8, temp);
2338 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2340 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2343 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2344 MpegEncContext * const s= (MpegEncContext *)c;
2345 const uint8_t *scantable= s->intra_scantable.permutated;
2346 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2347 int i, last, run, bits, level, start_i;
2348 const int esc_length= s->ac_esc_length;
2350 uint8_t * last_length;
2354 s->dsp.diff_pixels(temp, src1, src2, stride);
2356 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2362 length = s->intra_ac_vlc_length;
2363 last_length= s->intra_ac_vlc_last_length;
2364 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2367 length = s->inter_ac_vlc_length;
2368 last_length= s->inter_ac_vlc_last_length;
2373 for(i=start_i; i<last; i++){
2374 int j= scantable[i];
2379 if((level&(~127)) == 0){
2380 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2389 level= temp[i] + 64;
2393 if((level&(~127)) == 0){
2394 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2402 #define VSAD_INTRA(size) \
2403 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2407 for(y=1; y<h; y++){ \
2408 for(x=0; x<size; x+=4){ \
2409 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2410 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2420 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2425 for(x=0; x<16; x++){
2426 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2435 #define SQ(a) ((a)*(a))
2436 #define VSSE_INTRA(size) \
2437 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2441 for(y=1; y<h; y++){ \
2442 for(x=0; x<size; x+=4){ \
2443 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2444 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2454 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2459 for(x=0; x<16; x++){
2460 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2469 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2473 for(i=0; i<size; i++)
2474 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2478 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2479 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2480 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2482 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2484 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2485 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2486 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2487 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2489 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2491 for(i=0; i<len; i++)
2492 dst[i] = src0[i] * src1[i];
2495 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2498 for(i=0; i<len; i++)
2499 dst[i] = src0[i] * src1[-i];
2502 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2504 for(i=0; i<len; i++)
2505 dst[i] = src0[i] * src1[i] + src2[i];
2508 static void vector_fmul_window_c(float *dst, const float *src0,
2509 const float *src1, const float *win, int len)
2515 for(i=-len, j=len-1; i<0; i++, j--) {
2520 dst[i] = s0*wj - s1*wi;
2521 dst[j] = s0*wi + s1*wj;
2525 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2529 for (i = 0; i < len; i++)
2530 dst[i] = src[i] * mul;
2533 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2534 const float **sv, float mul, int len)
2537 for (i = 0; i < len; i += 2, sv++) {
2538 dst[i ] = src[i ] * sv[0][0] * mul;
2539 dst[i+1] = src[i+1] * sv[0][1] * mul;
2543 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2544 const float **sv, float mul, int len)
2547 for (i = 0; i < len; i += 4, sv++) {
2548 dst[i ] = src[i ] * sv[0][0] * mul;
2549 dst[i+1] = src[i+1] * sv[0][1] * mul;
2550 dst[i+2] = src[i+2] * sv[0][2] * mul;
2551 dst[i+3] = src[i+3] * sv[0][3] * mul;
2555 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2559 for (i = 0; i < len; i += 2, sv++) {
2560 dst[i ] = sv[0][0] * mul;
2561 dst[i+1] = sv[0][1] * mul;
2565 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2569 for (i = 0; i < len; i += 4, sv++) {
2570 dst[i ] = sv[0][0] * mul;
2571 dst[i+1] = sv[0][1] * mul;
2572 dst[i+2] = sv[0][2] * mul;
2573 dst[i+3] = sv[0][3] * mul;
2577 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2581 for (i = 0; i < len; i++) {
2582 float t = v1[i] - v2[i];
2588 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2593 for (i = 0; i < len; i++)
2599 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2600 uint32_t maxi, uint32_t maxisign)
2603 if(a > mini) return mini;
2604 else if((a^(1<<31)) > maxisign) return maxi;
2608 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2610 uint32_t mini = *(uint32_t*)min;
2611 uint32_t maxi = *(uint32_t*)max;
2612 uint32_t maxisign = maxi ^ (1<<31);
2613 uint32_t *dsti = (uint32_t*)dst;
2614 const uint32_t *srci = (const uint32_t*)src;
2615 for(i=0; i<len; i+=8) {
2616 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2617 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2618 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2619 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2620 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2621 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2622 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2623 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2626 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2628 if(min < 0 && max > 0) {
2629 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2631 for(i=0; i < len; i+=8) {
2632 dst[i ] = av_clipf(src[i ], min, max);
2633 dst[i + 1] = av_clipf(src[i + 1], min, max);
2634 dst[i + 2] = av_clipf(src[i + 2], min, max);
2635 dst[i + 3] = av_clipf(src[i + 3], min, max);
2636 dst[i + 4] = av_clipf(src[i + 4], min, max);
2637 dst[i + 5] = av_clipf(src[i + 5], min, max);
2638 dst[i + 6] = av_clipf(src[i + 6], min, max);
2639 dst[i + 7] = av_clipf(src[i + 7], min, max);
2644 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2649 res += (*v1++ * *v2++) >> shift;
2654 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2659 *v1++ += mul * *v3++;
2664 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2665 const int16_t *window, unsigned int len)
2668 int len2 = len >> 1;
2670 for (i = 0; i < len2; i++) {
2671 int16_t w = window[i];
2672 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2673 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2678 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2679 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2680 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2681 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2682 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2683 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2684 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2686 static void wmv2_idct_row(short * b)
2689 int a0,a1,a2,a3,a4,a5,a6,a7;
2691 a1 = W1*b[1]+W7*b[7];
2692 a7 = W7*b[1]-W1*b[7];
2693 a5 = W5*b[5]+W3*b[3];
2694 a3 = W3*b[5]-W5*b[3];
2695 a2 = W2*b[2]+W6*b[6];
2696 a6 = W6*b[2]-W2*b[6];
2697 a0 = W0*b[0]+W0*b[4];
2698 a4 = W0*b[0]-W0*b[4];
2700 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2701 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2703 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2704 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2705 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2706 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2707 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2708 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2709 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2710 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2712 static void wmv2_idct_col(short * b)
2715 int a0,a1,a2,a3,a4,a5,a6,a7;
2716 /*step 1, with extended precision*/
2717 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2718 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2719 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2720 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2721 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2722 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2723 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2724 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2726 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2727 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2729 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2730 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2731 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2732 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2734 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2735 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2736 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2737 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2739 void ff_wmv2_idct_c(short * block){
2743 wmv2_idct_row(block+i);
2746 wmv2_idct_col(block+i);
2749 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2751 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2753 ff_wmv2_idct_c(block);
2754 ff_put_pixels_clamped_c(block, dest, line_size);
2756 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2758 ff_wmv2_idct_c(block);
2759 ff_add_pixels_clamped_c(block, dest, line_size);
2761 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2764 ff_put_pixels_clamped_c(block, dest, line_size);
2766 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2769 ff_add_pixels_clamped_c(block, dest, line_size);
2772 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2775 put_pixels_clamped4_c(block, dest, line_size);
2777 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2780 add_pixels_clamped4_c(block, dest, line_size);
2783 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2786 put_pixels_clamped2_c(block, dest, line_size);
2788 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2791 add_pixels_clamped2_c(block, dest, line_size);
2794 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2796 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2798 dest[0] = cm[(block[0] + 4)>>3];
2800 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2802 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2804 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2807 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2809 /* init static data */
2810 av_cold void dsputil_static_init(void)
2814 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2815 for(i=0;i<MAX_NEG_CROP;i++) {
2817 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2820 for(i=0;i<512;i++) {
2821 ff_squareTbl[i] = (i - 256) * (i - 256);
2824 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2827 int ff_check_alignment(void){
2828 static int did_fail=0;
2829 DECLARE_ALIGNED(16, int, aligned);
2831 if((intptr_t)&aligned & 15){
2833 #if HAVE_MMX || HAVE_ALTIVEC
2834 av_log(NULL, AV_LOG_ERROR,
2835 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2836 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2837 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2838 "Do not report crashes to FFmpeg developers.\n");
2847 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2851 ff_check_alignment();
2854 if(avctx->dct_algo==FF_DCT_FASTINT) {
2855 c->fdct = fdct_ifast;
2856 c->fdct248 = fdct_ifast248;
2858 else if(avctx->dct_algo==FF_DCT_FAAN) {
2859 c->fdct = ff_faandct;
2860 c->fdct248 = ff_faandct248;
2863 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2864 c->fdct248 = ff_fdct248_islow;
2866 #endif //CONFIG_ENCODERS
2868 if(avctx->lowres==1){
2869 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2870 c->idct_put= ff_jref_idct4_put;
2871 c->idct_add= ff_jref_idct4_add;
2873 c->idct_put= ff_h264_lowres_idct_put_c;
2874 c->idct_add= ff_h264_lowres_idct_add_c;
2876 c->idct = j_rev_dct4;
2877 c->idct_permutation_type= FF_NO_IDCT_PERM;
2878 }else if(avctx->lowres==2){
2879 c->idct_put= ff_jref_idct2_put;
2880 c->idct_add= ff_jref_idct2_add;
2881 c->idct = j_rev_dct2;
2882 c->idct_permutation_type= FF_NO_IDCT_PERM;
2883 }else if(avctx->lowres==3){
2884 c->idct_put= ff_jref_idct1_put;
2885 c->idct_add= ff_jref_idct1_add;
2886 c->idct = j_rev_dct1;
2887 c->idct_permutation_type= FF_NO_IDCT_PERM;
2889 if(avctx->idct_algo==FF_IDCT_INT){
2890 c->idct_put= ff_jref_idct_put;
2891 c->idct_add= ff_jref_idct_add;
2892 c->idct = j_rev_dct;
2893 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2894 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2895 avctx->idct_algo==FF_IDCT_VP3){
2896 c->idct_put= ff_vp3_idct_put_c;
2897 c->idct_add= ff_vp3_idct_add_c;
2898 c->idct = ff_vp3_idct_c;
2899 c->idct_permutation_type= FF_NO_IDCT_PERM;
2900 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2901 c->idct_put= ff_wmv2_idct_put_c;
2902 c->idct_add= ff_wmv2_idct_add_c;
2903 c->idct = ff_wmv2_idct_c;
2904 c->idct_permutation_type= FF_NO_IDCT_PERM;
2905 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2906 c->idct_put= ff_faanidct_put;
2907 c->idct_add= ff_faanidct_add;
2908 c->idct = ff_faanidct;
2909 c->idct_permutation_type= FF_NO_IDCT_PERM;
2910 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2911 c->idct_put= ff_ea_idct_put_c;
2912 c->idct_permutation_type= FF_NO_IDCT_PERM;
2913 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2914 c->idct = ff_bink_idct_c;
2915 c->idct_add = ff_bink_idct_add_c;
2916 c->idct_put = ff_bink_idct_put_c;
2917 c->idct_permutation_type = FF_NO_IDCT_PERM;
2918 }else{ //accurate/default
2919 c->idct_put= ff_simple_idct_put;
2920 c->idct_add= ff_simple_idct_add;
2921 c->idct = ff_simple_idct;
2922 c->idct_permutation_type= FF_NO_IDCT_PERM;
2926 c->get_pixels = get_pixels_c;
2927 c->diff_pixels = diff_pixels_c;
2928 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2929 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2930 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2931 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2932 c->add_pixels8 = add_pixels8_c;
2933 c->add_pixels4 = add_pixels4_c;
2934 c->sum_abs_dctelem = sum_abs_dctelem_c;
2935 c->emulated_edge_mc = ff_emulated_edge_mc;
2938 c->clear_block = clear_block_c;
2939 c->clear_blocks = clear_blocks_c;
2940 c->pix_sum = pix_sum_c;
2941 c->pix_norm1 = pix_norm1_c;
2943 c->fill_block_tab[0] = fill_block16_c;
2944 c->fill_block_tab[1] = fill_block8_c;
2945 c->scale_block = scale_block_c;
2947 /* TODO [0] 16 [1] 8 */
2948 c->pix_abs[0][0] = pix_abs16_c;
2949 c->pix_abs[0][1] = pix_abs16_x2_c;
2950 c->pix_abs[0][2] = pix_abs16_y2_c;
2951 c->pix_abs[0][3] = pix_abs16_xy2_c;
2952 c->pix_abs[1][0] = pix_abs8_c;
2953 c->pix_abs[1][1] = pix_abs8_x2_c;
2954 c->pix_abs[1][2] = pix_abs8_y2_c;
2955 c->pix_abs[1][3] = pix_abs8_xy2_c;
2957 #define dspfunc(PFX, IDX, NUM) \
2958 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2959 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2960 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2961 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2963 dspfunc(put, 0, 16);
2964 dspfunc(put_no_rnd, 0, 16);
2966 dspfunc(put_no_rnd, 1, 8);
2970 dspfunc(avg, 0, 16);
2971 dspfunc(avg_no_rnd, 0, 16);
2973 dspfunc(avg_no_rnd, 1, 8);
2978 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
2979 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
2981 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2982 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2983 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2984 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2985 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2986 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2987 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2988 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2989 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2991 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2992 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2993 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2994 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2995 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2996 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2997 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2998 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2999 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3001 #define dspfunc(PFX, IDX, NUM) \
3002 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3003 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3004 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3005 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3006 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3007 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3008 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3009 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3010 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3011 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3012 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3013 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3014 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3015 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3016 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3017 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3019 dspfunc(put_qpel, 0, 16);
3020 dspfunc(put_no_rnd_qpel, 0, 16);
3022 dspfunc(avg_qpel, 0, 16);
3023 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3025 dspfunc(put_qpel, 1, 8);
3026 dspfunc(put_no_rnd_qpel, 1, 8);
3028 dspfunc(avg_qpel, 1, 8);
3029 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3031 dspfunc(put_h264_qpel, 0, 16);
3032 dspfunc(put_h264_qpel, 1, 8);
3033 dspfunc(put_h264_qpel, 2, 4);
3034 dspfunc(put_h264_qpel, 3, 2);
3035 dspfunc(avg_h264_qpel, 0, 16);
3036 dspfunc(avg_h264_qpel, 1, 8);
3037 dspfunc(avg_h264_qpel, 2, 4);
3040 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3041 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3042 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3043 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3044 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3045 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3047 c->draw_edges = draw_edges_c;
3049 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3050 ff_mlp_init(c, avctx);
3052 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3053 ff_intrax8dsp_init(c,avctx);
3055 #if CONFIG_RV30_DECODER
3056 ff_rv30dsp_init(c,avctx);
3058 #if CONFIG_RV40_DECODER
3059 ff_rv40dsp_init(c,avctx);
3060 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3061 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3062 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3063 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3066 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3067 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3068 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3069 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3070 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3071 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3072 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3073 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3075 #define SET_CMP_FUNC(name) \
3076 c->name[0]= name ## 16_c;\
3077 c->name[1]= name ## 8x8_c;
3079 SET_CMP_FUNC(hadamard8_diff)
3080 c->hadamard8_diff[4]= hadamard8_intra16_c;
3081 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3082 SET_CMP_FUNC(dct_sad)
3083 SET_CMP_FUNC(dct_max)
3085 SET_CMP_FUNC(dct264_sad)
3087 c->sad[0]= pix_abs16_c;
3088 c->sad[1]= pix_abs8_c;
3092 SET_CMP_FUNC(quant_psnr)
3095 c->vsad[0]= vsad16_c;
3096 c->vsad[4]= vsad_intra16_c;
3097 c->vsad[5]= vsad_intra8_c;
3098 c->vsse[0]= vsse16_c;
3099 c->vsse[4]= vsse_intra16_c;
3100 c->vsse[5]= vsse_intra8_c;
3101 c->nsse[0]= nsse16_c;
3102 c->nsse[1]= nsse8_c;
3104 ff_dsputil_init_dwt(c);
3107 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3109 c->add_bytes= add_bytes_c;
3110 c->add_bytes_l2= add_bytes_l2_c;
3111 c->diff_bytes= diff_bytes_c;
3112 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3113 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3114 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3115 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3116 c->bswap_buf= bswap_buf;
3117 c->bswap16_buf = bswap16_buf;
3118 #if CONFIG_PNG_DECODER
3119 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
3122 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3123 c->h263_h_loop_filter= h263_h_loop_filter_c;
3124 c->h263_v_loop_filter= h263_v_loop_filter_c;
3127 if (CONFIG_VP3_DECODER) {
3128 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3129 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3130 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3133 c->h261_loop_filter= h261_loop_filter_c;
3135 c->try_8x8basis= try_8x8basis_c;
3136 c->add_8x8basis= add_8x8basis_c;
3138 #if CONFIG_VORBIS_DECODER
3139 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3141 #if CONFIG_AC3_DECODER
3142 c->ac3_downmix = ff_ac3_downmix_c;
3144 c->vector_fmul = vector_fmul_c;
3145 c->vector_fmul_reverse = vector_fmul_reverse_c;
3146 c->vector_fmul_add = vector_fmul_add_c;
3147 c->vector_fmul_window = vector_fmul_window_c;
3148 c->vector_clipf = vector_clipf_c;
3149 c->scalarproduct_int16 = scalarproduct_int16_c;
3150 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3151 c->apply_window_int16 = apply_window_int16_c;
3152 c->scalarproduct_float = scalarproduct_float_c;
3153 c->butterflies_float = butterflies_float_c;
3154 c->vector_fmul_scalar = vector_fmul_scalar_c;
3156 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3157 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3159 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3160 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3162 c->shrink[0]= av_image_copy_plane;
3163 c->shrink[1]= ff_shrink22;
3164 c->shrink[2]= ff_shrink44;
3165 c->shrink[3]= ff_shrink88;
3167 c->prefetch= just_return;
3169 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3170 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3172 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3173 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3174 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3175 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3176 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3177 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3178 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3179 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3180 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3182 for(i=0; i<64; i++){
3183 if(!c->put_2tap_qpel_pixels_tab[0][i])
3184 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3185 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3186 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3189 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3190 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3191 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3192 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3194 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3195 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3196 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3197 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3199 switch(c->idct_permutation_type){
3200 case FF_NO_IDCT_PERM:
3202 c->idct_permutation[i]= i;
3204 case FF_LIBMPEG2_IDCT_PERM:
3206 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3208 case FF_SIMPLE_IDCT_PERM:
3210 c->idct_permutation[i]= simple_mmx_permutation[i];
3212 case FF_TRANSPOSE_IDCT_PERM:
3214 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3216 case FF_PARTTRANS_IDCT_PERM:
3218 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3220 case FF_SSE2_IDCT_PERM:
3222 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3225 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");