3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43 uint32_t ff_squareTbl[512] = {0, };
46 #include "dsputil_template.c"
50 #include "dsputil_template.c"
54 #include "dsputil_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL/255 * 0x7f)
58 #define pb_80 (~0UL/255 * 0x80)
60 const uint8_t ff_zigzag_direct[64] = {
61 0, 1, 8, 16, 9, 2, 3, 10,
62 17, 24, 32, 25, 18, 11, 4, 5,
63 12, 19, 26, 33, 40, 48, 41, 34,
64 27, 20, 13, 6, 7, 14, 21, 28,
65 35, 42, 49, 56, 57, 50, 43, 36,
66 29, 22, 15, 23, 30, 37, 44, 51,
67 58, 59, 52, 45, 38, 31, 39, 46,
68 53, 60, 61, 54, 47, 55, 62, 63
71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
72 specification, we interleave the fields */
73 const uint8_t ff_zigzag248_direct[64] = {
74 0, 8, 1, 9, 16, 24, 2, 10,
75 17, 25, 32, 40, 48, 56, 33, 41,
76 18, 26, 3, 11, 4, 12, 19, 27,
77 34, 42, 49, 57, 50, 58, 35, 43,
78 20, 28, 5, 13, 6, 14, 21, 29,
79 36, 44, 51, 59, 52, 60, 37, 45,
80 22, 30, 7, 15, 23, 31, 38, 46,
81 53, 61, 54, 62, 39, 47, 55, 63,
84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87 const uint8_t ff_alternate_horizontal_scan[64] = {
88 0, 1, 2, 3, 8, 9, 16, 17,
89 10, 11, 4, 5, 6, 7, 15, 14,
90 13, 12, 19, 18, 24, 25, 32, 33,
91 26, 27, 20, 21, 22, 23, 28, 29,
92 30, 31, 34, 35, 40, 41, 48, 49,
93 42, 43, 36, 37, 38, 39, 44, 45,
94 46, 47, 50, 51, 56, 57, 58, 59,
95 52, 53, 54, 55, 60, 61, 62, 63,
98 const uint8_t ff_alternate_vertical_scan[64] = {
99 0, 8, 16, 24, 1, 9, 2, 10,
100 17, 25, 32, 40, 48, 56, 57, 49,
101 41, 33, 26, 18, 3, 11, 4, 12,
102 19, 27, 34, 42, 50, 58, 35, 43,
103 51, 59, 20, 28, 5, 13, 6, 14,
104 21, 29, 36, 44, 52, 60, 37, 45,
105 53, 61, 22, 30, 7, 15, 23, 31,
106 38, 46, 54, 62, 39, 47, 55, 63,
109 /* Input permutation for the simple_idct_mmx */
110 static const uint8_t simple_mmx_permutation[64]={
111 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
112 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
113 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
114 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
115 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
116 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
117 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
118 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
127 st->scantable= src_scantable;
131 j = src_scantable[i];
132 st->permutated[i] = permutation[j];
141 j = st->permutated[i];
143 st->raster_end[i]= end;
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= av_bswap32(src[i+0]);
222 dst[i+1]= av_bswap32(src[i+1]);
223 dst[i+2]= av_bswap32(src[i+2]);
224 dst[i+3]= av_bswap32(src[i+3]);
225 dst[i+4]= av_bswap32(src[i+4]);
226 dst[i+5]= av_bswap32(src[i+5]);
227 dst[i+6]= av_bswap32(src[i+6]);
228 dst[i+7]= av_bswap32(src[i+7]);
231 dst[i+0]= av_bswap32(src[i+0]);
235 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
238 *dst++ = av_bswap16(*src++);
241 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244 uint32_t *sq = ff_squareTbl + 256;
247 for (i = 0; i < h; i++) {
248 s += sq[pix1[0] - pix2[0]];
249 s += sq[pix1[1] - pix2[1]];
250 s += sq[pix1[2] - pix2[2]];
251 s += sq[pix1[3] - pix2[3]];
258 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
261 uint32_t *sq = ff_squareTbl + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
269 s += sq[pix1[4] - pix2[4]];
270 s += sq[pix1[5] - pix2[5]];
271 s += sq[pix1[6] - pix2[6]];
272 s += sq[pix1[7] - pix2[7]];
279 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
282 uint32_t *sq = ff_squareTbl + 256;
285 for (i = 0; i < h; i++) {
286 s += sq[pix1[ 0] - pix2[ 0]];
287 s += sq[pix1[ 1] - pix2[ 1]];
288 s += sq[pix1[ 2] - pix2[ 2]];
289 s += sq[pix1[ 3] - pix2[ 3]];
290 s += sq[pix1[ 4] - pix2[ 4]];
291 s += sq[pix1[ 5] - pix2[ 5]];
292 s += sq[pix1[ 6] - pix2[ 6]];
293 s += sq[pix1[ 7] - pix2[ 7]];
294 s += sq[pix1[ 8] - pix2[ 8]];
295 s += sq[pix1[ 9] - pix2[ 9]];
296 s += sq[pix1[10] - pix2[10]];
297 s += sq[pix1[11] - pix2[11]];
298 s += sq[pix1[12] - pix2[12]];
299 s += sq[pix1[13] - pix2[13]];
300 s += sq[pix1[14] - pix2[14]];
301 s += sq[pix1[15] - pix2[15]];
309 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
313 /* read the pixels */
315 block[0] = pixels[0];
316 block[1] = pixels[1];
317 block[2] = pixels[2];
318 block[3] = pixels[3];
319 block[4] = pixels[4];
320 block[5] = pixels[5];
321 block[6] = pixels[6];
322 block[7] = pixels[7];
328 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
329 const uint8_t *s2, int stride){
332 /* read the pixels */
334 block[0] = s1[0] - s2[0];
335 block[1] = s1[1] - s2[1];
336 block[2] = s1[2] - s2[2];
337 block[3] = s1[3] - s2[3];
338 block[4] = s1[4] - s2[4];
339 block[5] = s1[5] - s2[5];
340 block[6] = s1[6] - s2[6];
341 block[7] = s1[7] - s2[7];
349 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
353 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
355 /* read the pixels */
357 pixels[0] = cm[block[0]];
358 pixels[1] = cm[block[1]];
359 pixels[2] = cm[block[2]];
360 pixels[3] = cm[block[3]];
361 pixels[4] = cm[block[4]];
362 pixels[5] = cm[block[5]];
363 pixels[6] = cm[block[6]];
364 pixels[7] = cm[block[7]];
371 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377 /* read the pixels */
379 pixels[0] = cm[block[0]];
380 pixels[1] = cm[block[1]];
381 pixels[2] = cm[block[2]];
382 pixels[3] = cm[block[3]];
389 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
393 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
395 /* read the pixels */
397 pixels[0] = cm[block[0]];
398 pixels[1] = cm[block[1]];
405 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
406 uint8_t *restrict pixels,
411 for (i = 0; i < 8; i++) {
412 for (j = 0; j < 8; j++) {
415 else if (*block > 127)
418 *pixels = (uint8_t)(*block + 128);
422 pixels += (line_size - 8);
426 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 /* read the pixels */
433 pixels[0] = block[0];
434 pixels[1] = block[1];
435 pixels[2] = block[2];
436 pixels[3] = block[3];
437 pixels[4] = block[4];
438 pixels[5] = block[5];
439 pixels[6] = block[6];
440 pixels[7] = block[7];
447 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
451 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
453 /* read the pixels */
455 pixels[0] = cm[pixels[0] + block[0]];
456 pixels[1] = cm[pixels[1] + block[1]];
457 pixels[2] = cm[pixels[2] + block[2]];
458 pixels[3] = cm[pixels[3] + block[3]];
459 pixels[4] = cm[pixels[4] + block[4]];
460 pixels[5] = cm[pixels[5] + block[5]];
461 pixels[6] = cm[pixels[6] + block[6]];
462 pixels[7] = cm[pixels[7] + block[7]];
468 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
472 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474 /* read the pixels */
476 pixels[0] = cm[pixels[0] + block[0]];
477 pixels[1] = cm[pixels[1] + block[1]];
478 pixels[2] = cm[pixels[2] + block[2]];
479 pixels[3] = cm[pixels[3] + block[3]];
485 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
489 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491 /* read the pixels */
493 pixels[0] = cm[pixels[0] + block[0]];
494 pixels[1] = cm[pixels[1] + block[1]];
500 static int sum_abs_dctelem_c(DCTELEM *block)
504 sum+= FFABS(block[i]);
508 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
512 for (i = 0; i < h; i++) {
513 memset(block, value, 16);
518 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
522 for (i = 0; i < h; i++) {
523 memset(block, value, 8);
528 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
531 uint16_t *dst1 = (uint16_t *) dst;
532 uint16_t *dst2 = (uint16_t *)(dst + linesize);
534 for (j = 0; j < 8; j++) {
535 for (i = 0; i < 8; i++) {
536 dst1[i] = dst2[i] = src[i] * 0x0101;
544 #define avg2(a,b) ((a+b+1)>>1)
545 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
547 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
549 const int A=(16-x16)*(16-y16);
550 const int B=( x16)*(16-y16);
551 const int C=(16-x16)*( y16);
552 const int D=( x16)*( y16);
557 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
558 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
559 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
560 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
561 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
562 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
563 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
564 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
570 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
571 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
574 const int s= 1<<shift;
584 for(x=0; x<8; x++){ //XXX FIXME optimize
585 int src_x, src_y, frac_x, frac_y, index;
594 if((unsigned)src_x < width){
595 if((unsigned)src_y < height){
596 index= src_x + src_y*stride;
597 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
598 + src[index +1]* frac_x )*(s-frac_y)
599 + ( src[index+stride ]*(s-frac_x)
600 + src[index+stride+1]* frac_x )* frac_y
603 index= src_x + av_clip(src_y, 0, height)*stride;
604 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
605 + src[index +1]* frac_x )*s
609 if((unsigned)src_y < height){
610 index= av_clip(src_x, 0, width) + src_y*stride;
611 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
612 + src[index+stride ]* frac_y )*s
615 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
616 dst[y*stride + x]= src[index ];
628 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 case 2: put_pixels2_8_c (dst, src, stride, height); break;
631 case 4: put_pixels4_8_c (dst, src, stride, height); break;
632 case 8: put_pixels8_8_c (dst, src, stride, height); break;
633 case 16:put_pixels16_8_c(dst, src, stride, height); break;
637 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
659 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
670 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 for (i=0; i < height; i++) {
673 for (j=0; j < width; j++) {
674 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
681 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 for (i=0; i < height; i++) {
684 for (j=0; j < width; j++) {
685 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
692 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 for (i=0; i < height; i++) {
695 for (j=0; j < width; j++) {
696 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
703 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 for (i=0; i < height; i++) {
706 for (j=0; j < width; j++) {
707 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
714 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 for (i=0; i < height; i++) {
717 for (j=0; j < width; j++) {
718 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
725 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
728 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
729 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
730 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
734 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
767 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 for (i=0; i < height; i++) {
770 for (j=0; j < width; j++) {
771 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
778 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 for (i=0; i < height; i++) {
781 for (j=0; j < width; j++) {
782 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
789 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 for (i=0; i < height; i++) {
792 for (j=0; j < width; j++) {
793 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
800 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
802 for (i=0; i < height; i++) {
803 for (j=0; j < width; j++) {
804 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
811 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
813 for (i=0; i < height; i++) {
814 for (j=0; j < width; j++) {
815 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
822 #define QPEL_MC(r, OPNAME, RND, OP) \
823 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
824 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
828 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
829 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
830 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
831 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
832 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
833 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
834 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
835 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
841 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
843 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
847 const int src0= src[0*srcStride];\
848 const int src1= src[1*srcStride];\
849 const int src2= src[2*srcStride];\
850 const int src3= src[3*srcStride];\
851 const int src4= src[4*srcStride];\
852 const int src5= src[5*srcStride];\
853 const int src6= src[6*srcStride];\
854 const int src7= src[7*srcStride];\
855 const int src8= src[8*srcStride];\
856 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
857 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
858 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
859 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
860 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
861 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
862 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
863 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
869 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
870 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
875 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
876 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
877 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
878 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
879 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
880 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
881 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
882 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
883 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
884 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
885 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
886 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
887 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
888 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
889 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
890 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
896 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
897 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
902 const int src0= src[0*srcStride];\
903 const int src1= src[1*srcStride];\
904 const int src2= src[2*srcStride];\
905 const int src3= src[3*srcStride];\
906 const int src4= src[4*srcStride];\
907 const int src5= src[5*srcStride];\
908 const int src6= src[6*srcStride];\
909 const int src7= src[7*srcStride];\
910 const int src8= src[8*srcStride];\
911 const int src9= src[9*srcStride];\
912 const int src10= src[10*srcStride];\
913 const int src11= src[11*srcStride];\
914 const int src12= src[12*srcStride];\
915 const int src13= src[13*srcStride];\
916 const int src14= src[14*srcStride];\
917 const int src15= src[15*srcStride];\
918 const int src16= src[16*srcStride];\
919 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
920 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
921 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
922 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
923 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
924 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
925 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
926 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
927 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
928 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
929 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
930 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
931 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
932 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
933 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
934 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
940 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
942 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
943 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
946 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
947 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
950 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
952 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
953 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
956 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
959 copy_block9(full, src, 16, stride, 9);\
960 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
961 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
964 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
966 copy_block9(full, src, 16, stride, 9);\
967 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
970 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
973 copy_block9(full, src, 16, stride, 9);\
974 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
975 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
977 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
982 copy_block9(full, src, 16, stride, 9);\
983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
984 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
986 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
988 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
992 copy_block9(full, src, 16, stride, 9);\
993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
994 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
996 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
998 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t halfHV[64];\
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1010 uint8_t full[16*9];\
1012 uint8_t halfHV[64];\
1013 copy_block9(full, src, 16, stride, 9);\
1014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t full[16*9];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1040 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1047 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1052 uint8_t full[16*9];\
1054 uint8_t halfHV[64];\
1055 copy_block9(full, src, 16, stride, 9);\
1056 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1057 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1058 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1059 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1063 uint8_t halfHV[64];\
1064 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1065 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1066 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1068 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1070 uint8_t halfHV[64];\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1072 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1073 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1075 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1076 uint8_t full[16*9];\
1079 uint8_t halfHV[64];\
1080 copy_block9(full, src, 16, stride, 9);\
1081 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1082 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1083 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1084 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1086 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1087 uint8_t full[16*9];\
1089 copy_block9(full, src, 16, stride, 9);\
1090 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1091 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1092 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1094 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1095 uint8_t full[16*9];\
1098 uint8_t halfHV[64];\
1099 copy_block9(full, src, 16, stride, 9);\
1100 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1102 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1103 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1105 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1106 uint8_t full[16*9];\
1108 copy_block9(full, src, 16, stride, 9);\
1109 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1110 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1111 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1113 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1116 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1119 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1121 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1122 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1125 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1126 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1129 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1131 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1132 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1135 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1136 uint8_t full[24*17];\
1138 copy_block17(full, src, 24, stride, 17);\
1139 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1140 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1143 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1144 uint8_t full[24*17];\
1145 copy_block17(full, src, 24, stride, 17);\
1146 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1149 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1150 uint8_t full[24*17];\
1152 copy_block17(full, src, 24, stride, 17);\
1153 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1154 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1156 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1157 uint8_t full[24*17];\
1158 uint8_t halfH[272];\
1159 uint8_t halfV[256];\
1160 uint8_t halfHV[256];\
1161 copy_block17(full, src, 24, stride, 17);\
1162 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1163 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1164 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1165 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1167 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1168 uint8_t full[24*17];\
1169 uint8_t halfH[272];\
1170 uint8_t halfHV[256];\
1171 copy_block17(full, src, 24, stride, 17);\
1172 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1173 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1175 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1177 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178 uint8_t full[24*17];\
1179 uint8_t halfH[272];\
1180 uint8_t halfV[256];\
1181 uint8_t halfHV[256];\
1182 copy_block17(full, src, 24, stride, 17);\
1183 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1189 uint8_t full[24*17];\
1190 uint8_t halfH[272];\
1191 uint8_t halfHV[256];\
1192 copy_block17(full, src, 24, stride, 17);\
1193 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfV[256];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfHV[256];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1219 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220 uint8_t full[24*17];\
1221 uint8_t halfH[272];\
1222 uint8_t halfV[256];\
1223 uint8_t halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1231 uint8_t full[24*17];\
1232 uint8_t halfH[272];\
1233 uint8_t halfHV[256];\
1234 copy_block17(full, src, 24, stride, 17);\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1241 uint8_t halfH[272];\
1242 uint8_t halfHV[256];\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1244 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1245 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1247 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1248 uint8_t halfH[272];\
1249 uint8_t halfHV[256];\
1250 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1251 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1252 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1254 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1255 uint8_t full[24*17];\
1256 uint8_t halfH[272];\
1257 uint8_t halfV[256];\
1258 uint8_t halfHV[256];\
1259 copy_block17(full, src, 24, stride, 17);\
1260 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1261 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1265 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1266 uint8_t full[24*17];\
1267 uint8_t halfH[272];\
1268 copy_block17(full, src, 24, stride, 17);\
1269 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1270 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1271 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1273 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1274 uint8_t full[24*17];\
1275 uint8_t halfH[272];\
1276 uint8_t halfV[256];\
1277 uint8_t halfHV[256];\
1278 copy_block17(full, src, 24, stride, 17);\
1279 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1280 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1281 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1282 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1284 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1285 uint8_t full[24*17];\
1286 uint8_t halfH[272];\
1287 copy_block17(full, src, 24, stride, 17);\
1288 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1289 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1290 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1292 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1293 uint8_t halfH[272];\
1294 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1295 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1298 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1299 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1300 #define op_put(a, b) a = cm[((b) + 16)>>5]
1301 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1303 QPEL_MC(0, put_ , _ , op_put)
1304 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1305 QPEL_MC(0, avg_ , _ , op_avg)
1306 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1308 #undef op_avg_no_rnd
1310 #undef op_put_no_rnd
1312 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1313 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1314 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1315 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1316 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1317 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1319 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1320 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1324 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1325 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1326 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1327 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1328 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1329 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1330 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1331 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1337 #if CONFIG_RV40_DECODER
1338 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1339 put_pixels16_xy2_8_c(dst, src, stride, 16);
1341 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1342 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1344 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1345 put_pixels8_xy2_8_c(dst, src, stride, 8);
1347 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1348 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1350 #endif /* CONFIG_RV40_DECODER */
1352 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1353 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1357 const int src_1= src[ -srcStride];
1358 const int src0 = src[0 ];
1359 const int src1 = src[ srcStride];
1360 const int src2 = src[2*srcStride];
1361 const int src3 = src[3*srcStride];
1362 const int src4 = src[4*srcStride];
1363 const int src5 = src[5*srcStride];
1364 const int src6 = src[6*srcStride];
1365 const int src7 = src[7*srcStride];
1366 const int src8 = src[8*srcStride];
1367 const int src9 = src[9*srcStride];
1368 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1369 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1370 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1371 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1372 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1373 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1374 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1375 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1381 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1383 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1387 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1388 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1391 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1393 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1397 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1398 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1401 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1405 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1407 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1408 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1410 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1414 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1415 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1416 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1417 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1419 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1425 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1426 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1428 const int strength= ff_h263_loop_filter_strength[qscale];
1432 int p0= src[x-2*stride];
1433 int p1= src[x-1*stride];
1434 int p2= src[x+0*stride];
1435 int p3= src[x+1*stride];
1436 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1438 if (d<-2*strength) d1= 0;
1439 else if(d<- strength) d1=-2*strength - d;
1440 else if(d< strength) d1= d;
1441 else if(d< 2*strength) d1= 2*strength - d;
1446 if(p1&256) p1= ~(p1>>31);
1447 if(p2&256) p2= ~(p2>>31);
1449 src[x-1*stride] = p1;
1450 src[x+0*stride] = p2;
1454 d2= av_clip((p0-p3)/4, -ad1, ad1);
1456 src[x-2*stride] = p0 - d2;
1457 src[x+ stride] = p3 + d2;
1462 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1463 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1465 const int strength= ff_h263_loop_filter_strength[qscale];
1469 int p0= src[y*stride-2];
1470 int p1= src[y*stride-1];
1471 int p2= src[y*stride+0];
1472 int p3= src[y*stride+1];
1473 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1475 if (d<-2*strength) d1= 0;
1476 else if(d<- strength) d1=-2*strength - d;
1477 else if(d< strength) d1= d;
1478 else if(d< 2*strength) d1= 2*strength - d;
1483 if(p1&256) p1= ~(p1>>31);
1484 if(p2&256) p2= ~(p2>>31);
1486 src[y*stride-1] = p1;
1487 src[y*stride+0] = p2;
1491 d2= av_clip((p0-p3)/4, -ad1, ad1);
1493 src[y*stride-2] = p0 - d2;
1494 src[y*stride+1] = p3 + d2;
1499 static void h261_loop_filter_c(uint8_t *src, int stride){
1504 temp[x ] = 4*src[x ];
1505 temp[x + 7*8] = 4*src[x + 7*stride];
1509 xy = y * stride + x;
1511 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1516 src[ y*stride] = (temp[ y*8] + 2)>>2;
1517 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1519 xy = y * stride + x;
1521 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1526 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1532 s += abs(pix1[0] - pix2[0]);
1533 s += abs(pix1[1] - pix2[1]);
1534 s += abs(pix1[2] - pix2[2]);
1535 s += abs(pix1[3] - pix2[3]);
1536 s += abs(pix1[4] - pix2[4]);
1537 s += abs(pix1[5] - pix2[5]);
1538 s += abs(pix1[6] - pix2[6]);
1539 s += abs(pix1[7] - pix2[7]);
1540 s += abs(pix1[8] - pix2[8]);
1541 s += abs(pix1[9] - pix2[9]);
1542 s += abs(pix1[10] - pix2[10]);
1543 s += abs(pix1[11] - pix2[11]);
1544 s += abs(pix1[12] - pix2[12]);
1545 s += abs(pix1[13] - pix2[13]);
1546 s += abs(pix1[14] - pix2[14]);
1547 s += abs(pix1[15] - pix2[15]);
1554 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1560 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1561 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1562 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1563 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1564 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1565 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1566 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1567 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1568 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1569 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1570 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1571 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1572 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1573 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1574 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1575 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1582 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1585 uint8_t *pix3 = pix2 + line_size;
1589 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1590 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1591 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1592 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1593 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1594 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1595 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1596 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1597 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1598 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1599 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1600 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1601 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1602 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1603 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1604 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1612 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1615 uint8_t *pix3 = pix2 + line_size;
1619 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1620 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1621 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1622 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1623 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1624 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1625 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1626 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1627 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1628 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1629 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1630 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1631 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1632 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1633 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1634 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1642 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1648 s += abs(pix1[0] - pix2[0]);
1649 s += abs(pix1[1] - pix2[1]);
1650 s += abs(pix1[2] - pix2[2]);
1651 s += abs(pix1[3] - pix2[3]);
1652 s += abs(pix1[4] - pix2[4]);
1653 s += abs(pix1[5] - pix2[5]);
1654 s += abs(pix1[6] - pix2[6]);
1655 s += abs(pix1[7] - pix2[7]);
1662 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1668 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1669 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1670 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1671 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1672 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1673 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1674 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1675 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1682 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1685 uint8_t *pix3 = pix2 + line_size;
1689 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1690 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1691 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1692 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1693 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1694 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1695 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1696 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1704 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1707 uint8_t *pix3 = pix2 + line_size;
1711 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1712 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1713 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1714 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1715 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1716 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1717 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1718 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1726 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1727 MpegEncContext *c = v;
1733 for(x=0; x<16; x++){
1734 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1737 for(x=0; x<15; x++){
1738 score2+= FFABS( s1[x ] - s1[x +stride]
1739 - s1[x+1] + s1[x+1+stride])
1740 -FFABS( s2[x ] - s2[x +stride]
1741 - s2[x+1] + s2[x+1+stride]);
1748 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1749 else return score1 + FFABS(score2)*8;
1752 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1753 MpegEncContext *c = v;
1760 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1764 score2+= FFABS( s1[x ] - s1[x +stride]
1765 - s1[x+1] + s1[x+1+stride])
1766 -FFABS( s2[x ] - s2[x +stride]
1767 - s2[x+1] + s2[x+1+stride]);
1774 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1775 else return score1 + FFABS(score2)*8;
1778 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1782 for(i=0; i<8*8; i++){
1783 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1786 assert(-512<b && b<512);
1788 sum += (w*b)*(w*b)>>4;
1793 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1796 for(i=0; i<8*8; i++){
1797 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1802 * permutes an 8x8 block.
1803 * @param block the block which will be permuted according to the given permutation vector
1804 * @param permutation the permutation vector
1805 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1806 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1807 * (inverse) permutated to scantable order!
1809 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1815 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1817 for(i=0; i<=last; i++){
1818 const int j= scantable[i];
1823 for(i=0; i<=last; i++){
1824 const int j= scantable[i];
1825 const int perm_j= permutation[j];
1826 block[perm_j]= temp[j];
1830 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1834 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1837 memset(cmp, 0, sizeof(void*)*6);
1845 cmp[i]= c->hadamard8_diff[i];
1851 cmp[i]= c->dct_sad[i];
1854 cmp[i]= c->dct264_sad[i];
1857 cmp[i]= c->dct_max[i];
1860 cmp[i]= c->quant_psnr[i];
1889 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1894 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1896 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1897 long a = *(long*)(src+i);
1898 long b = *(long*)(dst+i);
1899 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1902 dst[i+0] += src[i+0];
1905 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1907 #if !HAVE_FAST_UNALIGNED
1908 if((long)src2 & (sizeof(long)-1)){
1909 for(i=0; i+7<w; i+=8){
1910 dst[i+0] = src1[i+0]-src2[i+0];
1911 dst[i+1] = src1[i+1]-src2[i+1];
1912 dst[i+2] = src1[i+2]-src2[i+2];
1913 dst[i+3] = src1[i+3]-src2[i+3];
1914 dst[i+4] = src1[i+4]-src2[i+4];
1915 dst[i+5] = src1[i+5]-src2[i+5];
1916 dst[i+6] = src1[i+6]-src2[i+6];
1917 dst[i+7] = src1[i+7]-src2[i+7];
1921 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1922 long a = *(long*)(src1+i);
1923 long b = *(long*)(src2+i);
1924 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1927 dst[i+0] = src1[i+0]-src2[i+0];
1930 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1938 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1947 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1955 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1965 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1968 for(i=0; i<w-1; i++){
1995 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2025 #define BUTTERFLY2(o1,o2,i1,i2) \
2029 #define BUTTERFLY1(x,y) \
2038 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2040 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2048 //FIXME try pointer walks
2049 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2050 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2051 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2052 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2054 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2055 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2056 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2057 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2059 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2060 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2061 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2062 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2066 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2067 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2068 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2069 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2071 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2072 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2073 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2074 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2077 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2078 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2079 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2080 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2085 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2093 //FIXME try pointer walks
2094 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2095 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2096 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2097 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2099 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2100 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2101 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2102 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2104 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2105 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2106 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2107 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2111 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2112 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2113 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2114 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2116 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2117 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2118 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2119 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2122 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2123 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2124 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2125 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2128 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2133 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2134 MpegEncContext * const s= (MpegEncContext *)c;
2135 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2139 s->dsp.diff_pixels(temp, src1, src2, stride);
2141 return s->dsp.sum_abs_dctelem(temp);
2146 const int s07 = SRC(0) + SRC(7);\
2147 const int s16 = SRC(1) + SRC(6);\
2148 const int s25 = SRC(2) + SRC(5);\
2149 const int s34 = SRC(3) + SRC(4);\
2150 const int a0 = s07 + s34;\
2151 const int a1 = s16 + s25;\
2152 const int a2 = s07 - s34;\
2153 const int a3 = s16 - s25;\
2154 const int d07 = SRC(0) - SRC(7);\
2155 const int d16 = SRC(1) - SRC(6);\
2156 const int d25 = SRC(2) - SRC(5);\
2157 const int d34 = SRC(3) - SRC(4);\
2158 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2159 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2160 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2161 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2163 DST(1, a4 + (a7>>2)) ;\
2164 DST(2, a2 + (a3>>1)) ;\
2165 DST(3, a5 + (a6>>2)) ;\
2167 DST(5, a6 - (a5>>2)) ;\
2168 DST(6, (a2>>1) - a3 ) ;\
2169 DST(7, (a4>>2) - a7 ) ;\
2172 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2173 MpegEncContext * const s= (MpegEncContext *)c;
2178 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2180 #define SRC(x) dct[i][x]
2181 #define DST(x,v) dct[i][x]= v
2182 for( i = 0; i < 8; i++ )
2187 #define SRC(x) dct[x][i]
2188 #define DST(x,v) sum += FFABS(v)
2189 for( i = 0; i < 8; i++ )
2197 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2198 MpegEncContext * const s= (MpegEncContext *)c;
2199 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2204 s->dsp.diff_pixels(temp, src1, src2, stride);
2208 sum= FFMAX(sum, FFABS(temp[i]));
2213 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2214 MpegEncContext * const s= (MpegEncContext *)c;
2215 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2216 DCTELEM * const bak = temp+64;
2222 s->dsp.diff_pixels(temp, src1, src2, stride);
2224 memcpy(bak, temp, 64*sizeof(DCTELEM));
2226 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2227 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2228 ff_simple_idct_8(temp); //FIXME
2231 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2236 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2237 MpegEncContext * const s= (MpegEncContext *)c;
2238 const uint8_t *scantable= s->intra_scantable.permutated;
2239 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2240 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2241 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2242 int i, last, run, bits, level, distortion, start_i;
2243 const int esc_length= s->ac_esc_length;
2245 uint8_t * last_length;
2249 copy_block8(lsrc1, src1, 8, stride, 8);
2250 copy_block8(lsrc2, src2, 8, stride, 8);
2252 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2254 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2260 length = s->intra_ac_vlc_length;
2261 last_length= s->intra_ac_vlc_last_length;
2262 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2265 length = s->inter_ac_vlc_length;
2266 last_length= s->inter_ac_vlc_last_length;
2271 for(i=start_i; i<last; i++){
2272 int j= scantable[i];
2277 if((level&(~127)) == 0){
2278 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2287 level= temp[i] + 64;
2291 if((level&(~127)) == 0){
2292 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2300 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2302 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2305 s->dsp.idct_add(lsrc2, 8, temp);
2307 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2309 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2312 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2313 MpegEncContext * const s= (MpegEncContext *)c;
2314 const uint8_t *scantable= s->intra_scantable.permutated;
2315 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2316 int i, last, run, bits, level, start_i;
2317 const int esc_length= s->ac_esc_length;
2319 uint8_t * last_length;
2323 s->dsp.diff_pixels(temp, src1, src2, stride);
2325 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2331 length = s->intra_ac_vlc_length;
2332 last_length= s->intra_ac_vlc_last_length;
2333 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2336 length = s->inter_ac_vlc_length;
2337 last_length= s->inter_ac_vlc_last_length;
2342 for(i=start_i; i<last; i++){
2343 int j= scantable[i];
2348 if((level&(~127)) == 0){
2349 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2358 level= temp[i] + 64;
2362 if((level&(~127)) == 0){
2363 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2371 #define VSAD_INTRA(size) \
2372 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2376 for(y=1; y<h; y++){ \
2377 for(x=0; x<size; x+=4){ \
2378 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2379 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2389 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2394 for(x=0; x<16; x++){
2395 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2404 #define SQ(a) ((a)*(a))
2405 #define VSSE_INTRA(size) \
2406 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2410 for(y=1; y<h; y++){ \
2411 for(x=0; x<size; x+=4){ \
2412 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2413 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2423 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2428 for(x=0; x<16; x++){
2429 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2438 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2442 for(i=0; i<size; i++)
2443 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2447 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2448 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2449 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2451 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2453 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2454 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2455 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2456 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2458 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2460 for(i=0; i<len; i++)
2461 dst[i] = src0[i] * src1[i];
2464 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2467 for(i=0; i<len; i++)
2468 dst[i] = src0[i] * src1[-i];
2471 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2473 for(i=0; i<len; i++)
2474 dst[i] = src0[i] * src1[i] + src2[i];
2477 static void vector_fmul_window_c(float *dst, const float *src0,
2478 const float *src1, const float *win, int len)
2484 for(i=-len, j=len-1; i<0; i++, j--) {
2489 dst[i] = s0*wj - s1*wi;
2490 dst[j] = s0*wi + s1*wj;
2494 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2498 for (i = 0; i < len; i++)
2499 dst[i] = src[i] * mul;
2502 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2503 const float **sv, float mul, int len)
2506 for (i = 0; i < len; i += 2, sv++) {
2507 dst[i ] = src[i ] * sv[0][0] * mul;
2508 dst[i+1] = src[i+1] * sv[0][1] * mul;
2512 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2513 const float **sv, float mul, int len)
2516 for (i = 0; i < len; i += 4, sv++) {
2517 dst[i ] = src[i ] * sv[0][0] * mul;
2518 dst[i+1] = src[i+1] * sv[0][1] * mul;
2519 dst[i+2] = src[i+2] * sv[0][2] * mul;
2520 dst[i+3] = src[i+3] * sv[0][3] * mul;
2524 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2528 for (i = 0; i < len; i += 2, sv++) {
2529 dst[i ] = sv[0][0] * mul;
2530 dst[i+1] = sv[0][1] * mul;
2534 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2538 for (i = 0; i < len; i += 4, sv++) {
2539 dst[i ] = sv[0][0] * mul;
2540 dst[i+1] = sv[0][1] * mul;
2541 dst[i+2] = sv[0][2] * mul;
2542 dst[i+3] = sv[0][3] * mul;
2546 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2550 for (i = 0; i < len; i++) {
2551 float t = v1[i] - v2[i];
2557 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2562 for (i = 0; i < len; i++)
2568 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2569 uint32_t maxi, uint32_t maxisign)
2572 if(a > mini) return mini;
2573 else if((a^(1U<<31)) > maxisign) return maxi;
2577 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2579 uint32_t mini = *(uint32_t*)min;
2580 uint32_t maxi = *(uint32_t*)max;
2581 uint32_t maxisign = maxi ^ (1U<<31);
2582 uint32_t *dsti = (uint32_t*)dst;
2583 const uint32_t *srci = (const uint32_t*)src;
2584 for(i=0; i<len; i+=8) {
2585 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2586 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2587 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2588 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2589 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2590 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2591 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2592 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2595 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2597 if(min < 0 && max > 0) {
2598 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2600 for(i=0; i < len; i+=8) {
2601 dst[i ] = av_clipf(src[i ], min, max);
2602 dst[i + 1] = av_clipf(src[i + 1], min, max);
2603 dst[i + 2] = av_clipf(src[i + 2], min, max);
2604 dst[i + 3] = av_clipf(src[i + 3], min, max);
2605 dst[i + 4] = av_clipf(src[i + 4], min, max);
2606 dst[i + 5] = av_clipf(src[i + 5], min, max);
2607 dst[i + 6] = av_clipf(src[i + 6], min, max);
2608 dst[i + 7] = av_clipf(src[i + 7], min, max);
2613 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2618 res += (*v1++ * *v2++) >> shift;
2623 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2628 *v1++ += mul * *v3++;
2633 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2634 const int16_t *window, unsigned int len)
2637 int len2 = len >> 1;
2639 for (i = 0; i < len2; i++) {
2640 int16_t w = window[i];
2641 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2642 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2646 static void vector_clip_int32_c(int32_t *dst, const int32_t *src, int32_t min,
2647 int32_t max, unsigned int len)
2650 *dst++ = av_clip(*src++, min, max);
2651 *dst++ = av_clip(*src++, min, max);
2652 *dst++ = av_clip(*src++, min, max);
2653 *dst++ = av_clip(*src++, min, max);
2654 *dst++ = av_clip(*src++, min, max);
2655 *dst++ = av_clip(*src++, min, max);
2656 *dst++ = av_clip(*src++, min, max);
2657 *dst++ = av_clip(*src++, min, max);
2663 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2664 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2665 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2666 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2667 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2668 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2669 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2671 static void wmv2_idct_row(short * b)
2674 int a0,a1,a2,a3,a4,a5,a6,a7;
2676 a1 = W1*b[1]+W7*b[7];
2677 a7 = W7*b[1]-W1*b[7];
2678 a5 = W5*b[5]+W3*b[3];
2679 a3 = W3*b[5]-W5*b[3];
2680 a2 = W2*b[2]+W6*b[6];
2681 a6 = W6*b[2]-W2*b[6];
2682 a0 = W0*b[0]+W0*b[4];
2683 a4 = W0*b[0]-W0*b[4];
2685 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2686 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2688 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2689 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2690 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2691 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2692 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2693 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2694 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2695 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2697 static void wmv2_idct_col(short * b)
2700 int a0,a1,a2,a3,a4,a5,a6,a7;
2701 /*step 1, with extended precision*/
2702 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2703 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2704 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2705 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2706 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2707 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2708 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2709 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2711 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2712 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2714 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2715 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2716 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2717 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2719 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2720 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2721 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2722 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2724 void ff_wmv2_idct_c(short * block){
2728 wmv2_idct_row(block+i);
2731 wmv2_idct_col(block+i);
2734 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2736 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2738 ff_wmv2_idct_c(block);
2739 ff_put_pixels_clamped_c(block, dest, line_size);
2741 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2743 ff_wmv2_idct_c(block);
2744 ff_add_pixels_clamped_c(block, dest, line_size);
2746 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2749 ff_put_pixels_clamped_c(block, dest, line_size);
2751 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2754 ff_add_pixels_clamped_c(block, dest, line_size);
2757 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2760 put_pixels_clamped4_c(block, dest, line_size);
2762 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2765 add_pixels_clamped4_c(block, dest, line_size);
2768 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2771 put_pixels_clamped2_c(block, dest, line_size);
2773 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2776 add_pixels_clamped2_c(block, dest, line_size);
2779 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2783 dest[0] = cm[(block[0] + 4)>>3];
2785 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2787 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2789 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2792 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2794 /* init static data */
2795 av_cold void dsputil_static_init(void)
2799 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2800 for(i=0;i<MAX_NEG_CROP;i++) {
2802 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2805 for(i=0;i<512;i++) {
2806 ff_squareTbl[i] = (i - 256) * (i - 256);
2809 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2812 int ff_check_alignment(void){
2813 static int did_fail=0;
2814 LOCAL_ALIGNED_16(int, aligned, [4]);
2816 if((intptr_t)aligned & 15){
2818 #if HAVE_MMX || HAVE_ALTIVEC
2819 av_log(NULL, AV_LOG_ERROR,
2820 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2821 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2822 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2823 "Do not report crashes to FFmpeg developers.\n");
2832 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2836 ff_check_alignment();
2839 if(avctx->dct_algo==FF_DCT_FASTINT) {
2840 c->fdct = fdct_ifast;
2841 c->fdct248 = fdct_ifast248;
2843 else if(avctx->dct_algo==FF_DCT_FAAN) {
2844 c->fdct = ff_faandct;
2845 c->fdct248 = ff_faandct248;
2848 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2849 c->fdct248 = ff_fdct248_islow;
2851 #endif //CONFIG_ENCODERS
2853 if(avctx->lowres==1){
2854 c->idct_put= ff_jref_idct4_put;
2855 c->idct_add= ff_jref_idct4_add;
2856 c->idct = j_rev_dct4;
2857 c->idct_permutation_type= FF_NO_IDCT_PERM;
2858 }else if(avctx->lowres==2){
2859 c->idct_put= ff_jref_idct2_put;
2860 c->idct_add= ff_jref_idct2_add;
2861 c->idct = j_rev_dct2;
2862 c->idct_permutation_type= FF_NO_IDCT_PERM;
2863 }else if(avctx->lowres==3){
2864 c->idct_put= ff_jref_idct1_put;
2865 c->idct_add= ff_jref_idct1_add;
2866 c->idct = j_rev_dct1;
2867 c->idct_permutation_type= FF_NO_IDCT_PERM;
2869 if (avctx->bits_per_raw_sample == 10) {
2870 c->idct_put = ff_simple_idct_put_10;
2871 c->idct_add = ff_simple_idct_add_10;
2872 c->idct = ff_simple_idct_10;
2873 c->idct_permutation_type = FF_NO_IDCT_PERM;
2875 if(avctx->idct_algo==FF_IDCT_INT){
2876 c->idct_put= ff_jref_idct_put;
2877 c->idct_add= ff_jref_idct_add;
2878 c->idct = j_rev_dct;
2879 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2880 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2881 avctx->idct_algo==FF_IDCT_VP3){
2882 c->idct_put= ff_vp3_idct_put_c;
2883 c->idct_add= ff_vp3_idct_add_c;
2884 c->idct = ff_vp3_idct_c;
2885 c->idct_permutation_type= FF_NO_IDCT_PERM;
2886 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2887 c->idct_put= ff_wmv2_idct_put_c;
2888 c->idct_add= ff_wmv2_idct_add_c;
2889 c->idct = ff_wmv2_idct_c;
2890 c->idct_permutation_type= FF_NO_IDCT_PERM;
2891 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2892 c->idct_put= ff_faanidct_put;
2893 c->idct_add= ff_faanidct_add;
2894 c->idct = ff_faanidct;
2895 c->idct_permutation_type= FF_NO_IDCT_PERM;
2896 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2897 c->idct_put= ff_ea_idct_put_c;
2898 c->idct_permutation_type= FF_NO_IDCT_PERM;
2899 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2900 c->idct = ff_bink_idct_c;
2901 c->idct_add = ff_bink_idct_add_c;
2902 c->idct_put = ff_bink_idct_put_c;
2903 c->idct_permutation_type = FF_NO_IDCT_PERM;
2904 }else{ //accurate/default
2905 c->idct_put = ff_simple_idct_put_8;
2906 c->idct_add = ff_simple_idct_add_8;
2907 c->idct = ff_simple_idct_8;
2908 c->idct_permutation_type= FF_NO_IDCT_PERM;
2913 c->get_pixels = get_pixels_c;
2914 c->diff_pixels = diff_pixels_c;
2915 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2916 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2917 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2918 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2919 c->sum_abs_dctelem = sum_abs_dctelem_c;
2922 c->pix_sum = pix_sum_c;
2923 c->pix_norm1 = pix_norm1_c;
2925 c->fill_block_tab[0] = fill_block16_c;
2926 c->fill_block_tab[1] = fill_block8_c;
2927 c->scale_block = scale_block_c;
2929 /* TODO [0] 16 [1] 8 */
2930 c->pix_abs[0][0] = pix_abs16_c;
2931 c->pix_abs[0][1] = pix_abs16_x2_c;
2932 c->pix_abs[0][2] = pix_abs16_y2_c;
2933 c->pix_abs[0][3] = pix_abs16_xy2_c;
2934 c->pix_abs[1][0] = pix_abs8_c;
2935 c->pix_abs[1][1] = pix_abs8_x2_c;
2936 c->pix_abs[1][2] = pix_abs8_y2_c;
2937 c->pix_abs[1][3] = pix_abs8_xy2_c;
2939 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2940 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2941 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2942 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2943 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2944 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2945 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2946 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2947 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2949 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2950 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2951 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2952 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2953 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2954 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2955 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2956 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2957 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2959 #define dspfunc(PFX, IDX, NUM) \
2960 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2961 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2962 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2963 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2964 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2965 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2966 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2967 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2968 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2969 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2970 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2971 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2972 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2973 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2974 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2975 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2977 dspfunc(put_qpel, 0, 16);
2978 dspfunc(put_no_rnd_qpel, 0, 16);
2980 dspfunc(avg_qpel, 0, 16);
2981 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2983 dspfunc(put_qpel, 1, 8);
2984 dspfunc(put_no_rnd_qpel, 1, 8);
2986 dspfunc(avg_qpel, 1, 8);
2987 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2991 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
2992 ff_mlp_init(c, avctx);
2994 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
2995 ff_intrax8dsp_init(c,avctx);
2997 #if CONFIG_RV30_DECODER
2998 ff_rv30dsp_init(c,avctx);
3000 #if CONFIG_RV40_DECODER
3001 ff_rv40dsp_init(c,avctx);
3002 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3003 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3004 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3005 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3008 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3009 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3010 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3011 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3012 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3013 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3014 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3015 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3017 #define SET_CMP_FUNC(name) \
3018 c->name[0]= name ## 16_c;\
3019 c->name[1]= name ## 8x8_c;
3021 SET_CMP_FUNC(hadamard8_diff)
3022 c->hadamard8_diff[4]= hadamard8_intra16_c;
3023 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3024 SET_CMP_FUNC(dct_sad)
3025 SET_CMP_FUNC(dct_max)
3027 SET_CMP_FUNC(dct264_sad)
3029 c->sad[0]= pix_abs16_c;
3030 c->sad[1]= pix_abs8_c;
3034 SET_CMP_FUNC(quant_psnr)
3037 c->vsad[0]= vsad16_c;
3038 c->vsad[4]= vsad_intra16_c;
3039 c->vsad[5]= vsad_intra8_c;
3040 c->vsse[0]= vsse16_c;
3041 c->vsse[4]= vsse_intra16_c;
3042 c->vsse[5]= vsse_intra8_c;
3043 c->nsse[0]= nsse16_c;
3044 c->nsse[1]= nsse8_c;
3046 ff_dsputil_init_dwt(c);
3049 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3051 c->add_bytes= add_bytes_c;
3052 c->diff_bytes= diff_bytes_c;
3053 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3054 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3055 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3056 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3057 c->bswap_buf= bswap_buf;
3058 c->bswap16_buf = bswap16_buf;
3060 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3061 c->h263_h_loop_filter= h263_h_loop_filter_c;
3062 c->h263_v_loop_filter= h263_v_loop_filter_c;
3065 if (CONFIG_VP3_DECODER) {
3066 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3067 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3068 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3071 c->h261_loop_filter= h261_loop_filter_c;
3073 c->try_8x8basis= try_8x8basis_c;
3074 c->add_8x8basis= add_8x8basis_c;
3076 #if CONFIG_VORBIS_DECODER
3077 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3079 #if CONFIG_AC3_DECODER
3080 c->ac3_downmix = ff_ac3_downmix_c;
3082 c->vector_fmul = vector_fmul_c;
3083 c->vector_fmul_reverse = vector_fmul_reverse_c;
3084 c->vector_fmul_add = vector_fmul_add_c;
3085 c->vector_fmul_window = vector_fmul_window_c;
3086 c->vector_clipf = vector_clipf_c;
3087 c->scalarproduct_int16 = scalarproduct_int16_c;
3088 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3089 c->apply_window_int16 = apply_window_int16_c;
3090 c->vector_clip_int32 = vector_clip_int32_c;
3091 c->scalarproduct_float = scalarproduct_float_c;
3092 c->butterflies_float = butterflies_float_c;
3093 c->vector_fmul_scalar = vector_fmul_scalar_c;
3095 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3096 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3098 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3099 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3101 c->shrink[0]= av_image_copy_plane;
3102 c->shrink[1]= ff_shrink22;
3103 c->shrink[2]= ff_shrink44;
3104 c->shrink[3]= ff_shrink88;
3106 c->prefetch= just_return;
3108 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3109 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3113 #define FUNC(f, depth) f ## _ ## depth
3114 #define FUNCC(f, depth) f ## _ ## depth ## _c
3116 #define dspfunc1(PFX, IDX, NUM, depth)\
3117 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3118 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3119 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3120 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3122 #define dspfunc2(PFX, IDX, NUM, depth)\
3123 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3124 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3125 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3126 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3127 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3128 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3129 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3130 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3131 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3132 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3133 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3134 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3135 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3136 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3137 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3138 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3141 #define BIT_DEPTH_FUNCS(depth)\
3142 c->draw_edges = FUNCC(draw_edges , depth);\
3143 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3144 c->clear_block = FUNCC(clear_block , depth);\
3145 c->clear_blocks = FUNCC(clear_blocks , depth);\
3146 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3147 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3148 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3149 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3151 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3152 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3153 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3154 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3155 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3156 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3158 dspfunc1(put , 0, 16, depth);\
3159 dspfunc1(put , 1, 8, depth);\
3160 dspfunc1(put , 2, 4, depth);\
3161 dspfunc1(put , 3, 2, depth);\
3162 dspfunc1(put_no_rnd, 0, 16, depth);\
3163 dspfunc1(put_no_rnd, 1, 8, depth);\
3164 dspfunc1(avg , 0, 16, depth);\
3165 dspfunc1(avg , 1, 8, depth);\
3166 dspfunc1(avg , 2, 4, depth);\
3167 dspfunc1(avg , 3, 2, depth);\
3168 dspfunc1(avg_no_rnd, 0, 16, depth);\
3169 dspfunc1(avg_no_rnd, 1, 8, depth);\
3171 dspfunc2(put_h264_qpel, 0, 16, depth);\
3172 dspfunc2(put_h264_qpel, 1, 8, depth);\
3173 dspfunc2(put_h264_qpel, 2, 4, depth);\
3174 dspfunc2(put_h264_qpel, 3, 2, depth);\
3175 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3176 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3177 dspfunc2(avg_h264_qpel, 2, 4, depth);
3179 switch (avctx->bits_per_raw_sample) {
3184 BIT_DEPTH_FUNCS(10);
3187 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3194 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3195 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3196 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3197 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3198 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3199 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3200 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3201 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3202 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3204 for(i=0; i<64; i++){
3205 if(!c->put_2tap_qpel_pixels_tab[0][i])
3206 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3207 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3208 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3211 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3212 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3213 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3214 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3216 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3217 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3218 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3219 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3221 switch(c->idct_permutation_type){
3222 case FF_NO_IDCT_PERM:
3224 c->idct_permutation[i]= i;
3226 case FF_LIBMPEG2_IDCT_PERM:
3228 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3230 case FF_SIMPLE_IDCT_PERM:
3232 c->idct_permutation[i]= simple_mmx_permutation[i];
3234 case FF_TRANSPOSE_IDCT_PERM:
3236 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3238 case FF_PARTTRANS_IDCT_PERM:
3240 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3242 case FF_SSE2_IDCT_PERM:
3244 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3247 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");