3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = ff_squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = ff_squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
293 /* draw the edges of width 'w' of an image of size width, height */
294 //FIXME check that this is ok for mpeg4 interlaced
295 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
297 uint8_t *ptr, *last_line;
300 last_line = buf + (height - 1) * wrap;
303 memcpy(buf - (i + 1) * wrap, buf, width);
304 memcpy(last_line + (i + 1) * wrap, last_line, width);
308 for(i=0;i<height;i++) {
309 memset(ptr - w, ptr[0], w);
310 memset(ptr + width, ptr[width-1], w);
315 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
323 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324 * @param buf destination buffer
325 * @param src source buffer
326 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327 * @param block_w width of block
328 * @param block_h height of block
329 * @param src_x x coordinate of the top left sample of the block in the source buffer
330 * @param src_y y coordinate of the top left sample of the block in the source buffer
331 * @param w width of the source buffer
332 * @param h height of the source buffer
334 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335 int src_x, int src_y, int w, int h){
337 int start_y, start_x, end_y, end_x;
340 src+= (h-1-src_y)*linesize;
342 }else if(src_y<=-block_h){
343 src+= (1-block_h-src_y)*linesize;
349 }else if(src_x<=-block_w){
350 src+= (1-block_w-src_x);
354 start_y= FFMAX(0, -src_y);
355 start_x= FFMAX(0, -src_x);
356 end_y= FFMIN(block_h, h-src_y);
357 end_x= FFMIN(block_w, w-src_x);
358 assert(start_y < end_y && block_h);
359 assert(start_x < end_x && block_w);
362 src += start_y*linesize + start_x;
366 for(y=0; y<start_y; y++){
371 // copy existing part
380 for(; y<block_h; y++){
385 buf -= block_h * linesize + start_x;
388 for(x=0; x<start_x; x++){
389 buf[x] = buf[start_x];
393 for(x=end_x; x<block_w; x++){
394 buf[x] = buf[end_x - 1];
400 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
404 /* read the pixels */
406 block[0] = pixels[0];
407 block[1] = pixels[1];
408 block[2] = pixels[2];
409 block[3] = pixels[3];
410 block[4] = pixels[4];
411 block[5] = pixels[5];
412 block[6] = pixels[6];
413 block[7] = pixels[7];
419 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
420 const uint8_t *s2, int stride){
423 /* read the pixels */
425 block[0] = s1[0] - s2[0];
426 block[1] = s1[1] - s2[1];
427 block[2] = s1[2] - s2[2];
428 block[3] = s1[3] - s2[3];
429 block[4] = s1[4] - s2[4];
430 block[5] = s1[5] - s2[5];
431 block[6] = s1[6] - s2[6];
432 block[7] = s1[7] - s2[7];
440 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
444 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
446 /* read the pixels */
448 pixels[0] = cm[block[0]];
449 pixels[1] = cm[block[1]];
450 pixels[2] = cm[block[2]];
451 pixels[3] = cm[block[3]];
452 pixels[4] = cm[block[4]];
453 pixels[5] = cm[block[5]];
454 pixels[6] = cm[block[6]];
455 pixels[7] = cm[block[7]];
462 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
466 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
468 /* read the pixels */
470 pixels[0] = cm[block[0]];
471 pixels[1] = cm[block[1]];
472 pixels[2] = cm[block[2]];
473 pixels[3] = cm[block[3]];
480 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
484 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
486 /* read the pixels */
488 pixels[0] = cm[block[0]];
489 pixels[1] = cm[block[1]];
496 static void put_signed_pixels_clamped_c(const DCTELEM *block,
497 uint8_t *restrict pixels,
502 for (i = 0; i < 8; i++) {
503 for (j = 0; j < 8; j++) {
506 else if (*block > 127)
509 *pixels = (uint8_t)(*block + 128);
513 pixels += (line_size - 8);
517 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
522 /* read the pixels */
524 pixels[0] = block[0];
525 pixels[1] = block[1];
526 pixels[2] = block[2];
527 pixels[3] = block[3];
528 pixels[4] = block[4];
529 pixels[5] = block[5];
530 pixels[6] = block[6];
531 pixels[7] = block[7];
538 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
542 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
544 /* read the pixels */
546 pixels[0] = cm[pixels[0] + block[0]];
547 pixels[1] = cm[pixels[1] + block[1]];
548 pixels[2] = cm[pixels[2] + block[2]];
549 pixels[3] = cm[pixels[3] + block[3]];
550 pixels[4] = cm[pixels[4] + block[4]];
551 pixels[5] = cm[pixels[5] + block[5]];
552 pixels[6] = cm[pixels[6] + block[6]];
553 pixels[7] = cm[pixels[7] + block[7]];
559 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
563 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
565 /* read the pixels */
567 pixels[0] = cm[pixels[0] + block[0]];
568 pixels[1] = cm[pixels[1] + block[1]];
569 pixels[2] = cm[pixels[2] + block[2]];
570 pixels[3] = cm[pixels[3] + block[3]];
576 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
580 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
582 /* read the pixels */
584 pixels[0] = cm[pixels[0] + block[0]];
585 pixels[1] = cm[pixels[1] + block[1]];
591 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
595 pixels[0] += block[0];
596 pixels[1] += block[1];
597 pixels[2] += block[2];
598 pixels[3] += block[3];
599 pixels[4] += block[4];
600 pixels[5] += block[5];
601 pixels[6] += block[6];
602 pixels[7] += block[7];
608 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
612 pixels[0] += block[0];
613 pixels[1] += block[1];
614 pixels[2] += block[2];
615 pixels[3] += block[3];
621 static int sum_abs_dctelem_c(DCTELEM *block)
625 sum+= FFABS(block[i]);
629 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
633 for (i = 0; i < h; i++) {
634 memset(block, value, 16);
639 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
643 for (i = 0; i < h; i++) {
644 memset(block, value, 8);
649 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
652 uint16_t *dst1 = (uint16_t *) dst;
653 uint16_t *dst2 = (uint16_t *)(dst + linesize);
655 for (j = 0; j < 8; j++) {
656 for (i = 0; i < 8; i++) {
657 dst1[i] = dst2[i] = src[i] * 0x0101;
667 #define PIXOP2(OPNAME, OP) \
668 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
672 OP(*((uint64_t*)block), AV_RN64(pixels));\
678 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
682 const uint64_t a= AV_RN64(pixels );\
683 const uint64_t b= AV_RN64(pixels+1);\
684 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
690 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
694 const uint64_t a= AV_RN64(pixels );\
695 const uint64_t b= AV_RN64(pixels+1);\
696 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
702 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706 const uint64_t a= AV_RN64(pixels );\
707 const uint64_t b= AV_RN64(pixels+line_size);\
708 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
714 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
718 const uint64_t a= AV_RN64(pixels );\
719 const uint64_t b= AV_RN64(pixels+line_size);\
720 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
726 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
729 const uint64_t a= AV_RN64(pixels );\
730 const uint64_t b= AV_RN64(pixels+1);\
731 uint64_t l0= (a&0x0303030303030303ULL)\
732 + (b&0x0303030303030303ULL)\
733 + 0x0202020202020202ULL;\
734 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
735 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
739 for(i=0; i<h; i+=2){\
740 uint64_t a= AV_RN64(pixels );\
741 uint64_t b= AV_RN64(pixels+1);\
742 l1= (a&0x0303030303030303ULL)\
743 + (b&0x0303030303030303ULL);\
744 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
745 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
746 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
749 a= AV_RN64(pixels );\
750 b= AV_RN64(pixels+1);\
751 l0= (a&0x0303030303030303ULL)\
752 + (b&0x0303030303030303ULL)\
753 + 0x0202020202020202ULL;\
754 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
755 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
756 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
762 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
765 const uint64_t a= AV_RN64(pixels );\
766 const uint64_t b= AV_RN64(pixels+1);\
767 uint64_t l0= (a&0x0303030303030303ULL)\
768 + (b&0x0303030303030303ULL)\
769 + 0x0101010101010101ULL;\
770 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
771 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
775 for(i=0; i<h; i+=2){\
776 uint64_t a= AV_RN64(pixels );\
777 uint64_t b= AV_RN64(pixels+1);\
778 l1= (a&0x0303030303030303ULL)\
779 + (b&0x0303030303030303ULL);\
780 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
781 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
782 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
785 a= AV_RN64(pixels );\
786 b= AV_RN64(pixels+1);\
787 l0= (a&0x0303030303030303ULL)\
788 + (b&0x0303030303030303ULL)\
789 + 0x0101010101010101ULL;\
790 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
791 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
792 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
798 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
799 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
800 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
801 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
802 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
803 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
804 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
806 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
807 #else // 64 bit variant
809 #define PIXOP2(OPNAME, OP) \
810 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
813 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
818 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
821 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
826 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
829 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
830 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
835 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
839 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
840 int src_stride1, int src_stride2, int h){\
844 a= AV_RN32(&src1[i*src_stride1 ]);\
845 b= AV_RN32(&src2[i*src_stride2 ]);\
846 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
847 a= AV_RN32(&src1[i*src_stride1+4]);\
848 b= AV_RN32(&src2[i*src_stride2+4]);\
849 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
853 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
854 int src_stride1, int src_stride2, int h){\
858 a= AV_RN32(&src1[i*src_stride1 ]);\
859 b= AV_RN32(&src2[i*src_stride2 ]);\
860 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
861 a= AV_RN32(&src1[i*src_stride1+4]);\
862 b= AV_RN32(&src2[i*src_stride2+4]);\
863 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
867 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
868 int src_stride1, int src_stride2, int h){\
872 a= AV_RN32(&src1[i*src_stride1 ]);\
873 b= AV_RN32(&src2[i*src_stride2 ]);\
874 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
878 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879 int src_stride1, int src_stride2, int h){\
883 a= AV_RN16(&src1[i*src_stride1 ]);\
884 b= AV_RN16(&src2[i*src_stride2 ]);\
885 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
889 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
891 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
892 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
895 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896 int src_stride1, int src_stride2, int h){\
897 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
898 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
901 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
909 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
913 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
917 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
918 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
921 uint32_t a, b, c, d, l0, l1, h0, h1;\
922 a= AV_RN32(&src1[i*src_stride1]);\
923 b= AV_RN32(&src2[i*src_stride2]);\
924 c= AV_RN32(&src3[i*src_stride3]);\
925 d= AV_RN32(&src4[i*src_stride4]);\
926 l0= (a&0x03030303UL)\
929 h0= ((a&0xFCFCFCFCUL)>>2)\
930 + ((b&0xFCFCFCFCUL)>>2);\
931 l1= (c&0x03030303UL)\
933 h1= ((c&0xFCFCFCFCUL)>>2)\
934 + ((d&0xFCFCFCFCUL)>>2);\
935 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
936 a= AV_RN32(&src1[i*src_stride1+4]);\
937 b= AV_RN32(&src2[i*src_stride2+4]);\
938 c= AV_RN32(&src3[i*src_stride3+4]);\
939 d= AV_RN32(&src4[i*src_stride4+4]);\
940 l0= (a&0x03030303UL)\
943 h0= ((a&0xFCFCFCFCUL)>>2)\
944 + ((b&0xFCFCFCFCUL)>>2);\
945 l1= (c&0x03030303UL)\
947 h1= ((c&0xFCFCFCFCUL)>>2)\
948 + ((d&0xFCFCFCFCUL)>>2);\
949 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
953 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
954 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
957 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
961 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
962 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
965 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
969 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
970 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
973 uint32_t a, b, c, d, l0, l1, h0, h1;\
974 a= AV_RN32(&src1[i*src_stride1]);\
975 b= AV_RN32(&src2[i*src_stride2]);\
976 c= AV_RN32(&src3[i*src_stride3]);\
977 d= AV_RN32(&src4[i*src_stride4]);\
978 l0= (a&0x03030303UL)\
981 h0= ((a&0xFCFCFCFCUL)>>2)\
982 + ((b&0xFCFCFCFCUL)>>2);\
983 l1= (c&0x03030303UL)\
985 h1= ((c&0xFCFCFCFCUL)>>2)\
986 + ((d&0xFCFCFCFCUL)>>2);\
987 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
988 a= AV_RN32(&src1[i*src_stride1+4]);\
989 b= AV_RN32(&src2[i*src_stride2+4]);\
990 c= AV_RN32(&src3[i*src_stride3+4]);\
991 d= AV_RN32(&src4[i*src_stride4+4]);\
992 l0= (a&0x03030303UL)\
995 h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 l1= (c&0x03030303UL)\
999 h1= ((c&0xFCFCFCFCUL)>>2)\
1000 + ((d&0xFCFCFCFCUL)>>2);\
1001 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1009 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1010 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1011 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017 int i, a0, b0, a1, b1;\
1024 for(i=0; i<h; i+=2){\
1030 block[0]= (a1+a0)>>2; /* FIXME non put */\
1031 block[1]= (b1+b0)>>2;\
1041 block[0]= (a1+a0)>>2;\
1042 block[1]= (b1+b0)>>2;\
1048 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051 const uint32_t a= AV_RN32(pixels );\
1052 const uint32_t b= AV_RN32(pixels+1);\
1053 uint32_t l0= (a&0x03030303UL)\
1056 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1061 for(i=0; i<h; i+=2){\
1062 uint32_t a= AV_RN32(pixels );\
1063 uint32_t b= AV_RN32(pixels+1);\
1064 l1= (a&0x03030303UL)\
1065 + (b&0x03030303UL);\
1066 h1= ((a&0xFCFCFCFCUL)>>2)\
1067 + ((b&0xFCFCFCFCUL)>>2);\
1068 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1071 a= AV_RN32(pixels );\
1072 b= AV_RN32(pixels+1);\
1073 l0= (a&0x03030303UL)\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1087 for(j=0; j<2; j++){\
1089 const uint32_t a= AV_RN32(pixels );\
1090 const uint32_t b= AV_RN32(pixels+1);\
1091 uint32_t l0= (a&0x03030303UL)\
1094 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1095 + ((b&0xFCFCFCFCUL)>>2);\
1099 for(i=0; i<h; i+=2){\
1100 uint32_t a= AV_RN32(pixels );\
1101 uint32_t b= AV_RN32(pixels+1);\
1102 l1= (a&0x03030303UL)\
1103 + (b&0x03030303UL);\
1104 h1= ((a&0xFCFCFCFCUL)>>2)\
1105 + ((b&0xFCFCFCFCUL)>>2);\
1106 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1109 a= AV_RN32(pixels );\
1110 b= AV_RN32(pixels+1);\
1111 l0= (a&0x03030303UL)\
1114 h0= ((a&0xFCFCFCFCUL)>>2)\
1115 + ((b&0xFCFCFCFCUL)>>2);\
1116 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1120 pixels+=4-line_size*(h+1);\
1121 block +=4-line_size*h;\
1125 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1128 for(j=0; j<2; j++){\
1130 const uint32_t a= AV_RN32(pixels );\
1131 const uint32_t b= AV_RN32(pixels+1);\
1132 uint32_t l0= (a&0x03030303UL)\
1135 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1136 + ((b&0xFCFCFCFCUL)>>2);\
1140 for(i=0; i<h; i+=2){\
1141 uint32_t a= AV_RN32(pixels );\
1142 uint32_t b= AV_RN32(pixels+1);\
1143 l1= (a&0x03030303UL)\
1144 + (b&0x03030303UL);\
1145 h1= ((a&0xFCFCFCFCUL)>>2)\
1146 + ((b&0xFCFCFCFCUL)>>2);\
1147 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150 a= AV_RN32(pixels );\
1151 b= AV_RN32(pixels+1);\
1152 l0= (a&0x03030303UL)\
1155 h0= ((a&0xFCFCFCFCUL)>>2)\
1156 + ((b&0xFCFCFCFCUL)>>2);\
1157 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 pixels+=4-line_size*(h+1);\
1162 block +=4-line_size*h;\
1166 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1168 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1169 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1170 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1171 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1172 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1173 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1175 #define op_avg(a, b) a = rnd_avg32(a, b)
1177 #define op_put(a, b) a = b
1184 #define put_no_rnd_pixels8_c put_pixels8_c
1185 #define put_no_rnd_pixels16_c put_pixels16_c
1187 #define avg2(a,b) ((a+b+1)>>1)
1188 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1190 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1191 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1194 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1195 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1198 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1200 const int A=(16-x16)*(16-y16);
1201 const int B=( x16)*(16-y16);
1202 const int C=(16-x16)*( y16);
1203 const int D=( x16)*( y16);
1208 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1209 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1210 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1211 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1212 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1213 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1214 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1215 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1221 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1222 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1225 const int s= 1<<shift;
1235 for(x=0; x<8; x++){ //XXX FIXME optimize
1236 int src_x, src_y, frac_x, frac_y, index;
1240 frac_x= src_x&(s-1);
1241 frac_y= src_y&(s-1);
1245 if((unsigned)src_x < width){
1246 if((unsigned)src_y < height){
1247 index= src_x + src_y*stride;
1248 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1249 + src[index +1]* frac_x )*(s-frac_y)
1250 + ( src[index+stride ]*(s-frac_x)
1251 + src[index+stride+1]* frac_x )* frac_y
1254 index= src_x + av_clip(src_y, 0, height)*stride;
1255 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1256 + src[index +1]* frac_x )*s
1260 if((unsigned)src_y < height){
1261 index= av_clip(src_x, 0, width) + src_y*stride;
1262 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1263 + src[index+stride ]* frac_y )*s
1266 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1267 dst[y*stride + x]= src[index ];
1279 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 case 2: put_pixels2_c (dst, src, stride, height); break;
1282 case 4: put_pixels4_c (dst, src, stride, height); break;
1283 case 8: put_pixels8_c (dst, src, stride, height); break;
1284 case 16:put_pixels16_c(dst, src, stride, height); break;
1288 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1290 for (i=0; i < height; i++) {
1291 for (j=0; j < width; j++) {
1292 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1299 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301 for (i=0; i < height; i++) {
1302 for (j=0; j < width; j++) {
1303 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1310 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1321 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1332 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1343 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1354 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1376 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 case 2: avg_pixels2_c (dst, src, stride, height); break;
1379 case 4: avg_pixels4_c (dst, src, stride, height); break;
1380 case 8: avg_pixels8_c (dst, src, stride, height); break;
1381 case 16:avg_pixels16_c(dst, src, stride, height); break;
1385 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1387 for (i=0; i < height; i++) {
1388 for (j=0; j < width; j++) {
1389 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1396 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398 for (i=0; i < height; i++) {
1399 for (j=0; j < width; j++) {
1400 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1407 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1409 for (i=0; i < height; i++) {
1410 for (j=0; j < width; j++) {
1411 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1418 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1420 for (i=0; i < height; i++) {
1421 for (j=0; j < width; j++) {
1422 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1429 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1431 for (i=0; i < height; i++) {
1432 for (j=0; j < width; j++) {
1433 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1440 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1442 for (i=0; i < height; i++) {
1443 for (j=0; j < width; j++) {
1444 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1451 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1453 for (i=0; i < height; i++) {
1454 for (j=0; j < width; j++) {
1455 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464 for (i=0; i < height; i++) {
1465 for (j=0; j < width; j++) {
1466 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1473 #define TPEL_WIDTH(width)\
1474 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1476 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1478 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1480 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1486 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1488 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1490 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1494 #define H264_CHROMA_MC(OPNAME, OP)\
1495 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1496 const int A=(8-x)*(8-y);\
1497 const int B=( x)*(8-y);\
1498 const int C=(8-x)*( y);\
1499 const int D=( x)*( y);\
1502 assert(x<8 && y<8 && x>=0 && y>=0);\
1505 for(i=0; i<h; i++){\
1506 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1507 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1513 const int step= C ? stride : 1;\
1514 for(i=0; i<h; i++){\
1515 OP(dst[0], (A*src[0] + E*src[step+0]));\
1516 OP(dst[1], (A*src[1] + E*src[step+1]));\
1523 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524 const int A=(8-x)*(8-y);\
1525 const int B=( x)*(8-y);\
1526 const int C=(8-x)*( y);\
1527 const int D=( x)*( y);\
1530 assert(x<8 && y<8 && x>=0 && y>=0);\
1533 for(i=0; i<h; i++){\
1534 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1537 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1543 const int step= C ? stride : 1;\
1544 for(i=0; i<h; i++){\
1545 OP(dst[0], (A*src[0] + E*src[step+0]));\
1546 OP(dst[1], (A*src[1] + E*src[step+1]));\
1547 OP(dst[2], (A*src[2] + E*src[step+2]));\
1548 OP(dst[3], (A*src[3] + E*src[step+3]));\
1555 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1556 const int A=(8-x)*(8-y);\
1557 const int B=( x)*(8-y);\
1558 const int C=(8-x)*( y);\
1559 const int D=( x)*( y);\
1562 assert(x<8 && y<8 && x>=0 && y>=0);\
1565 for(i=0; i<h; i++){\
1566 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1567 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1568 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1569 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1570 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1571 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1572 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1573 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1579 const int step= C ? stride : 1;\
1580 for(i=0; i<h; i++){\
1581 OP(dst[0], (A*src[0] + E*src[step+0]));\
1582 OP(dst[1], (A*src[1] + E*src[step+1]));\
1583 OP(dst[2], (A*src[2] + E*src[step+2]));\
1584 OP(dst[3], (A*src[3] + E*src[step+3]));\
1585 OP(dst[4], (A*src[4] + E*src[step+4]));\
1586 OP(dst[5], (A*src[5] + E*src[step+5]));\
1587 OP(dst[6], (A*src[6] + E*src[step+6]));\
1588 OP(dst[7], (A*src[7] + E*src[step+7]));\
1595 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1596 #define op_put(a, b) a = (((b) + 32)>>6)
1598 H264_CHROMA_MC(put_ , op_put)
1599 H264_CHROMA_MC(avg_ , op_avg)
1603 #define QPEL_MC(r, OPNAME, RND, OP) \
1604 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1609 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1610 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1611 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1612 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1613 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1614 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1615 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1616 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1622 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1624 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1628 const int src0= src[0*srcStride];\
1629 const int src1= src[1*srcStride];\
1630 const int src2= src[2*srcStride];\
1631 const int src3= src[3*srcStride];\
1632 const int src4= src[4*srcStride];\
1633 const int src5= src[5*srcStride];\
1634 const int src6= src[6*srcStride];\
1635 const int src7= src[7*srcStride];\
1636 const int src8= src[8*srcStride];\
1637 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1638 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1639 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1640 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1641 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1642 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1643 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1644 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1650 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1651 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1656 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1657 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1658 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1659 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1660 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1661 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1662 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1663 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1664 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1665 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1666 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1667 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1668 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1669 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1670 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1671 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1677 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1678 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1683 const int src0= src[0*srcStride];\
1684 const int src1= src[1*srcStride];\
1685 const int src2= src[2*srcStride];\
1686 const int src3= src[3*srcStride];\
1687 const int src4= src[4*srcStride];\
1688 const int src5= src[5*srcStride];\
1689 const int src6= src[6*srcStride];\
1690 const int src7= src[7*srcStride];\
1691 const int src8= src[8*srcStride];\
1692 const int src9= src[9*srcStride];\
1693 const int src10= src[10*srcStride];\
1694 const int src11= src[11*srcStride];\
1695 const int src12= src[12*srcStride];\
1696 const int src13= src[13*srcStride];\
1697 const int src14= src[14*srcStride];\
1698 const int src15= src[15*srcStride];\
1699 const int src16= src[16*srcStride];\
1700 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1701 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1702 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1703 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1704 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1705 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1706 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1707 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1708 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1709 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1710 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1711 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1712 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1713 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1714 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1715 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1721 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1723 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1724 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1727 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1728 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1731 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1734 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1737 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 copy_block9(full, src, 16, stride, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1742 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1745 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1746 uint8_t full[16*9];\
1747 copy_block9(full, src, 16, stride, 9);\
1748 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1751 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1752 uint8_t full[16*9];\
1754 copy_block9(full, src, 16, stride, 9);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1756 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1758 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1762 uint8_t halfHV[64];\
1763 copy_block9(full, src, 16, stride, 9);\
1764 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1769 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[16*9];\
1772 uint8_t halfHV[64];\
1773 copy_block9(full, src, 16, stride, 9);\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1779 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1790 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1793 uint8_t halfHV[64];\
1794 copy_block9(full, src, 16, stride, 9);\
1795 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1796 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1800 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801 uint8_t full[16*9];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t full[16*9];\
1814 uint8_t halfHV[64];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1821 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t full[16*9];\
1825 uint8_t halfHV[64];\
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1835 uint8_t halfHV[64];\
1836 copy_block9(full, src, 16, stride, 9);\
1837 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1839 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1842 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t halfHV[64];\
1845 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1846 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1847 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1849 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t halfHV[64];\
1852 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1853 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1854 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1856 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[16*9];\
1860 uint8_t halfHV[64];\
1861 copy_block9(full, src, 16, stride, 9);\
1862 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1863 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1864 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1867 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1868 uint8_t full[16*9];\
1870 copy_block9(full, src, 16, stride, 9);\
1871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1873 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1875 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1879 uint8_t halfHV[64];\
1880 copy_block9(full, src, 16, stride, 9);\
1881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1886 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[16*9];\
1889 copy_block9(full, src, 16, stride, 9);\
1890 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1891 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1892 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1894 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1896 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1897 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1900 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1902 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1903 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1906 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1907 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1910 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1912 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1913 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1916 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1919 copy_block17(full, src, 24, stride, 17);\
1920 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1921 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1924 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[24*17];\
1926 copy_block17(full, src, 24, stride, 17);\
1927 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1930 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[24*17];\
1933 copy_block17(full, src, 24, stride, 17);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1935 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1937 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfV[256];\
1941 uint8_t halfHV[256];\
1942 copy_block17(full, src, 24, stride, 17);\
1943 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1945 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1948 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t full[24*17];\
1950 uint8_t halfH[272];\
1951 uint8_t halfHV[256];\
1952 copy_block17(full, src, 24, stride, 17);\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1958 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfV[256];\
1962 uint8_t halfHV[256];\
1963 copy_block17(full, src, 24, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1969 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1970 uint8_t full[24*17];\
1971 uint8_t halfH[272];\
1972 uint8_t halfHV[256];\
1973 copy_block17(full, src, 24, stride, 17);\
1974 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1975 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1979 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfV[256];\
1983 uint8_t halfHV[256];\
1984 copy_block17(full, src, 24, stride, 17);\
1985 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1990 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[24*17];\
1992 uint8_t halfH[272];\
1993 uint8_t halfHV[256];\
1994 copy_block17(full, src, 24, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2000 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t full[24*17];\
2002 uint8_t halfH[272];\
2003 uint8_t halfV[256];\
2004 uint8_t halfHV[256];\
2005 copy_block17(full, src, 24, stride, 17);\
2006 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2007 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2008 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2011 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2012 uint8_t full[24*17];\
2013 uint8_t halfH[272];\
2014 uint8_t halfHV[256];\
2015 copy_block17(full, src, 24, stride, 17);\
2016 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2017 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2018 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2019 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2021 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2022 uint8_t halfH[272];\
2023 uint8_t halfHV[256];\
2024 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2025 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2026 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2028 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t halfH[272];\
2030 uint8_t halfHV[256];\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2035 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t full[24*17];\
2037 uint8_t halfH[272];\
2038 uint8_t halfV[256];\
2039 uint8_t halfHV[256];\
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2046 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 copy_block17(full, src, 24, stride, 17);\
2050 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2051 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2052 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2054 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2055 uint8_t full[24*17];\
2056 uint8_t halfH[272];\
2057 uint8_t halfV[256];\
2058 uint8_t halfHV[256];\
2059 copy_block17(full, src, 24, stride, 17);\
2060 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2061 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2062 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2063 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2065 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2066 uint8_t full[24*17];\
2067 uint8_t halfH[272];\
2068 copy_block17(full, src, 24, stride, 17);\
2069 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2070 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2071 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2073 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2074 uint8_t halfH[272];\
2075 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2076 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2079 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2080 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2081 #define op_put(a, b) a = cm[((b) + 16)>>5]
2082 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2084 QPEL_MC(0, put_ , _ , op_put)
2085 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2086 QPEL_MC(0, avg_ , _ , op_avg)
2087 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2089 #undef op_avg_no_rnd
2091 #undef op_put_no_rnd
2093 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2094 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2095 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2096 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2097 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2098 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2101 #define H264_LOWPASS(OPNAME, OP, OP2) \
2102 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2104 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2109 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2115 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2117 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2121 const int srcB= src[-2*srcStride];\
2122 const int srcA= src[-1*srcStride];\
2123 const int src0= src[0 *srcStride];\
2124 const int src1= src[1 *srcStride];\
2125 const int src2= src[2 *srcStride];\
2126 const int src3= src[3 *srcStride];\
2127 const int src4= src[4 *srcStride];\
2128 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2129 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2135 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2138 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2140 src -= 2*srcStride;\
2141 for(i=0; i<h+5; i++)\
2143 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2144 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2148 tmp -= tmpStride*(h+5-2);\
2151 const int tmpB= tmp[-2*tmpStride];\
2152 const int tmpA= tmp[-1*tmpStride];\
2153 const int tmp0= tmp[0 *tmpStride];\
2154 const int tmp1= tmp[1 *tmpStride];\
2155 const int tmp2= tmp[2 *tmpStride];\
2156 const int tmp3= tmp[3 *tmpStride];\
2157 const int tmp4= tmp[4 *tmpStride];\
2158 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2159 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2164 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2166 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2171 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2172 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2173 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2179 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2181 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2185 const int srcB= src[-2*srcStride];\
2186 const int srcA= src[-1*srcStride];\
2187 const int src0= src[0 *srcStride];\
2188 const int src1= src[1 *srcStride];\
2189 const int src2= src[2 *srcStride];\
2190 const int src3= src[3 *srcStride];\
2191 const int src4= src[4 *srcStride];\
2192 const int src5= src[5 *srcStride];\
2193 const int src6= src[6 *srcStride];\
2194 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2195 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2196 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2197 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 src -= 2*srcStride;\
2209 for(i=0; i<h+5; i++)\
2211 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2212 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2213 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2214 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2218 tmp -= tmpStride*(h+5-2);\
2221 const int tmpB= tmp[-2*tmpStride];\
2222 const int tmpA= tmp[-1*tmpStride];\
2223 const int tmp0= tmp[0 *tmpStride];\
2224 const int tmp1= tmp[1 *tmpStride];\
2225 const int tmp2= tmp[2 *tmpStride];\
2226 const int tmp3= tmp[3 *tmpStride];\
2227 const int tmp4= tmp[4 *tmpStride];\
2228 const int tmp5= tmp[5 *tmpStride];\
2229 const int tmp6= tmp[6 *tmpStride];\
2230 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2231 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2232 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2233 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2239 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2246 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2247 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2248 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2249 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2250 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2251 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2252 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2258 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264 const int srcB= src[-2*srcStride];\
2265 const int srcA= src[-1*srcStride];\
2266 const int src0= src[0 *srcStride];\
2267 const int src1= src[1 *srcStride];\
2268 const int src2= src[2 *srcStride];\
2269 const int src3= src[3 *srcStride];\
2270 const int src4= src[4 *srcStride];\
2271 const int src5= src[5 *srcStride];\
2272 const int src6= src[6 *srcStride];\
2273 const int src7= src[7 *srcStride];\
2274 const int src8= src[8 *srcStride];\
2275 const int src9= src[9 *srcStride];\
2276 const int src10=src[10*srcStride];\
2277 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2278 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2279 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2280 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2281 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2282 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2283 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2284 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2290 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2293 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2295 src -= 2*srcStride;\
2296 for(i=0; i<h+5; i++)\
2298 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2299 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2300 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2301 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2302 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2303 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2304 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2305 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2309 tmp -= tmpStride*(h+5-2);\
2312 const int tmpB= tmp[-2*tmpStride];\
2313 const int tmpA= tmp[-1*tmpStride];\
2314 const int tmp0= tmp[0 *tmpStride];\
2315 const int tmp1= tmp[1 *tmpStride];\
2316 const int tmp2= tmp[2 *tmpStride];\
2317 const int tmp3= tmp[3 *tmpStride];\
2318 const int tmp4= tmp[4 *tmpStride];\
2319 const int tmp5= tmp[5 *tmpStride];\
2320 const int tmp6= tmp[6 *tmpStride];\
2321 const int tmp7= tmp[7 *tmpStride];\
2322 const int tmp8= tmp[8 *tmpStride];\
2323 const int tmp9= tmp[9 *tmpStride];\
2324 const int tmp10=tmp[10*tmpStride];\
2325 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2326 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2327 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2328 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2329 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2330 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2331 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2332 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2338 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2340 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2341 src += 8*srcStride;\
2342 dst += 8*dstStride;\
2343 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2344 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2347 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2348 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2349 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2350 src += 8*srcStride;\
2351 dst += 8*dstStride;\
2352 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2353 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2356 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2357 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2358 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2359 src += 8*srcStride;\
2360 dst += 8*dstStride;\
2361 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2362 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2365 #define H264_MC(OPNAME, SIZE) \
2366 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2367 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2371 uint8_t half[SIZE*SIZE];\
2372 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2377 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2381 uint8_t half[SIZE*SIZE];\
2382 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2383 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2387 uint8_t full[SIZE*(SIZE+5)];\
2388 uint8_t * const full_mid= full + SIZE*2;\
2389 uint8_t half[SIZE*SIZE];\
2390 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2391 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2392 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2395 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2396 uint8_t full[SIZE*(SIZE+5)];\
2397 uint8_t * const full_mid= full + SIZE*2;\
2398 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2399 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2403 uint8_t full[SIZE*(SIZE+5)];\
2404 uint8_t * const full_mid= full + SIZE*2;\
2405 uint8_t half[SIZE*SIZE];\
2406 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2407 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2408 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2412 uint8_t full[SIZE*(SIZE+5)];\
2413 uint8_t * const full_mid= full + SIZE*2;\
2414 uint8_t halfH[SIZE*SIZE];\
2415 uint8_t halfV[SIZE*SIZE];\
2416 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2418 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t full[SIZE*(SIZE+5)];\
2424 uint8_t * const full_mid= full + SIZE*2;\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2429 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2434 uint8_t full[SIZE*(SIZE+5)];\
2435 uint8_t * const full_mid= full + SIZE*2;\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2440 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2445 uint8_t full[SIZE*(SIZE+5)];\
2446 uint8_t * const full_mid= full + SIZE*2;\
2447 uint8_t halfH[SIZE*SIZE];\
2448 uint8_t halfV[SIZE*SIZE];\
2449 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2451 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2456 int16_t tmp[SIZE*(SIZE+5)];\
2457 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2461 int16_t tmp[SIZE*(SIZE+5)];\
2462 uint8_t halfH[SIZE*SIZE];\
2463 uint8_t halfHV[SIZE*SIZE];\
2464 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2466 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2470 int16_t tmp[SIZE*(SIZE+5)];\
2471 uint8_t halfH[SIZE*SIZE];\
2472 uint8_t halfHV[SIZE*SIZE];\
2473 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2474 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2479 uint8_t full[SIZE*(SIZE+5)];\
2480 uint8_t * const full_mid= full + SIZE*2;\
2481 int16_t tmp[SIZE*(SIZE+5)];\
2482 uint8_t halfV[SIZE*SIZE];\
2483 uint8_t halfHV[SIZE*SIZE];\
2484 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2485 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2490 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2491 uint8_t full[SIZE*(SIZE+5)];\
2492 uint8_t * const full_mid= full + SIZE*2;\
2493 int16_t tmp[SIZE*(SIZE+5)];\
2494 uint8_t halfV[SIZE*SIZE];\
2495 uint8_t halfHV[SIZE*SIZE];\
2496 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2497 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2498 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2499 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2502 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2503 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2504 #define op_put(a, b) a = cm[((b) + 16)>>5]
2505 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2506 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2508 H264_LOWPASS(put_ , op_put, op2_put)
2509 H264_LOWPASS(avg_ , op_avg, op2_avg)
2524 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2525 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2526 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2527 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2529 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2530 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2534 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2535 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2536 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2537 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2538 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2539 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2540 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2541 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2547 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2548 put_pixels8_c(dst, src, stride, 8);
2550 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551 avg_pixels8_c(dst, src, stride, 8);
2553 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2554 put_pixels16_c(dst, src, stride, 16);
2556 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557 avg_pixels16_c(dst, src, stride, 16);
2560 #if CONFIG_RV40_DECODER
2561 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2562 put_pixels16_xy2_c(dst, src, stride, 16);
2564 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565 avg_pixels16_xy2_c(dst, src, stride, 16);
2567 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568 put_pixels8_xy2_c(dst, src, stride, 8);
2570 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571 avg_pixels8_xy2_c(dst, src, stride, 8);
2573 #endif /* CONFIG_RV40_DECODER */
2575 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2576 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2580 const int src_1= src[ -srcStride];
2581 const int src0 = src[0 ];
2582 const int src1 = src[ srcStride];
2583 const int src2 = src[2*srcStride];
2584 const int src3 = src[3*srcStride];
2585 const int src4 = src[4*srcStride];
2586 const int src5 = src[5*srcStride];
2587 const int src6 = src[6*srcStride];
2588 const int src7 = src[7*srcStride];
2589 const int src8 = src[8*srcStride];
2590 const int src9 = src[9*srcStride];
2591 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2592 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2593 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2594 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2595 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2596 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2597 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2598 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2604 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2606 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2607 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2610 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2611 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2614 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2616 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2617 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2620 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2621 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2624 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2628 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2629 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2630 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2631 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2633 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2637 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2638 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2639 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2640 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2642 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2644 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2645 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2648 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2649 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2651 const int strength= ff_h263_loop_filter_strength[qscale];
2655 int p0= src[x-2*stride];
2656 int p1= src[x-1*stride];
2657 int p2= src[x+0*stride];
2658 int p3= src[x+1*stride];
2659 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2661 if (d<-2*strength) d1= 0;
2662 else if(d<- strength) d1=-2*strength - d;
2663 else if(d< strength) d1= d;
2664 else if(d< 2*strength) d1= 2*strength - d;
2669 if(p1&256) p1= ~(p1>>31);
2670 if(p2&256) p2= ~(p2>>31);
2672 src[x-1*stride] = p1;
2673 src[x+0*stride] = p2;
2677 d2= av_clip((p0-p3)/4, -ad1, ad1);
2679 src[x-2*stride] = p0 - d2;
2680 src[x+ stride] = p3 + d2;
2685 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2686 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2688 const int strength= ff_h263_loop_filter_strength[qscale];
2692 int p0= src[y*stride-2];
2693 int p1= src[y*stride-1];
2694 int p2= src[y*stride+0];
2695 int p3= src[y*stride+1];
2696 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2698 if (d<-2*strength) d1= 0;
2699 else if(d<- strength) d1=-2*strength - d;
2700 else if(d< strength) d1= d;
2701 else if(d< 2*strength) d1= 2*strength - d;
2706 if(p1&256) p1= ~(p1>>31);
2707 if(p2&256) p2= ~(p2>>31);
2709 src[y*stride-1] = p1;
2710 src[y*stride+0] = p2;
2714 d2= av_clip((p0-p3)/4, -ad1, ad1);
2716 src[y*stride-2] = p0 - d2;
2717 src[y*stride+1] = p3 + d2;
2722 static void h261_loop_filter_c(uint8_t *src, int stride){
2727 temp[x ] = 4*src[x ];
2728 temp[x + 7*8] = 4*src[x + 7*stride];
2732 xy = y * stride + x;
2734 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2739 src[ y*stride] = (temp[ y*8] + 2)>>2;
2740 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2742 xy = y * stride + x;
2744 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2749 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2755 s += abs(pix1[0] - pix2[0]);
2756 s += abs(pix1[1] - pix2[1]);
2757 s += abs(pix1[2] - pix2[2]);
2758 s += abs(pix1[3] - pix2[3]);
2759 s += abs(pix1[4] - pix2[4]);
2760 s += abs(pix1[5] - pix2[5]);
2761 s += abs(pix1[6] - pix2[6]);
2762 s += abs(pix1[7] - pix2[7]);
2763 s += abs(pix1[8] - pix2[8]);
2764 s += abs(pix1[9] - pix2[9]);
2765 s += abs(pix1[10] - pix2[10]);
2766 s += abs(pix1[11] - pix2[11]);
2767 s += abs(pix1[12] - pix2[12]);
2768 s += abs(pix1[13] - pix2[13]);
2769 s += abs(pix1[14] - pix2[14]);
2770 s += abs(pix1[15] - pix2[15]);
2777 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2783 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2784 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2785 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2786 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2787 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2788 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2789 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2790 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2791 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2792 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2793 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2794 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2795 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2796 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2797 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2798 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2805 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2808 uint8_t *pix3 = pix2 + line_size;
2812 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2813 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2814 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2815 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2816 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2817 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2818 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2819 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2820 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2821 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2822 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2823 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2824 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2825 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2826 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2827 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2835 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2838 uint8_t *pix3 = pix2 + line_size;
2842 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2843 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2844 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2845 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2846 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2847 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2848 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2849 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2850 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2851 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2852 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2853 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2854 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2855 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2856 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2857 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2865 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2871 s += abs(pix1[0] - pix2[0]);
2872 s += abs(pix1[1] - pix2[1]);
2873 s += abs(pix1[2] - pix2[2]);
2874 s += abs(pix1[3] - pix2[3]);
2875 s += abs(pix1[4] - pix2[4]);
2876 s += abs(pix1[5] - pix2[5]);
2877 s += abs(pix1[6] - pix2[6]);
2878 s += abs(pix1[7] - pix2[7]);
2885 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2891 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2892 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2893 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2894 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2895 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2896 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2897 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2898 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2905 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2908 uint8_t *pix3 = pix2 + line_size;
2912 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2913 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2914 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2915 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2916 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2917 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2918 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2919 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2927 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2930 uint8_t *pix3 = pix2 + line_size;
2934 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2935 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2936 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2937 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2938 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2939 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2940 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2941 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2949 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2950 MpegEncContext *c = v;
2956 for(x=0; x<16; x++){
2957 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2960 for(x=0; x<15; x++){
2961 score2+= FFABS( s1[x ] - s1[x +stride]
2962 - s1[x+1] + s1[x+1+stride])
2963 -FFABS( s2[x ] - s2[x +stride]
2964 - s2[x+1] + s2[x+1+stride]);
2971 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2972 else return score1 + FFABS(score2)*8;
2975 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2976 MpegEncContext *c = v;
2983 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2987 score2+= FFABS( s1[x ] - s1[x +stride]
2988 - s1[x+1] + s1[x+1+stride])
2989 -FFABS( s2[x ] - s2[x +stride]
2990 - s2[x+1] + s2[x+1+stride]);
2997 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2998 else return score1 + FFABS(score2)*8;
3001 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3005 for(i=0; i<8*8; i++){
3006 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3009 assert(-512<b && b<512);
3011 sum += (w*b)*(w*b)>>4;
3016 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3019 for(i=0; i<8*8; i++){
3020 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3025 * permutes an 8x8 block.
3026 * @param block the block which will be permuted according to the given permutation vector
3027 * @param permutation the permutation vector
3028 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3029 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3030 * (inverse) permutated to scantable order!
3032 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3038 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3040 for(i=0; i<=last; i++){
3041 const int j= scantable[i];
3046 for(i=0; i<=last; i++){
3047 const int j= scantable[i];
3048 const int perm_j= permutation[j];
3049 block[perm_j]= temp[j];
3053 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3057 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3060 memset(cmp, 0, sizeof(void*)*6);
3068 cmp[i]= c->hadamard8_diff[i];
3074 cmp[i]= c->dct_sad[i];
3077 cmp[i]= c->dct264_sad[i];
3080 cmp[i]= c->dct_max[i];
3083 cmp[i]= c->quant_psnr[i];
3112 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3117 static void clear_block_c(DCTELEM *block)
3119 memset(block, 0, sizeof(DCTELEM)*64);
3123 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3125 static void clear_blocks_c(DCTELEM *blocks)
3127 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3130 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3132 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3133 long a = *(long*)(src+i);
3134 long b = *(long*)(dst+i);
3135 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3138 dst[i+0] += src[i+0];
3141 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3143 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144 long a = *(long*)(src1+i);
3145 long b = *(long*)(src2+i);
3146 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3149 dst[i] = src1[i]+src2[i];
3152 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3154 #if !HAVE_FAST_UNALIGNED
3155 if((long)src2 & (sizeof(long)-1)){
3156 for(i=0; i+7<w; i+=8){
3157 dst[i+0] = src1[i+0]-src2[i+0];
3158 dst[i+1] = src1[i+1]-src2[i+1];
3159 dst[i+2] = src1[i+2]-src2[i+2];
3160 dst[i+3] = src1[i+3]-src2[i+3];
3161 dst[i+4] = src1[i+4]-src2[i+4];
3162 dst[i+5] = src1[i+5]-src2[i+5];
3163 dst[i+6] = src1[i+6]-src2[i+6];
3164 dst[i+7] = src1[i+7]-src2[i+7];
3168 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3169 long a = *(long*)(src1+i);
3170 long b = *(long*)(src2+i);
3171 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3174 dst[i+0] = src1[i+0]-src2[i+0];
3177 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3185 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3194 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3202 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3212 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3215 for(i=0; i<w-1; i++){
3242 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3272 #define BUTTERFLY2(o1,o2,i1,i2) \
3276 #define BUTTERFLY1(x,y) \
3285 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3287 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3295 //FIXME try pointer walks
3296 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3297 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3298 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3299 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3301 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3302 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3303 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3304 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3306 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3307 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3308 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3309 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3313 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3314 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3315 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3316 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3318 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3319 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3320 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3321 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3324 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3325 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3326 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3327 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3333 printf("MAX:%d\n", maxi);
3339 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3347 //FIXME try pointer walks
3348 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3349 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3350 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3351 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3353 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3354 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3355 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3356 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3358 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3359 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3360 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3361 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3365 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3366 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3367 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3368 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3370 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3371 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3372 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3373 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3376 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3377 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3378 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3379 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3382 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3387 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3388 MpegEncContext * const s= (MpegEncContext *)c;
3389 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3393 s->dsp.diff_pixels(temp, src1, src2, stride);
3395 return s->dsp.sum_abs_dctelem(temp);
3400 const int s07 = SRC(0) + SRC(7);\
3401 const int s16 = SRC(1) + SRC(6);\
3402 const int s25 = SRC(2) + SRC(5);\
3403 const int s34 = SRC(3) + SRC(4);\
3404 const int a0 = s07 + s34;\
3405 const int a1 = s16 + s25;\
3406 const int a2 = s07 - s34;\
3407 const int a3 = s16 - s25;\
3408 const int d07 = SRC(0) - SRC(7);\
3409 const int d16 = SRC(1) - SRC(6);\
3410 const int d25 = SRC(2) - SRC(5);\
3411 const int d34 = SRC(3) - SRC(4);\
3412 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3413 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3414 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3415 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3417 DST(1, a4 + (a7>>2)) ;\
3418 DST(2, a2 + (a3>>1)) ;\
3419 DST(3, a5 + (a6>>2)) ;\
3421 DST(5, a6 - (a5>>2)) ;\
3422 DST(6, (a2>>1) - a3 ) ;\
3423 DST(7, (a4>>2) - a7 ) ;\
3426 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3427 MpegEncContext * const s= (MpegEncContext *)c;
3432 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3434 #define SRC(x) dct[i][x]
3435 #define DST(x,v) dct[i][x]= v
3436 for( i = 0; i < 8; i++ )
3441 #define SRC(x) dct[x][i]
3442 #define DST(x,v) sum += FFABS(v)
3443 for( i = 0; i < 8; i++ )
3451 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3452 MpegEncContext * const s= (MpegEncContext *)c;
3453 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3458 s->dsp.diff_pixels(temp, src1, src2, stride);
3462 sum= FFMAX(sum, FFABS(temp[i]));
3467 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468 MpegEncContext * const s= (MpegEncContext *)c;
3469 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3470 DCTELEM * const bak = temp+64;
3476 s->dsp.diff_pixels(temp, src1, src2, stride);
3478 memcpy(bak, temp, 64*sizeof(DCTELEM));
3480 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3481 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3482 ff_simple_idct(temp); //FIXME
3485 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3490 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3491 MpegEncContext * const s= (MpegEncContext *)c;
3492 const uint8_t *scantable= s->intra_scantable.permutated;
3493 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3494 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3495 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3496 int i, last, run, bits, level, distortion, start_i;
3497 const int esc_length= s->ac_esc_length;
3499 uint8_t * last_length;
3503 copy_block8(lsrc1, src1, 8, stride, 8);
3504 copy_block8(lsrc2, src2, 8, stride, 8);
3506 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3508 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3514 length = s->intra_ac_vlc_length;
3515 last_length= s->intra_ac_vlc_last_length;
3516 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3519 length = s->inter_ac_vlc_length;
3520 last_length= s->inter_ac_vlc_last_length;
3525 for(i=start_i; i<last; i++){
3526 int j= scantable[i];
3531 if((level&(~127)) == 0){
3532 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3541 level= temp[i] + 64;
3545 if((level&(~127)) == 0){
3546 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3554 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3556 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3559 s->dsp.idct_add(lsrc2, 8, temp);
3561 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3563 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3566 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3567 MpegEncContext * const s= (MpegEncContext *)c;
3568 const uint8_t *scantable= s->intra_scantable.permutated;
3569 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3570 int i, last, run, bits, level, start_i;
3571 const int esc_length= s->ac_esc_length;
3573 uint8_t * last_length;
3577 s->dsp.diff_pixels(temp, src1, src2, stride);
3579 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3585 length = s->intra_ac_vlc_length;
3586 last_length= s->intra_ac_vlc_last_length;
3587 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3590 length = s->inter_ac_vlc_length;
3591 last_length= s->inter_ac_vlc_last_length;
3596 for(i=start_i; i<last; i++){
3597 int j= scantable[i];
3602 if((level&(~127)) == 0){
3603 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3612 level= temp[i] + 64;
3616 if((level&(~127)) == 0){
3617 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3625 #define VSAD_INTRA(size) \
3626 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3630 for(y=1; y<h; y++){ \
3631 for(x=0; x<size; x+=4){ \
3632 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3633 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3643 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3648 for(x=0; x<16; x++){
3649 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3658 #define SQ(a) ((a)*(a))
3659 #define VSSE_INTRA(size) \
3660 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3664 for(y=1; y<h; y++){ \
3665 for(x=0; x<size; x+=4){ \
3666 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3667 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3677 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3682 for(x=0; x<16; x++){
3683 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3692 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3696 for(i=0; i<size; i++)
3697 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3701 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3702 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3703 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3705 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3707 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3708 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3709 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3710 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3712 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3714 for(i=0; i<len; i++)
3715 dst[i] = src0[i] * src1[i];
3718 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3721 for(i=0; i<len; i++)
3722 dst[i] = src0[i] * src1[-i];
3725 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3727 for(i=0; i<len; i++)
3728 dst[i] = src0[i] * src1[i] + src2[i];
3731 static void vector_fmul_window_c(float *dst, const float *src0,
3732 const float *src1, const float *win, int len)
3738 for(i=-len, j=len-1; i<0; i++, j--) {
3743 dst[i] = s0*wj - s1*wi;
3744 dst[j] = s0*wi + s1*wj;
3748 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3752 for (i = 0; i < len; i++)
3753 dst[i] = src[i] * mul;
3756 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3757 const float **sv, float mul, int len)
3760 for (i = 0; i < len; i += 2, sv++) {
3761 dst[i ] = src[i ] * sv[0][0] * mul;
3762 dst[i+1] = src[i+1] * sv[0][1] * mul;
3766 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3767 const float **sv, float mul, int len)
3770 for (i = 0; i < len; i += 4, sv++) {
3771 dst[i ] = src[i ] * sv[0][0] * mul;
3772 dst[i+1] = src[i+1] * sv[0][1] * mul;
3773 dst[i+2] = src[i+2] * sv[0][2] * mul;
3774 dst[i+3] = src[i+3] * sv[0][3] * mul;
3778 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3782 for (i = 0; i < len; i += 2, sv++) {
3783 dst[i ] = sv[0][0] * mul;
3784 dst[i+1] = sv[0][1] * mul;
3788 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3792 for (i = 0; i < len; i += 4, sv++) {
3793 dst[i ] = sv[0][0] * mul;
3794 dst[i+1] = sv[0][1] * mul;
3795 dst[i+2] = sv[0][2] * mul;
3796 dst[i+3] = sv[0][3] * mul;
3800 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3804 for (i = 0; i < len; i++) {
3805 float t = v1[i] - v2[i];
3811 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3816 for (i = 0; i < len; i++)
3822 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3823 uint32_t maxi, uint32_t maxisign)
3826 if(a > mini) return mini;
3827 else if((a^(1<<31)) > maxisign) return maxi;
3831 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3833 uint32_t mini = *(uint32_t*)min;
3834 uint32_t maxi = *(uint32_t*)max;
3835 uint32_t maxisign = maxi ^ (1<<31);
3836 uint32_t *dsti = (uint32_t*)dst;
3837 const uint32_t *srci = (const uint32_t*)src;
3838 for(i=0; i<len; i+=8) {
3839 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3840 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3841 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3842 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3843 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3844 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3845 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3846 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3849 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3851 if(min < 0 && max > 0) {
3852 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3854 for(i=0; i < len; i+=8) {
3855 dst[i ] = av_clipf(src[i ], min, max);
3856 dst[i + 1] = av_clipf(src[i + 1], min, max);
3857 dst[i + 2] = av_clipf(src[i + 2], min, max);
3858 dst[i + 3] = av_clipf(src[i + 3], min, max);
3859 dst[i + 4] = av_clipf(src[i + 4], min, max);
3860 dst[i + 5] = av_clipf(src[i + 5], min, max);
3861 dst[i + 6] = av_clipf(src[i + 6], min, max);
3862 dst[i + 7] = av_clipf(src[i + 7], min, max);
3867 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3872 res += (*v1++ * *v2++) >> shift;
3877 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3882 *v1++ += mul * *v3++;
3888 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3889 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3890 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3891 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3892 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3893 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3894 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3896 static void wmv2_idct_row(short * b)
3899 int a0,a1,a2,a3,a4,a5,a6,a7;
3901 a1 = W1*b[1]+W7*b[7];
3902 a7 = W7*b[1]-W1*b[7];
3903 a5 = W5*b[5]+W3*b[3];
3904 a3 = W3*b[5]-W5*b[3];
3905 a2 = W2*b[2]+W6*b[6];
3906 a6 = W6*b[2]-W2*b[6];
3907 a0 = W0*b[0]+W0*b[4];
3908 a4 = W0*b[0]-W0*b[4];
3910 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3911 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3913 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3914 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3915 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3916 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3917 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3918 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3919 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3920 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3922 static void wmv2_idct_col(short * b)
3925 int a0,a1,a2,a3,a4,a5,a6,a7;
3926 /*step 1, with extended precision*/
3927 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3928 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3929 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3930 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3931 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3932 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3933 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3934 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3936 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3937 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3939 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3940 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3941 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3942 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3944 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3945 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3946 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3947 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3949 void ff_wmv2_idct_c(short * block){
3953 wmv2_idct_row(block+i);
3956 wmv2_idct_col(block+i);
3959 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3961 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3963 ff_wmv2_idct_c(block);
3964 put_pixels_clamped_c(block, dest, line_size);
3966 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3968 ff_wmv2_idct_c(block);
3969 add_pixels_clamped_c(block, dest, line_size);
3971 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3974 put_pixels_clamped_c(block, dest, line_size);
3976 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3979 add_pixels_clamped_c(block, dest, line_size);
3982 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3985 put_pixels_clamped4_c(block, dest, line_size);
3987 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3990 add_pixels_clamped4_c(block, dest, line_size);
3993 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3996 put_pixels_clamped2_c(block, dest, line_size);
3998 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4001 add_pixels_clamped2_c(block, dest, line_size);
4004 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4006 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4008 dest[0] = cm[(block[0] + 4)>>3];
4010 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4012 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4014 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4017 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4019 /* init static data */
4020 av_cold void dsputil_static_init(void)
4024 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4025 for(i=0;i<MAX_NEG_CROP;i++) {
4027 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4030 for(i=0;i<512;i++) {
4031 ff_squareTbl[i] = (i - 256) * (i - 256);
4034 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4037 int ff_check_alignment(void){
4038 static int did_fail=0;
4039 DECLARE_ALIGNED(16, int, aligned);
4041 if((intptr_t)&aligned & 15){
4043 #if HAVE_MMX || HAVE_ALTIVEC
4044 av_log(NULL, AV_LOG_ERROR,
4045 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4046 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4047 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4048 "Do not report crashes to FFmpeg developers.\n");
4057 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4061 ff_check_alignment();
4064 if(avctx->dct_algo==FF_DCT_FASTINT) {
4065 c->fdct = fdct_ifast;
4066 c->fdct248 = fdct_ifast248;
4068 else if(avctx->dct_algo==FF_DCT_FAAN) {
4069 c->fdct = ff_faandct;
4070 c->fdct248 = ff_faandct248;
4073 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4074 c->fdct248 = ff_fdct248_islow;
4076 #endif //CONFIG_ENCODERS
4078 if(avctx->lowres==1){
4079 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4080 c->idct_put= ff_jref_idct4_put;
4081 c->idct_add= ff_jref_idct4_add;
4083 c->idct_put= ff_h264_lowres_idct_put_c;
4084 c->idct_add= ff_h264_lowres_idct_add_c;
4086 c->idct = j_rev_dct4;
4087 c->idct_permutation_type= FF_NO_IDCT_PERM;
4088 }else if(avctx->lowres==2){
4089 c->idct_put= ff_jref_idct2_put;
4090 c->idct_add= ff_jref_idct2_add;
4091 c->idct = j_rev_dct2;
4092 c->idct_permutation_type= FF_NO_IDCT_PERM;
4093 }else if(avctx->lowres==3){
4094 c->idct_put= ff_jref_idct1_put;
4095 c->idct_add= ff_jref_idct1_add;
4096 c->idct = j_rev_dct1;
4097 c->idct_permutation_type= FF_NO_IDCT_PERM;
4099 if(avctx->idct_algo==FF_IDCT_INT){
4100 c->idct_put= ff_jref_idct_put;
4101 c->idct_add= ff_jref_idct_add;
4102 c->idct = j_rev_dct;
4103 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4104 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4105 avctx->idct_algo==FF_IDCT_VP3){
4106 c->idct_put= ff_vp3_idct_put_c;
4107 c->idct_add= ff_vp3_idct_add_c;
4108 c->idct = ff_vp3_idct_c;
4109 c->idct_permutation_type= FF_NO_IDCT_PERM;
4110 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4111 c->idct_put= ff_wmv2_idct_put_c;
4112 c->idct_add= ff_wmv2_idct_add_c;
4113 c->idct = ff_wmv2_idct_c;
4114 c->idct_permutation_type= FF_NO_IDCT_PERM;
4115 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4116 c->idct_put= ff_faanidct_put;
4117 c->idct_add= ff_faanidct_add;
4118 c->idct = ff_faanidct;
4119 c->idct_permutation_type= FF_NO_IDCT_PERM;
4120 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4121 c->idct_put= ff_ea_idct_put_c;
4122 c->idct_permutation_type= FF_NO_IDCT_PERM;
4123 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4124 c->idct = ff_bink_idct_c;
4125 c->idct_add = ff_bink_idct_add_c;
4126 c->idct_put = ff_bink_idct_put_c;
4127 c->idct_permutation_type = FF_NO_IDCT_PERM;
4128 }else{ //accurate/default
4129 c->idct_put= ff_simple_idct_put;
4130 c->idct_add= ff_simple_idct_add;
4131 c->idct = ff_simple_idct;
4132 c->idct_permutation_type= FF_NO_IDCT_PERM;
4136 c->get_pixels = get_pixels_c;
4137 c->diff_pixels = diff_pixels_c;
4138 c->put_pixels_clamped = put_pixels_clamped_c;
4139 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4140 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4141 c->add_pixels_clamped = add_pixels_clamped_c;
4142 c->add_pixels8 = add_pixels8_c;
4143 c->add_pixels4 = add_pixels4_c;
4144 c->sum_abs_dctelem = sum_abs_dctelem_c;
4145 c->emulated_edge_mc = ff_emulated_edge_mc;
4148 c->clear_block = clear_block_c;
4149 c->clear_blocks = clear_blocks_c;
4150 c->pix_sum = pix_sum_c;
4151 c->pix_norm1 = pix_norm1_c;
4153 c->fill_block_tab[0] = fill_block16_c;
4154 c->fill_block_tab[1] = fill_block8_c;
4155 c->scale_block = scale_block_c;
4157 /* TODO [0] 16 [1] 8 */
4158 c->pix_abs[0][0] = pix_abs16_c;
4159 c->pix_abs[0][1] = pix_abs16_x2_c;
4160 c->pix_abs[0][2] = pix_abs16_y2_c;
4161 c->pix_abs[0][3] = pix_abs16_xy2_c;
4162 c->pix_abs[1][0] = pix_abs8_c;
4163 c->pix_abs[1][1] = pix_abs8_x2_c;
4164 c->pix_abs[1][2] = pix_abs8_y2_c;
4165 c->pix_abs[1][3] = pix_abs8_xy2_c;
4167 #define dspfunc(PFX, IDX, NUM) \
4168 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4169 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4170 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4171 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4173 dspfunc(put, 0, 16);
4174 dspfunc(put_no_rnd, 0, 16);
4176 dspfunc(put_no_rnd, 1, 8);
4180 dspfunc(avg, 0, 16);
4181 dspfunc(avg_no_rnd, 0, 16);
4183 dspfunc(avg_no_rnd, 1, 8);
4188 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4189 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4191 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4192 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4193 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4194 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4195 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4196 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4197 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4198 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4199 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4201 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4202 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4203 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4204 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4205 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4206 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4207 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4208 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4209 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4211 #define dspfunc(PFX, IDX, NUM) \
4212 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4213 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4214 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4215 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4216 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4217 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4218 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4219 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4220 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4221 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4222 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4223 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4224 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4225 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4226 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4227 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4229 dspfunc(put_qpel, 0, 16);
4230 dspfunc(put_no_rnd_qpel, 0, 16);
4232 dspfunc(avg_qpel, 0, 16);
4233 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4235 dspfunc(put_qpel, 1, 8);
4236 dspfunc(put_no_rnd_qpel, 1, 8);
4238 dspfunc(avg_qpel, 1, 8);
4239 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4241 dspfunc(put_h264_qpel, 0, 16);
4242 dspfunc(put_h264_qpel, 1, 8);
4243 dspfunc(put_h264_qpel, 2, 4);
4244 dspfunc(put_h264_qpel, 3, 2);
4245 dspfunc(avg_h264_qpel, 0, 16);
4246 dspfunc(avg_h264_qpel, 1, 8);
4247 dspfunc(avg_h264_qpel, 2, 4);
4250 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4251 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4252 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4253 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4254 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4255 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4257 c->draw_edges = draw_edges_c;
4259 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4260 ff_mlp_init(c, avctx);
4262 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4263 ff_intrax8dsp_init(c,avctx);
4265 #if CONFIG_RV30_DECODER
4266 ff_rv30dsp_init(c,avctx);
4268 #if CONFIG_RV40_DECODER
4269 ff_rv40dsp_init(c,avctx);
4270 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4271 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4272 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4273 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4276 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4277 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4278 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4279 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4280 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4281 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4282 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4283 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4285 #define SET_CMP_FUNC(name) \
4286 c->name[0]= name ## 16_c;\
4287 c->name[1]= name ## 8x8_c;
4289 SET_CMP_FUNC(hadamard8_diff)
4290 c->hadamard8_diff[4]= hadamard8_intra16_c;
4291 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4292 SET_CMP_FUNC(dct_sad)
4293 SET_CMP_FUNC(dct_max)
4295 SET_CMP_FUNC(dct264_sad)
4297 c->sad[0]= pix_abs16_c;
4298 c->sad[1]= pix_abs8_c;
4302 SET_CMP_FUNC(quant_psnr)
4305 c->vsad[0]= vsad16_c;
4306 c->vsad[4]= vsad_intra16_c;
4307 c->vsad[5]= vsad_intra8_c;
4308 c->vsse[0]= vsse16_c;
4309 c->vsse[4]= vsse_intra16_c;
4310 c->vsse[5]= vsse_intra8_c;
4311 c->nsse[0]= nsse16_c;
4312 c->nsse[1]= nsse8_c;
4314 ff_dsputil_init_dwt(c);
4317 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4319 c->add_bytes= add_bytes_c;
4320 c->add_bytes_l2= add_bytes_l2_c;
4321 c->diff_bytes= diff_bytes_c;
4322 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4323 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4324 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4325 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4326 c->bswap_buf= bswap_buf;
4327 #if CONFIG_PNG_DECODER
4328 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4331 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4332 c->h263_h_loop_filter= h263_h_loop_filter_c;
4333 c->h263_v_loop_filter= h263_v_loop_filter_c;
4336 if (CONFIG_VP3_DECODER) {
4337 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4338 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4339 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4342 c->h261_loop_filter= h261_loop_filter_c;
4344 c->try_8x8basis= try_8x8basis_c;
4345 c->add_8x8basis= add_8x8basis_c;
4347 #if CONFIG_VORBIS_DECODER
4348 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4350 #if CONFIG_AC3_DECODER
4351 c->ac3_downmix = ff_ac3_downmix_c;
4353 c->vector_fmul = vector_fmul_c;
4354 c->vector_fmul_reverse = vector_fmul_reverse_c;
4355 c->vector_fmul_add = vector_fmul_add_c;
4356 c->vector_fmul_window = vector_fmul_window_c;
4357 c->vector_clipf = vector_clipf_c;
4358 c->scalarproduct_int16 = scalarproduct_int16_c;
4359 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4360 c->scalarproduct_float = scalarproduct_float_c;
4361 c->butterflies_float = butterflies_float_c;
4362 c->vector_fmul_scalar = vector_fmul_scalar_c;
4364 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4365 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4367 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4368 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4370 c->shrink[0]= av_image_copy_plane;
4371 c->shrink[1]= ff_shrink22;
4372 c->shrink[2]= ff_shrink44;
4373 c->shrink[3]= ff_shrink88;
4375 c->prefetch= just_return;
4377 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4378 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4380 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4381 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4382 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4383 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4384 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4385 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4386 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4387 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4388 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4390 for(i=0; i<64; i++){
4391 if(!c->put_2tap_qpel_pixels_tab[0][i])
4392 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4393 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4394 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4397 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4398 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4399 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4400 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4402 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4403 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4404 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4405 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4407 switch(c->idct_permutation_type){
4408 case FF_NO_IDCT_PERM:
4410 c->idct_permutation[i]= i;
4412 case FF_LIBMPEG2_IDCT_PERM:
4414 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4416 case FF_SIMPLE_IDCT_PERM:
4418 c->idct_permutation[i]= simple_mmx_permutation[i];
4420 case FF_TRANSPOSE_IDCT_PERM:
4422 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4424 case FF_PARTTRANS_IDCT_PERM:
4426 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4428 case FF_SSE2_IDCT_PERM:
4430 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4433 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");