3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228 *dst++ = av_bswap16(*src++);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
299 /* draw the edges of width 'w' of an image of size width, height */
300 //FIXME check that this is ok for mpeg4 interlaced
301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
303 uint8_t *ptr, *last_line;
306 last_line = buf + (height - 1) * wrap;
309 if (sides&EDGE_TOP) memcpy(buf - (i + 1) * wrap, buf, width);
310 if (sides&EDGE_BOTTOM) memcpy(last_line + (i + 1) * wrap, last_line, width);
314 for(i=0;i<height;i++) {
315 memset(ptr - w, ptr[0], w);
316 memset(ptr + width, ptr[width-1], w);
321 if (sides&EDGE_TOP) {
322 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
323 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
326 if (sides&EDGE_BOTTOM) {
327 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
328 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
334 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
335 * @param buf destination buffer
336 * @param src source buffer
337 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
338 * @param block_w width of block
339 * @param block_h height of block
340 * @param src_x x coordinate of the top left sample of the block in the source buffer
341 * @param src_y y coordinate of the top left sample of the block in the source buffer
342 * @param w width of the source buffer
343 * @param h height of the source buffer
345 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
346 int src_x, int src_y, int w, int h){
348 int start_y, start_x, end_y, end_x;
351 src+= (h-1-src_y)*linesize;
353 }else if(src_y<=-block_h){
354 src+= (1-block_h-src_y)*linesize;
360 }else if(src_x<=-block_w){
361 src+= (1-block_w-src_x);
365 start_y= FFMAX(0, -src_y);
366 start_x= FFMAX(0, -src_x);
367 end_y= FFMIN(block_h, h-src_y);
368 end_x= FFMIN(block_w, w-src_x);
369 assert(start_y < end_y && block_h);
370 assert(start_x < end_x && block_w);
373 src += start_y*linesize + start_x;
377 for(y=0; y<start_y; y++){
382 // copy existing part
391 for(; y<block_h; y++){
396 buf -= block_h * linesize + start_x;
399 for(x=0; x<start_x; x++){
400 buf[x] = buf[start_x];
404 for(x=end_x; x<block_w; x++){
405 buf[x] = buf[end_x - 1];
411 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
415 /* read the pixels */
417 block[0] = pixels[0];
418 block[1] = pixels[1];
419 block[2] = pixels[2];
420 block[3] = pixels[3];
421 block[4] = pixels[4];
422 block[5] = pixels[5];
423 block[6] = pixels[6];
424 block[7] = pixels[7];
430 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
431 const uint8_t *s2, int stride){
434 /* read the pixels */
436 block[0] = s1[0] - s2[0];
437 block[1] = s1[1] - s2[1];
438 block[2] = s1[2] - s2[2];
439 block[3] = s1[3] - s2[3];
440 block[4] = s1[4] - s2[4];
441 block[5] = s1[5] - s2[5];
442 block[6] = s1[6] - s2[6];
443 block[7] = s1[7] - s2[7];
451 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
455 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
457 /* read the pixels */
459 pixels[0] = cm[block[0]];
460 pixels[1] = cm[block[1]];
461 pixels[2] = cm[block[2]];
462 pixels[3] = cm[block[3]];
463 pixels[4] = cm[block[4]];
464 pixels[5] = cm[block[5]];
465 pixels[6] = cm[block[6]];
466 pixels[7] = cm[block[7]];
473 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
483 pixels[2] = cm[block[2]];
484 pixels[3] = cm[block[3]];
491 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
495 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
497 /* read the pixels */
499 pixels[0] = cm[block[0]];
500 pixels[1] = cm[block[1]];
507 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
508 uint8_t *restrict pixels,
513 for (i = 0; i < 8; i++) {
514 for (j = 0; j < 8; j++) {
517 else if (*block > 127)
520 *pixels = (uint8_t)(*block + 128);
524 pixels += (line_size - 8);
528 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
533 /* read the pixels */
535 pixels[0] = block[0];
536 pixels[1] = block[1];
537 pixels[2] = block[2];
538 pixels[3] = block[3];
539 pixels[4] = block[4];
540 pixels[5] = block[5];
541 pixels[6] = block[6];
542 pixels[7] = block[7];
549 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
553 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
555 /* read the pixels */
557 pixels[0] = cm[pixels[0] + block[0]];
558 pixels[1] = cm[pixels[1] + block[1]];
559 pixels[2] = cm[pixels[2] + block[2]];
560 pixels[3] = cm[pixels[3] + block[3]];
561 pixels[4] = cm[pixels[4] + block[4]];
562 pixels[5] = cm[pixels[5] + block[5]];
563 pixels[6] = cm[pixels[6] + block[6]];
564 pixels[7] = cm[pixels[7] + block[7]];
570 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
576 /* read the pixels */
578 pixels[0] = cm[pixels[0] + block[0]];
579 pixels[1] = cm[pixels[1] + block[1]];
580 pixels[2] = cm[pixels[2] + block[2]];
581 pixels[3] = cm[pixels[3] + block[3]];
587 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
591 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
593 /* read the pixels */
595 pixels[0] = cm[pixels[0] + block[0]];
596 pixels[1] = cm[pixels[1] + block[1]];
602 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
606 pixels[0] += block[0];
607 pixels[1] += block[1];
608 pixels[2] += block[2];
609 pixels[3] += block[3];
610 pixels[4] += block[4];
611 pixels[5] += block[5];
612 pixels[6] += block[6];
613 pixels[7] += block[7];
619 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
623 pixels[0] += block[0];
624 pixels[1] += block[1];
625 pixels[2] += block[2];
626 pixels[3] += block[3];
632 static int sum_abs_dctelem_c(DCTELEM *block)
636 sum+= FFABS(block[i]);
640 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
644 for (i = 0; i < h; i++) {
645 memset(block, value, 16);
650 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
654 for (i = 0; i < h; i++) {
655 memset(block, value, 8);
660 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
663 uint16_t *dst1 = (uint16_t *) dst;
664 uint16_t *dst2 = (uint16_t *)(dst + linesize);
666 for (j = 0; j < 8; j++) {
667 for (i = 0; i < 8; i++) {
668 dst1[i] = dst2[i] = src[i] * 0x0101;
678 #define PIXOP2(OPNAME, OP) \
679 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
683 OP(*((uint64_t*)block), AV_RN64(pixels));\
689 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 const uint64_t a= AV_RN64(pixels );\
694 const uint64_t b= AV_RN64(pixels+1);\
695 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
701 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
705 const uint64_t a= AV_RN64(pixels );\
706 const uint64_t b= AV_RN64(pixels+1);\
707 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
713 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
717 const uint64_t a= AV_RN64(pixels );\
718 const uint64_t b= AV_RN64(pixels+line_size);\
719 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
725 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
729 const uint64_t a= AV_RN64(pixels );\
730 const uint64_t b= AV_RN64(pixels+line_size);\
731 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
737 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
740 const uint64_t a= AV_RN64(pixels );\
741 const uint64_t b= AV_RN64(pixels+1);\
742 uint64_t l0= (a&0x0303030303030303ULL)\
743 + (b&0x0303030303030303ULL)\
744 + 0x0202020202020202ULL;\
745 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
746 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
750 for(i=0; i<h; i+=2){\
751 uint64_t a= AV_RN64(pixels );\
752 uint64_t b= AV_RN64(pixels+1);\
753 l1= (a&0x0303030303030303ULL)\
754 + (b&0x0303030303030303ULL);\
755 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
756 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
757 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
760 a= AV_RN64(pixels );\
761 b= AV_RN64(pixels+1);\
762 l0= (a&0x0303030303030303ULL)\
763 + (b&0x0303030303030303ULL)\
764 + 0x0202020202020202ULL;\
765 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
767 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
773 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776 const uint64_t a= AV_RN64(pixels );\
777 const uint64_t b= AV_RN64(pixels+1);\
778 uint64_t l0= (a&0x0303030303030303ULL)\
779 + (b&0x0303030303030303ULL)\
780 + 0x0101010101010101ULL;\
781 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
782 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
786 for(i=0; i<h; i+=2){\
787 uint64_t a= AV_RN64(pixels );\
788 uint64_t b= AV_RN64(pixels+1);\
789 l1= (a&0x0303030303030303ULL)\
790 + (b&0x0303030303030303ULL);\
791 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
792 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
793 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
796 a= AV_RN64(pixels );\
797 b= AV_RN64(pixels+1);\
798 l0= (a&0x0303030303030303ULL)\
799 + (b&0x0303030303030303ULL)\
800 + 0x0101010101010101ULL;\
801 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
802 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
803 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
809 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
810 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
811 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
812 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
813 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
814 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
815 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
817 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
818 #else // 64 bit variant
820 #define PIXOP2(OPNAME, OP) \
821 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
824 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
829 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
837 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
841 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
846 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
847 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
850 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
851 int src_stride1, int src_stride2, int h){\
855 a= AV_RN32(&src1[i*src_stride1 ]);\
856 b= AV_RN32(&src2[i*src_stride2 ]);\
857 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
858 a= AV_RN32(&src1[i*src_stride1+4]);\
859 b= AV_RN32(&src2[i*src_stride2+4]);\
860 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
864 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
865 int src_stride1, int src_stride2, int h){\
869 a= AV_RN32(&src1[i*src_stride1 ]);\
870 b= AV_RN32(&src2[i*src_stride2 ]);\
871 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
872 a= AV_RN32(&src1[i*src_stride1+4]);\
873 b= AV_RN32(&src2[i*src_stride2+4]);\
874 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
878 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879 int src_stride1, int src_stride2, int h){\
883 a= AV_RN32(&src1[i*src_stride1 ]);\
884 b= AV_RN32(&src2[i*src_stride2 ]);\
885 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
889 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
894 a= AV_RN16(&src1[i*src_stride1 ]);\
895 b= AV_RN16(&src2[i*src_stride2 ]);\
896 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
900 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
901 int src_stride1, int src_stride2, int h){\
902 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
903 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
906 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
907 int src_stride1, int src_stride2, int h){\
908 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
909 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
912 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
913 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
916 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
920 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
924 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
925 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
928 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
929 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
932 uint32_t a, b, c, d, l0, l1, h0, h1;\
933 a= AV_RN32(&src1[i*src_stride1]);\
934 b= AV_RN32(&src2[i*src_stride2]);\
935 c= AV_RN32(&src3[i*src_stride3]);\
936 d= AV_RN32(&src4[i*src_stride4]);\
937 l0= (a&0x03030303UL)\
940 h0= ((a&0xFCFCFCFCUL)>>2)\
941 + ((b&0xFCFCFCFCUL)>>2);\
942 l1= (c&0x03030303UL)\
944 h1= ((c&0xFCFCFCFCUL)>>2)\
945 + ((d&0xFCFCFCFCUL)>>2);\
946 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
947 a= AV_RN32(&src1[i*src_stride1+4]);\
948 b= AV_RN32(&src2[i*src_stride2+4]);\
949 c= AV_RN32(&src3[i*src_stride3+4]);\
950 d= AV_RN32(&src4[i*src_stride4+4]);\
951 l0= (a&0x03030303UL)\
954 h0= ((a&0xFCFCFCFCUL)>>2)\
955 + ((b&0xFCFCFCFCUL)>>2);\
956 l1= (c&0x03030303UL)\
958 h1= ((c&0xFCFCFCFCUL)>>2)\
959 + ((d&0xFCFCFCFCUL)>>2);\
960 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
964 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
965 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
968 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
969 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
972 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
973 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
976 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
977 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
980 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
981 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
984 uint32_t a, b, c, d, l0, l1, h0, h1;\
985 a= AV_RN32(&src1[i*src_stride1]);\
986 b= AV_RN32(&src2[i*src_stride2]);\
987 c= AV_RN32(&src3[i*src_stride3]);\
988 d= AV_RN32(&src4[i*src_stride4]);\
989 l0= (a&0x03030303UL)\
992 h0= ((a&0xFCFCFCFCUL)>>2)\
993 + ((b&0xFCFCFCFCUL)>>2);\
994 l1= (c&0x03030303UL)\
996 h1= ((c&0xFCFCFCFCUL)>>2)\
997 + ((d&0xFCFCFCFCUL)>>2);\
998 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999 a= AV_RN32(&src1[i*src_stride1+4]);\
1000 b= AV_RN32(&src2[i*src_stride2+4]);\
1001 c= AV_RN32(&src3[i*src_stride3+4]);\
1002 d= AV_RN32(&src4[i*src_stride4+4]);\
1003 l0= (a&0x03030303UL)\
1006 h0= ((a&0xFCFCFCFCUL)>>2)\
1007 + ((b&0xFCFCFCFCUL)>>2);\
1008 l1= (c&0x03030303UL)\
1009 + (d&0x03030303UL);\
1010 h1= ((c&0xFCFCFCFCUL)>>2)\
1011 + ((d&0xFCFCFCFCUL)>>2);\
1012 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1015 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1020 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1021 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1022 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1023 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1026 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1028 int i, a0, b0, a1, b1;\
1035 for(i=0; i<h; i+=2){\
1041 block[0]= (a1+a0)>>2; /* FIXME non put */\
1042 block[1]= (b1+b0)>>2;\
1052 block[0]= (a1+a0)>>2;\
1053 block[1]= (b1+b0)>>2;\
1059 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1062 const uint32_t a= AV_RN32(pixels );\
1063 const uint32_t b= AV_RN32(pixels+1);\
1064 uint32_t l0= (a&0x03030303UL)\
1067 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1068 + ((b&0xFCFCFCFCUL)>>2);\
1072 for(i=0; i<h; i+=2){\
1073 uint32_t a= AV_RN32(pixels );\
1074 uint32_t b= AV_RN32(pixels+1);\
1075 l1= (a&0x03030303UL)\
1076 + (b&0x03030303UL);\
1077 h1= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082 a= AV_RN32(pixels );\
1083 b= AV_RN32(pixels+1);\
1084 l0= (a&0x03030303UL)\
1087 h0= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1095 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 for(j=0; j<2; j++){\
1100 const uint32_t a= AV_RN32(pixels );\
1101 const uint32_t b= AV_RN32(pixels+1);\
1102 uint32_t l0= (a&0x03030303UL)\
1105 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1106 + ((b&0xFCFCFCFCUL)>>2);\
1110 for(i=0; i<h; i+=2){\
1111 uint32_t a= AV_RN32(pixels );\
1112 uint32_t b= AV_RN32(pixels+1);\
1113 l1= (a&0x03030303UL)\
1114 + (b&0x03030303UL);\
1115 h1= ((a&0xFCFCFCFCUL)>>2)\
1116 + ((b&0xFCFCFCFCUL)>>2);\
1117 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1120 a= AV_RN32(pixels );\
1121 b= AV_RN32(pixels+1);\
1122 l0= (a&0x03030303UL)\
1125 h0= ((a&0xFCFCFCFCUL)>>2)\
1126 + ((b&0xFCFCFCFCUL)>>2);\
1127 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1131 pixels+=4-line_size*(h+1);\
1132 block +=4-line_size*h;\
1136 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1139 for(j=0; j<2; j++){\
1141 const uint32_t a= AV_RN32(pixels );\
1142 const uint32_t b= AV_RN32(pixels+1);\
1143 uint32_t l0= (a&0x03030303UL)\
1146 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1147 + ((b&0xFCFCFCFCUL)>>2);\
1151 for(i=0; i<h; i+=2){\
1152 uint32_t a= AV_RN32(pixels );\
1153 uint32_t b= AV_RN32(pixels+1);\
1154 l1= (a&0x03030303UL)\
1155 + (b&0x03030303UL);\
1156 h1= ((a&0xFCFCFCFCUL)>>2)\
1157 + ((b&0xFCFCFCFCUL)>>2);\
1158 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 a= AV_RN32(pixels );\
1162 b= AV_RN32(pixels+1);\
1163 l0= (a&0x03030303UL)\
1166 h0= ((a&0xFCFCFCFCUL)>>2)\
1167 + ((b&0xFCFCFCFCUL)>>2);\
1168 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1172 pixels+=4-line_size*(h+1);\
1173 block +=4-line_size*h;\
1177 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1178 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1179 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1180 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1181 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1182 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1183 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1184 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1186 #define op_avg(a, b) a = rnd_avg32(a, b)
1188 #define op_put(a, b) a = b
1195 #define put_no_rnd_pixels8_c put_pixels8_c
1196 #define put_no_rnd_pixels16_c put_pixels16_c
1198 #define avg2(a,b) ((a+b+1)>>1)
1199 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1201 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1202 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1205 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1206 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1209 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1211 const int A=(16-x16)*(16-y16);
1212 const int B=( x16)*(16-y16);
1213 const int C=(16-x16)*( y16);
1214 const int D=( x16)*( y16);
1219 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1220 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1221 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1222 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1223 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1224 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1225 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1226 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1232 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1233 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1236 const int s= 1<<shift;
1246 for(x=0; x<8; x++){ //XXX FIXME optimize
1247 int src_x, src_y, frac_x, frac_y, index;
1251 frac_x= src_x&(s-1);
1252 frac_y= src_y&(s-1);
1256 if((unsigned)src_x < width){
1257 if((unsigned)src_y < height){
1258 index= src_x + src_y*stride;
1259 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1260 + src[index +1]* frac_x )*(s-frac_y)
1261 + ( src[index+stride ]*(s-frac_x)
1262 + src[index+stride+1]* frac_x )* frac_y
1265 index= src_x + av_clip(src_y, 0, height)*stride;
1266 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1267 + src[index +1]* frac_x )*s
1271 if((unsigned)src_y < height){
1272 index= av_clip(src_x, 0, width) + src_y*stride;
1273 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1274 + src[index+stride ]* frac_y )*s
1277 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1278 dst[y*stride + x]= src[index ];
1290 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 case 2: put_pixels2_c (dst, src, stride, height); break;
1293 case 4: put_pixels4_c (dst, src, stride, height); break;
1294 case 8: put_pixels8_c (dst, src, stride, height); break;
1295 case 16:put_pixels16_c(dst, src, stride, height); break;
1299 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301 for (i=0; i < height; i++) {
1302 for (j=0; j < width; j++) {
1303 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1310 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1321 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1332 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1343 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1354 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1365 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1376 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1387 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 case 2: avg_pixels2_c (dst, src, stride, height); break;
1390 case 4: avg_pixels4_c (dst, src, stride, height); break;
1391 case 8: avg_pixels8_c (dst, src, stride, height); break;
1392 case 16:avg_pixels16_c(dst, src, stride, height); break;
1396 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398 for (i=0; i < height; i++) {
1399 for (j=0; j < width; j++) {
1400 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1407 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1409 for (i=0; i < height; i++) {
1410 for (j=0; j < width; j++) {
1411 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1418 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1420 for (i=0; i < height; i++) {
1421 for (j=0; j < width; j++) {
1422 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1429 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1431 for (i=0; i < height; i++) {
1432 for (j=0; j < width; j++) {
1433 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1440 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1442 for (i=0; i < height; i++) {
1443 for (j=0; j < width; j++) {
1444 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1451 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1453 for (i=0; i < height; i++) {
1454 for (j=0; j < width; j++) {
1455 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1462 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464 for (i=0; i < height; i++) {
1465 for (j=0; j < width; j++) {
1466 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1473 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1475 for (i=0; i < height; i++) {
1476 for (j=0; j < width; j++) {
1477 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1484 #define TPEL_WIDTH(width)\
1485 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1487 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1489 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1491 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1493 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1495 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1496 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1497 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1498 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1499 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1500 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1501 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1502 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1505 #define H264_CHROMA_MC(OPNAME, OP)\
1506 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1507 const int A=(8-x)*(8-y);\
1508 const int B=( x)*(8-y);\
1509 const int C=(8-x)*( y);\
1510 const int D=( x)*( y);\
1513 assert(x<8 && y<8 && x>=0 && y>=0);\
1516 for(i=0; i<h; i++){\
1517 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1518 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1524 const int step= C ? stride : 1;\
1525 for(i=0; i<h; i++){\
1526 OP(dst[0], (A*src[0] + E*src[step+0]));\
1527 OP(dst[1], (A*src[1] + E*src[step+1]));\
1534 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1535 const int A=(8-x)*(8-y);\
1536 const int B=( x)*(8-y);\
1537 const int C=(8-x)*( y);\
1538 const int D=( x)*( y);\
1541 assert(x<8 && y<8 && x>=0 && y>=0);\
1544 for(i=0; i<h; i++){\
1545 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1546 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1547 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1548 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1554 const int step= C ? stride : 1;\
1555 for(i=0; i<h; i++){\
1556 OP(dst[0], (A*src[0] + E*src[step+0]));\
1557 OP(dst[1], (A*src[1] + E*src[step+1]));\
1558 OP(dst[2], (A*src[2] + E*src[step+2]));\
1559 OP(dst[3], (A*src[3] + E*src[step+3]));\
1566 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1567 const int A=(8-x)*(8-y);\
1568 const int B=( x)*(8-y);\
1569 const int C=(8-x)*( y);\
1570 const int D=( x)*( y);\
1573 assert(x<8 && y<8 && x>=0 && y>=0);\
1576 for(i=0; i<h; i++){\
1577 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1578 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1579 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1580 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1581 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1582 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1583 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1584 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1590 const int step= C ? stride : 1;\
1591 for(i=0; i<h; i++){\
1592 OP(dst[0], (A*src[0] + E*src[step+0]));\
1593 OP(dst[1], (A*src[1] + E*src[step+1]));\
1594 OP(dst[2], (A*src[2] + E*src[step+2]));\
1595 OP(dst[3], (A*src[3] + E*src[step+3]));\
1596 OP(dst[4], (A*src[4] + E*src[step+4]));\
1597 OP(dst[5], (A*src[5] + E*src[step+5]));\
1598 OP(dst[6], (A*src[6] + E*src[step+6]));\
1599 OP(dst[7], (A*src[7] + E*src[step+7]));\
1606 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1607 #define op_put(a, b) a = (((b) + 32)>>6)
1609 H264_CHROMA_MC(put_ , op_put)
1610 H264_CHROMA_MC(avg_ , op_avg)
1614 #define QPEL_MC(r, OPNAME, RND, OP) \
1615 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1620 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1621 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1622 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1623 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1624 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1625 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1626 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1627 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1633 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1635 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1639 const int src0= src[0*srcStride];\
1640 const int src1= src[1*srcStride];\
1641 const int src2= src[2*srcStride];\
1642 const int src3= src[3*srcStride];\
1643 const int src4= src[4*srcStride];\
1644 const int src5= src[5*srcStride];\
1645 const int src6= src[6*srcStride];\
1646 const int src7= src[7*srcStride];\
1647 const int src8= src[8*srcStride];\
1648 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1649 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1650 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1651 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1652 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1653 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1654 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1655 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1661 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1662 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1667 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1668 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1669 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1670 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1671 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1672 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1673 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1674 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1675 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1676 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1677 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1678 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1679 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1680 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1681 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1682 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1688 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1689 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1694 const int src0= src[0*srcStride];\
1695 const int src1= src[1*srcStride];\
1696 const int src2= src[2*srcStride];\
1697 const int src3= src[3*srcStride];\
1698 const int src4= src[4*srcStride];\
1699 const int src5= src[5*srcStride];\
1700 const int src6= src[6*srcStride];\
1701 const int src7= src[7*srcStride];\
1702 const int src8= src[8*srcStride];\
1703 const int src9= src[9*srcStride];\
1704 const int src10= src[10*srcStride];\
1705 const int src11= src[11*srcStride];\
1706 const int src12= src[12*srcStride];\
1707 const int src13= src[13*srcStride];\
1708 const int src14= src[14*srcStride];\
1709 const int src15= src[15*srcStride];\
1710 const int src16= src[16*srcStride];\
1711 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1712 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1713 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1714 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1715 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1716 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1717 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1718 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1719 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1720 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1721 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1722 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1723 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1724 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1725 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1726 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1732 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1734 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1735 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1738 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1739 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1742 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1745 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1748 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t full[16*9];\
1751 copy_block9(full, src, 16, stride, 9);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1753 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1756 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1757 uint8_t full[16*9];\
1758 copy_block9(full, src, 16, stride, 9);\
1759 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1762 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t full[16*9];\
1765 copy_block9(full, src, 16, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1767 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1769 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[16*9];\
1773 uint8_t halfHV[64];\
1774 copy_block9(full, src, 16, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1780 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1790 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1794 uint8_t halfHV[64];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1801 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1811 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t full[16*9];\
1815 uint8_t halfHV[64];\
1816 copy_block9(full, src, 16, stride, 9);\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1819 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1822 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[16*9];\
1825 uint8_t halfHV[64];\
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1832 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1836 uint8_t halfHV[64];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1839 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1843 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1846 uint8_t halfHV[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1853 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t halfHV[64];\
1856 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1860 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t halfHV[64];\
1863 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1867 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1868 uint8_t full[16*9];\
1871 uint8_t halfHV[64];\
1872 copy_block9(full, src, 16, stride, 9);\
1873 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1874 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1875 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1876 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1878 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t full[16*9];\
1881 copy_block9(full, src, 16, stride, 9);\
1882 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1883 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1884 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1886 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[16*9];\
1890 uint8_t halfHV[64];\
1891 copy_block9(full, src, 16, stride, 9);\
1892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1897 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[16*9];\
1900 copy_block9(full, src, 16, stride, 9);\
1901 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1905 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1907 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1908 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1911 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1913 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1914 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1917 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1918 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1921 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1924 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1927 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1930 copy_block17(full, src, 24, stride, 17);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1935 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1936 uint8_t full[24*17];\
1937 copy_block17(full, src, 24, stride, 17);\
1938 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1941 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t full[24*17];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1946 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1948 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t full[24*17];\
1950 uint8_t halfH[272];\
1951 uint8_t halfV[256];\
1952 uint8_t halfHV[256];\
1953 copy_block17(full, src, 24, stride, 17);\
1954 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1956 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1957 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1959 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1960 uint8_t full[24*17];\
1961 uint8_t halfH[272];\
1962 uint8_t halfHV[256];\
1963 copy_block17(full, src, 24, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1969 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1970 uint8_t full[24*17];\
1971 uint8_t halfH[272];\
1972 uint8_t halfV[256];\
1973 uint8_t halfHV[256];\
1974 copy_block17(full, src, 24, stride, 17);\
1975 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1977 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1978 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1980 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[24*17];\
1982 uint8_t halfH[272];\
1983 uint8_t halfHV[256];\
1984 copy_block17(full, src, 24, stride, 17);\
1985 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1990 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[24*17];\
1992 uint8_t halfH[272];\
1993 uint8_t halfV[256];\
1994 uint8_t halfHV[256];\
1995 copy_block17(full, src, 24, stride, 17);\
1996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1998 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2001 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[24*17];\
2003 uint8_t halfH[272];\
2004 uint8_t halfHV[256];\
2005 copy_block17(full, src, 24, stride, 17);\
2006 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2007 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2008 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2011 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012 uint8_t full[24*17];\
2013 uint8_t halfH[272];\
2014 uint8_t halfV[256];\
2015 uint8_t halfHV[256];\
2016 copy_block17(full, src, 24, stride, 17);\
2017 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2018 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2019 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2022 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 uint8_t halfHV[256];\
2026 copy_block17(full, src, 24, stride, 17);\
2027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2032 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t halfH[272];\
2034 uint8_t halfHV[256];\
2035 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2036 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2037 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2039 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2040 uint8_t halfH[272];\
2041 uint8_t halfHV[256];\
2042 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2046 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2057 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 copy_block17(full, src, 24, stride, 17);\
2061 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2062 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2063 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2065 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2066 uint8_t full[24*17];\
2067 uint8_t halfH[272];\
2068 uint8_t halfV[256];\
2069 uint8_t halfHV[256];\
2070 copy_block17(full, src, 24, stride, 17);\
2071 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2072 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2073 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2076 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t full[24*17];\
2078 uint8_t halfH[272];\
2079 copy_block17(full, src, 24, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2082 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2084 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2085 uint8_t halfH[272];\
2086 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2087 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2090 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2091 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2092 #define op_put(a, b) a = cm[((b) + 16)>>5]
2093 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2095 QPEL_MC(0, put_ , _ , op_put)
2096 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2097 QPEL_MC(0, avg_ , _ , op_avg)
2098 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2100 #undef op_avg_no_rnd
2102 #undef op_put_no_rnd
2104 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2105 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2106 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2107 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2108 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2109 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2112 #define H264_LOWPASS(OPNAME, OP, OP2) \
2113 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2115 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2119 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2120 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2126 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2128 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2132 const int srcB= src[-2*srcStride];\
2133 const int srcA= src[-1*srcStride];\
2134 const int src0= src[0 *srcStride];\
2135 const int src1= src[1 *srcStride];\
2136 const int src2= src[2 *srcStride];\
2137 const int src3= src[3 *srcStride];\
2138 const int src4= src[4 *srcStride];\
2139 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2140 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2146 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2149 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2151 src -= 2*srcStride;\
2152 for(i=0; i<h+5; i++)\
2154 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2155 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2159 tmp -= tmpStride*(h+5-2);\
2162 const int tmpB= tmp[-2*tmpStride];\
2163 const int tmpA= tmp[-1*tmpStride];\
2164 const int tmp0= tmp[0 *tmpStride];\
2165 const int tmp1= tmp[1 *tmpStride];\
2166 const int tmp2= tmp[2 *tmpStride];\
2167 const int tmp3= tmp[3 *tmpStride];\
2168 const int tmp4= tmp[4 *tmpStride];\
2169 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2170 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2175 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2177 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2182 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2183 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2184 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2190 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2192 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2196 const int srcB= src[-2*srcStride];\
2197 const int srcA= src[-1*srcStride];\
2198 const int src0= src[0 *srcStride];\
2199 const int src1= src[1 *srcStride];\
2200 const int src2= src[2 *srcStride];\
2201 const int src3= src[3 *srcStride];\
2202 const int src4= src[4 *srcStride];\
2203 const int src5= src[5 *srcStride];\
2204 const int src6= src[6 *srcStride];\
2205 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2206 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2207 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2208 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2214 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2217 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2219 src -= 2*srcStride;\
2220 for(i=0; i<h+5; i++)\
2222 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2223 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2224 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2225 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2229 tmp -= tmpStride*(h+5-2);\
2232 const int tmpB= tmp[-2*tmpStride];\
2233 const int tmpA= tmp[-1*tmpStride];\
2234 const int tmp0= tmp[0 *tmpStride];\
2235 const int tmp1= tmp[1 *tmpStride];\
2236 const int tmp2= tmp[2 *tmpStride];\
2237 const int tmp3= tmp[3 *tmpStride];\
2238 const int tmp4= tmp[4 *tmpStride];\
2239 const int tmp5= tmp[5 *tmpStride];\
2240 const int tmp6= tmp[6 *tmpStride];\
2241 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2242 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2243 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2244 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2250 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2252 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2256 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2257 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2258 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2259 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2260 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2261 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2262 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2263 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2269 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2271 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2275 const int srcB= src[-2*srcStride];\
2276 const int srcA= src[-1*srcStride];\
2277 const int src0= src[0 *srcStride];\
2278 const int src1= src[1 *srcStride];\
2279 const int src2= src[2 *srcStride];\
2280 const int src3= src[3 *srcStride];\
2281 const int src4= src[4 *srcStride];\
2282 const int src5= src[5 *srcStride];\
2283 const int src6= src[6 *srcStride];\
2284 const int src7= src[7 *srcStride];\
2285 const int src8= src[8 *srcStride];\
2286 const int src9= src[9 *srcStride];\
2287 const int src10=src[10*srcStride];\
2288 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2289 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2290 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2291 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2292 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2293 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2294 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2295 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2301 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2304 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2306 src -= 2*srcStride;\
2307 for(i=0; i<h+5; i++)\
2309 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2310 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2311 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2312 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2313 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2314 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2315 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2316 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2320 tmp -= tmpStride*(h+5-2);\
2323 const int tmpB= tmp[-2*tmpStride];\
2324 const int tmpA= tmp[-1*tmpStride];\
2325 const int tmp0= tmp[0 *tmpStride];\
2326 const int tmp1= tmp[1 *tmpStride];\
2327 const int tmp2= tmp[2 *tmpStride];\
2328 const int tmp3= tmp[3 *tmpStride];\
2329 const int tmp4= tmp[4 *tmpStride];\
2330 const int tmp5= tmp[5 *tmpStride];\
2331 const int tmp6= tmp[6 *tmpStride];\
2332 const int tmp7= tmp[7 *tmpStride];\
2333 const int tmp8= tmp[8 *tmpStride];\
2334 const int tmp9= tmp[9 *tmpStride];\
2335 const int tmp10=tmp[10*tmpStride];\
2336 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2337 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2338 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2339 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2340 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2341 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2342 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2343 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2349 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2350 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2351 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2352 src += 8*srcStride;\
2353 dst += 8*dstStride;\
2354 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2355 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2358 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2359 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2360 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2361 src += 8*srcStride;\
2362 dst += 8*dstStride;\
2363 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2364 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2367 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2368 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2369 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2370 src += 8*srcStride;\
2371 dst += 8*dstStride;\
2372 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2373 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2376 #define H264_MC(OPNAME, SIZE) \
2377 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2378 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2382 uint8_t half[SIZE*SIZE];\
2383 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2384 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2387 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2388 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2392 uint8_t half[SIZE*SIZE];\
2393 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2394 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2398 uint8_t full[SIZE*(SIZE+5)];\
2399 uint8_t * const full_mid= full + SIZE*2;\
2400 uint8_t half[SIZE*SIZE];\
2401 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2402 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2403 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2406 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2407 uint8_t full[SIZE*(SIZE+5)];\
2408 uint8_t * const full_mid= full + SIZE*2;\
2409 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2410 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2414 uint8_t full[SIZE*(SIZE+5)];\
2415 uint8_t * const full_mid= full + SIZE*2;\
2416 uint8_t half[SIZE*SIZE];\
2417 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2418 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2419 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t full[SIZE*(SIZE+5)];\
2424 uint8_t * const full_mid= full + SIZE*2;\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2429 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2434 uint8_t full[SIZE*(SIZE+5)];\
2435 uint8_t * const full_mid= full + SIZE*2;\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2439 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2440 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2445 uint8_t full[SIZE*(SIZE+5)];\
2446 uint8_t * const full_mid= full + SIZE*2;\
2447 uint8_t halfH[SIZE*SIZE];\
2448 uint8_t halfV[SIZE*SIZE];\
2449 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2451 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2456 uint8_t full[SIZE*(SIZE+5)];\
2457 uint8_t * const full_mid= full + SIZE*2;\
2458 uint8_t halfH[SIZE*SIZE];\
2459 uint8_t halfV[SIZE*SIZE];\
2460 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2461 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2462 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2467 int16_t tmp[SIZE*(SIZE+5)];\
2468 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2471 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2472 int16_t tmp[SIZE*(SIZE+5)];\
2473 uint8_t halfH[SIZE*SIZE];\
2474 uint8_t halfHV[SIZE*SIZE];\
2475 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2476 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2481 int16_t tmp[SIZE*(SIZE+5)];\
2482 uint8_t halfH[SIZE*SIZE];\
2483 uint8_t halfHV[SIZE*SIZE];\
2484 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2485 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2486 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2490 uint8_t full[SIZE*(SIZE+5)];\
2491 uint8_t * const full_mid= full + SIZE*2;\
2492 int16_t tmp[SIZE*(SIZE+5)];\
2493 uint8_t halfV[SIZE*SIZE];\
2494 uint8_t halfHV[SIZE*SIZE];\
2495 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2496 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2497 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2498 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2502 uint8_t full[SIZE*(SIZE+5)];\
2503 uint8_t * const full_mid= full + SIZE*2;\
2504 int16_t tmp[SIZE*(SIZE+5)];\
2505 uint8_t halfV[SIZE*SIZE];\
2506 uint8_t halfHV[SIZE*SIZE];\
2507 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2508 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2509 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2510 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2513 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2514 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2515 #define op_put(a, b) a = cm[((b) + 16)>>5]
2516 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2517 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2519 H264_LOWPASS(put_ , op_put, op2_put)
2520 H264_LOWPASS(avg_ , op_avg, op2_avg)
2535 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2536 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2537 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2538 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2540 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2545 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2546 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2547 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2548 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2549 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2550 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2551 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2552 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2558 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2559 put_pixels8_c(dst, src, stride, 8);
2561 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2562 avg_pixels8_c(dst, src, stride, 8);
2564 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2565 put_pixels16_c(dst, src, stride, 16);
2567 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2568 avg_pixels16_c(dst, src, stride, 16);
2571 #if CONFIG_RV40_DECODER
2572 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2573 put_pixels16_xy2_c(dst, src, stride, 16);
2575 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2576 avg_pixels16_xy2_c(dst, src, stride, 16);
2578 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2579 put_pixels8_xy2_c(dst, src, stride, 8);
2581 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2582 avg_pixels8_xy2_c(dst, src, stride, 8);
2584 #endif /* CONFIG_RV40_DECODER */
2586 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2587 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2591 const int src_1= src[ -srcStride];
2592 const int src0 = src[0 ];
2593 const int src1 = src[ srcStride];
2594 const int src2 = src[2*srcStride];
2595 const int src3 = src[3*srcStride];
2596 const int src4 = src[4*srcStride];
2597 const int src5 = src[5*srcStride];
2598 const int src6 = src[6*srcStride];
2599 const int src7 = src[7*srcStride];
2600 const int src8 = src[8*srcStride];
2601 const int src9 = src[9*srcStride];
2602 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2603 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2604 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2605 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2606 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2607 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2608 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2609 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2615 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2617 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2618 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2621 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2622 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2625 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2627 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2628 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2631 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2632 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2635 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2639 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2641 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2642 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2644 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2648 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2649 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2650 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2651 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2653 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2655 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2659 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2660 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2662 const int strength= ff_h263_loop_filter_strength[qscale];
2666 int p0= src[x-2*stride];
2667 int p1= src[x-1*stride];
2668 int p2= src[x+0*stride];
2669 int p3= src[x+1*stride];
2670 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2672 if (d<-2*strength) d1= 0;
2673 else if(d<- strength) d1=-2*strength - d;
2674 else if(d< strength) d1= d;
2675 else if(d< 2*strength) d1= 2*strength - d;
2680 if(p1&256) p1= ~(p1>>31);
2681 if(p2&256) p2= ~(p2>>31);
2683 src[x-1*stride] = p1;
2684 src[x+0*stride] = p2;
2688 d2= av_clip((p0-p3)/4, -ad1, ad1);
2690 src[x-2*stride] = p0 - d2;
2691 src[x+ stride] = p3 + d2;
2696 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2697 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2699 const int strength= ff_h263_loop_filter_strength[qscale];
2703 int p0= src[y*stride-2];
2704 int p1= src[y*stride-1];
2705 int p2= src[y*stride+0];
2706 int p3= src[y*stride+1];
2707 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2709 if (d<-2*strength) d1= 0;
2710 else if(d<- strength) d1=-2*strength - d;
2711 else if(d< strength) d1= d;
2712 else if(d< 2*strength) d1= 2*strength - d;
2717 if(p1&256) p1= ~(p1>>31);
2718 if(p2&256) p2= ~(p2>>31);
2720 src[y*stride-1] = p1;
2721 src[y*stride+0] = p2;
2725 d2= av_clip((p0-p3)/4, -ad1, ad1);
2727 src[y*stride-2] = p0 - d2;
2728 src[y*stride+1] = p3 + d2;
2733 static void h261_loop_filter_c(uint8_t *src, int stride){
2738 temp[x ] = 4*src[x ];
2739 temp[x + 7*8] = 4*src[x + 7*stride];
2743 xy = y * stride + x;
2745 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2750 src[ y*stride] = (temp[ y*8] + 2)>>2;
2751 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2753 xy = y * stride + x;
2755 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2760 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2766 s += abs(pix1[0] - pix2[0]);
2767 s += abs(pix1[1] - pix2[1]);
2768 s += abs(pix1[2] - pix2[2]);
2769 s += abs(pix1[3] - pix2[3]);
2770 s += abs(pix1[4] - pix2[4]);
2771 s += abs(pix1[5] - pix2[5]);
2772 s += abs(pix1[6] - pix2[6]);
2773 s += abs(pix1[7] - pix2[7]);
2774 s += abs(pix1[8] - pix2[8]);
2775 s += abs(pix1[9] - pix2[9]);
2776 s += abs(pix1[10] - pix2[10]);
2777 s += abs(pix1[11] - pix2[11]);
2778 s += abs(pix1[12] - pix2[12]);
2779 s += abs(pix1[13] - pix2[13]);
2780 s += abs(pix1[14] - pix2[14]);
2781 s += abs(pix1[15] - pix2[15]);
2788 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2794 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2795 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2796 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2797 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2798 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2799 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2800 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2801 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2802 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2803 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2804 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2805 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2806 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2807 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2808 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2809 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2816 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2819 uint8_t *pix3 = pix2 + line_size;
2823 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2824 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2825 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2826 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2827 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2828 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2829 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2830 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2831 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2832 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2833 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2834 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2835 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2836 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2837 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2838 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2846 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2849 uint8_t *pix3 = pix2 + line_size;
2853 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2854 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2855 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2856 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2857 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2858 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2859 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2860 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2861 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2862 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2863 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2864 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2865 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2866 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2867 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2868 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2876 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2882 s += abs(pix1[0] - pix2[0]);
2883 s += abs(pix1[1] - pix2[1]);
2884 s += abs(pix1[2] - pix2[2]);
2885 s += abs(pix1[3] - pix2[3]);
2886 s += abs(pix1[4] - pix2[4]);
2887 s += abs(pix1[5] - pix2[5]);
2888 s += abs(pix1[6] - pix2[6]);
2889 s += abs(pix1[7] - pix2[7]);
2896 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2902 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2903 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2904 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2905 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2906 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2907 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2908 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2909 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2916 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2919 uint8_t *pix3 = pix2 + line_size;
2923 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2924 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2925 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2926 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2927 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2928 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2929 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2930 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2938 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2941 uint8_t *pix3 = pix2 + line_size;
2945 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2946 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2947 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2948 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2949 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2950 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2951 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2952 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2960 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2961 MpegEncContext *c = v;
2967 for(x=0; x<16; x++){
2968 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2971 for(x=0; x<15; x++){
2972 score2+= FFABS( s1[x ] - s1[x +stride]
2973 - s1[x+1] + s1[x+1+stride])
2974 -FFABS( s2[x ] - s2[x +stride]
2975 - s2[x+1] + s2[x+1+stride]);
2982 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2983 else return score1 + FFABS(score2)*8;
2986 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2987 MpegEncContext *c = v;
2994 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2998 score2+= FFABS( s1[x ] - s1[x +stride]
2999 - s1[x+1] + s1[x+1+stride])
3000 -FFABS( s2[x ] - s2[x +stride]
3001 - s2[x+1] + s2[x+1+stride]);
3008 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3009 else return score1 + FFABS(score2)*8;
3012 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3016 for(i=0; i<8*8; i++){
3017 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3020 assert(-512<b && b<512);
3022 sum += (w*b)*(w*b)>>4;
3027 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3030 for(i=0; i<8*8; i++){
3031 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3036 * permutes an 8x8 block.
3037 * @param block the block which will be permuted according to the given permutation vector
3038 * @param permutation the permutation vector
3039 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3040 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3041 * (inverse) permutated to scantable order!
3043 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3049 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3051 for(i=0; i<=last; i++){
3052 const int j= scantable[i];
3057 for(i=0; i<=last; i++){
3058 const int j= scantable[i];
3059 const int perm_j= permutation[j];
3060 block[perm_j]= temp[j];
3064 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3068 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3071 memset(cmp, 0, sizeof(void*)*6);
3079 cmp[i]= c->hadamard8_diff[i];
3085 cmp[i]= c->dct_sad[i];
3088 cmp[i]= c->dct264_sad[i];
3091 cmp[i]= c->dct_max[i];
3094 cmp[i]= c->quant_psnr[i];
3123 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3128 static void clear_block_c(DCTELEM *block)
3130 memset(block, 0, sizeof(DCTELEM)*64);
3134 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3136 static void clear_blocks_c(DCTELEM *blocks)
3138 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3141 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3143 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3144 long a = *(long*)(src+i);
3145 long b = *(long*)(dst+i);
3146 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3149 dst[i+0] += src[i+0];
3152 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3154 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3155 long a = *(long*)(src1+i);
3156 long b = *(long*)(src2+i);
3157 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3160 dst[i] = src1[i]+src2[i];
3163 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3165 #if !HAVE_FAST_UNALIGNED
3166 if((long)src2 & (sizeof(long)-1)){
3167 for(i=0; i+7<w; i+=8){
3168 dst[i+0] = src1[i+0]-src2[i+0];
3169 dst[i+1] = src1[i+1]-src2[i+1];
3170 dst[i+2] = src1[i+2]-src2[i+2];
3171 dst[i+3] = src1[i+3]-src2[i+3];
3172 dst[i+4] = src1[i+4]-src2[i+4];
3173 dst[i+5] = src1[i+5]-src2[i+5];
3174 dst[i+6] = src1[i+6]-src2[i+6];
3175 dst[i+7] = src1[i+7]-src2[i+7];
3179 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3180 long a = *(long*)(src1+i);
3181 long b = *(long*)(src2+i);
3182 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3185 dst[i+0] = src1[i+0]-src2[i+0];
3188 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3196 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3205 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3213 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3223 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3226 for(i=0; i<w-1; i++){
3253 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3283 #define BUTTERFLY2(o1,o2,i1,i2) \
3287 #define BUTTERFLY1(x,y) \
3296 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3298 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3306 //FIXME try pointer walks
3307 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3308 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3309 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3310 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3312 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3313 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3314 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3315 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3317 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3318 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3319 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3320 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3324 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3325 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3326 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3327 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3329 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3330 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3331 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3332 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3335 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3336 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3337 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3338 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3344 printf("MAX:%d\n", maxi);
3350 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3358 //FIXME try pointer walks
3359 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3360 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3361 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3362 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3364 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3365 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3366 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3367 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3369 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3370 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3371 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3372 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3376 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3377 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3378 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3379 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3381 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3382 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3383 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3384 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3387 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3388 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3389 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3390 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3393 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3398 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3399 MpegEncContext * const s= (MpegEncContext *)c;
3400 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3404 s->dsp.diff_pixels(temp, src1, src2, stride);
3406 return s->dsp.sum_abs_dctelem(temp);
3411 const int s07 = SRC(0) + SRC(7);\
3412 const int s16 = SRC(1) + SRC(6);\
3413 const int s25 = SRC(2) + SRC(5);\
3414 const int s34 = SRC(3) + SRC(4);\
3415 const int a0 = s07 + s34;\
3416 const int a1 = s16 + s25;\
3417 const int a2 = s07 - s34;\
3418 const int a3 = s16 - s25;\
3419 const int d07 = SRC(0) - SRC(7);\
3420 const int d16 = SRC(1) - SRC(6);\
3421 const int d25 = SRC(2) - SRC(5);\
3422 const int d34 = SRC(3) - SRC(4);\
3423 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3424 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3425 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3426 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3428 DST(1, a4 + (a7>>2)) ;\
3429 DST(2, a2 + (a3>>1)) ;\
3430 DST(3, a5 + (a6>>2)) ;\
3432 DST(5, a6 - (a5>>2)) ;\
3433 DST(6, (a2>>1) - a3 ) ;\
3434 DST(7, (a4>>2) - a7 ) ;\
3437 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3438 MpegEncContext * const s= (MpegEncContext *)c;
3443 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3445 #define SRC(x) dct[i][x]
3446 #define DST(x,v) dct[i][x]= v
3447 for( i = 0; i < 8; i++ )
3452 #define SRC(x) dct[x][i]
3453 #define DST(x,v) sum += FFABS(v)
3454 for( i = 0; i < 8; i++ )
3462 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3463 MpegEncContext * const s= (MpegEncContext *)c;
3464 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3469 s->dsp.diff_pixels(temp, src1, src2, stride);
3473 sum= FFMAX(sum, FFABS(temp[i]));
3478 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3479 MpegEncContext * const s= (MpegEncContext *)c;
3480 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3481 DCTELEM * const bak = temp+64;
3487 s->dsp.diff_pixels(temp, src1, src2, stride);
3489 memcpy(bak, temp, 64*sizeof(DCTELEM));
3491 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3492 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3493 ff_simple_idct(temp); //FIXME
3496 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3501 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502 MpegEncContext * const s= (MpegEncContext *)c;
3503 const uint8_t *scantable= s->intra_scantable.permutated;
3504 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3505 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3506 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3507 int i, last, run, bits, level, distortion, start_i;
3508 const int esc_length= s->ac_esc_length;
3510 uint8_t * last_length;
3514 copy_block8(lsrc1, src1, 8, stride, 8);
3515 copy_block8(lsrc2, src2, 8, stride, 8);
3517 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3519 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3525 length = s->intra_ac_vlc_length;
3526 last_length= s->intra_ac_vlc_last_length;
3527 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3530 length = s->inter_ac_vlc_length;
3531 last_length= s->inter_ac_vlc_last_length;
3536 for(i=start_i; i<last; i++){
3537 int j= scantable[i];
3542 if((level&(~127)) == 0){
3543 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3552 level= temp[i] + 64;
3556 if((level&(~127)) == 0){
3557 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3565 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3567 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3570 s->dsp.idct_add(lsrc2, 8, temp);
3572 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3574 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3577 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3578 MpegEncContext * const s= (MpegEncContext *)c;
3579 const uint8_t *scantable= s->intra_scantable.permutated;
3580 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3581 int i, last, run, bits, level, start_i;
3582 const int esc_length= s->ac_esc_length;
3584 uint8_t * last_length;
3588 s->dsp.diff_pixels(temp, src1, src2, stride);
3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3596 length = s->intra_ac_vlc_length;
3597 last_length= s->intra_ac_vlc_last_length;
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3601 length = s->inter_ac_vlc_length;
3602 last_length= s->inter_ac_vlc_last_length;
3607 for(i=start_i; i<last; i++){
3608 int j= scantable[i];
3613 if((level&(~127)) == 0){
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3623 level= temp[i] + 64;
3627 if((level&(~127)) == 0){
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636 #define VSAD_INTRA(size) \
3637 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3641 for(y=1; y<h; y++){ \
3642 for(x=0; x<size; x+=4){ \
3643 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3644 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3654 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3659 for(x=0; x<16; x++){
3660 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3669 #define SQ(a) ((a)*(a))
3670 #define VSSE_INTRA(size) \
3671 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3675 for(y=1; y<h; y++){ \
3676 for(x=0; x<size; x+=4){ \
3677 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3678 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3688 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3693 for(x=0; x<16; x++){
3694 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3703 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3707 for(i=0; i<size; i++)
3708 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3712 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3713 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3714 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3716 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3718 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3719 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3720 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3721 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3723 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3725 for(i=0; i<len; i++)
3726 dst[i] = src0[i] * src1[i];
3729 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3732 for(i=0; i<len; i++)
3733 dst[i] = src0[i] * src1[-i];
3736 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3738 for(i=0; i<len; i++)
3739 dst[i] = src0[i] * src1[i] + src2[i];
3742 static void vector_fmul_window_c(float *dst, const float *src0,
3743 const float *src1, const float *win, int len)
3749 for(i=-len, j=len-1; i<0; i++, j--) {
3754 dst[i] = s0*wj - s1*wi;
3755 dst[j] = s0*wi + s1*wj;
3759 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3763 for (i = 0; i < len; i++)
3764 dst[i] = src[i] * mul;
3767 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3768 const float **sv, float mul, int len)
3771 for (i = 0; i < len; i += 2, sv++) {
3772 dst[i ] = src[i ] * sv[0][0] * mul;
3773 dst[i+1] = src[i+1] * sv[0][1] * mul;
3777 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3778 const float **sv, float mul, int len)
3781 for (i = 0; i < len; i += 4, sv++) {
3782 dst[i ] = src[i ] * sv[0][0] * mul;
3783 dst[i+1] = src[i+1] * sv[0][1] * mul;
3784 dst[i+2] = src[i+2] * sv[0][2] * mul;
3785 dst[i+3] = src[i+3] * sv[0][3] * mul;
3789 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3793 for (i = 0; i < len; i += 2, sv++) {
3794 dst[i ] = sv[0][0] * mul;
3795 dst[i+1] = sv[0][1] * mul;
3799 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3803 for (i = 0; i < len; i += 4, sv++) {
3804 dst[i ] = sv[0][0] * mul;
3805 dst[i+1] = sv[0][1] * mul;
3806 dst[i+2] = sv[0][2] * mul;
3807 dst[i+3] = sv[0][3] * mul;
3811 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3815 for (i = 0; i < len; i++) {
3816 float t = v1[i] - v2[i];
3822 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3827 for (i = 0; i < len; i++)
3833 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3834 uint32_t maxi, uint32_t maxisign)
3837 if(a > mini) return mini;
3838 else if((a^(1<<31)) > maxisign) return maxi;
3842 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3844 uint32_t mini = *(uint32_t*)min;
3845 uint32_t maxi = *(uint32_t*)max;
3846 uint32_t maxisign = maxi ^ (1<<31);
3847 uint32_t *dsti = (uint32_t*)dst;
3848 const uint32_t *srci = (const uint32_t*)src;
3849 for(i=0; i<len; i+=8) {
3850 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3851 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3852 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3853 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3854 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3855 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3856 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3857 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3860 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3862 if(min < 0 && max > 0) {
3863 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3865 for(i=0; i < len; i+=8) {
3866 dst[i ] = av_clipf(src[i ], min, max);
3867 dst[i + 1] = av_clipf(src[i + 1], min, max);
3868 dst[i + 2] = av_clipf(src[i + 2], min, max);
3869 dst[i + 3] = av_clipf(src[i + 3], min, max);
3870 dst[i + 4] = av_clipf(src[i + 4], min, max);
3871 dst[i + 5] = av_clipf(src[i + 5], min, max);
3872 dst[i + 6] = av_clipf(src[i + 6], min, max);
3873 dst[i + 7] = av_clipf(src[i + 7], min, max);
3878 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3883 res += (*v1++ * *v2++) >> shift;
3888 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3893 *v1++ += mul * *v3++;
3899 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3900 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3901 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3902 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3903 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3904 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3905 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3907 static void wmv2_idct_row(short * b)
3910 int a0,a1,a2,a3,a4,a5,a6,a7;
3912 a1 = W1*b[1]+W7*b[7];
3913 a7 = W7*b[1]-W1*b[7];
3914 a5 = W5*b[5]+W3*b[3];
3915 a3 = W3*b[5]-W5*b[3];
3916 a2 = W2*b[2]+W6*b[6];
3917 a6 = W6*b[2]-W2*b[6];
3918 a0 = W0*b[0]+W0*b[4];
3919 a4 = W0*b[0]-W0*b[4];
3921 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3922 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3924 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3925 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3926 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3927 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3928 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3929 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3930 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3931 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3933 static void wmv2_idct_col(short * b)
3936 int a0,a1,a2,a3,a4,a5,a6,a7;
3937 /*step 1, with extended precision*/
3938 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3939 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3940 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3941 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3942 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3943 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3944 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3945 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3947 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3948 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3950 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3951 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3952 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3953 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3955 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3956 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3957 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3958 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3960 void ff_wmv2_idct_c(short * block){
3964 wmv2_idct_row(block+i);
3967 wmv2_idct_col(block+i);
3970 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3972 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3974 ff_wmv2_idct_c(block);
3975 ff_put_pixels_clamped_c(block, dest, line_size);
3977 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3979 ff_wmv2_idct_c(block);
3980 ff_add_pixels_clamped_c(block, dest, line_size);
3982 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3985 ff_put_pixels_clamped_c(block, dest, line_size);
3987 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3990 ff_add_pixels_clamped_c(block, dest, line_size);
3993 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3996 put_pixels_clamped4_c(block, dest, line_size);
3998 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4001 add_pixels_clamped4_c(block, dest, line_size);
4004 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4007 put_pixels_clamped2_c(block, dest, line_size);
4009 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4012 add_pixels_clamped2_c(block, dest, line_size);
4015 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4017 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4019 dest[0] = cm[(block[0] + 4)>>3];
4021 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4023 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4025 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4028 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4030 /* init static data */
4031 av_cold void dsputil_static_init(void)
4035 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4036 for(i=0;i<MAX_NEG_CROP;i++) {
4038 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4041 for(i=0;i<512;i++) {
4042 ff_squareTbl[i] = (i - 256) * (i - 256);
4045 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4048 int ff_check_alignment(void){
4049 static int did_fail=0;
4050 DECLARE_ALIGNED(16, int, aligned);
4052 if((intptr_t)&aligned & 15){
4054 #if HAVE_MMX || HAVE_ALTIVEC
4055 av_log(NULL, AV_LOG_ERROR,
4056 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4057 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4058 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4059 "Do not report crashes to FFmpeg developers.\n");
4068 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4072 ff_check_alignment();
4075 if(avctx->dct_algo==FF_DCT_FASTINT) {
4076 c->fdct = fdct_ifast;
4077 c->fdct248 = fdct_ifast248;
4079 else if(avctx->dct_algo==FF_DCT_FAAN) {
4080 c->fdct = ff_faandct;
4081 c->fdct248 = ff_faandct248;
4084 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4085 c->fdct248 = ff_fdct248_islow;
4087 #endif //CONFIG_ENCODERS
4089 if(avctx->lowres==1){
4090 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4091 c->idct_put= ff_jref_idct4_put;
4092 c->idct_add= ff_jref_idct4_add;
4094 c->idct_put= ff_h264_lowres_idct_put_c;
4095 c->idct_add= ff_h264_lowres_idct_add_c;
4097 c->idct = j_rev_dct4;
4098 c->idct_permutation_type= FF_NO_IDCT_PERM;
4099 }else if(avctx->lowres==2){
4100 c->idct_put= ff_jref_idct2_put;
4101 c->idct_add= ff_jref_idct2_add;
4102 c->idct = j_rev_dct2;
4103 c->idct_permutation_type= FF_NO_IDCT_PERM;
4104 }else if(avctx->lowres==3){
4105 c->idct_put= ff_jref_idct1_put;
4106 c->idct_add= ff_jref_idct1_add;
4107 c->idct = j_rev_dct1;
4108 c->idct_permutation_type= FF_NO_IDCT_PERM;
4110 if(avctx->idct_algo==FF_IDCT_INT){
4111 c->idct_put= ff_jref_idct_put;
4112 c->idct_add= ff_jref_idct_add;
4113 c->idct = j_rev_dct;
4114 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4115 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4116 avctx->idct_algo==FF_IDCT_VP3){
4117 c->idct_put= ff_vp3_idct_put_c;
4118 c->idct_add= ff_vp3_idct_add_c;
4119 c->idct = ff_vp3_idct_c;
4120 c->idct_permutation_type= FF_NO_IDCT_PERM;
4121 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4122 c->idct_put= ff_wmv2_idct_put_c;
4123 c->idct_add= ff_wmv2_idct_add_c;
4124 c->idct = ff_wmv2_idct_c;
4125 c->idct_permutation_type= FF_NO_IDCT_PERM;
4126 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4127 c->idct_put= ff_faanidct_put;
4128 c->idct_add= ff_faanidct_add;
4129 c->idct = ff_faanidct;
4130 c->idct_permutation_type= FF_NO_IDCT_PERM;
4131 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4132 c->idct_put= ff_ea_idct_put_c;
4133 c->idct_permutation_type= FF_NO_IDCT_PERM;
4134 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4135 c->idct = ff_bink_idct_c;
4136 c->idct_add = ff_bink_idct_add_c;
4137 c->idct_put = ff_bink_idct_put_c;
4138 c->idct_permutation_type = FF_NO_IDCT_PERM;
4139 }else{ //accurate/default
4140 c->idct_put= ff_simple_idct_put;
4141 c->idct_add= ff_simple_idct_add;
4142 c->idct = ff_simple_idct;
4143 c->idct_permutation_type= FF_NO_IDCT_PERM;
4147 c->get_pixels = get_pixels_c;
4148 c->diff_pixels = diff_pixels_c;
4149 c->put_pixels_clamped = ff_put_pixels_clamped_c;
4150 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4151 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4152 c->add_pixels_clamped = ff_add_pixels_clamped_c;
4153 c->add_pixels8 = add_pixels8_c;
4154 c->add_pixels4 = add_pixels4_c;
4155 c->sum_abs_dctelem = sum_abs_dctelem_c;
4156 c->emulated_edge_mc = ff_emulated_edge_mc;
4159 c->clear_block = clear_block_c;
4160 c->clear_blocks = clear_blocks_c;
4161 c->pix_sum = pix_sum_c;
4162 c->pix_norm1 = pix_norm1_c;
4164 c->fill_block_tab[0] = fill_block16_c;
4165 c->fill_block_tab[1] = fill_block8_c;
4166 c->scale_block = scale_block_c;
4168 /* TODO [0] 16 [1] 8 */
4169 c->pix_abs[0][0] = pix_abs16_c;
4170 c->pix_abs[0][1] = pix_abs16_x2_c;
4171 c->pix_abs[0][2] = pix_abs16_y2_c;
4172 c->pix_abs[0][3] = pix_abs16_xy2_c;
4173 c->pix_abs[1][0] = pix_abs8_c;
4174 c->pix_abs[1][1] = pix_abs8_x2_c;
4175 c->pix_abs[1][2] = pix_abs8_y2_c;
4176 c->pix_abs[1][3] = pix_abs8_xy2_c;
4178 #define dspfunc(PFX, IDX, NUM) \
4179 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4180 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4181 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4182 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4184 dspfunc(put, 0, 16);
4185 dspfunc(put_no_rnd, 0, 16);
4187 dspfunc(put_no_rnd, 1, 8);
4191 dspfunc(avg, 0, 16);
4192 dspfunc(avg_no_rnd, 0, 16);
4194 dspfunc(avg_no_rnd, 1, 8);
4199 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4200 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4202 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4203 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4204 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4205 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4206 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4207 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4208 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4209 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4210 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4212 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4213 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4214 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4215 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4216 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4217 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4218 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4219 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4220 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4222 #define dspfunc(PFX, IDX, NUM) \
4223 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4224 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4225 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4226 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4227 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4228 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4229 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4230 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4231 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4232 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4233 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4234 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4235 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4236 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4237 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4238 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4240 dspfunc(put_qpel, 0, 16);
4241 dspfunc(put_no_rnd_qpel, 0, 16);
4243 dspfunc(avg_qpel, 0, 16);
4244 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4246 dspfunc(put_qpel, 1, 8);
4247 dspfunc(put_no_rnd_qpel, 1, 8);
4249 dspfunc(avg_qpel, 1, 8);
4250 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4252 dspfunc(put_h264_qpel, 0, 16);
4253 dspfunc(put_h264_qpel, 1, 8);
4254 dspfunc(put_h264_qpel, 2, 4);
4255 dspfunc(put_h264_qpel, 3, 2);
4256 dspfunc(avg_h264_qpel, 0, 16);
4257 dspfunc(avg_h264_qpel, 1, 8);
4258 dspfunc(avg_h264_qpel, 2, 4);
4261 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4262 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4263 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4264 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4265 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4266 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4268 c->draw_edges = draw_edges_c;
4270 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4271 ff_mlp_init(c, avctx);
4273 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4274 ff_intrax8dsp_init(c,avctx);
4276 #if CONFIG_RV30_DECODER
4277 ff_rv30dsp_init(c,avctx);
4279 #if CONFIG_RV40_DECODER
4280 ff_rv40dsp_init(c,avctx);
4281 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4282 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4283 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4284 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4287 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4288 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4289 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4290 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4291 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4292 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4293 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4294 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4296 #define SET_CMP_FUNC(name) \
4297 c->name[0]= name ## 16_c;\
4298 c->name[1]= name ## 8x8_c;
4300 SET_CMP_FUNC(hadamard8_diff)
4301 c->hadamard8_diff[4]= hadamard8_intra16_c;
4302 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4303 SET_CMP_FUNC(dct_sad)
4304 SET_CMP_FUNC(dct_max)
4306 SET_CMP_FUNC(dct264_sad)
4308 c->sad[0]= pix_abs16_c;
4309 c->sad[1]= pix_abs8_c;
4313 SET_CMP_FUNC(quant_psnr)
4316 c->vsad[0]= vsad16_c;
4317 c->vsad[4]= vsad_intra16_c;
4318 c->vsad[5]= vsad_intra8_c;
4319 c->vsse[0]= vsse16_c;
4320 c->vsse[4]= vsse_intra16_c;
4321 c->vsse[5]= vsse_intra8_c;
4322 c->nsse[0]= nsse16_c;
4323 c->nsse[1]= nsse8_c;
4325 ff_dsputil_init_dwt(c);
4328 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4330 c->add_bytes= add_bytes_c;
4331 c->add_bytes_l2= add_bytes_l2_c;
4332 c->diff_bytes= diff_bytes_c;
4333 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4334 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4335 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4336 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4337 c->bswap_buf= bswap_buf;
4338 c->bswap16_buf = bswap16_buf;
4339 #if CONFIG_PNG_DECODER
4340 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4343 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4344 c->h263_h_loop_filter= h263_h_loop_filter_c;
4345 c->h263_v_loop_filter= h263_v_loop_filter_c;
4348 if (CONFIG_VP3_DECODER) {
4349 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4350 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4351 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4354 c->h261_loop_filter= h261_loop_filter_c;
4356 c->try_8x8basis= try_8x8basis_c;
4357 c->add_8x8basis= add_8x8basis_c;
4359 #if CONFIG_VORBIS_DECODER
4360 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4362 #if CONFIG_AC3_DECODER
4363 c->ac3_downmix = ff_ac3_downmix_c;
4365 c->vector_fmul = vector_fmul_c;
4366 c->vector_fmul_reverse = vector_fmul_reverse_c;
4367 c->vector_fmul_add = vector_fmul_add_c;
4368 c->vector_fmul_window = vector_fmul_window_c;
4369 c->vector_clipf = vector_clipf_c;
4370 c->scalarproduct_int16 = scalarproduct_int16_c;
4371 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4372 c->scalarproduct_float = scalarproduct_float_c;
4373 c->butterflies_float = butterflies_float_c;
4374 c->vector_fmul_scalar = vector_fmul_scalar_c;
4376 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4377 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4379 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4380 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4382 c->shrink[0]= av_image_copy_plane;
4383 c->shrink[1]= ff_shrink22;
4384 c->shrink[2]= ff_shrink44;
4385 c->shrink[3]= ff_shrink88;
4387 c->prefetch= just_return;
4389 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4390 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4392 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4393 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4394 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4395 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4396 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4397 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4398 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4399 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4400 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4402 for(i=0; i<64; i++){
4403 if(!c->put_2tap_qpel_pixels_tab[0][i])
4404 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4405 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4406 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4409 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4410 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4411 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4412 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4414 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4415 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4416 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4417 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4419 switch(c->idct_permutation_type){
4420 case FF_NO_IDCT_PERM:
4422 c->idct_permutation[i]= i;
4424 case FF_LIBMPEG2_IDCT_PERM:
4426 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4428 case FF_SIMPLE_IDCT_PERM:
4430 c->idct_permutation[i]= simple_mmx_permutation[i];
4432 case FF_TRANSPOSE_IDCT_PERM:
4434 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4436 case FF_PARTTRANS_IDCT_PERM:
4438 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4440 case FF_SSE2_IDCT_PERM:
4442 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4445 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");