3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = ff_squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = ff_squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
293 /* draw the edges of width 'w' of an image of size width, height */
294 //FIXME check that this is ok for mpeg4 interlaced
295 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
297 uint8_t *ptr, *last_line;
300 last_line = buf + (height - 1) * wrap;
303 memcpy(buf - (i + 1) * wrap, buf, width);
304 memcpy(last_line + (i + 1) * wrap, last_line, width);
308 for(i=0;i<height;i++) {
309 memset(ptr - w, ptr[0], w);
310 memset(ptr + width, ptr[width-1], w);
315 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
323 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324 * @param buf destination buffer
325 * @param src source buffer
326 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327 * @param block_w width of block
328 * @param block_h height of block
329 * @param src_x x coordinate of the top left sample of the block in the source buffer
330 * @param src_y y coordinate of the top left sample of the block in the source buffer
331 * @param w width of the source buffer
332 * @param h height of the source buffer
334 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335 int src_x, int src_y, int w, int h){
337 int start_y, start_x, end_y, end_x;
340 src+= (h-1-src_y)*linesize;
342 }else if(src_y<=-block_h){
343 src+= (1-block_h-src_y)*linesize;
349 }else if(src_x<=-block_w){
350 src+= (1-block_w-src_x);
354 start_y= FFMAX(0, -src_y);
355 start_x= FFMAX(0, -src_x);
356 end_y= FFMIN(block_h, h-src_y);
357 end_x= FFMIN(block_w, w-src_x);
358 assert(start_y < end_y && block_h);
359 assert(start_x < end_x && block_w);
362 src += start_y*linesize + start_x;
366 for(y=0; y<start_y; y++){
371 // copy existing part
380 for(; y<block_h; y++){
385 buf -= block_h * linesize + start_x;
388 for(x=0; x<start_x; x++){
389 buf[x] = buf[start_x];
393 for(x=end_x; x<block_w; x++){
394 buf[x] = buf[end_x - 1];
400 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
404 /* read the pixels */
406 block[0] = pixels[0];
407 block[1] = pixels[1];
408 block[2] = pixels[2];
409 block[3] = pixels[3];
410 block[4] = pixels[4];
411 block[5] = pixels[5];
412 block[6] = pixels[6];
413 block[7] = pixels[7];
419 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
420 const uint8_t *s2, int stride){
423 /* read the pixels */
425 block[0] = s1[0] - s2[0];
426 block[1] = s1[1] - s2[1];
427 block[2] = s1[2] - s2[2];
428 block[3] = s1[3] - s2[3];
429 block[4] = s1[4] - s2[4];
430 block[5] = s1[5] - s2[5];
431 block[6] = s1[6] - s2[6];
432 block[7] = s1[7] - s2[7];
440 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
444 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
446 /* read the pixels */
448 pixels[0] = cm[block[0]];
449 pixels[1] = cm[block[1]];
450 pixels[2] = cm[block[2]];
451 pixels[3] = cm[block[3]];
452 pixels[4] = cm[block[4]];
453 pixels[5] = cm[block[5]];
454 pixels[6] = cm[block[6]];
455 pixels[7] = cm[block[7]];
462 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
466 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
468 /* read the pixels */
470 pixels[0] = cm[block[0]];
471 pixels[1] = cm[block[1]];
472 pixels[2] = cm[block[2]];
473 pixels[3] = cm[block[3]];
480 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
484 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
486 /* read the pixels */
488 pixels[0] = cm[block[0]];
489 pixels[1] = cm[block[1]];
496 static void put_signed_pixels_clamped_c(const DCTELEM *block,
497 uint8_t *restrict pixels,
502 for (i = 0; i < 8; i++) {
503 for (j = 0; j < 8; j++) {
506 else if (*block > 127)
509 *pixels = (uint8_t)(*block + 128);
513 pixels += (line_size - 8);
517 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
522 /* read the pixels */
524 pixels[0] = block[0];
525 pixels[1] = block[1];
526 pixels[2] = block[2];
527 pixels[3] = block[3];
528 pixels[4] = block[4];
529 pixels[5] = block[5];
530 pixels[6] = block[6];
531 pixels[7] = block[7];
538 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
542 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
544 /* read the pixels */
546 pixels[0] = cm[pixels[0] + block[0]];
547 pixels[1] = cm[pixels[1] + block[1]];
548 pixels[2] = cm[pixels[2] + block[2]];
549 pixels[3] = cm[pixels[3] + block[3]];
550 pixels[4] = cm[pixels[4] + block[4]];
551 pixels[5] = cm[pixels[5] + block[5]];
552 pixels[6] = cm[pixels[6] + block[6]];
553 pixels[7] = cm[pixels[7] + block[7]];
559 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
563 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
565 /* read the pixels */
567 pixels[0] = cm[pixels[0] + block[0]];
568 pixels[1] = cm[pixels[1] + block[1]];
569 pixels[2] = cm[pixels[2] + block[2]];
570 pixels[3] = cm[pixels[3] + block[3]];
576 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
580 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
582 /* read the pixels */
584 pixels[0] = cm[pixels[0] + block[0]];
585 pixels[1] = cm[pixels[1] + block[1]];
591 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
595 pixels[0] += block[0];
596 pixels[1] += block[1];
597 pixels[2] += block[2];
598 pixels[3] += block[3];
599 pixels[4] += block[4];
600 pixels[5] += block[5];
601 pixels[6] += block[6];
602 pixels[7] += block[7];
608 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
612 pixels[0] += block[0];
613 pixels[1] += block[1];
614 pixels[2] += block[2];
615 pixels[3] += block[3];
621 static int sum_abs_dctelem_c(DCTELEM *block)
625 sum+= FFABS(block[i]);
629 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
633 for (i = 0; i < h; i++) {
634 memset(block, value, 16);
639 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
643 for (i = 0; i < h; i++) {
644 memset(block, value, 8);
649 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
652 uint16_t *dst1 = (uint16_t *) dst;
653 uint16_t *dst2 = (uint16_t *)(dst + linesize);
655 for (j = 0; j < 8; j++) {
656 for (i = 0; i < 8; i++) {
657 dst1[i] = dst2[i] = src[i] * 0x0101;
667 #define PIXOP2(OPNAME, OP) \
668 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
672 OP(*((uint64_t*)block), AV_RN64(pixels));\
678 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
682 const uint64_t a= AV_RN64(pixels );\
683 const uint64_t b= AV_RN64(pixels+1);\
684 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
690 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
694 const uint64_t a= AV_RN64(pixels );\
695 const uint64_t b= AV_RN64(pixels+1);\
696 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
702 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706 const uint64_t a= AV_RN64(pixels );\
707 const uint64_t b= AV_RN64(pixels+line_size);\
708 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
714 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
718 const uint64_t a= AV_RN64(pixels );\
719 const uint64_t b= AV_RN64(pixels+line_size);\
720 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
726 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
729 const uint64_t a= AV_RN64(pixels );\
730 const uint64_t b= AV_RN64(pixels+1);\
731 uint64_t l0= (a&0x0303030303030303ULL)\
732 + (b&0x0303030303030303ULL)\
733 + 0x0202020202020202ULL;\
734 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
735 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
739 for(i=0; i<h; i+=2){\
740 uint64_t a= AV_RN64(pixels );\
741 uint64_t b= AV_RN64(pixels+1);\
742 l1= (a&0x0303030303030303ULL)\
743 + (b&0x0303030303030303ULL);\
744 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
745 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
746 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
749 a= AV_RN64(pixels );\
750 b= AV_RN64(pixels+1);\
751 l0= (a&0x0303030303030303ULL)\
752 + (b&0x0303030303030303ULL)\
753 + 0x0202020202020202ULL;\
754 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
755 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
756 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
762 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
765 const uint64_t a= AV_RN64(pixels );\
766 const uint64_t b= AV_RN64(pixels+1);\
767 uint64_t l0= (a&0x0303030303030303ULL)\
768 + (b&0x0303030303030303ULL)\
769 + 0x0101010101010101ULL;\
770 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
771 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
775 for(i=0; i<h; i+=2){\
776 uint64_t a= AV_RN64(pixels );\
777 uint64_t b= AV_RN64(pixels+1);\
778 l1= (a&0x0303030303030303ULL)\
779 + (b&0x0303030303030303ULL);\
780 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
781 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
782 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
785 a= AV_RN64(pixels );\
786 b= AV_RN64(pixels+1);\
787 l0= (a&0x0303030303030303ULL)\
788 + (b&0x0303030303030303ULL)\
789 + 0x0101010101010101ULL;\
790 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
791 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
792 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
798 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
799 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
800 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
801 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
802 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
803 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
804 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
806 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
807 #else // 64 bit variant
809 #define PIXOP2(OPNAME, OP) \
810 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
813 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
818 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
821 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
826 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
829 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
830 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
835 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
839 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
840 int src_stride1, int src_stride2, int h){\
844 a= AV_RN32(&src1[i*src_stride1 ]);\
845 b= AV_RN32(&src2[i*src_stride2 ]);\
846 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
847 a= AV_RN32(&src1[i*src_stride1+4]);\
848 b= AV_RN32(&src2[i*src_stride2+4]);\
849 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
853 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
854 int src_stride1, int src_stride2, int h){\
858 a= AV_RN32(&src1[i*src_stride1 ]);\
859 b= AV_RN32(&src2[i*src_stride2 ]);\
860 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
861 a= AV_RN32(&src1[i*src_stride1+4]);\
862 b= AV_RN32(&src2[i*src_stride2+4]);\
863 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
867 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
868 int src_stride1, int src_stride2, int h){\
872 a= AV_RN32(&src1[i*src_stride1 ]);\
873 b= AV_RN32(&src2[i*src_stride2 ]);\
874 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
878 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
879 int src_stride1, int src_stride2, int h){\
883 a= AV_RN16(&src1[i*src_stride1 ]);\
884 b= AV_RN16(&src2[i*src_stride2 ]);\
885 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
889 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
891 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
892 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
895 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896 int src_stride1, int src_stride2, int h){\
897 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
898 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
901 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
905 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
909 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
913 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
917 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
918 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
921 uint32_t a, b, c, d, l0, l1, h0, h1;\
922 a= AV_RN32(&src1[i*src_stride1]);\
923 b= AV_RN32(&src2[i*src_stride2]);\
924 c= AV_RN32(&src3[i*src_stride3]);\
925 d= AV_RN32(&src4[i*src_stride4]);\
926 l0= (a&0x03030303UL)\
929 h0= ((a&0xFCFCFCFCUL)>>2)\
930 + ((b&0xFCFCFCFCUL)>>2);\
931 l1= (c&0x03030303UL)\
933 h1= ((c&0xFCFCFCFCUL)>>2)\
934 + ((d&0xFCFCFCFCUL)>>2);\
935 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
936 a= AV_RN32(&src1[i*src_stride1+4]);\
937 b= AV_RN32(&src2[i*src_stride2+4]);\
938 c= AV_RN32(&src3[i*src_stride3+4]);\
939 d= AV_RN32(&src4[i*src_stride4+4]);\
940 l0= (a&0x03030303UL)\
943 h0= ((a&0xFCFCFCFCUL)>>2)\
944 + ((b&0xFCFCFCFCUL)>>2);\
945 l1= (c&0x03030303UL)\
947 h1= ((c&0xFCFCFCFCUL)>>2)\
948 + ((d&0xFCFCFCFCUL)>>2);\
949 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
953 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
954 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
957 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
958 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
961 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
962 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
965 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
966 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
969 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
970 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
973 uint32_t a, b, c, d, l0, l1, h0, h1;\
974 a= AV_RN32(&src1[i*src_stride1]);\
975 b= AV_RN32(&src2[i*src_stride2]);\
976 c= AV_RN32(&src3[i*src_stride3]);\
977 d= AV_RN32(&src4[i*src_stride4]);\
978 l0= (a&0x03030303UL)\
981 h0= ((a&0xFCFCFCFCUL)>>2)\
982 + ((b&0xFCFCFCFCUL)>>2);\
983 l1= (c&0x03030303UL)\
985 h1= ((c&0xFCFCFCFCUL)>>2)\
986 + ((d&0xFCFCFCFCUL)>>2);\
987 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
988 a= AV_RN32(&src1[i*src_stride1+4]);\
989 b= AV_RN32(&src2[i*src_stride2+4]);\
990 c= AV_RN32(&src3[i*src_stride3+4]);\
991 d= AV_RN32(&src4[i*src_stride4+4]);\
992 l0= (a&0x03030303UL)\
995 h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 l1= (c&0x03030303UL)\
999 h1= ((c&0xFCFCFCFCUL)>>2)\
1000 + ((d&0xFCFCFCFCUL)>>2);\
1001 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1004 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1009 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1010 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1011 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017 int i, a0, b0, a1, b1;\
1024 for(i=0; i<h; i+=2){\
1030 block[0]= (a1+a0)>>2; /* FIXME non put */\
1031 block[1]= (b1+b0)>>2;\
1041 block[0]= (a1+a0)>>2;\
1042 block[1]= (b1+b0)>>2;\
1048 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1051 const uint32_t a= AV_RN32(pixels );\
1052 const uint32_t b= AV_RN32(pixels+1);\
1053 uint32_t l0= (a&0x03030303UL)\
1056 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1061 for(i=0; i<h; i+=2){\
1062 uint32_t a= AV_RN32(pixels );\
1063 uint32_t b= AV_RN32(pixels+1);\
1064 l1= (a&0x03030303UL)\
1065 + (b&0x03030303UL);\
1066 h1= ((a&0xFCFCFCFCUL)>>2)\
1067 + ((b&0xFCFCFCFCUL)>>2);\
1068 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1071 a= AV_RN32(pixels );\
1072 b= AV_RN32(pixels+1);\
1073 l0= (a&0x03030303UL)\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1084 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1087 for(j=0; j<2; j++){\
1089 const uint32_t a= AV_RN32(pixels );\
1090 const uint32_t b= AV_RN32(pixels+1);\
1091 uint32_t l0= (a&0x03030303UL)\
1094 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1095 + ((b&0xFCFCFCFCUL)>>2);\
1099 for(i=0; i<h; i+=2){\
1100 uint32_t a= AV_RN32(pixels );\
1101 uint32_t b= AV_RN32(pixels+1);\
1102 l1= (a&0x03030303UL)\
1103 + (b&0x03030303UL);\
1104 h1= ((a&0xFCFCFCFCUL)>>2)\
1105 + ((b&0xFCFCFCFCUL)>>2);\
1106 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1109 a= AV_RN32(pixels );\
1110 b= AV_RN32(pixels+1);\
1111 l0= (a&0x03030303UL)\
1114 h0= ((a&0xFCFCFCFCUL)>>2)\
1115 + ((b&0xFCFCFCFCUL)>>2);\
1116 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1120 pixels+=4-line_size*(h+1);\
1121 block +=4-line_size*h;\
1125 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1128 for(j=0; j<2; j++){\
1130 const uint32_t a= AV_RN32(pixels );\
1131 const uint32_t b= AV_RN32(pixels+1);\
1132 uint32_t l0= (a&0x03030303UL)\
1135 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1136 + ((b&0xFCFCFCFCUL)>>2);\
1140 for(i=0; i<h; i+=2){\
1141 uint32_t a= AV_RN32(pixels );\
1142 uint32_t b= AV_RN32(pixels+1);\
1143 l1= (a&0x03030303UL)\
1144 + (b&0x03030303UL);\
1145 h1= ((a&0xFCFCFCFCUL)>>2)\
1146 + ((b&0xFCFCFCFCUL)>>2);\
1147 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1150 a= AV_RN32(pixels );\
1151 b= AV_RN32(pixels+1);\
1152 l0= (a&0x03030303UL)\
1155 h0= ((a&0xFCFCFCFCUL)>>2)\
1156 + ((b&0xFCFCFCFCUL)>>2);\
1157 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1161 pixels+=4-line_size*(h+1);\
1162 block +=4-line_size*h;\
1166 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1168 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1169 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1170 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1171 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1172 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1173 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1175 #define op_avg(a, b) a = rnd_avg32(a, b)
1177 #define op_put(a, b) a = b
1184 #define put_no_rnd_pixels8_c put_pixels8_c
1185 #define put_no_rnd_pixels16_c put_pixels16_c
1187 #define avg2(a,b) ((a+b+1)>>1)
1188 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1190 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1191 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1194 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1195 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1198 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1200 const int A=(16-x16)*(16-y16);
1201 const int B=( x16)*(16-y16);
1202 const int C=(16-x16)*( y16);
1203 const int D=( x16)*( y16);
1208 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1209 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1210 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1211 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1212 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1213 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1214 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1215 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1221 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1222 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1225 const int s= 1<<shift;
1235 for(x=0; x<8; x++){ //XXX FIXME optimize
1236 int src_x, src_y, frac_x, frac_y, index;
1240 frac_x= src_x&(s-1);
1241 frac_y= src_y&(s-1);
1245 if((unsigned)src_x < width){
1246 if((unsigned)src_y < height){
1247 index= src_x + src_y*stride;
1248 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1249 + src[index +1]* frac_x )*(s-frac_y)
1250 + ( src[index+stride ]*(s-frac_x)
1251 + src[index+stride+1]* frac_x )* frac_y
1254 index= src_x + av_clip(src_y, 0, height)*stride;
1255 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1256 + src[index +1]* frac_x )*s
1260 if((unsigned)src_y < height){
1261 index= av_clip(src_x, 0, width) + src_y*stride;
1262 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1263 + src[index+stride ]* frac_y )*s
1266 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1267 dst[y*stride + x]= src[index ];
1279 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 case 2: put_pixels2_c (dst, src, stride, height); break;
1282 case 4: put_pixels4_c (dst, src, stride, height); break;
1283 case 8: put_pixels8_c (dst, src, stride, height); break;
1284 case 16:put_pixels16_c(dst, src, stride, height); break;
1288 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1290 for (i=0; i < height; i++) {
1291 for (j=0; j < width; j++) {
1292 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1299 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1301 for (i=0; i < height; i++) {
1302 for (j=0; j < width; j++) {
1303 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1310 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1321 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1332 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1343 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1354 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1365 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1376 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 case 2: avg_pixels2_c (dst, src, stride, height); break;
1379 case 4: avg_pixels4_c (dst, src, stride, height); break;
1380 case 8: avg_pixels8_c (dst, src, stride, height); break;
1381 case 16:avg_pixels16_c(dst, src, stride, height); break;
1385 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1387 for (i=0; i < height; i++) {
1388 for (j=0; j < width; j++) {
1389 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1396 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1398 for (i=0; i < height; i++) {
1399 for (j=0; j < width; j++) {
1400 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1407 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1409 for (i=0; i < height; i++) {
1410 for (j=0; j < width; j++) {
1411 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1418 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1420 for (i=0; i < height; i++) {
1421 for (j=0; j < width; j++) {
1422 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1429 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1431 for (i=0; i < height; i++) {
1432 for (j=0; j < width; j++) {
1433 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1440 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1442 for (i=0; i < height; i++) {
1443 for (j=0; j < width; j++) {
1444 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1451 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1453 for (i=0; i < height; i++) {
1454 for (j=0; j < width; j++) {
1455 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1462 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1464 for (i=0; i < height; i++) {
1465 for (j=0; j < width; j++) {
1466 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1473 #define TPEL_WIDTH(width)\
1474 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1476 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1478 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1480 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1486 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1488 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1490 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1494 #define H264_CHROMA_MC(OPNAME, OP)\
1495 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1496 const int A=(8-x)*(8-y);\
1497 const int B=( x)*(8-y);\
1498 const int C=(8-x)*( y);\
1499 const int D=( x)*( y);\
1502 assert(x<8 && y<8 && x>=0 && y>=0);\
1505 for(i=0; i<h; i++){\
1506 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1507 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1513 const int step= C ? stride : 1;\
1514 for(i=0; i<h; i++){\
1515 OP(dst[0], (A*src[0] + E*src[step+0]));\
1516 OP(dst[1], (A*src[1] + E*src[step+1]));\
1523 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1524 const int A=(8-x)*(8-y);\
1525 const int B=( x)*(8-y);\
1526 const int C=(8-x)*( y);\
1527 const int D=( x)*( y);\
1530 assert(x<8 && y<8 && x>=0 && y>=0);\
1533 for(i=0; i<h; i++){\
1534 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1535 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1536 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1537 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1543 const int step= C ? stride : 1;\
1544 for(i=0; i<h; i++){\
1545 OP(dst[0], (A*src[0] + E*src[step+0]));\
1546 OP(dst[1], (A*src[1] + E*src[step+1]));\
1547 OP(dst[2], (A*src[2] + E*src[step+2]));\
1548 OP(dst[3], (A*src[3] + E*src[step+3]));\
1555 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1556 const int A=(8-x)*(8-y);\
1557 const int B=( x)*(8-y);\
1558 const int C=(8-x)*( y);\
1559 const int D=( x)*( y);\
1562 assert(x<8 && y<8 && x>=0 && y>=0);\
1565 for(i=0; i<h; i++){\
1566 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1567 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1568 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1569 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1570 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1571 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1572 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1573 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1579 const int step= C ? stride : 1;\
1580 for(i=0; i<h; i++){\
1581 OP(dst[0], (A*src[0] + E*src[step+0]));\
1582 OP(dst[1], (A*src[1] + E*src[step+1]));\
1583 OP(dst[2], (A*src[2] + E*src[step+2]));\
1584 OP(dst[3], (A*src[3] + E*src[step+3]));\
1585 OP(dst[4], (A*src[4] + E*src[step+4]));\
1586 OP(dst[5], (A*src[5] + E*src[step+5]));\
1587 OP(dst[6], (A*src[6] + E*src[step+6]));\
1588 OP(dst[7], (A*src[7] + E*src[step+7]));\
1595 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1596 #define op_put(a, b) a = (((b) + 32)>>6)
1598 H264_CHROMA_MC(put_ , op_put)
1599 H264_CHROMA_MC(avg_ , op_avg)
1603 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1604 const int A=(8-x)*(8-y);
1605 const int B=( x)*(8-y);
1606 const int C=(8-x)*( y);
1607 const int D=( x)*( y);
1610 assert(x<8 && y<8 && x>=0 && y>=0);
1614 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1615 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1616 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1617 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1618 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1619 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1620 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1621 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1627 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1628 const int A=(8-x)*(8-y);
1629 const int B=( x)*(8-y);
1630 const int C=(8-x)*( y);
1631 const int D=( x)*( y);
1634 assert(x<8 && y<8 && x>=0 && y>=0);
1638 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1639 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1640 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1641 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1642 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1643 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1644 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1645 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1651 #define QPEL_MC(r, OPNAME, RND, OP) \
1652 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1653 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1657 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1658 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1659 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1660 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1661 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1662 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1663 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1664 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1670 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1672 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1676 const int src0= src[0*srcStride];\
1677 const int src1= src[1*srcStride];\
1678 const int src2= src[2*srcStride];\
1679 const int src3= src[3*srcStride];\
1680 const int src4= src[4*srcStride];\
1681 const int src5= src[5*srcStride];\
1682 const int src6= src[6*srcStride];\
1683 const int src7= src[7*srcStride];\
1684 const int src8= src[8*srcStride];\
1685 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1686 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1687 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1688 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1689 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1690 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1691 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1692 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1698 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1699 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1704 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1705 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1706 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1707 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1708 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1709 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1710 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1711 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1712 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1713 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1714 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1715 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1716 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1717 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1718 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1719 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1725 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1731 const int src0= src[0*srcStride];\
1732 const int src1= src[1*srcStride];\
1733 const int src2= src[2*srcStride];\
1734 const int src3= src[3*srcStride];\
1735 const int src4= src[4*srcStride];\
1736 const int src5= src[5*srcStride];\
1737 const int src6= src[6*srcStride];\
1738 const int src7= src[7*srcStride];\
1739 const int src8= src[8*srcStride];\
1740 const int src9= src[9*srcStride];\
1741 const int src10= src[10*srcStride];\
1742 const int src11= src[11*srcStride];\
1743 const int src12= src[12*srcStride];\
1744 const int src13= src[13*srcStride];\
1745 const int src14= src[14*srcStride];\
1746 const int src15= src[15*srcStride];\
1747 const int src16= src[16*srcStride];\
1748 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1749 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1750 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1751 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1752 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1753 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1754 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1755 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1756 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1757 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1758 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1759 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1760 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1761 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1762 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1763 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1769 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1772 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1775 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1776 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1779 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1781 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1782 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1785 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[16*9];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1790 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1793 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1799 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1800 uint8_t full[16*9];\
1802 copy_block9(full, src, 16, stride, 9);\
1803 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1804 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1806 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t full[16*9];\
1810 uint8_t halfHV[64];\
1811 copy_block9(full, src, 16, stride, 9);\
1812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1817 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t full[16*9];\
1820 uint8_t halfHV[64];\
1821 copy_block9(full, src, 16, stride, 9);\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1823 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1824 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1825 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1827 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1828 uint8_t full[16*9];\
1831 uint8_t halfHV[64];\
1832 copy_block9(full, src, 16, stride, 9);\
1833 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1834 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1835 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1836 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1838 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1839 uint8_t full[16*9];\
1841 uint8_t halfHV[64];\
1842 copy_block9(full, src, 16, stride, 9);\
1843 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1844 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1845 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1848 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[16*9];\
1852 uint8_t halfHV[64];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1859 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[16*9];\
1862 uint8_t halfHV[64];\
1863 copy_block9(full, src, 16, stride, 9);\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1865 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1866 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1867 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1869 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[16*9];\
1873 uint8_t halfHV[64];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1877 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1880 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[16*9];\
1883 uint8_t halfHV[64];\
1884 copy_block9(full, src, 16, stride, 9);\
1885 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1887 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1890 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1892 uint8_t halfHV[64];\
1893 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1897 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t halfHV[64];\
1900 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1901 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1902 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1904 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1905 uint8_t full[16*9];\
1908 uint8_t halfHV[64];\
1909 copy_block9(full, src, 16, stride, 9);\
1910 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1911 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1912 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1913 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1915 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1916 uint8_t full[16*9];\
1918 copy_block9(full, src, 16, stride, 9);\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1921 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1923 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924 uint8_t full[16*9];\
1927 uint8_t halfHV[64];\
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1934 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[16*9];\
1937 copy_block9(full, src, 16, stride, 9);\
1938 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1939 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1940 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1942 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1945 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1948 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1951 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1954 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1955 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1958 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1961 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1964 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1967 copy_block17(full, src, 24, stride, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1969 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1972 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 copy_block17(full, src, 24, stride, 17);\
1975 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1978 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[24*17];\
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1983 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1985 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1986 uint8_t full[24*17];\
1987 uint8_t halfH[272];\
1988 uint8_t halfV[256];\
1989 uint8_t halfHV[256];\
1990 copy_block17(full, src, 24, stride, 17);\
1991 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1996 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t full[24*17];\
1998 uint8_t halfH[272];\
1999 uint8_t halfHV[256];\
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2006 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
2009 uint8_t halfV[256];\
2010 uint8_t halfHV[256];\
2011 copy_block17(full, src, 24, stride, 17);\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2017 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[24*17];\
2019 uint8_t halfH[272];\
2020 uint8_t halfHV[256];\
2021 copy_block17(full, src, 24, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2024 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2025 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2027 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t full[24*17];\
2029 uint8_t halfH[272];\
2030 uint8_t halfV[256];\
2031 uint8_t halfHV[256];\
2032 copy_block17(full, src, 24, stride, 17);\
2033 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2034 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2035 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2038 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2039 uint8_t full[24*17];\
2040 uint8_t halfH[272];\
2041 uint8_t halfHV[256];\
2042 copy_block17(full, src, 24, stride, 17);\
2043 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2044 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2045 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2046 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2048 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2049 uint8_t full[24*17];\
2050 uint8_t halfH[272];\
2051 uint8_t halfV[256];\
2052 uint8_t halfHV[256];\
2053 copy_block17(full, src, 24, stride, 17);\
2054 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2055 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2056 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2057 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2059 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2060 uint8_t full[24*17];\
2061 uint8_t halfH[272];\
2062 uint8_t halfHV[256];\
2063 copy_block17(full, src, 24, stride, 17);\
2064 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2065 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2066 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2067 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2069 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2070 uint8_t halfH[272];\
2071 uint8_t halfHV[256];\
2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2076 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t halfH[272];\
2078 uint8_t halfHV[256];\
2079 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2081 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2083 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2084 uint8_t full[24*17];\
2085 uint8_t halfH[272];\
2086 uint8_t halfV[256];\
2087 uint8_t halfHV[256];\
2088 copy_block17(full, src, 24, stride, 17);\
2089 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2090 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2091 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2092 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2094 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2095 uint8_t full[24*17];\
2096 uint8_t halfH[272];\
2097 copy_block17(full, src, 24, stride, 17);\
2098 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2099 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2100 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2102 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2103 uint8_t full[24*17];\
2104 uint8_t halfH[272];\
2105 uint8_t halfV[256];\
2106 uint8_t halfHV[256];\
2107 copy_block17(full, src, 24, stride, 17);\
2108 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2109 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2110 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2111 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2113 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2114 uint8_t full[24*17];\
2115 uint8_t halfH[272];\
2116 copy_block17(full, src, 24, stride, 17);\
2117 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2118 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2119 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2121 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t halfH[272];\
2123 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2124 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2127 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2128 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2129 #define op_put(a, b) a = cm[((b) + 16)>>5]
2130 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2132 QPEL_MC(0, put_ , _ , op_put)
2133 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2134 QPEL_MC(0, avg_ , _ , op_avg)
2135 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2137 #undef op_avg_no_rnd
2139 #undef op_put_no_rnd
2141 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2142 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2143 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2144 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2145 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2146 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2149 #define H264_LOWPASS(OPNAME, OP, OP2) \
2150 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2152 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2156 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2157 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2163 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2165 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2169 const int srcB= src[-2*srcStride];\
2170 const int srcA= src[-1*srcStride];\
2171 const int src0= src[0 *srcStride];\
2172 const int src1= src[1 *srcStride];\
2173 const int src2= src[2 *srcStride];\
2174 const int src3= src[3 *srcStride];\
2175 const int src4= src[4 *srcStride];\
2176 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2177 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2183 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2186 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2188 src -= 2*srcStride;\
2189 for(i=0; i<h+5; i++)\
2191 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2192 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2196 tmp -= tmpStride*(h+5-2);\
2199 const int tmpB= tmp[-2*tmpStride];\
2200 const int tmpA= tmp[-1*tmpStride];\
2201 const int tmp0= tmp[0 *tmpStride];\
2202 const int tmp1= tmp[1 *tmpStride];\
2203 const int tmp2= tmp[2 *tmpStride];\
2204 const int tmp3= tmp[3 *tmpStride];\
2205 const int tmp4= tmp[4 *tmpStride];\
2206 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2207 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2212 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2214 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2218 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2219 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2220 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2221 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2227 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2229 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2233 const int srcB= src[-2*srcStride];\
2234 const int srcA= src[-1*srcStride];\
2235 const int src0= src[0 *srcStride];\
2236 const int src1= src[1 *srcStride];\
2237 const int src2= src[2 *srcStride];\
2238 const int src3= src[3 *srcStride];\
2239 const int src4= src[4 *srcStride];\
2240 const int src5= src[5 *srcStride];\
2241 const int src6= src[6 *srcStride];\
2242 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2251 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2254 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2256 src -= 2*srcStride;\
2257 for(i=0; i<h+5; i++)\
2259 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2260 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2261 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2262 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2266 tmp -= tmpStride*(h+5-2);\
2269 const int tmpB= tmp[-2*tmpStride];\
2270 const int tmpA= tmp[-1*tmpStride];\
2271 const int tmp0= tmp[0 *tmpStride];\
2272 const int tmp1= tmp[1 *tmpStride];\
2273 const int tmp2= tmp[2 *tmpStride];\
2274 const int tmp3= tmp[3 *tmpStride];\
2275 const int tmp4= tmp[4 *tmpStride];\
2276 const int tmp5= tmp[5 *tmpStride];\
2277 const int tmp6= tmp[6 *tmpStride];\
2278 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2279 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2280 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2281 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2287 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2289 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2293 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2294 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2295 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2296 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2297 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2298 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2299 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2300 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2306 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2312 const int srcB= src[-2*srcStride];\
2313 const int srcA= src[-1*srcStride];\
2314 const int src0= src[0 *srcStride];\
2315 const int src1= src[1 *srcStride];\
2316 const int src2= src[2 *srcStride];\
2317 const int src3= src[3 *srcStride];\
2318 const int src4= src[4 *srcStride];\
2319 const int src5= src[5 *srcStride];\
2320 const int src6= src[6 *srcStride];\
2321 const int src7= src[7 *srcStride];\
2322 const int src8= src[8 *srcStride];\
2323 const int src9= src[9 *srcStride];\
2324 const int src10=src[10*srcStride];\
2325 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2326 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2327 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2328 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2329 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2330 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2331 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2332 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2338 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2341 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2343 src -= 2*srcStride;\
2344 for(i=0; i<h+5; i++)\
2346 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2347 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2348 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2349 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2350 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2351 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2352 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2353 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2357 tmp -= tmpStride*(h+5-2);\
2360 const int tmpB= tmp[-2*tmpStride];\
2361 const int tmpA= tmp[-1*tmpStride];\
2362 const int tmp0= tmp[0 *tmpStride];\
2363 const int tmp1= tmp[1 *tmpStride];\
2364 const int tmp2= tmp[2 *tmpStride];\
2365 const int tmp3= tmp[3 *tmpStride];\
2366 const int tmp4= tmp[4 *tmpStride];\
2367 const int tmp5= tmp[5 *tmpStride];\
2368 const int tmp6= tmp[6 *tmpStride];\
2369 const int tmp7= tmp[7 *tmpStride];\
2370 const int tmp8= tmp[8 *tmpStride];\
2371 const int tmp9= tmp[9 *tmpStride];\
2372 const int tmp10=tmp[10*tmpStride];\
2373 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2374 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2375 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2376 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2377 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2378 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2379 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2380 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2386 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2388 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2389 src += 8*srcStride;\
2390 dst += 8*dstStride;\
2391 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2392 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2395 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2396 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2397 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2398 src += 8*srcStride;\
2399 dst += 8*dstStride;\
2400 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2401 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2404 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2405 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2406 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2407 src += 8*srcStride;\
2408 dst += 8*dstStride;\
2409 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2410 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2413 #define H264_MC(OPNAME, SIZE) \
2414 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2415 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2418 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2419 uint8_t half[SIZE*SIZE];\
2420 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2421 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2425 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2428 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2429 uint8_t half[SIZE*SIZE];\
2430 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2435 uint8_t full[SIZE*(SIZE+5)];\
2436 uint8_t * const full_mid= full + SIZE*2;\
2437 uint8_t half[SIZE*SIZE];\
2438 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2439 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2440 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t full[SIZE*(SIZE+5)];\
2445 uint8_t * const full_mid= full + SIZE*2;\
2446 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2447 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2451 uint8_t full[SIZE*(SIZE+5)];\
2452 uint8_t * const full_mid= full + SIZE*2;\
2453 uint8_t half[SIZE*SIZE];\
2454 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2455 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2456 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2460 uint8_t full[SIZE*(SIZE+5)];\
2461 uint8_t * const full_mid= full + SIZE*2;\
2462 uint8_t halfH[SIZE*SIZE];\
2463 uint8_t halfV[SIZE*SIZE];\
2464 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2466 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2467 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2470 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2471 uint8_t full[SIZE*(SIZE+5)];\
2472 uint8_t * const full_mid= full + SIZE*2;\
2473 uint8_t halfH[SIZE*SIZE];\
2474 uint8_t halfV[SIZE*SIZE];\
2475 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2476 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2477 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2478 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2481 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2482 uint8_t full[SIZE*(SIZE+5)];\
2483 uint8_t * const full_mid= full + SIZE*2;\
2484 uint8_t halfH[SIZE*SIZE];\
2485 uint8_t halfV[SIZE*SIZE];\
2486 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2487 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2488 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2489 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2492 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2493 uint8_t full[SIZE*(SIZE+5)];\
2494 uint8_t * const full_mid= full + SIZE*2;\
2495 uint8_t halfH[SIZE*SIZE];\
2496 uint8_t halfV[SIZE*SIZE];\
2497 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2498 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2499 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2500 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2504 int16_t tmp[SIZE*(SIZE+5)];\
2505 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2508 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2509 int16_t tmp[SIZE*(SIZE+5)];\
2510 uint8_t halfH[SIZE*SIZE];\
2511 uint8_t halfHV[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2513 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2517 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2518 int16_t tmp[SIZE*(SIZE+5)];\
2519 uint8_t halfH[SIZE*SIZE];\
2520 uint8_t halfHV[SIZE*SIZE];\
2521 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2522 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2523 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2526 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2527 uint8_t full[SIZE*(SIZE+5)];\
2528 uint8_t * const full_mid= full + SIZE*2;\
2529 int16_t tmp[SIZE*(SIZE+5)];\
2530 uint8_t halfV[SIZE*SIZE];\
2531 uint8_t halfHV[SIZE*SIZE];\
2532 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2533 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2534 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2535 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2539 uint8_t full[SIZE*(SIZE+5)];\
2540 uint8_t * const full_mid= full + SIZE*2;\
2541 int16_t tmp[SIZE*(SIZE+5)];\
2542 uint8_t halfV[SIZE*SIZE];\
2543 uint8_t halfHV[SIZE*SIZE];\
2544 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2545 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2547 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2550 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2551 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2552 #define op_put(a, b) a = cm[((b) + 16)>>5]
2553 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2554 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2556 H264_LOWPASS(put_ , op_put, op2_put)
2557 H264_LOWPASS(avg_ , op_avg, op2_avg)
2572 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2573 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2574 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2575 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2577 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2578 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2582 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2583 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2584 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2585 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2586 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2587 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2588 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2589 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2595 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2596 put_pixels8_c(dst, src, stride, 8);
2598 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2599 avg_pixels8_c(dst, src, stride, 8);
2601 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2602 put_pixels16_c(dst, src, stride, 16);
2604 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2605 avg_pixels16_c(dst, src, stride, 16);
2608 #if CONFIG_RV40_DECODER
2609 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2610 put_pixels16_xy2_c(dst, src, stride, 16);
2612 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2613 avg_pixels16_xy2_c(dst, src, stride, 16);
2615 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2616 put_pixels8_xy2_c(dst, src, stride, 8);
2618 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2619 avg_pixels8_xy2_c(dst, src, stride, 8);
2621 #endif /* CONFIG_RV40_DECODER */
2623 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2624 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2628 const int src_1= src[ -srcStride];
2629 const int src0 = src[0 ];
2630 const int src1 = src[ srcStride];
2631 const int src2 = src[2*srcStride];
2632 const int src3 = src[3*srcStride];
2633 const int src4 = src[4*srcStride];
2634 const int src5 = src[5*srcStride];
2635 const int src6 = src[6*srcStride];
2636 const int src7 = src[7*srcStride];
2637 const int src8 = src[8*srcStride];
2638 const int src9 = src[9*srcStride];
2639 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2640 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2641 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2642 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2643 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2644 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2645 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2646 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2652 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2654 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2655 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2658 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2659 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2662 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2664 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2665 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2668 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2669 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2672 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2676 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2678 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2679 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2681 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2685 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2687 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2688 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2690 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2692 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2693 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2696 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2697 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2699 const int strength= ff_h263_loop_filter_strength[qscale];
2703 int p0= src[x-2*stride];
2704 int p1= src[x-1*stride];
2705 int p2= src[x+0*stride];
2706 int p3= src[x+1*stride];
2707 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2709 if (d<-2*strength) d1= 0;
2710 else if(d<- strength) d1=-2*strength - d;
2711 else if(d< strength) d1= d;
2712 else if(d< 2*strength) d1= 2*strength - d;
2717 if(p1&256) p1= ~(p1>>31);
2718 if(p2&256) p2= ~(p2>>31);
2720 src[x-1*stride] = p1;
2721 src[x+0*stride] = p2;
2725 d2= av_clip((p0-p3)/4, -ad1, ad1);
2727 src[x-2*stride] = p0 - d2;
2728 src[x+ stride] = p3 + d2;
2733 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2734 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2736 const int strength= ff_h263_loop_filter_strength[qscale];
2740 int p0= src[y*stride-2];
2741 int p1= src[y*stride-1];
2742 int p2= src[y*stride+0];
2743 int p3= src[y*stride+1];
2744 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2746 if (d<-2*strength) d1= 0;
2747 else if(d<- strength) d1=-2*strength - d;
2748 else if(d< strength) d1= d;
2749 else if(d< 2*strength) d1= 2*strength - d;
2754 if(p1&256) p1= ~(p1>>31);
2755 if(p2&256) p2= ~(p2>>31);
2757 src[y*stride-1] = p1;
2758 src[y*stride+0] = p2;
2762 d2= av_clip((p0-p3)/4, -ad1, ad1);
2764 src[y*stride-2] = p0 - d2;
2765 src[y*stride+1] = p3 + d2;
2770 static void h261_loop_filter_c(uint8_t *src, int stride){
2775 temp[x ] = 4*src[x ];
2776 temp[x + 7*8] = 4*src[x + 7*stride];
2780 xy = y * stride + x;
2782 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2787 src[ y*stride] = (temp[ y*8] + 2)>>2;
2788 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2790 xy = y * stride + x;
2792 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2797 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2803 s += abs(pix1[0] - pix2[0]);
2804 s += abs(pix1[1] - pix2[1]);
2805 s += abs(pix1[2] - pix2[2]);
2806 s += abs(pix1[3] - pix2[3]);
2807 s += abs(pix1[4] - pix2[4]);
2808 s += abs(pix1[5] - pix2[5]);
2809 s += abs(pix1[6] - pix2[6]);
2810 s += abs(pix1[7] - pix2[7]);
2811 s += abs(pix1[8] - pix2[8]);
2812 s += abs(pix1[9] - pix2[9]);
2813 s += abs(pix1[10] - pix2[10]);
2814 s += abs(pix1[11] - pix2[11]);
2815 s += abs(pix1[12] - pix2[12]);
2816 s += abs(pix1[13] - pix2[13]);
2817 s += abs(pix1[14] - pix2[14]);
2818 s += abs(pix1[15] - pix2[15]);
2825 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2831 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2832 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2833 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2834 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2835 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2836 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2837 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2838 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2839 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2840 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2841 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2842 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2843 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2844 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2845 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2846 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2853 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2856 uint8_t *pix3 = pix2 + line_size;
2860 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2861 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2862 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2863 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2864 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2865 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2866 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2867 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2868 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2869 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2870 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2871 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2872 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2873 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2874 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2875 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2883 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2886 uint8_t *pix3 = pix2 + line_size;
2890 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2891 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2892 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2893 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2894 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2895 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2896 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2897 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2898 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2899 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2900 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2901 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2902 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2903 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2904 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2905 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2913 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2919 s += abs(pix1[0] - pix2[0]);
2920 s += abs(pix1[1] - pix2[1]);
2921 s += abs(pix1[2] - pix2[2]);
2922 s += abs(pix1[3] - pix2[3]);
2923 s += abs(pix1[4] - pix2[4]);
2924 s += abs(pix1[5] - pix2[5]);
2925 s += abs(pix1[6] - pix2[6]);
2926 s += abs(pix1[7] - pix2[7]);
2933 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2940 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2941 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2942 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2943 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2944 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2945 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2946 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2953 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2956 uint8_t *pix3 = pix2 + line_size;
2960 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2961 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2962 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2963 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2964 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2965 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2966 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2967 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2975 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2978 uint8_t *pix3 = pix2 + line_size;
2982 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2983 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2984 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2985 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2986 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2987 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2988 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2989 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2997 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2998 MpegEncContext *c = v;
3004 for(x=0; x<16; x++){
3005 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3008 for(x=0; x<15; x++){
3009 score2+= FFABS( s1[x ] - s1[x +stride]
3010 - s1[x+1] + s1[x+1+stride])
3011 -FFABS( s2[x ] - s2[x +stride]
3012 - s2[x+1] + s2[x+1+stride]);
3019 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3020 else return score1 + FFABS(score2)*8;
3023 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3024 MpegEncContext *c = v;
3031 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3035 score2+= FFABS( s1[x ] - s1[x +stride]
3036 - s1[x+1] + s1[x+1+stride])
3037 -FFABS( s2[x ] - s2[x +stride]
3038 - s2[x+1] + s2[x+1+stride]);
3045 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3046 else return score1 + FFABS(score2)*8;
3049 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3053 for(i=0; i<8*8; i++){
3054 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3057 assert(-512<b && b<512);
3059 sum += (w*b)*(w*b)>>4;
3064 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3067 for(i=0; i<8*8; i++){
3068 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3073 * permutes an 8x8 block.
3074 * @param block the block which will be permuted according to the given permutation vector
3075 * @param permutation the permutation vector
3076 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3077 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3078 * (inverse) permutated to scantable order!
3080 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3086 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3088 for(i=0; i<=last; i++){
3089 const int j= scantable[i];
3094 for(i=0; i<=last; i++){
3095 const int j= scantable[i];
3096 const int perm_j= permutation[j];
3097 block[perm_j]= temp[j];
3101 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3105 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3108 memset(cmp, 0, sizeof(void*)*6);
3116 cmp[i]= c->hadamard8_diff[i];
3122 cmp[i]= c->dct_sad[i];
3125 cmp[i]= c->dct264_sad[i];
3128 cmp[i]= c->dct_max[i];
3131 cmp[i]= c->quant_psnr[i];
3160 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3165 static void clear_block_c(DCTELEM *block)
3167 memset(block, 0, sizeof(DCTELEM)*64);
3171 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3173 static void clear_blocks_c(DCTELEM *blocks)
3175 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3178 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3180 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3181 long a = *(long*)(src+i);
3182 long b = *(long*)(dst+i);
3183 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3186 dst[i+0] += src[i+0];
3189 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3191 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3192 long a = *(long*)(src1+i);
3193 long b = *(long*)(src2+i);
3194 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3197 dst[i] = src1[i]+src2[i];
3200 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3202 #if !HAVE_FAST_UNALIGNED
3203 if((long)src2 & (sizeof(long)-1)){
3204 for(i=0; i+7<w; i+=8){
3205 dst[i+0] = src1[i+0]-src2[i+0];
3206 dst[i+1] = src1[i+1]-src2[i+1];
3207 dst[i+2] = src1[i+2]-src2[i+2];
3208 dst[i+3] = src1[i+3]-src2[i+3];
3209 dst[i+4] = src1[i+4]-src2[i+4];
3210 dst[i+5] = src1[i+5]-src2[i+5];
3211 dst[i+6] = src1[i+6]-src2[i+6];
3212 dst[i+7] = src1[i+7]-src2[i+7];
3216 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3217 long a = *(long*)(src1+i);
3218 long b = *(long*)(src2+i);
3219 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3222 dst[i+0] = src1[i+0]-src2[i+0];
3225 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3233 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3242 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3250 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3260 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3263 for(i=0; i<w-1; i++){
3290 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3320 #define BUTTERFLY2(o1,o2,i1,i2) \
3324 #define BUTTERFLY1(x,y) \
3333 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3335 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3343 //FIXME try pointer walks
3344 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3345 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3346 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3347 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3349 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3350 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3351 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3352 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3354 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3355 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3356 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3357 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3361 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3362 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3363 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3364 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3366 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3367 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3368 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3369 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3372 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3373 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3374 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3375 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3381 printf("MAX:%d\n", maxi);
3387 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3395 //FIXME try pointer walks
3396 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3397 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3398 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3399 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3401 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3402 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3403 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3404 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3406 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3407 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3408 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3409 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3413 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3414 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3415 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3416 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3418 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3419 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3420 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3421 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3424 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3425 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3426 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3427 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3430 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3435 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3436 MpegEncContext * const s= (MpegEncContext *)c;
3437 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3441 s->dsp.diff_pixels(temp, src1, src2, stride);
3443 return s->dsp.sum_abs_dctelem(temp);
3448 const int s07 = SRC(0) + SRC(7);\
3449 const int s16 = SRC(1) + SRC(6);\
3450 const int s25 = SRC(2) + SRC(5);\
3451 const int s34 = SRC(3) + SRC(4);\
3452 const int a0 = s07 + s34;\
3453 const int a1 = s16 + s25;\
3454 const int a2 = s07 - s34;\
3455 const int a3 = s16 - s25;\
3456 const int d07 = SRC(0) - SRC(7);\
3457 const int d16 = SRC(1) - SRC(6);\
3458 const int d25 = SRC(2) - SRC(5);\
3459 const int d34 = SRC(3) - SRC(4);\
3460 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3461 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3462 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3463 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3465 DST(1, a4 + (a7>>2)) ;\
3466 DST(2, a2 + (a3>>1)) ;\
3467 DST(3, a5 + (a6>>2)) ;\
3469 DST(5, a6 - (a5>>2)) ;\
3470 DST(6, (a2>>1) - a3 ) ;\
3471 DST(7, (a4>>2) - a7 ) ;\
3474 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475 MpegEncContext * const s= (MpegEncContext *)c;
3480 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3482 #define SRC(x) dct[i][x]
3483 #define DST(x,v) dct[i][x]= v
3484 for( i = 0; i < 8; i++ )
3489 #define SRC(x) dct[x][i]
3490 #define DST(x,v) sum += FFABS(v)
3491 for( i = 0; i < 8; i++ )
3499 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3500 MpegEncContext * const s= (MpegEncContext *)c;
3501 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3506 s->dsp.diff_pixels(temp, src1, src2, stride);
3510 sum= FFMAX(sum, FFABS(temp[i]));
3515 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3516 MpegEncContext * const s= (MpegEncContext *)c;
3517 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3518 DCTELEM * const bak = temp+64;
3524 s->dsp.diff_pixels(temp, src1, src2, stride);
3526 memcpy(bak, temp, 64*sizeof(DCTELEM));
3528 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3529 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3530 ff_simple_idct(temp); //FIXME
3533 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3538 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3539 MpegEncContext * const s= (MpegEncContext *)c;
3540 const uint8_t *scantable= s->intra_scantable.permutated;
3541 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3542 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3543 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3544 int i, last, run, bits, level, distortion, start_i;
3545 const int esc_length= s->ac_esc_length;
3547 uint8_t * last_length;
3551 copy_block8(lsrc1, src1, 8, stride, 8);
3552 copy_block8(lsrc2, src2, 8, stride, 8);
3554 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3556 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3562 length = s->intra_ac_vlc_length;
3563 last_length= s->intra_ac_vlc_last_length;
3564 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3567 length = s->inter_ac_vlc_length;
3568 last_length= s->inter_ac_vlc_last_length;
3573 for(i=start_i; i<last; i++){
3574 int j= scantable[i];
3579 if((level&(~127)) == 0){
3580 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3589 level= temp[i] + 64;
3593 if((level&(~127)) == 0){
3594 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3602 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3604 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3607 s->dsp.idct_add(lsrc2, 8, temp);
3609 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3611 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3614 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3615 MpegEncContext * const s= (MpegEncContext *)c;
3616 const uint8_t *scantable= s->intra_scantable.permutated;
3617 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3618 int i, last, run, bits, level, start_i;
3619 const int esc_length= s->ac_esc_length;
3621 uint8_t * last_length;
3625 s->dsp.diff_pixels(temp, src1, src2, stride);
3627 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3633 length = s->intra_ac_vlc_length;
3634 last_length= s->intra_ac_vlc_last_length;
3635 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3638 length = s->inter_ac_vlc_length;
3639 last_length= s->inter_ac_vlc_last_length;
3644 for(i=start_i; i<last; i++){
3645 int j= scantable[i];
3650 if((level&(~127)) == 0){
3651 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3660 level= temp[i] + 64;
3664 if((level&(~127)) == 0){
3665 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3673 #define VSAD_INTRA(size) \
3674 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3678 for(y=1; y<h; y++){ \
3679 for(x=0; x<size; x+=4){ \
3680 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3681 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3691 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3696 for(x=0; x<16; x++){
3697 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3706 #define SQ(a) ((a)*(a))
3707 #define VSSE_INTRA(size) \
3708 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3712 for(y=1; y<h; y++){ \
3713 for(x=0; x<size; x+=4){ \
3714 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3715 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3725 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3730 for(x=0; x<16; x++){
3731 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3740 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3744 for(i=0; i<size; i++)
3745 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3749 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3750 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3751 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3753 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3755 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3756 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3757 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3758 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3760 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3762 for(i=0; i<len; i++)
3763 dst[i] = src0[i] * src1[i];
3766 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3769 for(i=0; i<len; i++)
3770 dst[i] = src0[i] * src1[-i];
3773 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3775 for(i=0; i<len; i++)
3776 dst[i] = src0[i] * src1[i] + src2[i];
3779 static void vector_fmul_window_c(float *dst, const float *src0,
3780 const float *src1, const float *win, int len)
3786 for(i=-len, j=len-1; i<0; i++, j--) {
3791 dst[i] = s0*wj - s1*wi;
3792 dst[j] = s0*wi + s1*wj;
3796 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3800 for (i = 0; i < len; i++)
3801 dst[i] = src[i] * mul;
3804 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3805 const float **sv, float mul, int len)
3808 for (i = 0; i < len; i += 2, sv++) {
3809 dst[i ] = src[i ] * sv[0][0] * mul;
3810 dst[i+1] = src[i+1] * sv[0][1] * mul;
3814 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3815 const float **sv, float mul, int len)
3818 for (i = 0; i < len; i += 4, sv++) {
3819 dst[i ] = src[i ] * sv[0][0] * mul;
3820 dst[i+1] = src[i+1] * sv[0][1] * mul;
3821 dst[i+2] = src[i+2] * sv[0][2] * mul;
3822 dst[i+3] = src[i+3] * sv[0][3] * mul;
3826 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3830 for (i = 0; i < len; i += 2, sv++) {
3831 dst[i ] = sv[0][0] * mul;
3832 dst[i+1] = sv[0][1] * mul;
3836 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3840 for (i = 0; i < len; i += 4, sv++) {
3841 dst[i ] = sv[0][0] * mul;
3842 dst[i+1] = sv[0][1] * mul;
3843 dst[i+2] = sv[0][2] * mul;
3844 dst[i+3] = sv[0][3] * mul;
3848 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3852 for (i = 0; i < len; i++) {
3853 float t = v1[i] - v2[i];
3859 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3864 for (i = 0; i < len; i++)
3870 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3871 uint32_t maxi, uint32_t maxisign)
3874 if(a > mini) return mini;
3875 else if((a^(1<<31)) > maxisign) return maxi;
3879 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3881 uint32_t mini = *(uint32_t*)min;
3882 uint32_t maxi = *(uint32_t*)max;
3883 uint32_t maxisign = maxi ^ (1<<31);
3884 uint32_t *dsti = (uint32_t*)dst;
3885 const uint32_t *srci = (const uint32_t*)src;
3886 for(i=0; i<len; i+=8) {
3887 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3888 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3889 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3890 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3891 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3892 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3893 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3894 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3897 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3899 if(min < 0 && max > 0) {
3900 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3902 for(i=0; i < len; i+=8) {
3903 dst[i ] = av_clipf(src[i ], min, max);
3904 dst[i + 1] = av_clipf(src[i + 1], min, max);
3905 dst[i + 2] = av_clipf(src[i + 2], min, max);
3906 dst[i + 3] = av_clipf(src[i + 3], min, max);
3907 dst[i + 4] = av_clipf(src[i + 4], min, max);
3908 dst[i + 5] = av_clipf(src[i + 5], min, max);
3909 dst[i + 6] = av_clipf(src[i + 6], min, max);
3910 dst[i + 7] = av_clipf(src[i + 7], min, max);
3915 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3920 res += (*v1++ * *v2++) >> shift;
3925 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3930 *v1++ += mul * *v3++;
3936 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3937 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3938 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3939 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3940 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3941 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3942 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3944 static void wmv2_idct_row(short * b)
3947 int a0,a1,a2,a3,a4,a5,a6,a7;
3949 a1 = W1*b[1]+W7*b[7];
3950 a7 = W7*b[1]-W1*b[7];
3951 a5 = W5*b[5]+W3*b[3];
3952 a3 = W3*b[5]-W5*b[3];
3953 a2 = W2*b[2]+W6*b[6];
3954 a6 = W6*b[2]-W2*b[6];
3955 a0 = W0*b[0]+W0*b[4];
3956 a4 = W0*b[0]-W0*b[4];
3958 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3959 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3961 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3962 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3963 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3964 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3965 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3966 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3967 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3968 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3970 static void wmv2_idct_col(short * b)
3973 int a0,a1,a2,a3,a4,a5,a6,a7;
3974 /*step 1, with extended precision*/
3975 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3976 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3977 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3978 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3979 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3980 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3981 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3982 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3984 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3985 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3987 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3988 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3989 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3990 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3992 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3993 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3994 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3995 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3997 void ff_wmv2_idct_c(short * block){
4001 wmv2_idct_row(block+i);
4004 wmv2_idct_col(block+i);
4007 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4009 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4011 ff_wmv2_idct_c(block);
4012 put_pixels_clamped_c(block, dest, line_size);
4014 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4016 ff_wmv2_idct_c(block);
4017 add_pixels_clamped_c(block, dest, line_size);
4019 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4022 put_pixels_clamped_c(block, dest, line_size);
4024 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4027 add_pixels_clamped_c(block, dest, line_size);
4030 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4033 put_pixels_clamped4_c(block, dest, line_size);
4035 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4038 add_pixels_clamped4_c(block, dest, line_size);
4041 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4044 put_pixels_clamped2_c(block, dest, line_size);
4046 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4049 add_pixels_clamped2_c(block, dest, line_size);
4052 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4054 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4056 dest[0] = cm[(block[0] + 4)>>3];
4058 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4060 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4062 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4065 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4067 /* init static data */
4068 av_cold void dsputil_static_init(void)
4072 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4073 for(i=0;i<MAX_NEG_CROP;i++) {
4075 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4078 for(i=0;i<512;i++) {
4079 ff_squareTbl[i] = (i - 256) * (i - 256);
4082 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4085 int ff_check_alignment(void){
4086 static int did_fail=0;
4087 DECLARE_ALIGNED(16, int, aligned);
4089 if((intptr_t)&aligned & 15){
4091 #if HAVE_MMX || HAVE_ALTIVEC
4092 av_log(NULL, AV_LOG_ERROR,
4093 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4094 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4095 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4096 "Do not report crashes to FFmpeg developers.\n");
4105 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4109 ff_check_alignment();
4112 if(avctx->dct_algo==FF_DCT_FASTINT) {
4113 c->fdct = fdct_ifast;
4114 c->fdct248 = fdct_ifast248;
4116 else if(avctx->dct_algo==FF_DCT_FAAN) {
4117 c->fdct = ff_faandct;
4118 c->fdct248 = ff_faandct248;
4121 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4122 c->fdct248 = ff_fdct248_islow;
4124 #endif //CONFIG_ENCODERS
4126 if(avctx->lowres==1){
4127 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4128 c->idct_put= ff_jref_idct4_put;
4129 c->idct_add= ff_jref_idct4_add;
4131 c->idct_put= ff_h264_lowres_idct_put_c;
4132 c->idct_add= ff_h264_lowres_idct_add_c;
4134 c->idct = j_rev_dct4;
4135 c->idct_permutation_type= FF_NO_IDCT_PERM;
4136 }else if(avctx->lowres==2){
4137 c->idct_put= ff_jref_idct2_put;
4138 c->idct_add= ff_jref_idct2_add;
4139 c->idct = j_rev_dct2;
4140 c->idct_permutation_type= FF_NO_IDCT_PERM;
4141 }else if(avctx->lowres==3){
4142 c->idct_put= ff_jref_idct1_put;
4143 c->idct_add= ff_jref_idct1_add;
4144 c->idct = j_rev_dct1;
4145 c->idct_permutation_type= FF_NO_IDCT_PERM;
4147 if(avctx->idct_algo==FF_IDCT_INT){
4148 c->idct_put= ff_jref_idct_put;
4149 c->idct_add= ff_jref_idct_add;
4150 c->idct = j_rev_dct;
4151 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4152 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4153 avctx->idct_algo==FF_IDCT_VP3){
4154 c->idct_put= ff_vp3_idct_put_c;
4155 c->idct_add= ff_vp3_idct_add_c;
4156 c->idct = ff_vp3_idct_c;
4157 c->idct_permutation_type= FF_NO_IDCT_PERM;
4158 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4159 c->idct_put= ff_wmv2_idct_put_c;
4160 c->idct_add= ff_wmv2_idct_add_c;
4161 c->idct = ff_wmv2_idct_c;
4162 c->idct_permutation_type= FF_NO_IDCT_PERM;
4163 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4164 c->idct_put= ff_faanidct_put;
4165 c->idct_add= ff_faanidct_add;
4166 c->idct = ff_faanidct;
4167 c->idct_permutation_type= FF_NO_IDCT_PERM;
4168 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4169 c->idct_put= ff_ea_idct_put_c;
4170 c->idct_permutation_type= FF_NO_IDCT_PERM;
4171 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4172 c->idct = ff_bink_idct_c;
4173 c->idct_add = ff_bink_idct_add_c;
4174 c->idct_put = ff_bink_idct_put_c;
4175 c->idct_permutation_type = FF_NO_IDCT_PERM;
4176 }else{ //accurate/default
4177 c->idct_put= ff_simple_idct_put;
4178 c->idct_add= ff_simple_idct_add;
4179 c->idct = ff_simple_idct;
4180 c->idct_permutation_type= FF_NO_IDCT_PERM;
4184 c->get_pixels = get_pixels_c;
4185 c->diff_pixels = diff_pixels_c;
4186 c->put_pixels_clamped = put_pixels_clamped_c;
4187 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4188 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4189 c->add_pixels_clamped = add_pixels_clamped_c;
4190 c->add_pixels8 = add_pixels8_c;
4191 c->add_pixels4 = add_pixels4_c;
4192 c->sum_abs_dctelem = sum_abs_dctelem_c;
4193 c->emulated_edge_mc = ff_emulated_edge_mc;
4196 c->clear_block = clear_block_c;
4197 c->clear_blocks = clear_blocks_c;
4198 c->pix_sum = pix_sum_c;
4199 c->pix_norm1 = pix_norm1_c;
4201 c->fill_block_tab[0] = fill_block16_c;
4202 c->fill_block_tab[1] = fill_block8_c;
4203 c->scale_block = scale_block_c;
4205 /* TODO [0] 16 [1] 8 */
4206 c->pix_abs[0][0] = pix_abs16_c;
4207 c->pix_abs[0][1] = pix_abs16_x2_c;
4208 c->pix_abs[0][2] = pix_abs16_y2_c;
4209 c->pix_abs[0][3] = pix_abs16_xy2_c;
4210 c->pix_abs[1][0] = pix_abs8_c;
4211 c->pix_abs[1][1] = pix_abs8_x2_c;
4212 c->pix_abs[1][2] = pix_abs8_y2_c;
4213 c->pix_abs[1][3] = pix_abs8_xy2_c;
4215 #define dspfunc(PFX, IDX, NUM) \
4216 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4217 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4218 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4219 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4221 dspfunc(put, 0, 16);
4222 dspfunc(put_no_rnd, 0, 16);
4224 dspfunc(put_no_rnd, 1, 8);
4228 dspfunc(avg, 0, 16);
4229 dspfunc(avg_no_rnd, 0, 16);
4231 dspfunc(avg_no_rnd, 1, 8);
4236 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4237 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4239 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4240 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4241 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4242 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4243 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4244 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4245 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4246 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4247 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4249 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4250 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4251 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4252 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4253 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4254 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4255 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4256 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4257 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4259 #define dspfunc(PFX, IDX, NUM) \
4260 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4261 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4262 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4263 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4264 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4265 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4266 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4267 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4268 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4269 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4270 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4271 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4272 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4273 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4274 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4275 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4277 dspfunc(put_qpel, 0, 16);
4278 dspfunc(put_no_rnd_qpel, 0, 16);
4280 dspfunc(avg_qpel, 0, 16);
4281 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4283 dspfunc(put_qpel, 1, 8);
4284 dspfunc(put_no_rnd_qpel, 1, 8);
4286 dspfunc(avg_qpel, 1, 8);
4287 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4289 dspfunc(put_h264_qpel, 0, 16);
4290 dspfunc(put_h264_qpel, 1, 8);
4291 dspfunc(put_h264_qpel, 2, 4);
4292 dspfunc(put_h264_qpel, 3, 2);
4293 dspfunc(avg_h264_qpel, 0, 16);
4294 dspfunc(avg_h264_qpel, 1, 8);
4295 dspfunc(avg_h264_qpel, 2, 4);
4298 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4299 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4300 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4301 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4302 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4303 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4304 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4305 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4307 c->draw_edges = draw_edges_c;
4309 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4310 ff_mlp_init(c, avctx);
4312 #if CONFIG_VC1_DECODER
4313 ff_vc1dsp_init(c,avctx);
4315 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4316 ff_intrax8dsp_init(c,avctx);
4318 #if CONFIG_RV30_DECODER
4319 ff_rv30dsp_init(c,avctx);
4321 #if CONFIG_RV40_DECODER
4322 ff_rv40dsp_init(c,avctx);
4323 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4324 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4325 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4326 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4329 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4330 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4331 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4332 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4333 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4334 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4335 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4336 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4338 #define SET_CMP_FUNC(name) \
4339 c->name[0]= name ## 16_c;\
4340 c->name[1]= name ## 8x8_c;
4342 SET_CMP_FUNC(hadamard8_diff)
4343 c->hadamard8_diff[4]= hadamard8_intra16_c;
4344 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4345 SET_CMP_FUNC(dct_sad)
4346 SET_CMP_FUNC(dct_max)
4348 SET_CMP_FUNC(dct264_sad)
4350 c->sad[0]= pix_abs16_c;
4351 c->sad[1]= pix_abs8_c;
4355 SET_CMP_FUNC(quant_psnr)
4358 c->vsad[0]= vsad16_c;
4359 c->vsad[4]= vsad_intra16_c;
4360 c->vsad[5]= vsad_intra8_c;
4361 c->vsse[0]= vsse16_c;
4362 c->vsse[4]= vsse_intra16_c;
4363 c->vsse[5]= vsse_intra8_c;
4364 c->nsse[0]= nsse16_c;
4365 c->nsse[1]= nsse8_c;
4367 ff_dsputil_init_dwt(c);
4370 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4372 c->add_bytes= add_bytes_c;
4373 c->add_bytes_l2= add_bytes_l2_c;
4374 c->diff_bytes= diff_bytes_c;
4375 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4376 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4377 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4378 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4379 c->bswap_buf= bswap_buf;
4380 #if CONFIG_PNG_DECODER
4381 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4384 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4385 c->h263_h_loop_filter= h263_h_loop_filter_c;
4386 c->h263_v_loop_filter= h263_v_loop_filter_c;
4389 if (CONFIG_VP3_DECODER) {
4390 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4391 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4392 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4395 c->h261_loop_filter= h261_loop_filter_c;
4397 c->try_8x8basis= try_8x8basis_c;
4398 c->add_8x8basis= add_8x8basis_c;
4400 #if CONFIG_VORBIS_DECODER
4401 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4403 #if CONFIG_AC3_DECODER
4404 c->ac3_downmix = ff_ac3_downmix_c;
4406 c->vector_fmul = vector_fmul_c;
4407 c->vector_fmul_reverse = vector_fmul_reverse_c;
4408 c->vector_fmul_add = vector_fmul_add_c;
4409 c->vector_fmul_window = vector_fmul_window_c;
4410 c->vector_clipf = vector_clipf_c;
4411 c->scalarproduct_int16 = scalarproduct_int16_c;
4412 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4413 c->scalarproduct_float = scalarproduct_float_c;
4414 c->butterflies_float = butterflies_float_c;
4415 c->vector_fmul_scalar = vector_fmul_scalar_c;
4417 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4418 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4420 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4421 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4423 c->shrink[0]= av_image_copy_plane;
4424 c->shrink[1]= ff_shrink22;
4425 c->shrink[2]= ff_shrink44;
4426 c->shrink[3]= ff_shrink88;
4428 c->prefetch= just_return;
4430 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4431 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4433 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4434 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4435 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4436 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4437 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4438 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4439 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4440 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4441 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4443 for(i=0; i<64; i++){
4444 if(!c->put_2tap_qpel_pixels_tab[0][i])
4445 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4446 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4447 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4450 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4451 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4452 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4453 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4455 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4456 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4457 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4458 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4460 switch(c->idct_permutation_type){
4461 case FF_NO_IDCT_PERM:
4463 c->idct_permutation[i]= i;
4465 case FF_LIBMPEG2_IDCT_PERM:
4467 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4469 case FF_SIMPLE_IDCT_PERM:
4471 c->idct_permutation[i]= simple_mmx_permutation[i];
4473 case FF_TRANSPOSE_IDCT_PERM:
4475 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4477 case FF_PARTTRANS_IDCT_PERM:
4479 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4481 case FF_SSE2_IDCT_PERM:
4483 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4486 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");