3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228 *dst++ = av_bswap16(*src++);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
299 /* draw the edges of width 'w' of an image of size width, height */
300 //FIXME check that this is ok for mpeg4 interlaced
301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w, int sides)
303 uint8_t *ptr, *last_line;
308 for(i=0;i<height;i++) {
309 memset(ptr - w, ptr[0], w);
310 memset(ptr + width, ptr[width-1], w);
314 /* top and bottom + corners */
316 last_line = buf + (height - 1) * wrap;
317 if (sides & EDGE_TOP)
318 for(i = 0; i < w; i++)
319 memcpy(buf - (i + 1) * wrap, buf, width + w + w); // top
320 if (sides & EDGE_BOTTOM)
321 for (i = 0; i < w; i++)
322 memcpy(last_line + (i + 1) * wrap, last_line, width + w + w); // bottom
326 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
327 * @param buf destination buffer
328 * @param src source buffer
329 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
330 * @param block_w width of block
331 * @param block_h height of block
332 * @param src_x x coordinate of the top left sample of the block in the source buffer
333 * @param src_y y coordinate of the top left sample of the block in the source buffer
334 * @param w width of the source buffer
335 * @param h height of the source buffer
337 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
338 int src_x, int src_y, int w, int h){
340 int start_y, start_x, end_y, end_x;
343 src+= (h-1-src_y)*linesize;
345 }else if(src_y<=-block_h){
346 src+= (1-block_h-src_y)*linesize;
352 }else if(src_x<=-block_w){
353 src+= (1-block_w-src_x);
357 start_y= FFMAX(0, -src_y);
358 start_x= FFMAX(0, -src_x);
359 end_y= FFMIN(block_h, h-src_y);
360 end_x= FFMIN(block_w, w-src_x);
361 assert(start_y < end_y && block_h);
362 assert(start_x < end_x && block_w);
365 src += start_y*linesize + start_x;
369 for(y=0; y<start_y; y++){
374 // copy existing part
383 for(; y<block_h; y++){
388 buf -= block_h * linesize + start_x;
391 for(x=0; x<start_x; x++){
392 buf[x] = buf[start_x];
396 for(x=end_x; x<block_w; x++){
397 buf[x] = buf[end_x - 1];
403 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
407 /* read the pixels */
409 block[0] = pixels[0];
410 block[1] = pixels[1];
411 block[2] = pixels[2];
412 block[3] = pixels[3];
413 block[4] = pixels[4];
414 block[5] = pixels[5];
415 block[6] = pixels[6];
416 block[7] = pixels[7];
422 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
423 const uint8_t *s2, int stride){
426 /* read the pixels */
428 block[0] = s1[0] - s2[0];
429 block[1] = s1[1] - s2[1];
430 block[2] = s1[2] - s2[2];
431 block[3] = s1[3] - s2[3];
432 block[4] = s1[4] - s2[4];
433 block[5] = s1[5] - s2[5];
434 block[6] = s1[6] - s2[6];
435 block[7] = s1[7] - s2[7];
443 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
447 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
449 /* read the pixels */
451 pixels[0] = cm[block[0]];
452 pixels[1] = cm[block[1]];
453 pixels[2] = cm[block[2]];
454 pixels[3] = cm[block[3]];
455 pixels[4] = cm[block[4]];
456 pixels[5] = cm[block[5]];
457 pixels[6] = cm[block[6]];
458 pixels[7] = cm[block[7]];
465 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
469 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
471 /* read the pixels */
473 pixels[0] = cm[block[0]];
474 pixels[1] = cm[block[1]];
475 pixels[2] = cm[block[2]];
476 pixels[3] = cm[block[3]];
483 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
487 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
489 /* read the pixels */
491 pixels[0] = cm[block[0]];
492 pixels[1] = cm[block[1]];
499 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
500 uint8_t *restrict pixels,
505 for (i = 0; i < 8; i++) {
506 for (j = 0; j < 8; j++) {
509 else if (*block > 127)
512 *pixels = (uint8_t)(*block + 128);
516 pixels += (line_size - 8);
520 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
525 /* read the pixels */
527 pixels[0] = block[0];
528 pixels[1] = block[1];
529 pixels[2] = block[2];
530 pixels[3] = block[3];
531 pixels[4] = block[4];
532 pixels[5] = block[5];
533 pixels[6] = block[6];
534 pixels[7] = block[7];
541 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
545 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
547 /* read the pixels */
549 pixels[0] = cm[pixels[0] + block[0]];
550 pixels[1] = cm[pixels[1] + block[1]];
551 pixels[2] = cm[pixels[2] + block[2]];
552 pixels[3] = cm[pixels[3] + block[3]];
553 pixels[4] = cm[pixels[4] + block[4]];
554 pixels[5] = cm[pixels[5] + block[5]];
555 pixels[6] = cm[pixels[6] + block[6]];
556 pixels[7] = cm[pixels[7] + block[7]];
562 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
566 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
568 /* read the pixels */
570 pixels[0] = cm[pixels[0] + block[0]];
571 pixels[1] = cm[pixels[1] + block[1]];
572 pixels[2] = cm[pixels[2] + block[2]];
573 pixels[3] = cm[pixels[3] + block[3]];
579 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
585 /* read the pixels */
587 pixels[0] = cm[pixels[0] + block[0]];
588 pixels[1] = cm[pixels[1] + block[1]];
594 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
598 pixels[0] += block[0];
599 pixels[1] += block[1];
600 pixels[2] += block[2];
601 pixels[3] += block[3];
602 pixels[4] += block[4];
603 pixels[5] += block[5];
604 pixels[6] += block[6];
605 pixels[7] += block[7];
611 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
615 pixels[0] += block[0];
616 pixels[1] += block[1];
617 pixels[2] += block[2];
618 pixels[3] += block[3];
624 static int sum_abs_dctelem_c(DCTELEM *block)
628 sum+= FFABS(block[i]);
632 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
636 for (i = 0; i < h; i++) {
637 memset(block, value, 16);
642 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
646 for (i = 0; i < h; i++) {
647 memset(block, value, 8);
652 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
655 uint16_t *dst1 = (uint16_t *) dst;
656 uint16_t *dst2 = (uint16_t *)(dst + linesize);
658 for (j = 0; j < 8; j++) {
659 for (i = 0; i < 8; i++) {
660 dst1[i] = dst2[i] = src[i] * 0x0101;
670 #define PIXOP2(OPNAME, OP) \
671 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
675 OP(*((uint64_t*)block), AV_RN64(pixels));\
681 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
685 const uint64_t a= AV_RN64(pixels );\
686 const uint64_t b= AV_RN64(pixels+1);\
687 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
693 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
697 const uint64_t a= AV_RN64(pixels );\
698 const uint64_t b= AV_RN64(pixels+1);\
699 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
705 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
709 const uint64_t a= AV_RN64(pixels );\
710 const uint64_t b= AV_RN64(pixels+line_size);\
711 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
717 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
721 const uint64_t a= AV_RN64(pixels );\
722 const uint64_t b= AV_RN64(pixels+line_size);\
723 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
729 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
732 const uint64_t a= AV_RN64(pixels );\
733 const uint64_t b= AV_RN64(pixels+1);\
734 uint64_t l0= (a&0x0303030303030303ULL)\
735 + (b&0x0303030303030303ULL)\
736 + 0x0202020202020202ULL;\
737 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
738 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
742 for(i=0; i<h; i+=2){\
743 uint64_t a= AV_RN64(pixels );\
744 uint64_t b= AV_RN64(pixels+1);\
745 l1= (a&0x0303030303030303ULL)\
746 + (b&0x0303030303030303ULL);\
747 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
748 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
749 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
752 a= AV_RN64(pixels );\
753 b= AV_RN64(pixels+1);\
754 l0= (a&0x0303030303030303ULL)\
755 + (b&0x0303030303030303ULL)\
756 + 0x0202020202020202ULL;\
757 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
758 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
759 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
765 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
768 const uint64_t a= AV_RN64(pixels );\
769 const uint64_t b= AV_RN64(pixels+1);\
770 uint64_t l0= (a&0x0303030303030303ULL)\
771 + (b&0x0303030303030303ULL)\
772 + 0x0101010101010101ULL;\
773 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
774 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
778 for(i=0; i<h; i+=2){\
779 uint64_t a= AV_RN64(pixels );\
780 uint64_t b= AV_RN64(pixels+1);\
781 l1= (a&0x0303030303030303ULL)\
782 + (b&0x0303030303030303ULL);\
783 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
784 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
785 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
788 a= AV_RN64(pixels );\
789 b= AV_RN64(pixels+1);\
790 l0= (a&0x0303030303030303ULL)\
791 + (b&0x0303030303030303ULL)\
792 + 0x0101010101010101ULL;\
793 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
794 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
795 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
801 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
802 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
803 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
804 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
805 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
806 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
807 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
809 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
810 #else // 64 bit variant
812 #define PIXOP2(OPNAME, OP) \
813 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
816 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
821 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
824 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
829 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
833 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
838 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
842 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
843 int src_stride1, int src_stride2, int h){\
847 a= AV_RN32(&src1[i*src_stride1 ]);\
848 b= AV_RN32(&src2[i*src_stride2 ]);\
849 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
850 a= AV_RN32(&src1[i*src_stride1+4]);\
851 b= AV_RN32(&src2[i*src_stride2+4]);\
852 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
856 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
857 int src_stride1, int src_stride2, int h){\
861 a= AV_RN32(&src1[i*src_stride1 ]);\
862 b= AV_RN32(&src2[i*src_stride2 ]);\
863 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
864 a= AV_RN32(&src1[i*src_stride1+4]);\
865 b= AV_RN32(&src2[i*src_stride2+4]);\
866 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
870 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
871 int src_stride1, int src_stride2, int h){\
875 a= AV_RN32(&src1[i*src_stride1 ]);\
876 b= AV_RN32(&src2[i*src_stride2 ]);\
877 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
881 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
882 int src_stride1, int src_stride2, int h){\
886 a= AV_RN16(&src1[i*src_stride1 ]);\
887 b= AV_RN16(&src2[i*src_stride2 ]);\
888 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
892 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
893 int src_stride1, int src_stride2, int h){\
894 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
895 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
898 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
899 int src_stride1, int src_stride2, int h){\
900 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
901 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
904 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
908 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
912 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
913 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
916 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
920 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
921 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
924 uint32_t a, b, c, d, l0, l1, h0, h1;\
925 a= AV_RN32(&src1[i*src_stride1]);\
926 b= AV_RN32(&src2[i*src_stride2]);\
927 c= AV_RN32(&src3[i*src_stride3]);\
928 d= AV_RN32(&src4[i*src_stride4]);\
929 l0= (a&0x03030303UL)\
932 h0= ((a&0xFCFCFCFCUL)>>2)\
933 + ((b&0xFCFCFCFCUL)>>2);\
934 l1= (c&0x03030303UL)\
936 h1= ((c&0xFCFCFCFCUL)>>2)\
937 + ((d&0xFCFCFCFCUL)>>2);\
938 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
939 a= AV_RN32(&src1[i*src_stride1+4]);\
940 b= AV_RN32(&src2[i*src_stride2+4]);\
941 c= AV_RN32(&src3[i*src_stride3+4]);\
942 d= AV_RN32(&src4[i*src_stride4+4]);\
943 l0= (a&0x03030303UL)\
946 h0= ((a&0xFCFCFCFCUL)>>2)\
947 + ((b&0xFCFCFCFCUL)>>2);\
948 l1= (c&0x03030303UL)\
950 h1= ((c&0xFCFCFCFCUL)>>2)\
951 + ((d&0xFCFCFCFCUL)>>2);\
952 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
956 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
957 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
960 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
961 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
964 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
965 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
968 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
969 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
972 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
973 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
976 uint32_t a, b, c, d, l0, l1, h0, h1;\
977 a= AV_RN32(&src1[i*src_stride1]);\
978 b= AV_RN32(&src2[i*src_stride2]);\
979 c= AV_RN32(&src3[i*src_stride3]);\
980 d= AV_RN32(&src4[i*src_stride4]);\
981 l0= (a&0x03030303UL)\
984 h0= ((a&0xFCFCFCFCUL)>>2)\
985 + ((b&0xFCFCFCFCUL)>>2);\
986 l1= (c&0x03030303UL)\
988 h1= ((c&0xFCFCFCFCUL)>>2)\
989 + ((d&0xFCFCFCFCUL)>>2);\
990 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
991 a= AV_RN32(&src1[i*src_stride1+4]);\
992 b= AV_RN32(&src2[i*src_stride2+4]);\
993 c= AV_RN32(&src3[i*src_stride3+4]);\
994 d= AV_RN32(&src4[i*src_stride4+4]);\
995 l0= (a&0x03030303UL)\
998 h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 l1= (c&0x03030303UL)\
1001 + (d&0x03030303UL);\
1002 h1= ((c&0xFCFCFCFCUL)>>2)\
1003 + ((d&0xFCFCFCFCUL)>>2);\
1004 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1007 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1008 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1009 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1010 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1012 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1013 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1014 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1020 int i, a0, b0, a1, b1;\
1027 for(i=0; i<h; i+=2){\
1033 block[0]= (a1+a0)>>2; /* FIXME non put */\
1034 block[1]= (b1+b0)>>2;\
1044 block[0]= (a1+a0)>>2;\
1045 block[1]= (b1+b0)>>2;\
1051 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1054 const uint32_t a= AV_RN32(pixels );\
1055 const uint32_t b= AV_RN32(pixels+1);\
1056 uint32_t l0= (a&0x03030303UL)\
1059 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1060 + ((b&0xFCFCFCFCUL)>>2);\
1064 for(i=0; i<h; i+=2){\
1065 uint32_t a= AV_RN32(pixels );\
1066 uint32_t b= AV_RN32(pixels+1);\
1067 l1= (a&0x03030303UL)\
1068 + (b&0x03030303UL);\
1069 h1= ((a&0xFCFCFCFCUL)>>2)\
1070 + ((b&0xFCFCFCFCUL)>>2);\
1071 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1074 a= AV_RN32(pixels );\
1075 b= AV_RN32(pixels+1);\
1076 l0= (a&0x03030303UL)\
1079 h0= ((a&0xFCFCFCFCUL)>>2)\
1080 + ((b&0xFCFCFCFCUL)>>2);\
1081 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1087 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1090 for(j=0; j<2; j++){\
1092 const uint32_t a= AV_RN32(pixels );\
1093 const uint32_t b= AV_RN32(pixels+1);\
1094 uint32_t l0= (a&0x03030303UL)\
1097 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1102 for(i=0; i<h; i+=2){\
1103 uint32_t a= AV_RN32(pixels );\
1104 uint32_t b= AV_RN32(pixels+1);\
1105 l1= (a&0x03030303UL)\
1106 + (b&0x03030303UL);\
1107 h1= ((a&0xFCFCFCFCUL)>>2)\
1108 + ((b&0xFCFCFCFCUL)>>2);\
1109 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1112 a= AV_RN32(pixels );\
1113 b= AV_RN32(pixels+1);\
1114 l0= (a&0x03030303UL)\
1117 h0= ((a&0xFCFCFCFCUL)>>2)\
1118 + ((b&0xFCFCFCFCUL)>>2);\
1119 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1123 pixels+=4-line_size*(h+1);\
1124 block +=4-line_size*h;\
1128 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1131 for(j=0; j<2; j++){\
1133 const uint32_t a= AV_RN32(pixels );\
1134 const uint32_t b= AV_RN32(pixels+1);\
1135 uint32_t l0= (a&0x03030303UL)\
1138 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1143 for(i=0; i<h; i+=2){\
1144 uint32_t a= AV_RN32(pixels );\
1145 uint32_t b= AV_RN32(pixels+1);\
1146 l1= (a&0x03030303UL)\
1147 + (b&0x03030303UL);\
1148 h1= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1153 a= AV_RN32(pixels );\
1154 b= AV_RN32(pixels+1);\
1155 l0= (a&0x03030303UL)\
1158 h0= ((a&0xFCFCFCFCUL)>>2)\
1159 + ((b&0xFCFCFCFCUL)>>2);\
1160 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1164 pixels+=4-line_size*(h+1);\
1165 block +=4-line_size*h;\
1169 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1170 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1171 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1172 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1173 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1174 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1175 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1176 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1178 #define op_avg(a, b) a = rnd_avg32(a, b)
1180 #define op_put(a, b) a = b
1187 #define put_no_rnd_pixels8_c put_pixels8_c
1188 #define put_no_rnd_pixels16_c put_pixels16_c
1190 #define avg2(a,b) ((a+b+1)>>1)
1191 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1193 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1194 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1197 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1198 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1201 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1203 const int A=(16-x16)*(16-y16);
1204 const int B=( x16)*(16-y16);
1205 const int C=(16-x16)*( y16);
1206 const int D=( x16)*( y16);
1211 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1212 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1213 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1214 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1215 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1216 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1217 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1218 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1224 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1225 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1228 const int s= 1<<shift;
1238 for(x=0; x<8; x++){ //XXX FIXME optimize
1239 int src_x, src_y, frac_x, frac_y, index;
1243 frac_x= src_x&(s-1);
1244 frac_y= src_y&(s-1);
1248 if((unsigned)src_x < width){
1249 if((unsigned)src_y < height){
1250 index= src_x + src_y*stride;
1251 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1252 + src[index +1]* frac_x )*(s-frac_y)
1253 + ( src[index+stride ]*(s-frac_x)
1254 + src[index+stride+1]* frac_x )* frac_y
1257 index= src_x + av_clip(src_y, 0, height)*stride;
1258 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1259 + src[index +1]* frac_x )*s
1263 if((unsigned)src_y < height){
1264 index= av_clip(src_x, 0, width) + src_y*stride;
1265 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1266 + src[index+stride ]* frac_y )*s
1269 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1270 dst[y*stride + x]= src[index ];
1282 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284 case 2: put_pixels2_c (dst, src, stride, height); break;
1285 case 4: put_pixels4_c (dst, src, stride, height); break;
1286 case 8: put_pixels8_c (dst, src, stride, height); break;
1287 case 16:put_pixels16_c(dst, src, stride, height); break;
1291 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293 for (i=0; i < height; i++) {
1294 for (j=0; j < width; j++) {
1295 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1302 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304 for (i=0; i < height; i++) {
1305 for (j=0; j < width; j++) {
1306 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1313 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1315 for (i=0; i < height; i++) {
1316 for (j=0; j < width; j++) {
1317 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1324 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
1328 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1335 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
1339 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1346 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
1350 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1357 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
1361 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1368 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
1372 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1379 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381 case 2: avg_pixels2_c (dst, src, stride, height); break;
1382 case 4: avg_pixels4_c (dst, src, stride, height); break;
1383 case 8: avg_pixels8_c (dst, src, stride, height); break;
1384 case 16:avg_pixels16_c(dst, src, stride, height); break;
1388 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1399 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401 for (i=0; i < height; i++) {
1402 for (j=0; j < width; j++) {
1403 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1410 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412 for (i=0; i < height; i++) {
1413 for (j=0; j < width; j++) {
1414 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1421 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423 for (i=0; i < height; i++) {
1424 for (j=0; j < width; j++) {
1425 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1432 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 for (i=0; i < height; i++) {
1435 for (j=0; j < width; j++) {
1436 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1443 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 for (i=0; i < height; i++) {
1446 for (j=0; j < width; j++) {
1447 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1454 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 for (i=0; i < height; i++) {
1457 for (j=0; j < width; j++) {
1458 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1465 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1467 for (i=0; i < height; i++) {
1468 for (j=0; j < width; j++) {
1469 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1476 #define TPEL_WIDTH(width)\
1477 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1485 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1487 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1488 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1489 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1490 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1491 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1492 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1493 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1494 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1497 #define H264_CHROMA_MC(OPNAME, OP)\
1498 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1499 const int A=(8-x)*(8-y);\
1500 const int B=( x)*(8-y);\
1501 const int C=(8-x)*( y);\
1502 const int D=( x)*( y);\
1505 assert(x<8 && y<8 && x>=0 && y>=0);\
1508 for(i=0; i<h; i++){\
1509 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1510 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1516 const int step= C ? stride : 1;\
1517 for(i=0; i<h; i++){\
1518 OP(dst[0], (A*src[0] + E*src[step+0]));\
1519 OP(dst[1], (A*src[1] + E*src[step+1]));\
1526 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1527 const int A=(8-x)*(8-y);\
1528 const int B=( x)*(8-y);\
1529 const int C=(8-x)*( y);\
1530 const int D=( x)*( y);\
1533 assert(x<8 && y<8 && x>=0 && y>=0);\
1536 for(i=0; i<h; i++){\
1537 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1538 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1539 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1540 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1546 const int step= C ? stride : 1;\
1547 for(i=0; i<h; i++){\
1548 OP(dst[0], (A*src[0] + E*src[step+0]));\
1549 OP(dst[1], (A*src[1] + E*src[step+1]));\
1550 OP(dst[2], (A*src[2] + E*src[step+2]));\
1551 OP(dst[3], (A*src[3] + E*src[step+3]));\
1558 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1559 const int A=(8-x)*(8-y);\
1560 const int B=( x)*(8-y);\
1561 const int C=(8-x)*( y);\
1562 const int D=( x)*( y);\
1565 assert(x<8 && y<8 && x>=0 && y>=0);\
1568 for(i=0; i<h; i++){\
1569 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1570 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1571 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1572 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1573 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1574 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1575 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1576 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1582 const int step= C ? stride : 1;\
1583 for(i=0; i<h; i++){\
1584 OP(dst[0], (A*src[0] + E*src[step+0]));\
1585 OP(dst[1], (A*src[1] + E*src[step+1]));\
1586 OP(dst[2], (A*src[2] + E*src[step+2]));\
1587 OP(dst[3], (A*src[3] + E*src[step+3]));\
1588 OP(dst[4], (A*src[4] + E*src[step+4]));\
1589 OP(dst[5], (A*src[5] + E*src[step+5]));\
1590 OP(dst[6], (A*src[6] + E*src[step+6]));\
1591 OP(dst[7], (A*src[7] + E*src[step+7]));\
1598 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1599 #define op_put(a, b) a = (((b) + 32)>>6)
1601 H264_CHROMA_MC(put_ , op_put)
1602 H264_CHROMA_MC(avg_ , op_avg)
1606 #define QPEL_MC(r, OPNAME, RND, OP) \
1607 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1612 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1613 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1614 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1615 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1616 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1617 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1618 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1619 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1625 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1627 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1631 const int src0= src[0*srcStride];\
1632 const int src1= src[1*srcStride];\
1633 const int src2= src[2*srcStride];\
1634 const int src3= src[3*srcStride];\
1635 const int src4= src[4*srcStride];\
1636 const int src5= src[5*srcStride];\
1637 const int src6= src[6*srcStride];\
1638 const int src7= src[7*srcStride];\
1639 const int src8= src[8*srcStride];\
1640 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1641 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1642 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1643 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1644 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1645 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1646 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1647 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1653 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1654 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1659 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1660 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1661 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1662 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1663 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1664 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1665 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1666 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1667 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1668 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1669 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1670 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1671 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1672 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1673 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1674 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1680 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1686 const int src0= src[0*srcStride];\
1687 const int src1= src[1*srcStride];\
1688 const int src2= src[2*srcStride];\
1689 const int src3= src[3*srcStride];\
1690 const int src4= src[4*srcStride];\
1691 const int src5= src[5*srcStride];\
1692 const int src6= src[6*srcStride];\
1693 const int src7= src[7*srcStride];\
1694 const int src8= src[8*srcStride];\
1695 const int src9= src[9*srcStride];\
1696 const int src10= src[10*srcStride];\
1697 const int src11= src[11*srcStride];\
1698 const int src12= src[12*srcStride];\
1699 const int src13= src[13*srcStride];\
1700 const int src14= src[14*srcStride];\
1701 const int src15= src[15*srcStride];\
1702 const int src16= src[16*srcStride];\
1703 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1704 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1705 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1706 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1707 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1708 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1709 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1710 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1711 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1712 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1713 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1714 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1715 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1716 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1717 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1718 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1724 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1727 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1730 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1731 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1734 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1736 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1737 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1740 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1741 uint8_t full[16*9];\
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1745 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1748 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t full[16*9];\
1750 copy_block9(full, src, 16, stride, 9);\
1751 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1754 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[16*9];\
1757 copy_block9(full, src, 16, stride, 9);\
1758 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1759 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1761 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1762 uint8_t full[16*9];\
1765 uint8_t halfHV[64];\
1766 copy_block9(full, src, 16, stride, 9);\
1767 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1768 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1772 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1773 uint8_t full[16*9];\
1775 uint8_t halfHV[64];\
1776 copy_block9(full, src, 16, stride, 9);\
1777 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1778 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1782 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1793 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1796 uint8_t halfHV[64];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1800 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1803 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1804 uint8_t full[16*9];\
1807 uint8_t halfHV[64];\
1808 copy_block9(full, src, 16, stride, 9);\
1809 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1810 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1814 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1815 uint8_t full[16*9];\
1817 uint8_t halfHV[64];\
1818 copy_block9(full, src, 16, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t full[16*9];\
1828 uint8_t halfHV[64];\
1829 copy_block9(full, src, 16, stride, 9);\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1835 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t full[16*9];\
1838 uint8_t halfHV[64];\
1839 copy_block9(full, src, 16, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1842 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1843 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1845 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1847 uint8_t halfHV[64];\
1848 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1852 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t halfHV[64];\
1855 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1859 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[16*9];\
1863 uint8_t halfHV[64];\
1864 copy_block9(full, src, 16, stride, 9);\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1867 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1868 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1870 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1871 uint8_t full[16*9];\
1873 copy_block9(full, src, 16, stride, 9);\
1874 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1875 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1876 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1878 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t full[16*9];\
1882 uint8_t halfHV[64];\
1883 copy_block9(full, src, 16, stride, 9);\
1884 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1886 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1887 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1889 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1890 uint8_t full[16*9];\
1892 copy_block9(full, src, 16, stride, 9);\
1893 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1894 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1895 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1897 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1899 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1900 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1903 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1905 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1906 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1909 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1910 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1913 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1916 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1919 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1922 copy_block17(full, src, 24, stride, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1924 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1927 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1929 copy_block17(full, src, 24, stride, 17);\
1930 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1933 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1934 uint8_t full[24*17];\
1936 copy_block17(full, src, 24, stride, 17);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1938 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1940 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfV[256];\
1944 uint8_t halfHV[256];\
1945 copy_block17(full, src, 24, stride, 17);\
1946 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1951 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t full[24*17];\
1953 uint8_t halfH[272];\
1954 uint8_t halfHV[256];\
1955 copy_block17(full, src, 24, stride, 17);\
1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1961 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfV[256];\
1965 uint8_t halfHV[256];\
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1972 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 uint8_t halfHV[256];\
1976 copy_block17(full, src, 24, stride, 17);\
1977 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1978 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1982 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[24*17];\
1984 uint8_t halfH[272];\
1985 uint8_t halfV[256];\
1986 uint8_t halfHV[256];\
1987 copy_block17(full, src, 24, stride, 17);\
1988 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1993 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t full[24*17];\
1995 uint8_t halfH[272];\
1996 uint8_t halfHV[256];\
1997 copy_block17(full, src, 24, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2000 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2003 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t full[24*17];\
2005 uint8_t halfH[272];\
2006 uint8_t halfV[256];\
2007 uint8_t halfHV[256];\
2008 copy_block17(full, src, 24, stride, 17);\
2009 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2010 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2014 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2015 uint8_t full[24*17];\
2016 uint8_t halfH[272];\
2017 uint8_t halfHV[256];\
2018 copy_block17(full, src, 24, stride, 17);\
2019 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2020 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2021 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2022 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2024 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t halfH[272];\
2026 uint8_t halfHV[256];\
2027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2031 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t halfH[272];\
2033 uint8_t halfHV[256];\
2034 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2035 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2036 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2038 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2039 uint8_t full[24*17];\
2040 uint8_t halfH[272];\
2041 uint8_t halfV[256];\
2042 uint8_t halfHV[256];\
2043 copy_block17(full, src, 24, stride, 17);\
2044 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2045 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2046 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2047 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2049 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[24*17];\
2051 uint8_t halfH[272];\
2052 copy_block17(full, src, 24, stride, 17);\
2053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2055 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2057 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfV[256];\
2061 uint8_t halfHV[256];\
2062 copy_block17(full, src, 24, stride, 17);\
2063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2068 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[24*17];\
2070 uint8_t halfH[272];\
2071 copy_block17(full, src, 24, stride, 17);\
2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2073 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2074 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2076 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t halfH[272];\
2078 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2079 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2082 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2083 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2084 #define op_put(a, b) a = cm[((b) + 16)>>5]
2085 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2087 QPEL_MC(0, put_ , _ , op_put)
2088 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2089 QPEL_MC(0, avg_ , _ , op_avg)
2090 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2092 #undef op_avg_no_rnd
2094 #undef op_put_no_rnd
2096 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2097 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2098 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2099 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2100 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2101 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2104 #define H264_LOWPASS(OPNAME, OP, OP2) \
2105 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2107 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2111 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2112 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2118 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2120 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2124 const int srcB= src[-2*srcStride];\
2125 const int srcA= src[-1*srcStride];\
2126 const int src0= src[0 *srcStride];\
2127 const int src1= src[1 *srcStride];\
2128 const int src2= src[2 *srcStride];\
2129 const int src3= src[3 *srcStride];\
2130 const int src4= src[4 *srcStride];\
2131 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2138 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2141 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2143 src -= 2*srcStride;\
2144 for(i=0; i<h+5; i++)\
2146 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2147 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2151 tmp -= tmpStride*(h+5-2);\
2154 const int tmpB= tmp[-2*tmpStride];\
2155 const int tmpA= tmp[-1*tmpStride];\
2156 const int tmp0= tmp[0 *tmpStride];\
2157 const int tmp1= tmp[1 *tmpStride];\
2158 const int tmp2= tmp[2 *tmpStride];\
2159 const int tmp3= tmp[3 *tmpStride];\
2160 const int tmp4= tmp[4 *tmpStride];\
2161 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2162 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2167 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2173 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2174 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2175 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2176 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2182 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2184 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2188 const int srcB= src[-2*srcStride];\
2189 const int srcA= src[-1*srcStride];\
2190 const int src0= src[0 *srcStride];\
2191 const int src1= src[1 *srcStride];\
2192 const int src2= src[2 *srcStride];\
2193 const int src3= src[3 *srcStride];\
2194 const int src4= src[4 *srcStride];\
2195 const int src5= src[5 *srcStride];\
2196 const int src6= src[6 *srcStride];\
2197 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2198 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2199 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2200 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2206 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2209 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211 src -= 2*srcStride;\
2212 for(i=0; i<h+5; i++)\
2214 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2215 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2216 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2217 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2221 tmp -= tmpStride*(h+5-2);\
2224 const int tmpB= tmp[-2*tmpStride];\
2225 const int tmpA= tmp[-1*tmpStride];\
2226 const int tmp0= tmp[0 *tmpStride];\
2227 const int tmp1= tmp[1 *tmpStride];\
2228 const int tmp2= tmp[2 *tmpStride];\
2229 const int tmp3= tmp[3 *tmpStride];\
2230 const int tmp4= tmp[4 *tmpStride];\
2231 const int tmp5= tmp[5 *tmpStride];\
2232 const int tmp6= tmp[6 *tmpStride];\
2233 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2234 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2235 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2236 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2242 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2244 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2248 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2249 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2250 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2251 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2252 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2253 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2254 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2255 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2261 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2263 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2267 const int srcB= src[-2*srcStride];\
2268 const int srcA= src[-1*srcStride];\
2269 const int src0= src[0 *srcStride];\
2270 const int src1= src[1 *srcStride];\
2271 const int src2= src[2 *srcStride];\
2272 const int src3= src[3 *srcStride];\
2273 const int src4= src[4 *srcStride];\
2274 const int src5= src[5 *srcStride];\
2275 const int src6= src[6 *srcStride];\
2276 const int src7= src[7 *srcStride];\
2277 const int src8= src[8 *srcStride];\
2278 const int src9= src[9 *srcStride];\
2279 const int src10=src[10*srcStride];\
2280 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2281 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2282 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2283 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2284 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2285 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2286 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2287 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2293 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2298 src -= 2*srcStride;\
2299 for(i=0; i<h+5; i++)\
2301 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2302 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2303 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2304 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2305 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2306 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2307 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2308 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2312 tmp -= tmpStride*(h+5-2);\
2315 const int tmpB= tmp[-2*tmpStride];\
2316 const int tmpA= tmp[-1*tmpStride];\
2317 const int tmp0= tmp[0 *tmpStride];\
2318 const int tmp1= tmp[1 *tmpStride];\
2319 const int tmp2= tmp[2 *tmpStride];\
2320 const int tmp3= tmp[3 *tmpStride];\
2321 const int tmp4= tmp[4 *tmpStride];\
2322 const int tmp5= tmp[5 *tmpStride];\
2323 const int tmp6= tmp[6 *tmpStride];\
2324 const int tmp7= tmp[7 *tmpStride];\
2325 const int tmp8= tmp[8 *tmpStride];\
2326 const int tmp9= tmp[9 *tmpStride];\
2327 const int tmp10=tmp[10*tmpStride];\
2328 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2329 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2330 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2331 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2332 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2333 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2334 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2335 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2341 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2342 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2343 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2344 src += 8*srcStride;\
2345 dst += 8*dstStride;\
2346 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2347 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2350 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2351 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2352 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2353 src += 8*srcStride;\
2354 dst += 8*dstStride;\
2355 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2356 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2359 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2360 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2361 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2362 src += 8*srcStride;\
2363 dst += 8*dstStride;\
2364 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2365 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2368 #define H264_MC(OPNAME, SIZE) \
2369 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2370 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2373 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2374 uint8_t half[SIZE*SIZE];\
2375 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2376 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2379 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2380 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2384 uint8_t half[SIZE*SIZE];\
2385 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2386 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2389 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2390 uint8_t full[SIZE*(SIZE+5)];\
2391 uint8_t * const full_mid= full + SIZE*2;\
2392 uint8_t half[SIZE*SIZE];\
2393 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2394 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2395 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2398 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2399 uint8_t full[SIZE*(SIZE+5)];\
2400 uint8_t * const full_mid= full + SIZE*2;\
2401 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2402 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2406 uint8_t full[SIZE*(SIZE+5)];\
2407 uint8_t * const full_mid= full + SIZE*2;\
2408 uint8_t half[SIZE*SIZE];\
2409 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2410 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2411 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2414 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2415 uint8_t full[SIZE*(SIZE+5)];\
2416 uint8_t * const full_mid= full + SIZE*2;\
2417 uint8_t halfH[SIZE*SIZE];\
2418 uint8_t halfV[SIZE*SIZE];\
2419 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2420 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2421 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2422 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2426 uint8_t full[SIZE*(SIZE+5)];\
2427 uint8_t * const full_mid= full + SIZE*2;\
2428 uint8_t halfH[SIZE*SIZE];\
2429 uint8_t halfV[SIZE*SIZE];\
2430 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2431 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2432 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2433 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2436 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2437 uint8_t full[SIZE*(SIZE+5)];\
2438 uint8_t * const full_mid= full + SIZE*2;\
2439 uint8_t halfH[SIZE*SIZE];\
2440 uint8_t halfV[SIZE*SIZE];\
2441 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2442 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2443 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2444 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2447 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2448 uint8_t full[SIZE*(SIZE+5)];\
2449 uint8_t * const full_mid= full + SIZE*2;\
2450 uint8_t halfH[SIZE*SIZE];\
2451 uint8_t halfV[SIZE*SIZE];\
2452 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2453 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2454 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2458 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2459 int16_t tmp[SIZE*(SIZE+5)];\
2460 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2463 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2464 int16_t tmp[SIZE*(SIZE+5)];\
2465 uint8_t halfH[SIZE*SIZE];\
2466 uint8_t halfHV[SIZE*SIZE];\
2467 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2468 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2469 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2473 int16_t tmp[SIZE*(SIZE+5)];\
2474 uint8_t halfH[SIZE*SIZE];\
2475 uint8_t halfHV[SIZE*SIZE];\
2476 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2477 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2478 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2481 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2482 uint8_t full[SIZE*(SIZE+5)];\
2483 uint8_t * const full_mid= full + SIZE*2;\
2484 int16_t tmp[SIZE*(SIZE+5)];\
2485 uint8_t halfV[SIZE*SIZE];\
2486 uint8_t halfHV[SIZE*SIZE];\
2487 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2488 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2489 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2490 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2493 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2494 uint8_t full[SIZE*(SIZE+5)];\
2495 uint8_t * const full_mid= full + SIZE*2;\
2496 int16_t tmp[SIZE*(SIZE+5)];\
2497 uint8_t halfV[SIZE*SIZE];\
2498 uint8_t halfHV[SIZE*SIZE];\
2499 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2500 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2501 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2502 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2505 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2506 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2507 #define op_put(a, b) a = cm[((b) + 16)>>5]
2508 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2509 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2511 H264_LOWPASS(put_ , op_put, op2_put)
2512 H264_LOWPASS(avg_ , op_avg, op2_avg)
2527 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2528 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2529 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2530 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2532 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2533 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2537 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2538 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2539 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2540 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2541 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2542 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2543 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2544 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2550 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2551 put_pixels8_c(dst, src, stride, 8);
2553 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554 avg_pixels8_c(dst, src, stride, 8);
2556 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2557 put_pixels16_c(dst, src, stride, 16);
2559 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560 avg_pixels16_c(dst, src, stride, 16);
2563 #if CONFIG_RV40_DECODER
2564 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2565 put_pixels16_xy2_c(dst, src, stride, 16);
2567 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568 avg_pixels16_xy2_c(dst, src, stride, 16);
2570 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571 put_pixels8_xy2_c(dst, src, stride, 8);
2573 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574 avg_pixels8_xy2_c(dst, src, stride, 8);
2576 #endif /* CONFIG_RV40_DECODER */
2578 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2579 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2583 const int src_1= src[ -srcStride];
2584 const int src0 = src[0 ];
2585 const int src1 = src[ srcStride];
2586 const int src2 = src[2*srcStride];
2587 const int src3 = src[3*srcStride];
2588 const int src4 = src[4*srcStride];
2589 const int src5 = src[5*srcStride];
2590 const int src6 = src[6*srcStride];
2591 const int src7 = src[7*srcStride];
2592 const int src8 = src[8*srcStride];
2593 const int src9 = src[9*srcStride];
2594 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2595 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2596 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2597 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2598 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2599 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2600 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2601 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2607 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2609 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2610 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2613 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2614 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2617 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2619 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2620 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2623 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2624 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2627 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2631 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2633 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2636 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2640 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2641 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2642 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2643 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2645 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2647 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2648 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2651 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2652 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2654 const int strength= ff_h263_loop_filter_strength[qscale];
2658 int p0= src[x-2*stride];
2659 int p1= src[x-1*stride];
2660 int p2= src[x+0*stride];
2661 int p3= src[x+1*stride];
2662 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2664 if (d<-2*strength) d1= 0;
2665 else if(d<- strength) d1=-2*strength - d;
2666 else if(d< strength) d1= d;
2667 else if(d< 2*strength) d1= 2*strength - d;
2672 if(p1&256) p1= ~(p1>>31);
2673 if(p2&256) p2= ~(p2>>31);
2675 src[x-1*stride] = p1;
2676 src[x+0*stride] = p2;
2680 d2= av_clip((p0-p3)/4, -ad1, ad1);
2682 src[x-2*stride] = p0 - d2;
2683 src[x+ stride] = p3 + d2;
2688 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2689 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2691 const int strength= ff_h263_loop_filter_strength[qscale];
2695 int p0= src[y*stride-2];
2696 int p1= src[y*stride-1];
2697 int p2= src[y*stride+0];
2698 int p3= src[y*stride+1];
2699 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2701 if (d<-2*strength) d1= 0;
2702 else if(d<- strength) d1=-2*strength - d;
2703 else if(d< strength) d1= d;
2704 else if(d< 2*strength) d1= 2*strength - d;
2709 if(p1&256) p1= ~(p1>>31);
2710 if(p2&256) p2= ~(p2>>31);
2712 src[y*stride-1] = p1;
2713 src[y*stride+0] = p2;
2717 d2= av_clip((p0-p3)/4, -ad1, ad1);
2719 src[y*stride-2] = p0 - d2;
2720 src[y*stride+1] = p3 + d2;
2725 static void h261_loop_filter_c(uint8_t *src, int stride){
2730 temp[x ] = 4*src[x ];
2731 temp[x + 7*8] = 4*src[x + 7*stride];
2735 xy = y * stride + x;
2737 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2742 src[ y*stride] = (temp[ y*8] + 2)>>2;
2743 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2745 xy = y * stride + x;
2747 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2752 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2758 s += abs(pix1[0] - pix2[0]);
2759 s += abs(pix1[1] - pix2[1]);
2760 s += abs(pix1[2] - pix2[2]);
2761 s += abs(pix1[3] - pix2[3]);
2762 s += abs(pix1[4] - pix2[4]);
2763 s += abs(pix1[5] - pix2[5]);
2764 s += abs(pix1[6] - pix2[6]);
2765 s += abs(pix1[7] - pix2[7]);
2766 s += abs(pix1[8] - pix2[8]);
2767 s += abs(pix1[9] - pix2[9]);
2768 s += abs(pix1[10] - pix2[10]);
2769 s += abs(pix1[11] - pix2[11]);
2770 s += abs(pix1[12] - pix2[12]);
2771 s += abs(pix1[13] - pix2[13]);
2772 s += abs(pix1[14] - pix2[14]);
2773 s += abs(pix1[15] - pix2[15]);
2780 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2786 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2787 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2788 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2789 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2790 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2791 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2792 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2793 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2794 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2795 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2796 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2797 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2798 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2799 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2800 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2801 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2808 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2811 uint8_t *pix3 = pix2 + line_size;
2815 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2816 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2817 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2818 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2819 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2820 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2821 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2822 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2823 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2824 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2825 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2826 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2827 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2828 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2829 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2830 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2838 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2841 uint8_t *pix3 = pix2 + line_size;
2845 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2846 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2847 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2848 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2849 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2850 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2851 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2852 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2853 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2854 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2855 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2856 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2857 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2858 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2859 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2860 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2868 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2874 s += abs(pix1[0] - pix2[0]);
2875 s += abs(pix1[1] - pix2[1]);
2876 s += abs(pix1[2] - pix2[2]);
2877 s += abs(pix1[3] - pix2[3]);
2878 s += abs(pix1[4] - pix2[4]);
2879 s += abs(pix1[5] - pix2[5]);
2880 s += abs(pix1[6] - pix2[6]);
2881 s += abs(pix1[7] - pix2[7]);
2888 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2894 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2895 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2896 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2897 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2898 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2899 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2900 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2901 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2908 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2911 uint8_t *pix3 = pix2 + line_size;
2915 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2916 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2917 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2918 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2919 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2920 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2921 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2922 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2930 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2933 uint8_t *pix3 = pix2 + line_size;
2937 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2938 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2939 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2940 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2941 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2942 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2943 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2944 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2952 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2953 MpegEncContext *c = v;
2959 for(x=0; x<16; x++){
2960 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2963 for(x=0; x<15; x++){
2964 score2+= FFABS( s1[x ] - s1[x +stride]
2965 - s1[x+1] + s1[x+1+stride])
2966 -FFABS( s2[x ] - s2[x +stride]
2967 - s2[x+1] + s2[x+1+stride]);
2974 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2975 else return score1 + FFABS(score2)*8;
2978 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2979 MpegEncContext *c = v;
2986 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2990 score2+= FFABS( s1[x ] - s1[x +stride]
2991 - s1[x+1] + s1[x+1+stride])
2992 -FFABS( s2[x ] - s2[x +stride]
2993 - s2[x+1] + s2[x+1+stride]);
3000 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3001 else return score1 + FFABS(score2)*8;
3004 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3008 for(i=0; i<8*8; i++){
3009 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3012 assert(-512<b && b<512);
3014 sum += (w*b)*(w*b)>>4;
3019 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3022 for(i=0; i<8*8; i++){
3023 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3028 * permutes an 8x8 block.
3029 * @param block the block which will be permuted according to the given permutation vector
3030 * @param permutation the permutation vector
3031 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3032 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3033 * (inverse) permutated to scantable order!
3035 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3041 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3043 for(i=0; i<=last; i++){
3044 const int j= scantable[i];
3049 for(i=0; i<=last; i++){
3050 const int j= scantable[i];
3051 const int perm_j= permutation[j];
3052 block[perm_j]= temp[j];
3056 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3060 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3063 memset(cmp, 0, sizeof(void*)*6);
3071 cmp[i]= c->hadamard8_diff[i];
3077 cmp[i]= c->dct_sad[i];
3080 cmp[i]= c->dct264_sad[i];
3083 cmp[i]= c->dct_max[i];
3086 cmp[i]= c->quant_psnr[i];
3115 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3120 static void clear_block_c(DCTELEM *block)
3122 memset(block, 0, sizeof(DCTELEM)*64);
3126 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3128 static void clear_blocks_c(DCTELEM *blocks)
3130 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3133 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3135 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3136 long a = *(long*)(src+i);
3137 long b = *(long*)(dst+i);
3138 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3141 dst[i+0] += src[i+0];
3144 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3146 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3147 long a = *(long*)(src1+i);
3148 long b = *(long*)(src2+i);
3149 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3152 dst[i] = src1[i]+src2[i];
3155 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3157 #if !HAVE_FAST_UNALIGNED
3158 if((long)src2 & (sizeof(long)-1)){
3159 for(i=0; i+7<w; i+=8){
3160 dst[i+0] = src1[i+0]-src2[i+0];
3161 dst[i+1] = src1[i+1]-src2[i+1];
3162 dst[i+2] = src1[i+2]-src2[i+2];
3163 dst[i+3] = src1[i+3]-src2[i+3];
3164 dst[i+4] = src1[i+4]-src2[i+4];
3165 dst[i+5] = src1[i+5]-src2[i+5];
3166 dst[i+6] = src1[i+6]-src2[i+6];
3167 dst[i+7] = src1[i+7]-src2[i+7];
3171 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3172 long a = *(long*)(src1+i);
3173 long b = *(long*)(src2+i);
3174 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3177 dst[i+0] = src1[i+0]-src2[i+0];
3180 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3188 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3197 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3205 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3215 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3218 for(i=0; i<w-1; i++){
3245 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3275 #define BUTTERFLY2(o1,o2,i1,i2) \
3279 #define BUTTERFLY1(x,y) \
3288 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3290 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3298 //FIXME try pointer walks
3299 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3300 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3301 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3302 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3304 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3305 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3306 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3307 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3309 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3310 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3311 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3312 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3316 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3317 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3318 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3319 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3321 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3322 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3323 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3324 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3327 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3328 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3329 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3330 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3336 printf("MAX:%d\n", maxi);
3342 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3350 //FIXME try pointer walks
3351 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3352 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3353 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3354 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3356 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3357 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3358 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3359 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3361 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3362 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3363 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3364 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3368 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3369 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3370 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3371 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3373 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3374 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3375 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3376 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3379 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3380 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3381 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3382 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3385 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3390 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3391 MpegEncContext * const s= (MpegEncContext *)c;
3392 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3396 s->dsp.diff_pixels(temp, src1, src2, stride);
3398 return s->dsp.sum_abs_dctelem(temp);
3403 const int s07 = SRC(0) + SRC(7);\
3404 const int s16 = SRC(1) + SRC(6);\
3405 const int s25 = SRC(2) + SRC(5);\
3406 const int s34 = SRC(3) + SRC(4);\
3407 const int a0 = s07 + s34;\
3408 const int a1 = s16 + s25;\
3409 const int a2 = s07 - s34;\
3410 const int a3 = s16 - s25;\
3411 const int d07 = SRC(0) - SRC(7);\
3412 const int d16 = SRC(1) - SRC(6);\
3413 const int d25 = SRC(2) - SRC(5);\
3414 const int d34 = SRC(3) - SRC(4);\
3415 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3416 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3417 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3418 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3420 DST(1, a4 + (a7>>2)) ;\
3421 DST(2, a2 + (a3>>1)) ;\
3422 DST(3, a5 + (a6>>2)) ;\
3424 DST(5, a6 - (a5>>2)) ;\
3425 DST(6, (a2>>1) - a3 ) ;\
3426 DST(7, (a4>>2) - a7 ) ;\
3429 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3430 MpegEncContext * const s= (MpegEncContext *)c;
3435 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3437 #define SRC(x) dct[i][x]
3438 #define DST(x,v) dct[i][x]= v
3439 for( i = 0; i < 8; i++ )
3444 #define SRC(x) dct[x][i]
3445 #define DST(x,v) sum += FFABS(v)
3446 for( i = 0; i < 8; i++ )
3454 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3455 MpegEncContext * const s= (MpegEncContext *)c;
3456 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3461 s->dsp.diff_pixels(temp, src1, src2, stride);
3465 sum= FFMAX(sum, FFABS(temp[i]));
3470 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3471 MpegEncContext * const s= (MpegEncContext *)c;
3472 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3473 DCTELEM * const bak = temp+64;
3479 s->dsp.diff_pixels(temp, src1, src2, stride);
3481 memcpy(bak, temp, 64*sizeof(DCTELEM));
3483 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3484 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3485 ff_simple_idct(temp); //FIXME
3488 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3493 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494 MpegEncContext * const s= (MpegEncContext *)c;
3495 const uint8_t *scantable= s->intra_scantable.permutated;
3496 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3497 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3498 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3499 int i, last, run, bits, level, distortion, start_i;
3500 const int esc_length= s->ac_esc_length;
3502 uint8_t * last_length;
3506 copy_block8(lsrc1, src1, 8, stride, 8);
3507 copy_block8(lsrc2, src2, 8, stride, 8);
3509 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3511 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3517 length = s->intra_ac_vlc_length;
3518 last_length= s->intra_ac_vlc_last_length;
3519 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3522 length = s->inter_ac_vlc_length;
3523 last_length= s->inter_ac_vlc_last_length;
3528 for(i=start_i; i<last; i++){
3529 int j= scantable[i];
3534 if((level&(~127)) == 0){
3535 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3544 level= temp[i] + 64;
3548 if((level&(~127)) == 0){
3549 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3557 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3559 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3562 s->dsp.idct_add(lsrc2, 8, temp);
3564 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3566 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3569 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570 MpegEncContext * const s= (MpegEncContext *)c;
3571 const uint8_t *scantable= s->intra_scantable.permutated;
3572 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3573 int i, last, run, bits, level, start_i;
3574 const int esc_length= s->ac_esc_length;
3576 uint8_t * last_length;
3580 s->dsp.diff_pixels(temp, src1, src2, stride);
3582 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3588 length = s->intra_ac_vlc_length;
3589 last_length= s->intra_ac_vlc_last_length;
3590 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3593 length = s->inter_ac_vlc_length;
3594 last_length= s->inter_ac_vlc_last_length;
3599 for(i=start_i; i<last; i++){
3600 int j= scantable[i];
3605 if((level&(~127)) == 0){
3606 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3615 level= temp[i] + 64;
3619 if((level&(~127)) == 0){
3620 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3628 #define VSAD_INTRA(size) \
3629 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3633 for(y=1; y<h; y++){ \
3634 for(x=0; x<size; x+=4){ \
3635 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3636 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3646 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3651 for(x=0; x<16; x++){
3652 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3661 #define SQ(a) ((a)*(a))
3662 #define VSSE_INTRA(size) \
3663 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3667 for(y=1; y<h; y++){ \
3668 for(x=0; x<size; x+=4){ \
3669 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3670 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3680 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3685 for(x=0; x<16; x++){
3686 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3695 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3699 for(i=0; i<size; i++)
3700 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3704 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3705 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3706 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3708 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3710 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3711 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3712 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3713 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3715 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3717 for(i=0; i<len; i++)
3718 dst[i] = src0[i] * src1[i];
3721 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3724 for(i=0; i<len; i++)
3725 dst[i] = src0[i] * src1[-i];
3728 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3730 for(i=0; i<len; i++)
3731 dst[i] = src0[i] * src1[i] + src2[i];
3734 static void vector_fmul_window_c(float *dst, const float *src0,
3735 const float *src1, const float *win, int len)
3741 for(i=-len, j=len-1; i<0; i++, j--) {
3746 dst[i] = s0*wj - s1*wi;
3747 dst[j] = s0*wi + s1*wj;
3751 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3755 for (i = 0; i < len; i++)
3756 dst[i] = src[i] * mul;
3759 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3760 const float **sv, float mul, int len)
3763 for (i = 0; i < len; i += 2, sv++) {
3764 dst[i ] = src[i ] * sv[0][0] * mul;
3765 dst[i+1] = src[i+1] * sv[0][1] * mul;
3769 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3770 const float **sv, float mul, int len)
3773 for (i = 0; i < len; i += 4, sv++) {
3774 dst[i ] = src[i ] * sv[0][0] * mul;
3775 dst[i+1] = src[i+1] * sv[0][1] * mul;
3776 dst[i+2] = src[i+2] * sv[0][2] * mul;
3777 dst[i+3] = src[i+3] * sv[0][3] * mul;
3781 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3785 for (i = 0; i < len; i += 2, sv++) {
3786 dst[i ] = sv[0][0] * mul;
3787 dst[i+1] = sv[0][1] * mul;
3791 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3795 for (i = 0; i < len; i += 4, sv++) {
3796 dst[i ] = sv[0][0] * mul;
3797 dst[i+1] = sv[0][1] * mul;
3798 dst[i+2] = sv[0][2] * mul;
3799 dst[i+3] = sv[0][3] * mul;
3803 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3807 for (i = 0; i < len; i++) {
3808 float t = v1[i] - v2[i];
3814 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3819 for (i = 0; i < len; i++)
3825 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3826 uint32_t maxi, uint32_t maxisign)
3829 if(a > mini) return mini;
3830 else if((a^(1<<31)) > maxisign) return maxi;
3834 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3836 uint32_t mini = *(uint32_t*)min;
3837 uint32_t maxi = *(uint32_t*)max;
3838 uint32_t maxisign = maxi ^ (1<<31);
3839 uint32_t *dsti = (uint32_t*)dst;
3840 const uint32_t *srci = (const uint32_t*)src;
3841 for(i=0; i<len; i+=8) {
3842 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3843 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3844 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3845 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3846 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3847 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3848 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3849 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3852 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3854 if(min < 0 && max > 0) {
3855 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3857 for(i=0; i < len; i+=8) {
3858 dst[i ] = av_clipf(src[i ], min, max);
3859 dst[i + 1] = av_clipf(src[i + 1], min, max);
3860 dst[i + 2] = av_clipf(src[i + 2], min, max);
3861 dst[i + 3] = av_clipf(src[i + 3], min, max);
3862 dst[i + 4] = av_clipf(src[i + 4], min, max);
3863 dst[i + 5] = av_clipf(src[i + 5], min, max);
3864 dst[i + 6] = av_clipf(src[i + 6], min, max);
3865 dst[i + 7] = av_clipf(src[i + 7], min, max);
3870 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3875 res += (*v1++ * *v2++) >> shift;
3880 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3885 *v1++ += mul * *v3++;
3890 static void apply_window_int16_c(int16_t *output, const int16_t *input,
3891 const int16_t *window, unsigned int len)
3894 int len2 = len >> 1;
3896 for (i = 0; i < len2; i++) {
3897 int16_t w = window[i];
3898 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
3899 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
3904 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3905 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3906 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3907 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3908 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3909 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3910 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3912 static void wmv2_idct_row(short * b)
3915 int a0,a1,a2,a3,a4,a5,a6,a7;
3917 a1 = W1*b[1]+W7*b[7];
3918 a7 = W7*b[1]-W1*b[7];
3919 a5 = W5*b[5]+W3*b[3];
3920 a3 = W3*b[5]-W5*b[3];
3921 a2 = W2*b[2]+W6*b[6];
3922 a6 = W6*b[2]-W2*b[6];
3923 a0 = W0*b[0]+W0*b[4];
3924 a4 = W0*b[0]-W0*b[4];
3926 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3927 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3929 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3930 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3931 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3932 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3933 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3934 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3935 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3936 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3938 static void wmv2_idct_col(short * b)
3941 int a0,a1,a2,a3,a4,a5,a6,a7;
3942 /*step 1, with extended precision*/
3943 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3944 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3945 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3946 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3947 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3948 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3949 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3950 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3952 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3953 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3955 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3956 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3957 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3958 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3960 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3961 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3962 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3963 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3965 void ff_wmv2_idct_c(short * block){
3969 wmv2_idct_row(block+i);
3972 wmv2_idct_col(block+i);
3975 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3977 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3979 ff_wmv2_idct_c(block);
3980 ff_put_pixels_clamped_c(block, dest, line_size);
3982 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3984 ff_wmv2_idct_c(block);
3985 ff_add_pixels_clamped_c(block, dest, line_size);
3987 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3990 ff_put_pixels_clamped_c(block, dest, line_size);
3992 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3995 ff_add_pixels_clamped_c(block, dest, line_size);
3998 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4001 put_pixels_clamped4_c(block, dest, line_size);
4003 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4006 add_pixels_clamped4_c(block, dest, line_size);
4009 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4012 put_pixels_clamped2_c(block, dest, line_size);
4014 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4017 add_pixels_clamped2_c(block, dest, line_size);
4020 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4022 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4024 dest[0] = cm[(block[0] + 4)>>3];
4026 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4028 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4030 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4033 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4035 /* init static data */
4036 av_cold void dsputil_static_init(void)
4040 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4041 for(i=0;i<MAX_NEG_CROP;i++) {
4043 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4046 for(i=0;i<512;i++) {
4047 ff_squareTbl[i] = (i - 256) * (i - 256);
4050 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4053 int ff_check_alignment(void){
4054 static int did_fail=0;
4055 DECLARE_ALIGNED(16, int, aligned);
4057 if((intptr_t)&aligned & 15){
4059 #if HAVE_MMX || HAVE_ALTIVEC
4060 av_log(NULL, AV_LOG_ERROR,
4061 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4062 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4063 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4064 "Do not report crashes to FFmpeg developers.\n");
4073 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4077 ff_check_alignment();
4080 if(avctx->dct_algo==FF_DCT_FASTINT) {
4081 c->fdct = fdct_ifast;
4082 c->fdct248 = fdct_ifast248;
4084 else if(avctx->dct_algo==FF_DCT_FAAN) {
4085 c->fdct = ff_faandct;
4086 c->fdct248 = ff_faandct248;
4089 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4090 c->fdct248 = ff_fdct248_islow;
4092 #endif //CONFIG_ENCODERS
4094 if(avctx->lowres==1){
4095 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4096 c->idct_put= ff_jref_idct4_put;
4097 c->idct_add= ff_jref_idct4_add;
4099 c->idct_put= ff_h264_lowres_idct_put_c;
4100 c->idct_add= ff_h264_lowres_idct_add_c;
4102 c->idct = j_rev_dct4;
4103 c->idct_permutation_type= FF_NO_IDCT_PERM;
4104 }else if(avctx->lowres==2){
4105 c->idct_put= ff_jref_idct2_put;
4106 c->idct_add= ff_jref_idct2_add;
4107 c->idct = j_rev_dct2;
4108 c->idct_permutation_type= FF_NO_IDCT_PERM;
4109 }else if(avctx->lowres==3){
4110 c->idct_put= ff_jref_idct1_put;
4111 c->idct_add= ff_jref_idct1_add;
4112 c->idct = j_rev_dct1;
4113 c->idct_permutation_type= FF_NO_IDCT_PERM;
4115 if(avctx->idct_algo==FF_IDCT_INT){
4116 c->idct_put= ff_jref_idct_put;
4117 c->idct_add= ff_jref_idct_add;
4118 c->idct = j_rev_dct;
4119 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4120 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4121 avctx->idct_algo==FF_IDCT_VP3){
4122 c->idct_put= ff_vp3_idct_put_c;
4123 c->idct_add= ff_vp3_idct_add_c;
4124 c->idct = ff_vp3_idct_c;
4125 c->idct_permutation_type= FF_NO_IDCT_PERM;
4126 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4127 c->idct_put= ff_wmv2_idct_put_c;
4128 c->idct_add= ff_wmv2_idct_add_c;
4129 c->idct = ff_wmv2_idct_c;
4130 c->idct_permutation_type= FF_NO_IDCT_PERM;
4131 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4132 c->idct_put= ff_faanidct_put;
4133 c->idct_add= ff_faanidct_add;
4134 c->idct = ff_faanidct;
4135 c->idct_permutation_type= FF_NO_IDCT_PERM;
4136 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4137 c->idct_put= ff_ea_idct_put_c;
4138 c->idct_permutation_type= FF_NO_IDCT_PERM;
4139 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4140 c->idct = ff_bink_idct_c;
4141 c->idct_add = ff_bink_idct_add_c;
4142 c->idct_put = ff_bink_idct_put_c;
4143 c->idct_permutation_type = FF_NO_IDCT_PERM;
4144 }else{ //accurate/default
4145 c->idct_put= ff_simple_idct_put;
4146 c->idct_add= ff_simple_idct_add;
4147 c->idct = ff_simple_idct;
4148 c->idct_permutation_type= FF_NO_IDCT_PERM;
4152 c->get_pixels = get_pixels_c;
4153 c->diff_pixels = diff_pixels_c;
4154 c->put_pixels_clamped = ff_put_pixels_clamped_c;
4155 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4156 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4157 c->add_pixels_clamped = ff_add_pixels_clamped_c;
4158 c->add_pixels8 = add_pixels8_c;
4159 c->add_pixels4 = add_pixels4_c;
4160 c->sum_abs_dctelem = sum_abs_dctelem_c;
4161 c->emulated_edge_mc = ff_emulated_edge_mc;
4164 c->clear_block = clear_block_c;
4165 c->clear_blocks = clear_blocks_c;
4166 c->pix_sum = pix_sum_c;
4167 c->pix_norm1 = pix_norm1_c;
4169 c->fill_block_tab[0] = fill_block16_c;
4170 c->fill_block_tab[1] = fill_block8_c;
4171 c->scale_block = scale_block_c;
4173 /* TODO [0] 16 [1] 8 */
4174 c->pix_abs[0][0] = pix_abs16_c;
4175 c->pix_abs[0][1] = pix_abs16_x2_c;
4176 c->pix_abs[0][2] = pix_abs16_y2_c;
4177 c->pix_abs[0][3] = pix_abs16_xy2_c;
4178 c->pix_abs[1][0] = pix_abs8_c;
4179 c->pix_abs[1][1] = pix_abs8_x2_c;
4180 c->pix_abs[1][2] = pix_abs8_y2_c;
4181 c->pix_abs[1][3] = pix_abs8_xy2_c;
4183 #define dspfunc(PFX, IDX, NUM) \
4184 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4185 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4186 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4187 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4189 dspfunc(put, 0, 16);
4190 dspfunc(put_no_rnd, 0, 16);
4192 dspfunc(put_no_rnd, 1, 8);
4196 dspfunc(avg, 0, 16);
4197 dspfunc(avg_no_rnd, 0, 16);
4199 dspfunc(avg_no_rnd, 1, 8);
4204 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4205 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4207 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4208 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4209 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4210 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4211 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4212 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4213 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4214 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4215 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4217 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4218 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4219 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4220 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4221 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4222 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4223 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4224 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4225 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4227 #define dspfunc(PFX, IDX, NUM) \
4228 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4229 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4230 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4231 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4232 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4233 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4234 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4235 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4236 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4237 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4238 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4239 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4240 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4241 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4242 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4243 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4245 dspfunc(put_qpel, 0, 16);
4246 dspfunc(put_no_rnd_qpel, 0, 16);
4248 dspfunc(avg_qpel, 0, 16);
4249 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4251 dspfunc(put_qpel, 1, 8);
4252 dspfunc(put_no_rnd_qpel, 1, 8);
4254 dspfunc(avg_qpel, 1, 8);
4255 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4257 dspfunc(put_h264_qpel, 0, 16);
4258 dspfunc(put_h264_qpel, 1, 8);
4259 dspfunc(put_h264_qpel, 2, 4);
4260 dspfunc(put_h264_qpel, 3, 2);
4261 dspfunc(avg_h264_qpel, 0, 16);
4262 dspfunc(avg_h264_qpel, 1, 8);
4263 dspfunc(avg_h264_qpel, 2, 4);
4266 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4267 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4268 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4269 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4270 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4271 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4273 c->draw_edges = draw_edges_c;
4275 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4276 ff_mlp_init(c, avctx);
4278 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4279 ff_intrax8dsp_init(c,avctx);
4281 #if CONFIG_RV30_DECODER
4282 ff_rv30dsp_init(c,avctx);
4284 #if CONFIG_RV40_DECODER
4285 ff_rv40dsp_init(c,avctx);
4286 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4287 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4288 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4289 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4292 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4293 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4294 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4295 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4296 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4297 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4298 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4299 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4301 #define SET_CMP_FUNC(name) \
4302 c->name[0]= name ## 16_c;\
4303 c->name[1]= name ## 8x8_c;
4305 SET_CMP_FUNC(hadamard8_diff)
4306 c->hadamard8_diff[4]= hadamard8_intra16_c;
4307 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4308 SET_CMP_FUNC(dct_sad)
4309 SET_CMP_FUNC(dct_max)
4311 SET_CMP_FUNC(dct264_sad)
4313 c->sad[0]= pix_abs16_c;
4314 c->sad[1]= pix_abs8_c;
4318 SET_CMP_FUNC(quant_psnr)
4321 c->vsad[0]= vsad16_c;
4322 c->vsad[4]= vsad_intra16_c;
4323 c->vsad[5]= vsad_intra8_c;
4324 c->vsse[0]= vsse16_c;
4325 c->vsse[4]= vsse_intra16_c;
4326 c->vsse[5]= vsse_intra8_c;
4327 c->nsse[0]= nsse16_c;
4328 c->nsse[1]= nsse8_c;
4330 ff_dsputil_init_dwt(c);
4333 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4335 c->add_bytes= add_bytes_c;
4336 c->add_bytes_l2= add_bytes_l2_c;
4337 c->diff_bytes= diff_bytes_c;
4338 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4339 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4340 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4341 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4342 c->bswap_buf= bswap_buf;
4343 c->bswap16_buf = bswap16_buf;
4344 #if CONFIG_PNG_DECODER
4345 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4348 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4349 c->h263_h_loop_filter= h263_h_loop_filter_c;
4350 c->h263_v_loop_filter= h263_v_loop_filter_c;
4353 if (CONFIG_VP3_DECODER) {
4354 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4355 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4356 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4359 c->h261_loop_filter= h261_loop_filter_c;
4361 c->try_8x8basis= try_8x8basis_c;
4362 c->add_8x8basis= add_8x8basis_c;
4364 #if CONFIG_VORBIS_DECODER
4365 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4367 #if CONFIG_AC3_DECODER
4368 c->ac3_downmix = ff_ac3_downmix_c;
4370 c->vector_fmul = vector_fmul_c;
4371 c->vector_fmul_reverse = vector_fmul_reverse_c;
4372 c->vector_fmul_add = vector_fmul_add_c;
4373 c->vector_fmul_window = vector_fmul_window_c;
4374 c->vector_clipf = vector_clipf_c;
4375 c->scalarproduct_int16 = scalarproduct_int16_c;
4376 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4377 c->apply_window_int16 = apply_window_int16_c;
4378 c->scalarproduct_float = scalarproduct_float_c;
4379 c->butterflies_float = butterflies_float_c;
4380 c->vector_fmul_scalar = vector_fmul_scalar_c;
4382 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4383 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4385 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4386 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4388 c->shrink[0]= av_image_copy_plane;
4389 c->shrink[1]= ff_shrink22;
4390 c->shrink[2]= ff_shrink44;
4391 c->shrink[3]= ff_shrink88;
4393 c->prefetch= just_return;
4395 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4396 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4398 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4399 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4400 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4401 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4402 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4403 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4404 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4405 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4406 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4408 for(i=0; i<64; i++){
4409 if(!c->put_2tap_qpel_pixels_tab[0][i])
4410 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4411 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4412 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4415 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4416 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4417 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4418 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4420 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4421 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4422 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4423 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4425 switch(c->idct_permutation_type){
4426 case FF_NO_IDCT_PERM:
4428 c->idct_permutation[i]= i;
4430 case FF_LIBMPEG2_IDCT_PERM:
4432 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4434 case FF_SIMPLE_IDCT_PERM:
4436 c->idct_permutation[i]= simple_mmx_permutation[i];
4438 case FF_TRANSPOSE_IDCT_PERM:
4440 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4442 case FF_PARTTRANS_IDCT_PERM:
4444 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4446 case FF_SSE2_IDCT_PERM:
4448 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4451 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");