3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
228 *dst++ = av_bswap16(*src++);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
299 /* draw the edges of width 'w' of an image of size width, height */
300 //FIXME check that this is ok for mpeg4 interlaced
301 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
303 uint8_t *ptr, *last_line;
306 last_line = buf + (height - 1) * wrap;
309 memcpy(buf - (i + 1) * wrap, buf, width);
310 memcpy(last_line + (i + 1) * wrap, last_line, width);
314 for(i=0;i<height;i++) {
315 memset(ptr - w, ptr[0], w);
316 memset(ptr + width, ptr[width-1], w);
321 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
322 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
323 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
324 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
329 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
330 * @param buf destination buffer
331 * @param src source buffer
332 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
333 * @param block_w width of block
334 * @param block_h height of block
335 * @param src_x x coordinate of the top left sample of the block in the source buffer
336 * @param src_y y coordinate of the top left sample of the block in the source buffer
337 * @param w width of the source buffer
338 * @param h height of the source buffer
340 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
341 int src_x, int src_y, int w, int h){
343 int start_y, start_x, end_y, end_x;
346 src+= (h-1-src_y)*linesize;
348 }else if(src_y<=-block_h){
349 src+= (1-block_h-src_y)*linesize;
355 }else if(src_x<=-block_w){
356 src+= (1-block_w-src_x);
360 start_y= FFMAX(0, -src_y);
361 start_x= FFMAX(0, -src_x);
362 end_y= FFMIN(block_h, h-src_y);
363 end_x= FFMIN(block_w, w-src_x);
364 assert(start_y < end_y && block_h);
365 assert(start_x < end_x && block_w);
368 src += start_y*linesize + start_x;
372 for(y=0; y<start_y; y++){
377 // copy existing part
386 for(; y<block_h; y++){
391 buf -= block_h * linesize + start_x;
394 for(x=0; x<start_x; x++){
395 buf[x] = buf[start_x];
399 for(x=end_x; x<block_w; x++){
400 buf[x] = buf[end_x - 1];
406 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
410 /* read the pixels */
412 block[0] = pixels[0];
413 block[1] = pixels[1];
414 block[2] = pixels[2];
415 block[3] = pixels[3];
416 block[4] = pixels[4];
417 block[5] = pixels[5];
418 block[6] = pixels[6];
419 block[7] = pixels[7];
425 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
426 const uint8_t *s2, int stride){
429 /* read the pixels */
431 block[0] = s1[0] - s2[0];
432 block[1] = s1[1] - s2[1];
433 block[2] = s1[2] - s2[2];
434 block[3] = s1[3] - s2[3];
435 block[4] = s1[4] - s2[4];
436 block[5] = s1[5] - s2[5];
437 block[6] = s1[6] - s2[6];
438 block[7] = s1[7] - s2[7];
446 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
452 /* read the pixels */
454 pixels[0] = cm[block[0]];
455 pixels[1] = cm[block[1]];
456 pixels[2] = cm[block[2]];
457 pixels[3] = cm[block[3]];
458 pixels[4] = cm[block[4]];
459 pixels[5] = cm[block[5]];
460 pixels[6] = cm[block[6]];
461 pixels[7] = cm[block[7]];
468 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
472 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474 /* read the pixels */
476 pixels[0] = cm[block[0]];
477 pixels[1] = cm[block[1]];
478 pixels[2] = cm[block[2]];
479 pixels[3] = cm[block[3]];
486 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
490 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
492 /* read the pixels */
494 pixels[0] = cm[block[0]];
495 pixels[1] = cm[block[1]];
502 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
503 uint8_t *restrict pixels,
508 for (i = 0; i < 8; i++) {
509 for (j = 0; j < 8; j++) {
512 else if (*block > 127)
515 *pixels = (uint8_t)(*block + 128);
519 pixels += (line_size - 8);
523 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
528 /* read the pixels */
530 pixels[0] = block[0];
531 pixels[1] = block[1];
532 pixels[2] = block[2];
533 pixels[3] = block[3];
534 pixels[4] = block[4];
535 pixels[5] = block[5];
536 pixels[6] = block[6];
537 pixels[7] = block[7];
544 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
548 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
550 /* read the pixels */
552 pixels[0] = cm[pixels[0] + block[0]];
553 pixels[1] = cm[pixels[1] + block[1]];
554 pixels[2] = cm[pixels[2] + block[2]];
555 pixels[3] = cm[pixels[3] + block[3]];
556 pixels[4] = cm[pixels[4] + block[4]];
557 pixels[5] = cm[pixels[5] + block[5]];
558 pixels[6] = cm[pixels[6] + block[6]];
559 pixels[7] = cm[pixels[7] + block[7]];
565 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
569 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
571 /* read the pixels */
573 pixels[0] = cm[pixels[0] + block[0]];
574 pixels[1] = cm[pixels[1] + block[1]];
575 pixels[2] = cm[pixels[2] + block[2]];
576 pixels[3] = cm[pixels[3] + block[3]];
582 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
588 /* read the pixels */
590 pixels[0] = cm[pixels[0] + block[0]];
591 pixels[1] = cm[pixels[1] + block[1]];
597 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
601 pixels[0] += block[0];
602 pixels[1] += block[1];
603 pixels[2] += block[2];
604 pixels[3] += block[3];
605 pixels[4] += block[4];
606 pixels[5] += block[5];
607 pixels[6] += block[6];
608 pixels[7] += block[7];
614 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
618 pixels[0] += block[0];
619 pixels[1] += block[1];
620 pixels[2] += block[2];
621 pixels[3] += block[3];
627 static int sum_abs_dctelem_c(DCTELEM *block)
631 sum+= FFABS(block[i]);
635 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
639 for (i = 0; i < h; i++) {
640 memset(block, value, 16);
645 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
649 for (i = 0; i < h; i++) {
650 memset(block, value, 8);
655 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
658 uint16_t *dst1 = (uint16_t *) dst;
659 uint16_t *dst2 = (uint16_t *)(dst + linesize);
661 for (j = 0; j < 8; j++) {
662 for (i = 0; i < 8; i++) {
663 dst1[i] = dst2[i] = src[i] * 0x0101;
673 #define PIXOP2(OPNAME, OP) \
674 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
678 OP(*((uint64_t*)block), AV_RN64(pixels));\
684 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
688 const uint64_t a= AV_RN64(pixels );\
689 const uint64_t b= AV_RN64(pixels+1);\
690 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
696 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700 const uint64_t a= AV_RN64(pixels );\
701 const uint64_t b= AV_RN64(pixels+1);\
702 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
708 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
712 const uint64_t a= AV_RN64(pixels );\
713 const uint64_t b= AV_RN64(pixels+line_size);\
714 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
720 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
724 const uint64_t a= AV_RN64(pixels );\
725 const uint64_t b= AV_RN64(pixels+line_size);\
726 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
732 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
735 const uint64_t a= AV_RN64(pixels );\
736 const uint64_t b= AV_RN64(pixels+1);\
737 uint64_t l0= (a&0x0303030303030303ULL)\
738 + (b&0x0303030303030303ULL)\
739 + 0x0202020202020202ULL;\
740 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
741 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
745 for(i=0; i<h; i+=2){\
746 uint64_t a= AV_RN64(pixels );\
747 uint64_t b= AV_RN64(pixels+1);\
748 l1= (a&0x0303030303030303ULL)\
749 + (b&0x0303030303030303ULL);\
750 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
751 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
752 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
755 a= AV_RN64(pixels );\
756 b= AV_RN64(pixels+1);\
757 l0= (a&0x0303030303030303ULL)\
758 + (b&0x0303030303030303ULL)\
759 + 0x0202020202020202ULL;\
760 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
761 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
762 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
768 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
771 const uint64_t a= AV_RN64(pixels );\
772 const uint64_t b= AV_RN64(pixels+1);\
773 uint64_t l0= (a&0x0303030303030303ULL)\
774 + (b&0x0303030303030303ULL)\
775 + 0x0101010101010101ULL;\
776 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
777 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
781 for(i=0; i<h; i+=2){\
782 uint64_t a= AV_RN64(pixels );\
783 uint64_t b= AV_RN64(pixels+1);\
784 l1= (a&0x0303030303030303ULL)\
785 + (b&0x0303030303030303ULL);\
786 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
787 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
788 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
791 a= AV_RN64(pixels );\
792 b= AV_RN64(pixels+1);\
793 l0= (a&0x0303030303030303ULL)\
794 + (b&0x0303030303030303ULL)\
795 + 0x0101010101010101ULL;\
796 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
797 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
798 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
804 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
805 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
806 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
807 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
808 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
809 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
810 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
812 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
813 #else // 64 bit variant
815 #define PIXOP2(OPNAME, OP) \
816 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
819 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
824 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
832 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
836 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
841 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
845 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
846 int src_stride1, int src_stride2, int h){\
850 a= AV_RN32(&src1[i*src_stride1 ]);\
851 b= AV_RN32(&src2[i*src_stride2 ]);\
852 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
853 a= AV_RN32(&src1[i*src_stride1+4]);\
854 b= AV_RN32(&src2[i*src_stride2+4]);\
855 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
859 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
860 int src_stride1, int src_stride2, int h){\
864 a= AV_RN32(&src1[i*src_stride1 ]);\
865 b= AV_RN32(&src2[i*src_stride2 ]);\
866 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
867 a= AV_RN32(&src1[i*src_stride1+4]);\
868 b= AV_RN32(&src2[i*src_stride2+4]);\
869 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
873 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
874 int src_stride1, int src_stride2, int h){\
878 a= AV_RN32(&src1[i*src_stride1 ]);\
879 b= AV_RN32(&src2[i*src_stride2 ]);\
880 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
884 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885 int src_stride1, int src_stride2, int h){\
889 a= AV_RN16(&src1[i*src_stride1 ]);\
890 b= AV_RN16(&src2[i*src_stride2 ]);\
891 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
895 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
896 int src_stride1, int src_stride2, int h){\
897 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
898 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
901 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
902 int src_stride1, int src_stride2, int h){\
903 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
904 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
907 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
912 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
915 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
916 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
919 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
920 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
923 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
924 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
927 uint32_t a, b, c, d, l0, l1, h0, h1;\
928 a= AV_RN32(&src1[i*src_stride1]);\
929 b= AV_RN32(&src2[i*src_stride2]);\
930 c= AV_RN32(&src3[i*src_stride3]);\
931 d= AV_RN32(&src4[i*src_stride4]);\
932 l0= (a&0x03030303UL)\
935 h0= ((a&0xFCFCFCFCUL)>>2)\
936 + ((b&0xFCFCFCFCUL)>>2);\
937 l1= (c&0x03030303UL)\
939 h1= ((c&0xFCFCFCFCUL)>>2)\
940 + ((d&0xFCFCFCFCUL)>>2);\
941 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
942 a= AV_RN32(&src1[i*src_stride1+4]);\
943 b= AV_RN32(&src2[i*src_stride2+4]);\
944 c= AV_RN32(&src3[i*src_stride3+4]);\
945 d= AV_RN32(&src4[i*src_stride4+4]);\
946 l0= (a&0x03030303UL)\
949 h0= ((a&0xFCFCFCFCUL)>>2)\
950 + ((b&0xFCFCFCFCUL)>>2);\
951 l1= (c&0x03030303UL)\
953 h1= ((c&0xFCFCFCFCUL)>>2)\
954 + ((d&0xFCFCFCFCUL)>>2);\
955 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
959 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
963 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
964 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
967 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
968 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
971 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
972 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
975 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
976 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
979 uint32_t a, b, c, d, l0, l1, h0, h1;\
980 a= AV_RN32(&src1[i*src_stride1]);\
981 b= AV_RN32(&src2[i*src_stride2]);\
982 c= AV_RN32(&src3[i*src_stride3]);\
983 d= AV_RN32(&src4[i*src_stride4]);\
984 l0= (a&0x03030303UL)\
987 h0= ((a&0xFCFCFCFCUL)>>2)\
988 + ((b&0xFCFCFCFCUL)>>2);\
989 l1= (c&0x03030303UL)\
991 h1= ((c&0xFCFCFCFCUL)>>2)\
992 + ((d&0xFCFCFCFCUL)>>2);\
993 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
994 a= AV_RN32(&src1[i*src_stride1+4]);\
995 b= AV_RN32(&src2[i*src_stride2+4]);\
996 c= AV_RN32(&src3[i*src_stride3+4]);\
997 d= AV_RN32(&src4[i*src_stride4+4]);\
998 l0= (a&0x03030303UL)\
1001 h0= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 l1= (c&0x03030303UL)\
1004 + (d&0x03030303UL);\
1005 h1= ((c&0xFCFCFCFCUL)>>2)\
1006 + ((d&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1010 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1011 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1012 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1013 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1015 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1016 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1017 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1018 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1021 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1023 int i, a0, b0, a1, b1;\
1030 for(i=0; i<h; i+=2){\
1036 block[0]= (a1+a0)>>2; /* FIXME non put */\
1037 block[1]= (b1+b0)>>2;\
1047 block[0]= (a1+a0)>>2;\
1048 block[1]= (b1+b0)>>2;\
1054 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1057 const uint32_t a= AV_RN32(pixels );\
1058 const uint32_t b= AV_RN32(pixels+1);\
1059 uint32_t l0= (a&0x03030303UL)\
1062 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1063 + ((b&0xFCFCFCFCUL)>>2);\
1067 for(i=0; i<h; i+=2){\
1068 uint32_t a= AV_RN32(pixels );\
1069 uint32_t b= AV_RN32(pixels+1);\
1070 l1= (a&0x03030303UL)\
1071 + (b&0x03030303UL);\
1072 h1= ((a&0xFCFCFCFCUL)>>2)\
1073 + ((b&0xFCFCFCFCUL)>>2);\
1074 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 a= AV_RN32(pixels );\
1078 b= AV_RN32(pixels+1);\
1079 l0= (a&0x03030303UL)\
1082 h0= ((a&0xFCFCFCFCUL)>>2)\
1083 + ((b&0xFCFCFCFCUL)>>2);\
1084 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1093 for(j=0; j<2; j++){\
1095 const uint32_t a= AV_RN32(pixels );\
1096 const uint32_t b= AV_RN32(pixels+1);\
1097 uint32_t l0= (a&0x03030303UL)\
1100 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1101 + ((b&0xFCFCFCFCUL)>>2);\
1105 for(i=0; i<h; i+=2){\
1106 uint32_t a= AV_RN32(pixels );\
1107 uint32_t b= AV_RN32(pixels+1);\
1108 l1= (a&0x03030303UL)\
1109 + (b&0x03030303UL);\
1110 h1= ((a&0xFCFCFCFCUL)>>2)\
1111 + ((b&0xFCFCFCFCUL)>>2);\
1112 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1115 a= AV_RN32(pixels );\
1116 b= AV_RN32(pixels+1);\
1117 l0= (a&0x03030303UL)\
1120 h0= ((a&0xFCFCFCFCUL)>>2)\
1121 + ((b&0xFCFCFCFCUL)>>2);\
1122 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1126 pixels+=4-line_size*(h+1);\
1127 block +=4-line_size*h;\
1131 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1134 for(j=0; j<2; j++){\
1136 const uint32_t a= AV_RN32(pixels );\
1137 const uint32_t b= AV_RN32(pixels+1);\
1138 uint32_t l0= (a&0x03030303UL)\
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142 + ((b&0xFCFCFCFCUL)>>2);\
1146 for(i=0; i<h; i+=2){\
1147 uint32_t a= AV_RN32(pixels );\
1148 uint32_t b= AV_RN32(pixels+1);\
1149 l1= (a&0x03030303UL)\
1150 + (b&0x03030303UL);\
1151 h1= ((a&0xFCFCFCFCUL)>>2)\
1152 + ((b&0xFCFCFCFCUL)>>2);\
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1156 a= AV_RN32(pixels );\
1157 b= AV_RN32(pixels+1);\
1158 l0= (a&0x03030303UL)\
1161 h0= ((a&0xFCFCFCFCUL)>>2)\
1162 + ((b&0xFCFCFCFCUL)>>2);\
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1167 pixels+=4-line_size*(h+1);\
1168 block +=4-line_size*h;\
1172 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1173 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1174 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1175 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1176 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1177 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1178 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1179 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1181 #define op_avg(a, b) a = rnd_avg32(a, b)
1183 #define op_put(a, b) a = b
1190 #define put_no_rnd_pixels8_c put_pixels8_c
1191 #define put_no_rnd_pixels16_c put_pixels16_c
1193 #define avg2(a,b) ((a+b+1)>>1)
1194 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1196 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1197 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1200 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1201 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1204 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1206 const int A=(16-x16)*(16-y16);
1207 const int B=( x16)*(16-y16);
1208 const int C=(16-x16)*( y16);
1209 const int D=( x16)*( y16);
1214 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1215 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1216 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1217 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1218 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1219 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1220 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1221 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1227 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1228 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1231 const int s= 1<<shift;
1241 for(x=0; x<8; x++){ //XXX FIXME optimize
1242 int src_x, src_y, frac_x, frac_y, index;
1246 frac_x= src_x&(s-1);
1247 frac_y= src_y&(s-1);
1251 if((unsigned)src_x < width){
1252 if((unsigned)src_y < height){
1253 index= src_x + src_y*stride;
1254 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1255 + src[index +1]* frac_x )*(s-frac_y)
1256 + ( src[index+stride ]*(s-frac_x)
1257 + src[index+stride+1]* frac_x )* frac_y
1260 index= src_x + av_clip(src_y, 0, height)*stride;
1261 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1262 + src[index +1]* frac_x )*s
1266 if((unsigned)src_y < height){
1267 index= av_clip(src_x, 0, width) + src_y*stride;
1268 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1269 + src[index+stride ]* frac_y )*s
1272 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1273 dst[y*stride + x]= src[index ];
1285 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1287 case 2: put_pixels2_c (dst, src, stride, height); break;
1288 case 4: put_pixels4_c (dst, src, stride, height); break;
1289 case 8: put_pixels8_c (dst, src, stride, height); break;
1290 case 16:put_pixels16_c(dst, src, stride, height); break;
1294 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1296 for (i=0; i < height; i++) {
1297 for (j=0; j < width; j++) {
1298 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1305 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1307 for (i=0; i < height; i++) {
1308 for (j=0; j < width; j++) {
1309 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1316 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318 for (i=0; i < height; i++) {
1319 for (j=0; j < width; j++) {
1320 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1327 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329 for (i=0; i < height; i++) {
1330 for (j=0; j < width; j++) {
1331 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1338 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340 for (i=0; i < height; i++) {
1341 for (j=0; j < width; j++) {
1342 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1349 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 for (i=0; i < height; i++) {
1352 for (j=0; j < width; j++) {
1353 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1360 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362 for (i=0; i < height; i++) {
1363 for (j=0; j < width; j++) {
1364 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1371 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373 for (i=0; i < height; i++) {
1374 for (j=0; j < width; j++) {
1375 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1382 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1384 case 2: avg_pixels2_c (dst, src, stride, height); break;
1385 case 4: avg_pixels4_c (dst, src, stride, height); break;
1386 case 8: avg_pixels8_c (dst, src, stride, height); break;
1387 case 16:avg_pixels16_c(dst, src, stride, height); break;
1391 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393 for (i=0; i < height; i++) {
1394 for (j=0; j < width; j++) {
1395 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1402 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404 for (i=0; i < height; i++) {
1405 for (j=0; j < width; j++) {
1406 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1413 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1424 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1435 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1446 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1457 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1468 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1479 #define TPEL_WIDTH(width)\
1480 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1486 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1487 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1488 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1489 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1490 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1491 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1492 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1493 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1494 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1495 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1496 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1497 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1500 #define H264_CHROMA_MC(OPNAME, OP)\
1501 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1502 const int A=(8-x)*(8-y);\
1503 const int B=( x)*(8-y);\
1504 const int C=(8-x)*( y);\
1505 const int D=( x)*( y);\
1508 assert(x<8 && y<8 && x>=0 && y>=0);\
1511 for(i=0; i<h; i++){\
1512 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1513 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1519 const int step= C ? stride : 1;\
1520 for(i=0; i<h; i++){\
1521 OP(dst[0], (A*src[0] + E*src[step+0]));\
1522 OP(dst[1], (A*src[1] + E*src[step+1]));\
1529 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1530 const int A=(8-x)*(8-y);\
1531 const int B=( x)*(8-y);\
1532 const int C=(8-x)*( y);\
1533 const int D=( x)*( y);\
1536 assert(x<8 && y<8 && x>=0 && y>=0);\
1539 for(i=0; i<h; i++){\
1540 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1541 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1542 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1543 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1549 const int step= C ? stride : 1;\
1550 for(i=0; i<h; i++){\
1551 OP(dst[0], (A*src[0] + E*src[step+0]));\
1552 OP(dst[1], (A*src[1] + E*src[step+1]));\
1553 OP(dst[2], (A*src[2] + E*src[step+2]));\
1554 OP(dst[3], (A*src[3] + E*src[step+3]));\
1561 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1562 const int A=(8-x)*(8-y);\
1563 const int B=( x)*(8-y);\
1564 const int C=(8-x)*( y);\
1565 const int D=( x)*( y);\
1568 assert(x<8 && y<8 && x>=0 && y>=0);\
1571 for(i=0; i<h; i++){\
1572 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1573 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1574 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1575 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1576 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1577 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1578 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1579 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1585 const int step= C ? stride : 1;\
1586 for(i=0; i<h; i++){\
1587 OP(dst[0], (A*src[0] + E*src[step+0]));\
1588 OP(dst[1], (A*src[1] + E*src[step+1]));\
1589 OP(dst[2], (A*src[2] + E*src[step+2]));\
1590 OP(dst[3], (A*src[3] + E*src[step+3]));\
1591 OP(dst[4], (A*src[4] + E*src[step+4]));\
1592 OP(dst[5], (A*src[5] + E*src[step+5]));\
1593 OP(dst[6], (A*src[6] + E*src[step+6]));\
1594 OP(dst[7], (A*src[7] + E*src[step+7]));\
1601 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1602 #define op_put(a, b) a = (((b) + 32)>>6)
1604 H264_CHROMA_MC(put_ , op_put)
1605 H264_CHROMA_MC(avg_ , op_avg)
1609 #define QPEL_MC(r, OPNAME, RND, OP) \
1610 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1611 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1615 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1616 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1617 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1618 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1619 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1620 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1621 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1622 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1628 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1630 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1634 const int src0= src[0*srcStride];\
1635 const int src1= src[1*srcStride];\
1636 const int src2= src[2*srcStride];\
1637 const int src3= src[3*srcStride];\
1638 const int src4= src[4*srcStride];\
1639 const int src5= src[5*srcStride];\
1640 const int src6= src[6*srcStride];\
1641 const int src7= src[7*srcStride];\
1642 const int src8= src[8*srcStride];\
1643 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1644 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1645 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1646 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1647 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1648 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1649 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1650 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1656 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1657 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1662 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1663 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1664 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1665 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1666 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1667 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1668 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1669 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1670 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1671 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1672 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1673 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1674 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1675 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1676 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1677 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1683 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1689 const int src0= src[0*srcStride];\
1690 const int src1= src[1*srcStride];\
1691 const int src2= src[2*srcStride];\
1692 const int src3= src[3*srcStride];\
1693 const int src4= src[4*srcStride];\
1694 const int src5= src[5*srcStride];\
1695 const int src6= src[6*srcStride];\
1696 const int src7= src[7*srcStride];\
1697 const int src8= src[8*srcStride];\
1698 const int src9= src[9*srcStride];\
1699 const int src10= src[10*srcStride];\
1700 const int src11= src[11*srcStride];\
1701 const int src12= src[12*srcStride];\
1702 const int src13= src[13*srcStride];\
1703 const int src14= src[14*srcStride];\
1704 const int src15= src[15*srcStride];\
1705 const int src16= src[16*srcStride];\
1706 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1707 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1708 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1709 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1710 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1711 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1712 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1713 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1714 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1715 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1716 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1717 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1718 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1719 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1720 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1721 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1727 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1730 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1733 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1734 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1737 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1740 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1743 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[16*9];\
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1748 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1751 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1752 uint8_t full[16*9];\
1753 copy_block9(full, src, 16, stride, 9);\
1754 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1757 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[16*9];\
1760 copy_block9(full, src, 16, stride, 9);\
1761 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1762 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1764 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t full[16*9];\
1768 uint8_t halfHV[64];\
1769 copy_block9(full, src, 16, stride, 9);\
1770 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1775 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1776 uint8_t full[16*9];\
1778 uint8_t halfHV[64];\
1779 copy_block9(full, src, 16, stride, 9);\
1780 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1781 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1782 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1783 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1785 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[16*9];\
1789 uint8_t halfHV[64];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1794 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1796 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1797 uint8_t full[16*9];\
1799 uint8_t halfHV[64];\
1800 copy_block9(full, src, 16, stride, 9);\
1801 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1806 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t full[16*9];\
1810 uint8_t halfHV[64];\
1811 copy_block9(full, src, 16, stride, 9);\
1812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1813 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1817 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t full[16*9];\
1820 uint8_t halfHV[64];\
1821 copy_block9(full, src, 16, stride, 9);\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1823 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1824 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1825 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1827 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1828 uint8_t full[16*9];\
1831 uint8_t halfHV[64];\
1832 copy_block9(full, src, 16, stride, 9);\
1833 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1834 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1835 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1836 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1838 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1839 uint8_t full[16*9];\
1841 uint8_t halfHV[64];\
1842 copy_block9(full, src, 16, stride, 9);\
1843 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1844 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1845 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1848 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1850 uint8_t halfHV[64];\
1851 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1855 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t halfHV[64];\
1858 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1862 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t full[16*9];\
1866 uint8_t halfHV[64];\
1867 copy_block9(full, src, 16, stride, 9);\
1868 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1869 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1870 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1873 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1874 uint8_t full[16*9];\
1876 copy_block9(full, src, 16, stride, 9);\
1877 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1878 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1879 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1881 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882 uint8_t full[16*9];\
1885 uint8_t halfHV[64];\
1886 copy_block9(full, src, 16, stride, 9);\
1887 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1888 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1892 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[16*9];\
1895 copy_block9(full, src, 16, stride, 9);\
1896 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1897 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1898 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1900 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1902 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1903 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1906 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1909 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1912 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1913 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1916 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1918 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1919 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1922 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[24*17];\
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1927 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1930 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[24*17];\
1932 copy_block17(full, src, 24, stride, 17);\
1933 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1936 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1937 uint8_t full[24*17];\
1939 copy_block17(full, src, 24, stride, 17);\
1940 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1941 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1943 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1944 uint8_t full[24*17];\
1945 uint8_t halfH[272];\
1946 uint8_t halfV[256];\
1947 uint8_t halfHV[256];\
1948 copy_block17(full, src, 24, stride, 17);\
1949 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1950 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1954 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t full[24*17];\
1956 uint8_t halfH[272];\
1957 uint8_t halfHV[256];\
1958 copy_block17(full, src, 24, stride, 17);\
1959 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1960 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1964 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfV[256];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1975 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 uint8_t halfHV[256];\
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1985 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1986 uint8_t full[24*17];\
1987 uint8_t halfH[272];\
1988 uint8_t halfV[256];\
1989 uint8_t halfHV[256];\
1990 copy_block17(full, src, 24, stride, 17);\
1991 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1992 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1996 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t full[24*17];\
1998 uint8_t halfH[272];\
1999 uint8_t halfHV[256];\
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
2009 uint8_t halfV[256];\
2010 uint8_t halfHV[256];\
2011 copy_block17(full, src, 24, stride, 17);\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2017 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[24*17];\
2019 uint8_t halfH[272];\
2020 uint8_t halfHV[256];\
2021 copy_block17(full, src, 24, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2024 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2025 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2027 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t halfH[272];\
2029 uint8_t halfHV[256];\
2030 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2031 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2032 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2034 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t halfH[272];\
2036 uint8_t halfHV[256];\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2041 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[24*17];\
2043 uint8_t halfH[272];\
2044 uint8_t halfV[256];\
2045 uint8_t halfHV[256];\
2046 copy_block17(full, src, 24, stride, 17);\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2049 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2052 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[24*17];\
2054 uint8_t halfH[272];\
2055 copy_block17(full, src, 24, stride, 17);\
2056 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2057 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2058 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2060 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2061 uint8_t full[24*17];\
2062 uint8_t halfH[272];\
2063 uint8_t halfV[256];\
2064 uint8_t halfHV[256];\
2065 copy_block17(full, src, 24, stride, 17);\
2066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2068 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2069 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2071 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2072 uint8_t full[24*17];\
2073 uint8_t halfH[272];\
2074 copy_block17(full, src, 24, stride, 17);\
2075 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2076 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2077 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2079 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t halfH[272];\
2081 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2082 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2085 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2086 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2087 #define op_put(a, b) a = cm[((b) + 16)>>5]
2088 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2090 QPEL_MC(0, put_ , _ , op_put)
2091 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2092 QPEL_MC(0, avg_ , _ , op_avg)
2093 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2095 #undef op_avg_no_rnd
2097 #undef op_put_no_rnd
2099 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2100 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2101 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2102 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2103 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2104 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2107 #define H264_LOWPASS(OPNAME, OP, OP2) \
2108 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2110 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2114 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2115 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2121 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2123 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2127 const int srcB= src[-2*srcStride];\
2128 const int srcA= src[-1*srcStride];\
2129 const int src0= src[0 *srcStride];\
2130 const int src1= src[1 *srcStride];\
2131 const int src2= src[2 *srcStride];\
2132 const int src3= src[3 *srcStride];\
2133 const int src4= src[4 *srcStride];\
2134 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2141 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2144 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2146 src -= 2*srcStride;\
2147 for(i=0; i<h+5; i++)\
2149 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2150 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2154 tmp -= tmpStride*(h+5-2);\
2157 const int tmpB= tmp[-2*tmpStride];\
2158 const int tmpA= tmp[-1*tmpStride];\
2159 const int tmp0= tmp[0 *tmpStride];\
2160 const int tmp1= tmp[1 *tmpStride];\
2161 const int tmp2= tmp[2 *tmpStride];\
2162 const int tmp3= tmp[3 *tmpStride];\
2163 const int tmp4= tmp[4 *tmpStride];\
2164 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2165 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2170 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2172 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2176 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2177 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2178 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2179 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2185 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2187 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2191 const int srcB= src[-2*srcStride];\
2192 const int srcA= src[-1*srcStride];\
2193 const int src0= src[0 *srcStride];\
2194 const int src1= src[1 *srcStride];\
2195 const int src2= src[2 *srcStride];\
2196 const int src3= src[3 *srcStride];\
2197 const int src4= src[4 *srcStride];\
2198 const int src5= src[5 *srcStride];\
2199 const int src6= src[6 *srcStride];\
2200 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2201 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2202 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2203 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2209 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2212 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2214 src -= 2*srcStride;\
2215 for(i=0; i<h+5; i++)\
2217 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2218 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2219 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2220 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2224 tmp -= tmpStride*(h+5-2);\
2227 const int tmpB= tmp[-2*tmpStride];\
2228 const int tmpA= tmp[-1*tmpStride];\
2229 const int tmp0= tmp[0 *tmpStride];\
2230 const int tmp1= tmp[1 *tmpStride];\
2231 const int tmp2= tmp[2 *tmpStride];\
2232 const int tmp3= tmp[3 *tmpStride];\
2233 const int tmp4= tmp[4 *tmpStride];\
2234 const int tmp5= tmp[5 *tmpStride];\
2235 const int tmp6= tmp[6 *tmpStride];\
2236 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2237 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2238 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2239 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2245 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2251 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2252 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2253 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2254 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2255 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2256 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2257 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2258 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2264 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2266 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270 const int srcB= src[-2*srcStride];\
2271 const int srcA= src[-1*srcStride];\
2272 const int src0= src[0 *srcStride];\
2273 const int src1= src[1 *srcStride];\
2274 const int src2= src[2 *srcStride];\
2275 const int src3= src[3 *srcStride];\
2276 const int src4= src[4 *srcStride];\
2277 const int src5= src[5 *srcStride];\
2278 const int src6= src[6 *srcStride];\
2279 const int src7= src[7 *srcStride];\
2280 const int src8= src[8 *srcStride];\
2281 const int src9= src[9 *srcStride];\
2282 const int src10=src[10*srcStride];\
2283 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2284 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2285 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2286 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2287 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2288 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2289 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2290 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2296 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2299 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2301 src -= 2*srcStride;\
2302 for(i=0; i<h+5; i++)\
2304 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2305 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2306 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2307 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2308 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2309 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2310 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2311 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2315 tmp -= tmpStride*(h+5-2);\
2318 const int tmpB= tmp[-2*tmpStride];\
2319 const int tmpA= tmp[-1*tmpStride];\
2320 const int tmp0= tmp[0 *tmpStride];\
2321 const int tmp1= tmp[1 *tmpStride];\
2322 const int tmp2= tmp[2 *tmpStride];\
2323 const int tmp3= tmp[3 *tmpStride];\
2324 const int tmp4= tmp[4 *tmpStride];\
2325 const int tmp5= tmp[5 *tmpStride];\
2326 const int tmp6= tmp[6 *tmpStride];\
2327 const int tmp7= tmp[7 *tmpStride];\
2328 const int tmp8= tmp[8 *tmpStride];\
2329 const int tmp9= tmp[9 *tmpStride];\
2330 const int tmp10=tmp[10*tmpStride];\
2331 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2332 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2333 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2334 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2335 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2336 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2337 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2338 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2344 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2345 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2346 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2347 src += 8*srcStride;\
2348 dst += 8*dstStride;\
2349 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2350 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2353 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2354 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2355 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2356 src += 8*srcStride;\
2357 dst += 8*dstStride;\
2358 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2359 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2362 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2363 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2364 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2365 src += 8*srcStride;\
2366 dst += 8*dstStride;\
2367 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2368 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2371 #define H264_MC(OPNAME, SIZE) \
2372 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2373 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t half[SIZE*SIZE];\
2378 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2379 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2383 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2387 uint8_t half[SIZE*SIZE];\
2388 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2389 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2392 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2393 uint8_t full[SIZE*(SIZE+5)];\
2394 uint8_t * const full_mid= full + SIZE*2;\
2395 uint8_t half[SIZE*SIZE];\
2396 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2397 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2401 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2402 uint8_t full[SIZE*(SIZE+5)];\
2403 uint8_t * const full_mid= full + SIZE*2;\
2404 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2405 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2409 uint8_t full[SIZE*(SIZE+5)];\
2410 uint8_t * const full_mid= full + SIZE*2;\
2411 uint8_t half[SIZE*SIZE];\
2412 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2413 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2414 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2417 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2418 uint8_t full[SIZE*(SIZE+5)];\
2419 uint8_t * const full_mid= full + SIZE*2;\
2420 uint8_t halfH[SIZE*SIZE];\
2421 uint8_t halfV[SIZE*SIZE];\
2422 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2423 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2424 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2425 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2428 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2429 uint8_t full[SIZE*(SIZE+5)];\
2430 uint8_t * const full_mid= full + SIZE*2;\
2431 uint8_t halfH[SIZE*SIZE];\
2432 uint8_t halfV[SIZE*SIZE];\
2433 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2435 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2436 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2439 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2440 uint8_t full[SIZE*(SIZE+5)];\
2441 uint8_t * const full_mid= full + SIZE*2;\
2442 uint8_t halfH[SIZE*SIZE];\
2443 uint8_t halfV[SIZE*SIZE];\
2444 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2451 uint8_t full[SIZE*(SIZE+5)];\
2452 uint8_t * const full_mid= full + SIZE*2;\
2453 uint8_t halfH[SIZE*SIZE];\
2454 uint8_t halfV[SIZE*SIZE];\
2455 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2456 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2457 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2461 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2462 int16_t tmp[SIZE*(SIZE+5)];\
2463 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2467 int16_t tmp[SIZE*(SIZE+5)];\
2468 uint8_t halfH[SIZE*SIZE];\
2469 uint8_t halfHV[SIZE*SIZE];\
2470 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2471 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2472 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2476 int16_t tmp[SIZE*(SIZE+5)];\
2477 uint8_t halfH[SIZE*SIZE];\
2478 uint8_t halfHV[SIZE*SIZE];\
2479 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2481 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2484 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2485 uint8_t full[SIZE*(SIZE+5)];\
2486 uint8_t * const full_mid= full + SIZE*2;\
2487 int16_t tmp[SIZE*(SIZE+5)];\
2488 uint8_t halfV[SIZE*SIZE];\
2489 uint8_t halfHV[SIZE*SIZE];\
2490 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2491 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2492 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2493 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2497 uint8_t full[SIZE*(SIZE+5)];\
2498 uint8_t * const full_mid= full + SIZE*2;\
2499 int16_t tmp[SIZE*(SIZE+5)];\
2500 uint8_t halfV[SIZE*SIZE];\
2501 uint8_t halfHV[SIZE*SIZE];\
2502 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2503 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2504 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2508 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2509 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2510 #define op_put(a, b) a = cm[((b) + 16)>>5]
2511 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2512 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2514 H264_LOWPASS(put_ , op_put, op2_put)
2515 H264_LOWPASS(avg_ , op_avg, op2_avg)
2530 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2531 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2532 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2533 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2535 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2536 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2540 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2541 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2542 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2543 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2544 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2545 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2546 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2547 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2553 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2554 put_pixels8_c(dst, src, stride, 8);
2556 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2557 avg_pixels8_c(dst, src, stride, 8);
2559 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2560 put_pixels16_c(dst, src, stride, 16);
2562 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2563 avg_pixels16_c(dst, src, stride, 16);
2566 #if CONFIG_RV40_DECODER
2567 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2568 put_pixels16_xy2_c(dst, src, stride, 16);
2570 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2571 avg_pixels16_xy2_c(dst, src, stride, 16);
2573 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2574 put_pixels8_xy2_c(dst, src, stride, 8);
2576 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2577 avg_pixels8_xy2_c(dst, src, stride, 8);
2579 #endif /* CONFIG_RV40_DECODER */
2581 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2582 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2586 const int src_1= src[ -srcStride];
2587 const int src0 = src[0 ];
2588 const int src1 = src[ srcStride];
2589 const int src2 = src[2*srcStride];
2590 const int src3 = src[3*srcStride];
2591 const int src4 = src[4*srcStride];
2592 const int src5 = src[5*srcStride];
2593 const int src6 = src[6*srcStride];
2594 const int src7 = src[7*srcStride];
2595 const int src8 = src[8*srcStride];
2596 const int src9 = src[9*srcStride];
2597 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2598 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2599 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2600 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2601 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2602 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2603 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2604 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2610 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2612 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2616 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2620 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2622 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2626 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2630 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2634 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2639 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2643 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2648 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2650 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2654 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2657 const int strength= ff_h263_loop_filter_strength[qscale];
2661 int p0= src[x-2*stride];
2662 int p1= src[x-1*stride];
2663 int p2= src[x+0*stride];
2664 int p3= src[x+1*stride];
2665 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2667 if (d<-2*strength) d1= 0;
2668 else if(d<- strength) d1=-2*strength - d;
2669 else if(d< strength) d1= d;
2670 else if(d< 2*strength) d1= 2*strength - d;
2675 if(p1&256) p1= ~(p1>>31);
2676 if(p2&256) p2= ~(p2>>31);
2678 src[x-1*stride] = p1;
2679 src[x+0*stride] = p2;
2683 d2= av_clip((p0-p3)/4, -ad1, ad1);
2685 src[x-2*stride] = p0 - d2;
2686 src[x+ stride] = p3 + d2;
2691 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2692 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2694 const int strength= ff_h263_loop_filter_strength[qscale];
2698 int p0= src[y*stride-2];
2699 int p1= src[y*stride-1];
2700 int p2= src[y*stride+0];
2701 int p3= src[y*stride+1];
2702 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2704 if (d<-2*strength) d1= 0;
2705 else if(d<- strength) d1=-2*strength - d;
2706 else if(d< strength) d1= d;
2707 else if(d< 2*strength) d1= 2*strength - d;
2712 if(p1&256) p1= ~(p1>>31);
2713 if(p2&256) p2= ~(p2>>31);
2715 src[y*stride-1] = p1;
2716 src[y*stride+0] = p2;
2720 d2= av_clip((p0-p3)/4, -ad1, ad1);
2722 src[y*stride-2] = p0 - d2;
2723 src[y*stride+1] = p3 + d2;
2728 static void h261_loop_filter_c(uint8_t *src, int stride){
2733 temp[x ] = 4*src[x ];
2734 temp[x + 7*8] = 4*src[x + 7*stride];
2738 xy = y * stride + x;
2740 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2745 src[ y*stride] = (temp[ y*8] + 2)>>2;
2746 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2748 xy = y * stride + x;
2750 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2755 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2761 s += abs(pix1[0] - pix2[0]);
2762 s += abs(pix1[1] - pix2[1]);
2763 s += abs(pix1[2] - pix2[2]);
2764 s += abs(pix1[3] - pix2[3]);
2765 s += abs(pix1[4] - pix2[4]);
2766 s += abs(pix1[5] - pix2[5]);
2767 s += abs(pix1[6] - pix2[6]);
2768 s += abs(pix1[7] - pix2[7]);
2769 s += abs(pix1[8] - pix2[8]);
2770 s += abs(pix1[9] - pix2[9]);
2771 s += abs(pix1[10] - pix2[10]);
2772 s += abs(pix1[11] - pix2[11]);
2773 s += abs(pix1[12] - pix2[12]);
2774 s += abs(pix1[13] - pix2[13]);
2775 s += abs(pix1[14] - pix2[14]);
2776 s += abs(pix1[15] - pix2[15]);
2783 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2789 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2790 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2791 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2792 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2793 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2794 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2795 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2796 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2797 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2798 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2799 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2800 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2801 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2802 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2803 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2804 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2811 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2814 uint8_t *pix3 = pix2 + line_size;
2818 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2819 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2820 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2821 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2822 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2823 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2824 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2825 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2826 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2827 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2828 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2829 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2830 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2831 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2832 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2833 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2841 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2844 uint8_t *pix3 = pix2 + line_size;
2848 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2849 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2850 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2851 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2852 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2853 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2854 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2855 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2856 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2857 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2858 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2859 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2860 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2861 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2862 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2863 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2871 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2877 s += abs(pix1[0] - pix2[0]);
2878 s += abs(pix1[1] - pix2[1]);
2879 s += abs(pix1[2] - pix2[2]);
2880 s += abs(pix1[3] - pix2[3]);
2881 s += abs(pix1[4] - pix2[4]);
2882 s += abs(pix1[5] - pix2[5]);
2883 s += abs(pix1[6] - pix2[6]);
2884 s += abs(pix1[7] - pix2[7]);
2891 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2898 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2899 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2900 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2901 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2902 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2903 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2904 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2911 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2914 uint8_t *pix3 = pix2 + line_size;
2918 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2919 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2920 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2921 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2922 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2923 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2924 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2925 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2933 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2936 uint8_t *pix3 = pix2 + line_size;
2940 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2941 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2942 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2943 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2944 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2945 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2946 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2947 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2955 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2956 MpegEncContext *c = v;
2962 for(x=0; x<16; x++){
2963 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2966 for(x=0; x<15; x++){
2967 score2+= FFABS( s1[x ] - s1[x +stride]
2968 - s1[x+1] + s1[x+1+stride])
2969 -FFABS( s2[x ] - s2[x +stride]
2970 - s2[x+1] + s2[x+1+stride]);
2977 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
2978 else return score1 + FFABS(score2)*8;
2981 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2982 MpegEncContext *c = v;
2989 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2993 score2+= FFABS( s1[x ] - s1[x +stride]
2994 - s1[x+1] + s1[x+1+stride])
2995 -FFABS( s2[x ] - s2[x +stride]
2996 - s2[x+1] + s2[x+1+stride]);
3003 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3004 else return score1 + FFABS(score2)*8;
3007 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3011 for(i=0; i<8*8; i++){
3012 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3015 assert(-512<b && b<512);
3017 sum += (w*b)*(w*b)>>4;
3022 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3025 for(i=0; i<8*8; i++){
3026 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3031 * permutes an 8x8 block.
3032 * @param block the block which will be permuted according to the given permutation vector
3033 * @param permutation the permutation vector
3034 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3035 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3036 * (inverse) permutated to scantable order!
3038 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3044 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3046 for(i=0; i<=last; i++){
3047 const int j= scantable[i];
3052 for(i=0; i<=last; i++){
3053 const int j= scantable[i];
3054 const int perm_j= permutation[j];
3055 block[perm_j]= temp[j];
3059 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3063 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3066 memset(cmp, 0, sizeof(void*)*6);
3074 cmp[i]= c->hadamard8_diff[i];
3080 cmp[i]= c->dct_sad[i];
3083 cmp[i]= c->dct264_sad[i];
3086 cmp[i]= c->dct_max[i];
3089 cmp[i]= c->quant_psnr[i];
3118 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3123 static void clear_block_c(DCTELEM *block)
3125 memset(block, 0, sizeof(DCTELEM)*64);
3129 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3131 static void clear_blocks_c(DCTELEM *blocks)
3133 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3136 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3138 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3139 long a = *(long*)(src+i);
3140 long b = *(long*)(dst+i);
3141 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3144 dst[i+0] += src[i+0];
3147 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3149 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3150 long a = *(long*)(src1+i);
3151 long b = *(long*)(src2+i);
3152 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3155 dst[i] = src1[i]+src2[i];
3158 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3160 #if !HAVE_FAST_UNALIGNED
3161 if((long)src2 & (sizeof(long)-1)){
3162 for(i=0; i+7<w; i+=8){
3163 dst[i+0] = src1[i+0]-src2[i+0];
3164 dst[i+1] = src1[i+1]-src2[i+1];
3165 dst[i+2] = src1[i+2]-src2[i+2];
3166 dst[i+3] = src1[i+3]-src2[i+3];
3167 dst[i+4] = src1[i+4]-src2[i+4];
3168 dst[i+5] = src1[i+5]-src2[i+5];
3169 dst[i+6] = src1[i+6]-src2[i+6];
3170 dst[i+7] = src1[i+7]-src2[i+7];
3174 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3175 long a = *(long*)(src1+i);
3176 long b = *(long*)(src2+i);
3177 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3180 dst[i+0] = src1[i+0]-src2[i+0];
3183 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3191 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3200 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3208 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3218 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3221 for(i=0; i<w-1; i++){
3248 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3278 #define BUTTERFLY2(o1,o2,i1,i2) \
3282 #define BUTTERFLY1(x,y) \
3291 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3293 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3301 //FIXME try pointer walks
3302 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3303 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3304 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3305 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3307 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3308 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3309 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3310 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3312 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3313 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3314 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3315 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3319 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3320 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3321 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3322 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3324 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3325 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3326 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3327 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3330 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3331 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3332 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3333 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3339 printf("MAX:%d\n", maxi);
3345 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3353 //FIXME try pointer walks
3354 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3355 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3356 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3357 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3359 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3360 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3361 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3362 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3364 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3365 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3366 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3367 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3371 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3372 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3373 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3374 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3376 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3377 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3378 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3379 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3382 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3383 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3384 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3385 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3388 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3393 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3394 MpegEncContext * const s= (MpegEncContext *)c;
3395 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3399 s->dsp.diff_pixels(temp, src1, src2, stride);
3401 return s->dsp.sum_abs_dctelem(temp);
3406 const int s07 = SRC(0) + SRC(7);\
3407 const int s16 = SRC(1) + SRC(6);\
3408 const int s25 = SRC(2) + SRC(5);\
3409 const int s34 = SRC(3) + SRC(4);\
3410 const int a0 = s07 + s34;\
3411 const int a1 = s16 + s25;\
3412 const int a2 = s07 - s34;\
3413 const int a3 = s16 - s25;\
3414 const int d07 = SRC(0) - SRC(7);\
3415 const int d16 = SRC(1) - SRC(6);\
3416 const int d25 = SRC(2) - SRC(5);\
3417 const int d34 = SRC(3) - SRC(4);\
3418 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3419 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3420 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3421 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3423 DST(1, a4 + (a7>>2)) ;\
3424 DST(2, a2 + (a3>>1)) ;\
3425 DST(3, a5 + (a6>>2)) ;\
3427 DST(5, a6 - (a5>>2)) ;\
3428 DST(6, (a2>>1) - a3 ) ;\
3429 DST(7, (a4>>2) - a7 ) ;\
3432 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3433 MpegEncContext * const s= (MpegEncContext *)c;
3438 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3440 #define SRC(x) dct[i][x]
3441 #define DST(x,v) dct[i][x]= v
3442 for( i = 0; i < 8; i++ )
3447 #define SRC(x) dct[x][i]
3448 #define DST(x,v) sum += FFABS(v)
3449 for( i = 0; i < 8; i++ )
3457 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3458 MpegEncContext * const s= (MpegEncContext *)c;
3459 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3464 s->dsp.diff_pixels(temp, src1, src2, stride);
3468 sum= FFMAX(sum, FFABS(temp[i]));
3473 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474 MpegEncContext * const s= (MpegEncContext *)c;
3475 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3476 DCTELEM * const bak = temp+64;
3482 s->dsp.diff_pixels(temp, src1, src2, stride);
3484 memcpy(bak, temp, 64*sizeof(DCTELEM));
3486 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3487 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3488 ff_simple_idct(temp); //FIXME
3491 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3496 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3497 MpegEncContext * const s= (MpegEncContext *)c;
3498 const uint8_t *scantable= s->intra_scantable.permutated;
3499 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3500 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3501 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3502 int i, last, run, bits, level, distortion, start_i;
3503 const int esc_length= s->ac_esc_length;
3505 uint8_t * last_length;
3509 copy_block8(lsrc1, src1, 8, stride, 8);
3510 copy_block8(lsrc2, src2, 8, stride, 8);
3512 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3514 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3520 length = s->intra_ac_vlc_length;
3521 last_length= s->intra_ac_vlc_last_length;
3522 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3525 length = s->inter_ac_vlc_length;
3526 last_length= s->inter_ac_vlc_last_length;
3531 for(i=start_i; i<last; i++){
3532 int j= scantable[i];
3537 if((level&(~127)) == 0){
3538 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3547 level= temp[i] + 64;
3551 if((level&(~127)) == 0){
3552 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3560 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3562 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3565 s->dsp.idct_add(lsrc2, 8, temp);
3567 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3569 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3572 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3573 MpegEncContext * const s= (MpegEncContext *)c;
3574 const uint8_t *scantable= s->intra_scantable.permutated;
3575 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3576 int i, last, run, bits, level, start_i;
3577 const int esc_length= s->ac_esc_length;
3579 uint8_t * last_length;
3583 s->dsp.diff_pixels(temp, src1, src2, stride);
3585 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3591 length = s->intra_ac_vlc_length;
3592 last_length= s->intra_ac_vlc_last_length;
3593 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3596 length = s->inter_ac_vlc_length;
3597 last_length= s->inter_ac_vlc_last_length;
3602 for(i=start_i; i<last; i++){
3603 int j= scantable[i];
3608 if((level&(~127)) == 0){
3609 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3618 level= temp[i] + 64;
3622 if((level&(~127)) == 0){
3623 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3631 #define VSAD_INTRA(size) \
3632 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3636 for(y=1; y<h; y++){ \
3637 for(x=0; x<size; x+=4){ \
3638 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3639 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3649 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3654 for(x=0; x<16; x++){
3655 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3664 #define SQ(a) ((a)*(a))
3665 #define VSSE_INTRA(size) \
3666 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3670 for(y=1; y<h; y++){ \
3671 for(x=0; x<size; x+=4){ \
3672 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3673 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3683 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3688 for(x=0; x<16; x++){
3689 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3698 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3702 for(i=0; i<size; i++)
3703 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3707 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3708 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3709 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3711 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3713 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3714 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3715 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3716 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3718 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3720 for(i=0; i<len; i++)
3721 dst[i] = src0[i] * src1[i];
3724 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3727 for(i=0; i<len; i++)
3728 dst[i] = src0[i] * src1[-i];
3731 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3733 for(i=0; i<len; i++)
3734 dst[i] = src0[i] * src1[i] + src2[i];
3737 static void vector_fmul_window_c(float *dst, const float *src0,
3738 const float *src1, const float *win, int len)
3744 for(i=-len, j=len-1; i<0; i++, j--) {
3749 dst[i] = s0*wj - s1*wi;
3750 dst[j] = s0*wi + s1*wj;
3754 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3758 for (i = 0; i < len; i++)
3759 dst[i] = src[i] * mul;
3762 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3763 const float **sv, float mul, int len)
3766 for (i = 0; i < len; i += 2, sv++) {
3767 dst[i ] = src[i ] * sv[0][0] * mul;
3768 dst[i+1] = src[i+1] * sv[0][1] * mul;
3772 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3773 const float **sv, float mul, int len)
3776 for (i = 0; i < len; i += 4, sv++) {
3777 dst[i ] = src[i ] * sv[0][0] * mul;
3778 dst[i+1] = src[i+1] * sv[0][1] * mul;
3779 dst[i+2] = src[i+2] * sv[0][2] * mul;
3780 dst[i+3] = src[i+3] * sv[0][3] * mul;
3784 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3788 for (i = 0; i < len; i += 2, sv++) {
3789 dst[i ] = sv[0][0] * mul;
3790 dst[i+1] = sv[0][1] * mul;
3794 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3798 for (i = 0; i < len; i += 4, sv++) {
3799 dst[i ] = sv[0][0] * mul;
3800 dst[i+1] = sv[0][1] * mul;
3801 dst[i+2] = sv[0][2] * mul;
3802 dst[i+3] = sv[0][3] * mul;
3806 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3810 for (i = 0; i < len; i++) {
3811 float t = v1[i] - v2[i];
3817 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3822 for (i = 0; i < len; i++)
3828 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3829 uint32_t maxi, uint32_t maxisign)
3832 if(a > mini) return mini;
3833 else if((a^(1<<31)) > maxisign) return maxi;
3837 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3839 uint32_t mini = *(uint32_t*)min;
3840 uint32_t maxi = *(uint32_t*)max;
3841 uint32_t maxisign = maxi ^ (1<<31);
3842 uint32_t *dsti = (uint32_t*)dst;
3843 const uint32_t *srci = (const uint32_t*)src;
3844 for(i=0; i<len; i+=8) {
3845 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3846 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3847 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3848 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3849 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3850 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3851 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3852 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3855 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3857 if(min < 0 && max > 0) {
3858 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3860 for(i=0; i < len; i+=8) {
3861 dst[i ] = av_clipf(src[i ], min, max);
3862 dst[i + 1] = av_clipf(src[i + 1], min, max);
3863 dst[i + 2] = av_clipf(src[i + 2], min, max);
3864 dst[i + 3] = av_clipf(src[i + 3], min, max);
3865 dst[i + 4] = av_clipf(src[i + 4], min, max);
3866 dst[i + 5] = av_clipf(src[i + 5], min, max);
3867 dst[i + 6] = av_clipf(src[i + 6], min, max);
3868 dst[i + 7] = av_clipf(src[i + 7], min, max);
3873 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3878 res += (*v1++ * *v2++) >> shift;
3883 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3888 *v1++ += mul * *v3++;
3894 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3895 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3896 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3897 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3898 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3899 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3900 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3902 static void wmv2_idct_row(short * b)
3905 int a0,a1,a2,a3,a4,a5,a6,a7;
3907 a1 = W1*b[1]+W7*b[7];
3908 a7 = W7*b[1]-W1*b[7];
3909 a5 = W5*b[5]+W3*b[3];
3910 a3 = W3*b[5]-W5*b[3];
3911 a2 = W2*b[2]+W6*b[6];
3912 a6 = W6*b[2]-W2*b[6];
3913 a0 = W0*b[0]+W0*b[4];
3914 a4 = W0*b[0]-W0*b[4];
3916 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3917 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3919 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3920 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3921 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3922 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3923 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3924 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3925 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3926 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3928 static void wmv2_idct_col(short * b)
3931 int a0,a1,a2,a3,a4,a5,a6,a7;
3932 /*step 1, with extended precision*/
3933 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3934 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3935 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3936 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3937 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3938 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3939 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3940 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3942 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3943 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3945 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3946 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3947 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3948 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3950 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3951 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3952 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3953 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3955 void ff_wmv2_idct_c(short * block){
3959 wmv2_idct_row(block+i);
3962 wmv2_idct_col(block+i);
3965 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3967 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3969 ff_wmv2_idct_c(block);
3970 ff_put_pixels_clamped_c(block, dest, line_size);
3972 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3974 ff_wmv2_idct_c(block);
3975 ff_add_pixels_clamped_c(block, dest, line_size);
3977 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3980 ff_put_pixels_clamped_c(block, dest, line_size);
3982 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3985 ff_add_pixels_clamped_c(block, dest, line_size);
3988 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3991 put_pixels_clamped4_c(block, dest, line_size);
3993 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3996 add_pixels_clamped4_c(block, dest, line_size);
3999 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4002 put_pixels_clamped2_c(block, dest, line_size);
4004 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4007 add_pixels_clamped2_c(block, dest, line_size);
4010 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4012 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4014 dest[0] = cm[(block[0] + 4)>>3];
4016 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4018 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4020 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4023 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4025 /* init static data */
4026 av_cold void dsputil_static_init(void)
4030 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4031 for(i=0;i<MAX_NEG_CROP;i++) {
4033 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4036 for(i=0;i<512;i++) {
4037 ff_squareTbl[i] = (i - 256) * (i - 256);
4040 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4043 int ff_check_alignment(void){
4044 static int did_fail=0;
4045 DECLARE_ALIGNED(16, int, aligned);
4047 if((intptr_t)&aligned & 15){
4049 #if HAVE_MMX || HAVE_ALTIVEC
4050 av_log(NULL, AV_LOG_ERROR,
4051 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4052 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4053 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4054 "Do not report crashes to FFmpeg developers.\n");
4063 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4067 ff_check_alignment();
4070 if(avctx->dct_algo==FF_DCT_FASTINT) {
4071 c->fdct = fdct_ifast;
4072 c->fdct248 = fdct_ifast248;
4074 else if(avctx->dct_algo==FF_DCT_FAAN) {
4075 c->fdct = ff_faandct;
4076 c->fdct248 = ff_faandct248;
4079 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4080 c->fdct248 = ff_fdct248_islow;
4082 #endif //CONFIG_ENCODERS
4084 if(avctx->lowres==1){
4085 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4086 c->idct_put= ff_jref_idct4_put;
4087 c->idct_add= ff_jref_idct4_add;
4089 c->idct_put= ff_h264_lowres_idct_put_c;
4090 c->idct_add= ff_h264_lowres_idct_add_c;
4092 c->idct = j_rev_dct4;
4093 c->idct_permutation_type= FF_NO_IDCT_PERM;
4094 }else if(avctx->lowres==2){
4095 c->idct_put= ff_jref_idct2_put;
4096 c->idct_add= ff_jref_idct2_add;
4097 c->idct = j_rev_dct2;
4098 c->idct_permutation_type= FF_NO_IDCT_PERM;
4099 }else if(avctx->lowres==3){
4100 c->idct_put= ff_jref_idct1_put;
4101 c->idct_add= ff_jref_idct1_add;
4102 c->idct = j_rev_dct1;
4103 c->idct_permutation_type= FF_NO_IDCT_PERM;
4105 if(avctx->idct_algo==FF_IDCT_INT){
4106 c->idct_put= ff_jref_idct_put;
4107 c->idct_add= ff_jref_idct_add;
4108 c->idct = j_rev_dct;
4109 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4110 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4111 avctx->idct_algo==FF_IDCT_VP3){
4112 c->idct_put= ff_vp3_idct_put_c;
4113 c->idct_add= ff_vp3_idct_add_c;
4114 c->idct = ff_vp3_idct_c;
4115 c->idct_permutation_type= FF_NO_IDCT_PERM;
4116 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4117 c->idct_put= ff_wmv2_idct_put_c;
4118 c->idct_add= ff_wmv2_idct_add_c;
4119 c->idct = ff_wmv2_idct_c;
4120 c->idct_permutation_type= FF_NO_IDCT_PERM;
4121 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4122 c->idct_put= ff_faanidct_put;
4123 c->idct_add= ff_faanidct_add;
4124 c->idct = ff_faanidct;
4125 c->idct_permutation_type= FF_NO_IDCT_PERM;
4126 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4127 c->idct_put= ff_ea_idct_put_c;
4128 c->idct_permutation_type= FF_NO_IDCT_PERM;
4129 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4130 c->idct = ff_bink_idct_c;
4131 c->idct_add = ff_bink_idct_add_c;
4132 c->idct_put = ff_bink_idct_put_c;
4133 c->idct_permutation_type = FF_NO_IDCT_PERM;
4134 }else{ //accurate/default
4135 c->idct_put= ff_simple_idct_put;
4136 c->idct_add= ff_simple_idct_add;
4137 c->idct = ff_simple_idct;
4138 c->idct_permutation_type= FF_NO_IDCT_PERM;
4142 c->get_pixels = get_pixels_c;
4143 c->diff_pixels = diff_pixels_c;
4144 c->put_pixels_clamped = ff_put_pixels_clamped_c;
4145 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
4146 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4147 c->add_pixels_clamped = ff_add_pixels_clamped_c;
4148 c->add_pixels8 = add_pixels8_c;
4149 c->add_pixels4 = add_pixels4_c;
4150 c->sum_abs_dctelem = sum_abs_dctelem_c;
4151 c->emulated_edge_mc = ff_emulated_edge_mc;
4154 c->clear_block = clear_block_c;
4155 c->clear_blocks = clear_blocks_c;
4156 c->pix_sum = pix_sum_c;
4157 c->pix_norm1 = pix_norm1_c;
4159 c->fill_block_tab[0] = fill_block16_c;
4160 c->fill_block_tab[1] = fill_block8_c;
4161 c->scale_block = scale_block_c;
4163 /* TODO [0] 16 [1] 8 */
4164 c->pix_abs[0][0] = pix_abs16_c;
4165 c->pix_abs[0][1] = pix_abs16_x2_c;
4166 c->pix_abs[0][2] = pix_abs16_y2_c;
4167 c->pix_abs[0][3] = pix_abs16_xy2_c;
4168 c->pix_abs[1][0] = pix_abs8_c;
4169 c->pix_abs[1][1] = pix_abs8_x2_c;
4170 c->pix_abs[1][2] = pix_abs8_y2_c;
4171 c->pix_abs[1][3] = pix_abs8_xy2_c;
4173 #define dspfunc(PFX, IDX, NUM) \
4174 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4175 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4176 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4177 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4179 dspfunc(put, 0, 16);
4180 dspfunc(put_no_rnd, 0, 16);
4182 dspfunc(put_no_rnd, 1, 8);
4186 dspfunc(avg, 0, 16);
4187 dspfunc(avg_no_rnd, 0, 16);
4189 dspfunc(avg_no_rnd, 1, 8);
4194 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4195 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4197 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4198 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4199 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4200 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4201 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4202 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4203 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4204 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4205 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4207 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4208 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4209 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4210 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4211 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4212 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4213 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4214 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4215 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4217 #define dspfunc(PFX, IDX, NUM) \
4218 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4219 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4220 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4221 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4222 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4223 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4224 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4225 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4226 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4227 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4228 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4229 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4230 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4231 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4232 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4233 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4235 dspfunc(put_qpel, 0, 16);
4236 dspfunc(put_no_rnd_qpel, 0, 16);
4238 dspfunc(avg_qpel, 0, 16);
4239 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4241 dspfunc(put_qpel, 1, 8);
4242 dspfunc(put_no_rnd_qpel, 1, 8);
4244 dspfunc(avg_qpel, 1, 8);
4245 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4247 dspfunc(put_h264_qpel, 0, 16);
4248 dspfunc(put_h264_qpel, 1, 8);
4249 dspfunc(put_h264_qpel, 2, 4);
4250 dspfunc(put_h264_qpel, 3, 2);
4251 dspfunc(avg_h264_qpel, 0, 16);
4252 dspfunc(avg_h264_qpel, 1, 8);
4253 dspfunc(avg_h264_qpel, 2, 4);
4256 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4257 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4258 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4259 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4260 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4261 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4263 c->draw_edges = draw_edges_c;
4265 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4266 ff_mlp_init(c, avctx);
4268 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4269 ff_intrax8dsp_init(c,avctx);
4271 #if CONFIG_RV30_DECODER
4272 ff_rv30dsp_init(c,avctx);
4274 #if CONFIG_RV40_DECODER
4275 ff_rv40dsp_init(c,avctx);
4276 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4277 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4278 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4279 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4282 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4283 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4284 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4285 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4286 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4287 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4288 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4289 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4291 #define SET_CMP_FUNC(name) \
4292 c->name[0]= name ## 16_c;\
4293 c->name[1]= name ## 8x8_c;
4295 SET_CMP_FUNC(hadamard8_diff)
4296 c->hadamard8_diff[4]= hadamard8_intra16_c;
4297 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4298 SET_CMP_FUNC(dct_sad)
4299 SET_CMP_FUNC(dct_max)
4301 SET_CMP_FUNC(dct264_sad)
4303 c->sad[0]= pix_abs16_c;
4304 c->sad[1]= pix_abs8_c;
4308 SET_CMP_FUNC(quant_psnr)
4311 c->vsad[0]= vsad16_c;
4312 c->vsad[4]= vsad_intra16_c;
4313 c->vsad[5]= vsad_intra8_c;
4314 c->vsse[0]= vsse16_c;
4315 c->vsse[4]= vsse_intra16_c;
4316 c->vsse[5]= vsse_intra8_c;
4317 c->nsse[0]= nsse16_c;
4318 c->nsse[1]= nsse8_c;
4320 ff_dsputil_init_dwt(c);
4323 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4325 c->add_bytes= add_bytes_c;
4326 c->add_bytes_l2= add_bytes_l2_c;
4327 c->diff_bytes= diff_bytes_c;
4328 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4329 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4330 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4331 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4332 c->bswap_buf= bswap_buf;
4333 c->bswap16_buf = bswap16_buf;
4334 #if CONFIG_PNG_DECODER
4335 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4338 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4339 c->h263_h_loop_filter= h263_h_loop_filter_c;
4340 c->h263_v_loop_filter= h263_v_loop_filter_c;
4343 if (CONFIG_VP3_DECODER) {
4344 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4345 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4346 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4349 c->h261_loop_filter= h261_loop_filter_c;
4351 c->try_8x8basis= try_8x8basis_c;
4352 c->add_8x8basis= add_8x8basis_c;
4354 #if CONFIG_VORBIS_DECODER
4355 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4357 #if CONFIG_AC3_DECODER
4358 c->ac3_downmix = ff_ac3_downmix_c;
4360 c->vector_fmul = vector_fmul_c;
4361 c->vector_fmul_reverse = vector_fmul_reverse_c;
4362 c->vector_fmul_add = vector_fmul_add_c;
4363 c->vector_fmul_window = vector_fmul_window_c;
4364 c->vector_clipf = vector_clipf_c;
4365 c->scalarproduct_int16 = scalarproduct_int16_c;
4366 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4367 c->scalarproduct_float = scalarproduct_float_c;
4368 c->butterflies_float = butterflies_float_c;
4369 c->vector_fmul_scalar = vector_fmul_scalar_c;
4371 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4372 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4374 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4375 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4377 c->shrink[0]= av_image_copy_plane;
4378 c->shrink[1]= ff_shrink22;
4379 c->shrink[2]= ff_shrink44;
4380 c->shrink[3]= ff_shrink88;
4382 c->prefetch= just_return;
4384 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4385 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4387 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4388 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4389 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4390 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4391 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4392 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4393 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4394 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4395 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4397 for(i=0; i<64; i++){
4398 if(!c->put_2tap_qpel_pixels_tab[0][i])
4399 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4400 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4401 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4404 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4405 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4406 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4407 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4409 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4410 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4411 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4412 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4414 switch(c->idct_permutation_type){
4415 case FF_NO_IDCT_PERM:
4417 c->idct_permutation[i]= i;
4419 case FF_LIBMPEG2_IDCT_PERM:
4421 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4423 case FF_SIMPLE_IDCT_PERM:
4425 c->idct_permutation[i]= simple_mmx_permutation[i];
4427 case FF_TRANSPOSE_IDCT_PERM:
4429 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4431 case FF_PARTTRANS_IDCT_PERM:
4433 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4435 case FF_SSE2_IDCT_PERM:
4437 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4440 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");