3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavcore/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
47 #define pb_7f (~0UL/255 * 0x7f)
48 #define pb_80 (~0UL/255 * 0x80)
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* Input permutation for the simple_idct_mmx */
100 static const uint8_t simple_mmx_permutation[64]={
101 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
102 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
103 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
104 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
105 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
106 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
107 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
108 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
111 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
113 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
117 st->scantable= src_scantable;
121 j = src_scantable[i];
122 st->permutated[i] = permutation[j];
131 j = st->permutated[i];
133 st->raster_end[i]= end;
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = ff_squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= av_bswap32(src[i+0]);
212 dst[i+1]= av_bswap32(src[i+1]);
213 dst[i+2]= av_bswap32(src[i+2]);
214 dst[i+3]= av_bswap32(src[i+3]);
215 dst[i+4]= av_bswap32(src[i+4]);
216 dst[i+5]= av_bswap32(src[i+5]);
217 dst[i+6]= av_bswap32(src[i+6]);
218 dst[i+7]= av_bswap32(src[i+7]);
221 dst[i+0]= av_bswap32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = ff_squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = ff_squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = ff_squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
293 /* draw the edges of width 'w' of an image of size width, height */
294 //FIXME check that this is ok for mpeg4 interlaced
295 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
297 uint8_t *ptr, *last_line;
300 last_line = buf + (height - 1) * wrap;
303 memcpy(buf - (i + 1) * wrap, buf, width);
304 memcpy(last_line + (i + 1) * wrap, last_line, width);
308 for(i=0;i<height;i++) {
309 memset(ptr - w, ptr[0], w);
310 memset(ptr + width, ptr[width-1], w);
315 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
316 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
317 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
318 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
323 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
324 * @param buf destination buffer
325 * @param src source buffer
326 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
327 * @param block_w width of block
328 * @param block_h height of block
329 * @param src_x x coordinate of the top left sample of the block in the source buffer
330 * @param src_y y coordinate of the top left sample of the block in the source buffer
331 * @param w width of the source buffer
332 * @param h height of the source buffer
334 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
335 int src_x, int src_y, int w, int h){
337 int start_y, start_x, end_y, end_x;
340 src+= (h-1-src_y)*linesize;
342 }else if(src_y<=-block_h){
343 src+= (1-block_h-src_y)*linesize;
349 }else if(src_x<=-block_w){
350 src+= (1-block_w-src_x);
354 start_y= FFMAX(0, -src_y);
355 start_x= FFMAX(0, -src_x);
356 end_y= FFMIN(block_h, h-src_y);
357 end_x= FFMIN(block_w, w-src_x);
359 // copy existing part
360 for(y=start_y; y<end_y; y++){
361 for(x=start_x; x<end_x; x++){
362 buf[x + y*linesize]= src[x + y*linesize];
367 for(y=0; y<start_y; y++){
368 for(x=start_x; x<end_x; x++){
369 buf[x + y*linesize]= buf[x + start_y*linesize];
374 for(y=end_y; y<block_h; y++){
375 for(x=start_x; x<end_x; x++){
376 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
380 for(y=0; y<block_h; y++){
382 for(x=0; x<start_x; x++){
383 buf[x + y*linesize]= buf[start_x + y*linesize];
387 for(x=end_x; x<block_w; x++){
388 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397 /* read the pixels */
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
416 /* read the pixels */
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
439 /* read the pixels */
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
461 /* read the pixels */
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
499 else if (*block > 127)
502 *pixels = (uint8_t)(*block + 128);
506 pixels += (line_size - 8);
510 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
515 /* read the pixels */
517 pixels[0] = block[0];
518 pixels[1] = block[1];
519 pixels[2] = block[2];
520 pixels[3] = block[3];
521 pixels[4] = block[4];
522 pixels[5] = block[5];
523 pixels[6] = block[6];
524 pixels[7] = block[7];
531 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
535 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
537 /* read the pixels */
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
543 pixels[4] = cm[pixels[4] + block[4]];
544 pixels[5] = cm[pixels[5] + block[5]];
545 pixels[6] = cm[pixels[6] + block[6]];
546 pixels[7] = cm[pixels[7] + block[7]];
552 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
556 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
558 /* read the pixels */
560 pixels[0] = cm[pixels[0] + block[0]];
561 pixels[1] = cm[pixels[1] + block[1]];
562 pixels[2] = cm[pixels[2] + block[2]];
563 pixels[3] = cm[pixels[3] + block[3]];
569 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
573 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
575 /* read the pixels */
577 pixels[0] = cm[pixels[0] + block[0]];
578 pixels[1] = cm[pixels[1] + block[1]];
584 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
588 pixels[0] += block[0];
589 pixels[1] += block[1];
590 pixels[2] += block[2];
591 pixels[3] += block[3];
592 pixels[4] += block[4];
593 pixels[5] += block[5];
594 pixels[6] += block[6];
595 pixels[7] += block[7];
601 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
605 pixels[0] += block[0];
606 pixels[1] += block[1];
607 pixels[2] += block[2];
608 pixels[3] += block[3];
614 static int sum_abs_dctelem_c(DCTELEM *block)
618 sum+= FFABS(block[i]);
622 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
626 for (i = 0; i < h; i++) {
627 memset(block, value, 16);
632 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
636 for (i = 0; i < h; i++) {
637 memset(block, value, 8);
642 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
645 uint16_t *dst1 = (uint16_t *) dst;
646 uint16_t *dst2 = (uint16_t *)(dst + linesize);
648 for (j = 0; j < 8; j++) {
649 for (i = 0; i < 8; i++) {
650 dst1[i] = dst2[i] = src[i] * 0x0101;
660 #define PIXOP2(OPNAME, OP) \
661 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
665 OP(*((uint64_t*)block), AV_RN64(pixels));\
671 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
675 const uint64_t a= AV_RN64(pixels );\
676 const uint64_t b= AV_RN64(pixels+1);\
677 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
683 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
687 const uint64_t a= AV_RN64(pixels );\
688 const uint64_t b= AV_RN64(pixels+1);\
689 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
695 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
699 const uint64_t a= AV_RN64(pixels );\
700 const uint64_t b= AV_RN64(pixels+line_size);\
701 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
707 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
711 const uint64_t a= AV_RN64(pixels );\
712 const uint64_t b= AV_RN64(pixels+line_size);\
713 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
719 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
722 const uint64_t a= AV_RN64(pixels );\
723 const uint64_t b= AV_RN64(pixels+1);\
724 uint64_t l0= (a&0x0303030303030303ULL)\
725 + (b&0x0303030303030303ULL)\
726 + 0x0202020202020202ULL;\
727 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
728 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
732 for(i=0; i<h; i+=2){\
733 uint64_t a= AV_RN64(pixels );\
734 uint64_t b= AV_RN64(pixels+1);\
735 l1= (a&0x0303030303030303ULL)\
736 + (b&0x0303030303030303ULL);\
737 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
738 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
739 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
742 a= AV_RN64(pixels );\
743 b= AV_RN64(pixels+1);\
744 l0= (a&0x0303030303030303ULL)\
745 + (b&0x0303030303030303ULL)\
746 + 0x0202020202020202ULL;\
747 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
748 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
749 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
755 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
758 const uint64_t a= AV_RN64(pixels );\
759 const uint64_t b= AV_RN64(pixels+1);\
760 uint64_t l0= (a&0x0303030303030303ULL)\
761 + (b&0x0303030303030303ULL)\
762 + 0x0101010101010101ULL;\
763 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
764 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
768 for(i=0; i<h; i+=2){\
769 uint64_t a= AV_RN64(pixels );\
770 uint64_t b= AV_RN64(pixels+1);\
771 l1= (a&0x0303030303030303ULL)\
772 + (b&0x0303030303030303ULL);\
773 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
774 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
775 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
778 a= AV_RN64(pixels );\
779 b= AV_RN64(pixels+1);\
780 l0= (a&0x0303030303030303ULL)\
781 + (b&0x0303030303030303ULL)\
782 + 0x0101010101010101ULL;\
783 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
784 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
785 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
791 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
792 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
793 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
794 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
795 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
799 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
800 #else // 64 bit variant
802 #define PIXOP2(OPNAME, OP) \
803 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
806 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
811 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
814 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
819 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
822 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
823 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
828 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
829 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
832 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
833 int src_stride1, int src_stride2, int h){\
837 a= AV_RN32(&src1[i*src_stride1 ]);\
838 b= AV_RN32(&src2[i*src_stride2 ]);\
839 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
840 a= AV_RN32(&src1[i*src_stride1+4]);\
841 b= AV_RN32(&src2[i*src_stride2+4]);\
842 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
846 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
847 int src_stride1, int src_stride2, int h){\
851 a= AV_RN32(&src1[i*src_stride1 ]);\
852 b= AV_RN32(&src2[i*src_stride2 ]);\
853 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
854 a= AV_RN32(&src1[i*src_stride1+4]);\
855 b= AV_RN32(&src2[i*src_stride2+4]);\
856 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
860 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
861 int src_stride1, int src_stride2, int h){\
865 a= AV_RN32(&src1[i*src_stride1 ]);\
866 b= AV_RN32(&src2[i*src_stride2 ]);\
867 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
871 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
872 int src_stride1, int src_stride2, int h){\
876 a= AV_RN16(&src1[i*src_stride1 ]);\
877 b= AV_RN16(&src2[i*src_stride2 ]);\
878 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
882 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
883 int src_stride1, int src_stride2, int h){\
884 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
885 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
888 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
889 int src_stride1, int src_stride2, int h){\
890 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
891 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
894 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
898 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
899 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
902 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
906 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
907 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
910 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
911 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
914 uint32_t a, b, c, d, l0, l1, h0, h1;\
915 a= AV_RN32(&src1[i*src_stride1]);\
916 b= AV_RN32(&src2[i*src_stride2]);\
917 c= AV_RN32(&src3[i*src_stride3]);\
918 d= AV_RN32(&src4[i*src_stride4]);\
919 l0= (a&0x03030303UL)\
922 h0= ((a&0xFCFCFCFCUL)>>2)\
923 + ((b&0xFCFCFCFCUL)>>2);\
924 l1= (c&0x03030303UL)\
926 h1= ((c&0xFCFCFCFCUL)>>2)\
927 + ((d&0xFCFCFCFCUL)>>2);\
928 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
929 a= AV_RN32(&src1[i*src_stride1+4]);\
930 b= AV_RN32(&src2[i*src_stride2+4]);\
931 c= AV_RN32(&src3[i*src_stride3+4]);\
932 d= AV_RN32(&src4[i*src_stride4+4]);\
933 l0= (a&0x03030303UL)\
936 h0= ((a&0xFCFCFCFCUL)>>2)\
937 + ((b&0xFCFCFCFCUL)>>2);\
938 l1= (c&0x03030303UL)\
940 h1= ((c&0xFCFCFCFCUL)>>2)\
941 + ((d&0xFCFCFCFCUL)>>2);\
942 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
946 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
947 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
950 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
951 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
954 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
955 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
958 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
959 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
962 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
963 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
966 uint32_t a, b, c, d, l0, l1, h0, h1;\
967 a= AV_RN32(&src1[i*src_stride1]);\
968 b= AV_RN32(&src2[i*src_stride2]);\
969 c= AV_RN32(&src3[i*src_stride3]);\
970 d= AV_RN32(&src4[i*src_stride4]);\
971 l0= (a&0x03030303UL)\
974 h0= ((a&0xFCFCFCFCUL)>>2)\
975 + ((b&0xFCFCFCFCUL)>>2);\
976 l1= (c&0x03030303UL)\
978 h1= ((c&0xFCFCFCFCUL)>>2)\
979 + ((d&0xFCFCFCFCUL)>>2);\
980 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981 a= AV_RN32(&src1[i*src_stride1+4]);\
982 b= AV_RN32(&src2[i*src_stride2+4]);\
983 c= AV_RN32(&src3[i*src_stride3+4]);\
984 d= AV_RN32(&src4[i*src_stride4+4]);\
985 l0= (a&0x03030303UL)\
988 h0= ((a&0xFCFCFCFCUL)>>2)\
989 + ((b&0xFCFCFCFCUL)>>2);\
990 l1= (c&0x03030303UL)\
992 h1= ((c&0xFCFCFCFCUL)>>2)\
993 + ((d&0xFCFCFCFCUL)>>2);\
994 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
997 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
998 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
999 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1000 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1004 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1005 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1008 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1010 int i, a0, b0, a1, b1;\
1017 for(i=0; i<h; i+=2){\
1023 block[0]= (a1+a0)>>2; /* FIXME non put */\
1024 block[1]= (b1+b0)>>2;\
1034 block[0]= (a1+a0)>>2;\
1035 block[1]= (b1+b0)>>2;\
1041 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1044 const uint32_t a= AV_RN32(pixels );\
1045 const uint32_t b= AV_RN32(pixels+1);\
1046 uint32_t l0= (a&0x03030303UL)\
1049 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1050 + ((b&0xFCFCFCFCUL)>>2);\
1054 for(i=0; i<h; i+=2){\
1055 uint32_t a= AV_RN32(pixels );\
1056 uint32_t b= AV_RN32(pixels+1);\
1057 l1= (a&0x03030303UL)\
1058 + (b&0x03030303UL);\
1059 h1= ((a&0xFCFCFCFCUL)>>2)\
1060 + ((b&0xFCFCFCFCUL)>>2);\
1061 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1064 a= AV_RN32(pixels );\
1065 b= AV_RN32(pixels+1);\
1066 l0= (a&0x03030303UL)\
1069 h0= ((a&0xFCFCFCFCUL)>>2)\
1070 + ((b&0xFCFCFCFCUL)>>2);\
1071 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1080 for(j=0; j<2; j++){\
1082 const uint32_t a= AV_RN32(pixels );\
1083 const uint32_t b= AV_RN32(pixels+1);\
1084 uint32_t l0= (a&0x03030303UL)\
1087 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1092 for(i=0; i<h; i+=2){\
1093 uint32_t a= AV_RN32(pixels );\
1094 uint32_t b= AV_RN32(pixels+1);\
1095 l1= (a&0x03030303UL)\
1096 + (b&0x03030303UL);\
1097 h1= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1102 a= AV_RN32(pixels );\
1103 b= AV_RN32(pixels+1);\
1104 l0= (a&0x03030303UL)\
1107 h0= ((a&0xFCFCFCFCUL)>>2)\
1108 + ((b&0xFCFCFCFCUL)>>2);\
1109 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1113 pixels+=4-line_size*(h+1);\
1114 block +=4-line_size*h;\
1118 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1121 for(j=0; j<2; j++){\
1123 const uint32_t a= AV_RN32(pixels );\
1124 const uint32_t b= AV_RN32(pixels+1);\
1125 uint32_t l0= (a&0x03030303UL)\
1128 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1129 + ((b&0xFCFCFCFCUL)>>2);\
1133 for(i=0; i<h; i+=2){\
1134 uint32_t a= AV_RN32(pixels );\
1135 uint32_t b= AV_RN32(pixels+1);\
1136 l1= (a&0x03030303UL)\
1137 + (b&0x03030303UL);\
1138 h1= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1140 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1143 a= AV_RN32(pixels );\
1144 b= AV_RN32(pixels+1);\
1145 l0= (a&0x03030303UL)\
1148 h0= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1154 pixels+=4-line_size*(h+1);\
1155 block +=4-line_size*h;\
1159 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1163 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1164 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1168 #define op_avg(a, b) a = rnd_avg32(a, b)
1170 #define op_put(a, b) a = b
1177 #define put_no_rnd_pixels8_c put_pixels8_c
1178 #define put_no_rnd_pixels16_c put_pixels16_c
1180 #define avg2(a,b) ((a+b+1)>>1)
1181 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1183 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1184 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1187 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1188 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1191 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1193 const int A=(16-x16)*(16-y16);
1194 const int B=( x16)*(16-y16);
1195 const int C=(16-x16)*( y16);
1196 const int D=( x16)*( y16);
1201 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1202 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1203 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1204 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1205 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1206 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1207 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1208 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1214 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1215 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1218 const int s= 1<<shift;
1228 for(x=0; x<8; x++){ //XXX FIXME optimize
1229 int src_x, src_y, frac_x, frac_y, index;
1233 frac_x= src_x&(s-1);
1234 frac_y= src_y&(s-1);
1238 if((unsigned)src_x < width){
1239 if((unsigned)src_y < height){
1240 index= src_x + src_y*stride;
1241 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1242 + src[index +1]* frac_x )*(s-frac_y)
1243 + ( src[index+stride ]*(s-frac_x)
1244 + src[index+stride+1]* frac_x )* frac_y
1247 index= src_x + av_clip(src_y, 0, height)*stride;
1248 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1249 + src[index +1]* frac_x )*s
1253 if((unsigned)src_y < height){
1254 index= av_clip(src_x, 0, width) + src_y*stride;
1255 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1256 + src[index+stride ]* frac_y )*s
1259 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1260 dst[y*stride + x]= src[index ];
1272 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1274 case 2: put_pixels2_c (dst, src, stride, height); break;
1275 case 4: put_pixels4_c (dst, src, stride, height); break;
1276 case 8: put_pixels8_c (dst, src, stride, height); break;
1277 case 16:put_pixels16_c(dst, src, stride, height); break;
1281 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283 for (i=0; i < height; i++) {
1284 for (j=0; j < width; j++) {
1285 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1292 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294 for (i=0; i < height; i++) {
1295 for (j=0; j < width; j++) {
1296 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1303 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305 for (i=0; i < height; i++) {
1306 for (j=0; j < width; j++) {
1307 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1314 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1316 for (i=0; i < height; i++) {
1317 for (j=0; j < width; j++) {
1318 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1325 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1327 for (i=0; i < height; i++) {
1328 for (j=0; j < width; j++) {
1329 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1336 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1338 for (i=0; i < height; i++) {
1339 for (j=0; j < width; j++) {
1340 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1347 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1349 for (i=0; i < height; i++) {
1350 for (j=0; j < width; j++) {
1351 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1358 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1360 for (i=0; i < height; i++) {
1361 for (j=0; j < width; j++) {
1362 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1369 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371 case 2: avg_pixels2_c (dst, src, stride, height); break;
1372 case 4: avg_pixels4_c (dst, src, stride, height); break;
1373 case 8: avg_pixels8_c (dst, src, stride, height); break;
1374 case 16:avg_pixels16_c(dst, src, stride, height); break;
1378 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1389 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1400 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402 for (i=0; i < height; i++) {
1403 for (j=0; j < width; j++) {
1404 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1411 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1413 for (i=0; i < height; i++) {
1414 for (j=0; j < width; j++) {
1415 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1422 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1424 for (i=0; i < height; i++) {
1425 for (j=0; j < width; j++) {
1426 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1433 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1435 for (i=0; i < height; i++) {
1436 for (j=0; j < width; j++) {
1437 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1444 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1446 for (i=0; i < height; i++) {
1447 for (j=0; j < width; j++) {
1448 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1455 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1457 for (i=0; i < height; i++) {
1458 for (j=0; j < width; j++) {
1459 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1466 #define TPEL_WIDTH(width)\
1467 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1469 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1487 #define H264_CHROMA_MC(OPNAME, OP)\
1488 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1489 const int A=(8-x)*(8-y);\
1490 const int B=( x)*(8-y);\
1491 const int C=(8-x)*( y);\
1492 const int D=( x)*( y);\
1495 assert(x<8 && y<8 && x>=0 && y>=0);\
1498 for(i=0; i<h; i++){\
1499 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1500 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1506 const int step= C ? stride : 1;\
1507 for(i=0; i<h; i++){\
1508 OP(dst[0], (A*src[0] + E*src[step+0]));\
1509 OP(dst[1], (A*src[1] + E*src[step+1]));\
1516 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1517 const int A=(8-x)*(8-y);\
1518 const int B=( x)*(8-y);\
1519 const int C=(8-x)*( y);\
1520 const int D=( x)*( y);\
1523 assert(x<8 && y<8 && x>=0 && y>=0);\
1526 for(i=0; i<h; i++){\
1527 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1528 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1529 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1530 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1536 const int step= C ? stride : 1;\
1537 for(i=0; i<h; i++){\
1538 OP(dst[0], (A*src[0] + E*src[step+0]));\
1539 OP(dst[1], (A*src[1] + E*src[step+1]));\
1540 OP(dst[2], (A*src[2] + E*src[step+2]));\
1541 OP(dst[3], (A*src[3] + E*src[step+3]));\
1548 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1549 const int A=(8-x)*(8-y);\
1550 const int B=( x)*(8-y);\
1551 const int C=(8-x)*( y);\
1552 const int D=( x)*( y);\
1555 assert(x<8 && y<8 && x>=0 && y>=0);\
1558 for(i=0; i<h; i++){\
1559 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1560 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1561 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1562 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1563 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1564 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1565 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1566 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1572 const int step= C ? stride : 1;\
1573 for(i=0; i<h; i++){\
1574 OP(dst[0], (A*src[0] + E*src[step+0]));\
1575 OP(dst[1], (A*src[1] + E*src[step+1]));\
1576 OP(dst[2], (A*src[2] + E*src[step+2]));\
1577 OP(dst[3], (A*src[3] + E*src[step+3]));\
1578 OP(dst[4], (A*src[4] + E*src[step+4]));\
1579 OP(dst[5], (A*src[5] + E*src[step+5]));\
1580 OP(dst[6], (A*src[6] + E*src[step+6]));\
1581 OP(dst[7], (A*src[7] + E*src[step+7]));\
1588 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1589 #define op_put(a, b) a = (((b) + 32)>>6)
1591 H264_CHROMA_MC(put_ , op_put)
1592 H264_CHROMA_MC(avg_ , op_avg)
1596 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1597 const int A=(8-x)*(8-y);
1598 const int B=( x)*(8-y);
1599 const int C=(8-x)*( y);
1600 const int D=( x)*( y);
1603 assert(x<8 && y<8 && x>=0 && y>=0);
1607 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1608 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1609 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1610 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1611 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1612 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1613 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1614 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1620 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1621 const int A=(8-x)*(8-y);
1622 const int B=( x)*(8-y);
1623 const int C=(8-x)*( y);
1624 const int D=( x)*( y);
1627 assert(x<8 && y<8 && x>=0 && y>=0);
1631 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1632 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1633 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1634 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1635 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1636 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1637 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1638 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1644 #define QPEL_MC(r, OPNAME, RND, OP) \
1645 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1646 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1650 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1651 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1652 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1653 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1654 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1655 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1656 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1657 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1663 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1665 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1669 const int src0= src[0*srcStride];\
1670 const int src1= src[1*srcStride];\
1671 const int src2= src[2*srcStride];\
1672 const int src3= src[3*srcStride];\
1673 const int src4= src[4*srcStride];\
1674 const int src5= src[5*srcStride];\
1675 const int src6= src[6*srcStride];\
1676 const int src7= src[7*srcStride];\
1677 const int src8= src[8*srcStride];\
1678 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1679 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1680 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1681 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1682 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1683 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1684 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1685 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1691 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1692 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1697 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1698 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1699 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1700 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1701 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1702 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1703 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1704 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1705 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1706 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1707 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1708 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1709 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1710 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1711 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1712 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1718 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1719 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1724 const int src0= src[0*srcStride];\
1725 const int src1= src[1*srcStride];\
1726 const int src2= src[2*srcStride];\
1727 const int src3= src[3*srcStride];\
1728 const int src4= src[4*srcStride];\
1729 const int src5= src[5*srcStride];\
1730 const int src6= src[6*srcStride];\
1731 const int src7= src[7*srcStride];\
1732 const int src8= src[8*srcStride];\
1733 const int src9= src[9*srcStride];\
1734 const int src10= src[10*srcStride];\
1735 const int src11= src[11*srcStride];\
1736 const int src12= src[12*srcStride];\
1737 const int src13= src[13*srcStride];\
1738 const int src14= src[14*srcStride];\
1739 const int src15= src[15*srcStride];\
1740 const int src16= src[16*srcStride];\
1741 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1742 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1743 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1744 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1745 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1746 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1747 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1748 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1749 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1750 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1751 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1752 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1753 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1754 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1755 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1756 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1762 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1764 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1765 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1768 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1769 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1772 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1775 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1778 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[16*9];\
1781 copy_block9(full, src, 16, stride, 9);\
1782 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1783 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1786 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1787 uint8_t full[16*9];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1792 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t full[16*9];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1797 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1799 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1800 uint8_t full[16*9];\
1803 uint8_t halfHV[64];\
1804 copy_block9(full, src, 16, stride, 9);\
1805 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1810 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1811 uint8_t full[16*9];\
1813 uint8_t halfHV[64];\
1814 copy_block9(full, src, 16, stride, 9);\
1815 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1817 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1818 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1820 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1821 uint8_t full[16*9];\
1824 uint8_t halfHV[64];\
1825 copy_block9(full, src, 16, stride, 9);\
1826 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1827 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1829 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1831 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1832 uint8_t full[16*9];\
1834 uint8_t halfHV[64];\
1835 copy_block9(full, src, 16, stride, 9);\
1836 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1838 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1839 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1841 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1842 uint8_t full[16*9];\
1845 uint8_t halfHV[64];\
1846 copy_block9(full, src, 16, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1849 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1850 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1852 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t full[16*9];\
1855 uint8_t halfHV[64];\
1856 copy_block9(full, src, 16, stride, 9);\
1857 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1859 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1860 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1862 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t full[16*9];\
1866 uint8_t halfHV[64];\
1867 copy_block9(full, src, 16, stride, 9);\
1868 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1869 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1870 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1871 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1873 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1874 uint8_t full[16*9];\
1876 uint8_t halfHV[64];\
1877 copy_block9(full, src, 16, stride, 9);\
1878 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1879 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1880 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1881 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1883 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t halfHV[64];\
1886 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1887 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1890 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1892 uint8_t halfHV[64];\
1893 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1897 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[16*9];\
1901 uint8_t halfHV[64];\
1902 copy_block9(full, src, 16, stride, 9);\
1903 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1904 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1905 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1906 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1908 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[16*9];\
1911 copy_block9(full, src, 16, stride, 9);\
1912 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1914 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1916 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[16*9];\
1920 uint8_t halfHV[64];\
1921 copy_block9(full, src, 16, stride, 9);\
1922 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1924 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1927 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[16*9];\
1930 copy_block9(full, src, 16, stride, 9);\
1931 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1932 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1933 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1935 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1937 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1938 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1941 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1943 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1944 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1947 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1948 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1951 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1954 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1957 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t full[24*17];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1962 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1965 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t full[24*17];\
1967 copy_block17(full, src, 24, stride, 17);\
1968 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1971 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1972 uint8_t full[24*17];\
1974 copy_block17(full, src, 24, stride, 17);\
1975 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1976 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1978 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[24*17];\
1980 uint8_t halfH[272];\
1981 uint8_t halfV[256];\
1982 uint8_t halfHV[256];\
1983 copy_block17(full, src, 24, stride, 17);\
1984 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1989 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1990 uint8_t full[24*17];\
1991 uint8_t halfH[272];\
1992 uint8_t halfHV[256];\
1993 copy_block17(full, src, 24, stride, 17);\
1994 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1995 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1996 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t full[24*17];\
2001 uint8_t halfH[272];\
2002 uint8_t halfV[256];\
2003 uint8_t halfHV[256];\
2004 copy_block17(full, src, 24, stride, 17);\
2005 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2006 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2007 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2008 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2010 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2011 uint8_t full[24*17];\
2012 uint8_t halfH[272];\
2013 uint8_t halfHV[256];\
2014 copy_block17(full, src, 24, stride, 17);\
2015 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2016 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2017 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2018 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2020 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[24*17];\
2022 uint8_t halfH[272];\
2023 uint8_t halfV[256];\
2024 uint8_t halfHV[256];\
2025 copy_block17(full, src, 24, stride, 17);\
2026 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2028 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2029 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2031 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t full[24*17];\
2033 uint8_t halfH[272];\
2034 uint8_t halfHV[256];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2041 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[24*17];\
2043 uint8_t halfH[272];\
2044 uint8_t halfV[256];\
2045 uint8_t halfHV[256];\
2046 copy_block17(full, src, 24, stride, 17);\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2048 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2049 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2050 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2052 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[24*17];\
2054 uint8_t halfH[272];\
2055 uint8_t halfHV[256];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2062 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2063 uint8_t halfH[272];\
2064 uint8_t halfHV[256];\
2065 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2066 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2067 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2069 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2070 uint8_t halfH[272];\
2071 uint8_t halfHV[256];\
2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2074 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2076 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t full[24*17];\
2078 uint8_t halfH[272];\
2079 uint8_t halfV[256];\
2080 uint8_t halfHV[256];\
2081 copy_block17(full, src, 24, stride, 17);\
2082 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2083 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2084 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2085 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2087 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2088 uint8_t full[24*17];\
2089 uint8_t halfH[272];\
2090 copy_block17(full, src, 24, stride, 17);\
2091 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2092 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2093 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2095 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096 uint8_t full[24*17];\
2097 uint8_t halfH[272];\
2098 uint8_t halfV[256];\
2099 uint8_t halfHV[256];\
2100 copy_block17(full, src, 24, stride, 17);\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2106 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2107 uint8_t full[24*17];\
2108 uint8_t halfH[272];\
2109 copy_block17(full, src, 24, stride, 17);\
2110 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2111 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2112 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2114 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2115 uint8_t halfH[272];\
2116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2117 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2120 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2121 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2122 #define op_put(a, b) a = cm[((b) + 16)>>5]
2123 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2125 QPEL_MC(0, put_ , _ , op_put)
2126 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2127 QPEL_MC(0, avg_ , _ , op_avg)
2128 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2130 #undef op_avg_no_rnd
2132 #undef op_put_no_rnd
2134 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2135 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2136 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2137 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2138 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2139 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2142 #define H264_LOWPASS(OPNAME, OP, OP2) \
2143 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2149 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2150 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2156 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2158 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2162 const int srcB= src[-2*srcStride];\
2163 const int srcA= src[-1*srcStride];\
2164 const int src0= src[0 *srcStride];\
2165 const int src1= src[1 *srcStride];\
2166 const int src2= src[2 *srcStride];\
2167 const int src3= src[3 *srcStride];\
2168 const int src4= src[4 *srcStride];\
2169 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2170 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2176 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2179 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2181 src -= 2*srcStride;\
2182 for(i=0; i<h+5; i++)\
2184 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2185 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2189 tmp -= tmpStride*(h+5-2);\
2192 const int tmpB= tmp[-2*tmpStride];\
2193 const int tmpA= tmp[-1*tmpStride];\
2194 const int tmp0= tmp[0 *tmpStride];\
2195 const int tmp1= tmp[1 *tmpStride];\
2196 const int tmp2= tmp[2 *tmpStride];\
2197 const int tmp3= tmp[3 *tmpStride];\
2198 const int tmp4= tmp[4 *tmpStride];\
2199 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2205 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2207 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2213 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2214 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2220 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2222 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2226 const int srcB= src[-2*srcStride];\
2227 const int srcA= src[-1*srcStride];\
2228 const int src0= src[0 *srcStride];\
2229 const int src1= src[1 *srcStride];\
2230 const int src2= src[2 *srcStride];\
2231 const int src3= src[3 *srcStride];\
2232 const int src4= src[4 *srcStride];\
2233 const int src5= src[5 *srcStride];\
2234 const int src6= src[6 *srcStride];\
2235 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2236 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2237 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2238 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2244 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2249 src -= 2*srcStride;\
2250 for(i=0; i<h+5; i++)\
2252 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2253 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2254 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2255 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2259 tmp -= tmpStride*(h+5-2);\
2262 const int tmpB= tmp[-2*tmpStride];\
2263 const int tmpA= tmp[-1*tmpStride];\
2264 const int tmp0= tmp[0 *tmpStride];\
2265 const int tmp1= tmp[1 *tmpStride];\
2266 const int tmp2= tmp[2 *tmpStride];\
2267 const int tmp3= tmp[3 *tmpStride];\
2268 const int tmp4= tmp[4 *tmpStride];\
2269 const int tmp5= tmp[5 *tmpStride];\
2270 const int tmp6= tmp[6 *tmpStride];\
2271 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2272 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2273 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2274 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2280 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2282 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2286 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2287 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2288 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2289 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2290 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2291 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2292 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2293 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2299 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2301 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2305 const int srcB= src[-2*srcStride];\
2306 const int srcA= src[-1*srcStride];\
2307 const int src0= src[0 *srcStride];\
2308 const int src1= src[1 *srcStride];\
2309 const int src2= src[2 *srcStride];\
2310 const int src3= src[3 *srcStride];\
2311 const int src4= src[4 *srcStride];\
2312 const int src5= src[5 *srcStride];\
2313 const int src6= src[6 *srcStride];\
2314 const int src7= src[7 *srcStride];\
2315 const int src8= src[8 *srcStride];\
2316 const int src9= src[9 *srcStride];\
2317 const int src10=src[10*srcStride];\
2318 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2319 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2320 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2321 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2322 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2323 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2324 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2325 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2331 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2334 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2336 src -= 2*srcStride;\
2337 for(i=0; i<h+5; i++)\
2339 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2340 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2341 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2342 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2343 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2344 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2345 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2346 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2350 tmp -= tmpStride*(h+5-2);\
2353 const int tmpB= tmp[-2*tmpStride];\
2354 const int tmpA= tmp[-1*tmpStride];\
2355 const int tmp0= tmp[0 *tmpStride];\
2356 const int tmp1= tmp[1 *tmpStride];\
2357 const int tmp2= tmp[2 *tmpStride];\
2358 const int tmp3= tmp[3 *tmpStride];\
2359 const int tmp4= tmp[4 *tmpStride];\
2360 const int tmp5= tmp[5 *tmpStride];\
2361 const int tmp6= tmp[6 *tmpStride];\
2362 const int tmp7= tmp[7 *tmpStride];\
2363 const int tmp8= tmp[8 *tmpStride];\
2364 const int tmp9= tmp[9 *tmpStride];\
2365 const int tmp10=tmp[10*tmpStride];\
2366 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2367 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2368 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2369 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2370 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2371 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2372 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2373 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2379 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2380 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2381 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2382 src += 8*srcStride;\
2383 dst += 8*dstStride;\
2384 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2385 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2388 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2389 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2390 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2391 src += 8*srcStride;\
2392 dst += 8*dstStride;\
2393 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2394 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2397 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2398 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2399 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2400 src += 8*srcStride;\
2401 dst += 8*dstStride;\
2402 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2403 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2406 #define H264_MC(OPNAME, SIZE) \
2407 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2408 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2412 uint8_t half[SIZE*SIZE];\
2413 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2414 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2417 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2418 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2422 uint8_t half[SIZE*SIZE];\
2423 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2424 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2427 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2428 uint8_t full[SIZE*(SIZE+5)];\
2429 uint8_t * const full_mid= full + SIZE*2;\
2430 uint8_t half[SIZE*SIZE];\
2431 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2432 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2433 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2436 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2437 uint8_t full[SIZE*(SIZE+5)];\
2438 uint8_t * const full_mid= full + SIZE*2;\
2439 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2440 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t full[SIZE*(SIZE+5)];\
2445 uint8_t * const full_mid= full + SIZE*2;\
2446 uint8_t half[SIZE*SIZE];\
2447 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2448 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2449 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2452 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2453 uint8_t full[SIZE*(SIZE+5)];\
2454 uint8_t * const full_mid= full + SIZE*2;\
2455 uint8_t halfH[SIZE*SIZE];\
2456 uint8_t halfV[SIZE*SIZE];\
2457 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2458 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2459 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2460 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2463 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2464 uint8_t full[SIZE*(SIZE+5)];\
2465 uint8_t * const full_mid= full + SIZE*2;\
2466 uint8_t halfH[SIZE*SIZE];\
2467 uint8_t halfV[SIZE*SIZE];\
2468 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2469 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2470 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2471 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2474 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2475 uint8_t full[SIZE*(SIZE+5)];\
2476 uint8_t * const full_mid= full + SIZE*2;\
2477 uint8_t halfH[SIZE*SIZE];\
2478 uint8_t halfV[SIZE*SIZE];\
2479 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2480 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2481 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2482 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2485 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2486 uint8_t full[SIZE*(SIZE+5)];\
2487 uint8_t * const full_mid= full + SIZE*2;\
2488 uint8_t halfH[SIZE*SIZE];\
2489 uint8_t halfV[SIZE*SIZE];\
2490 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2491 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2492 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2493 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2497 int16_t tmp[SIZE*(SIZE+5)];\
2498 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2501 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2502 int16_t tmp[SIZE*(SIZE+5)];\
2503 uint8_t halfH[SIZE*SIZE];\
2504 uint8_t halfHV[SIZE*SIZE];\
2505 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2506 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2507 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2511 int16_t tmp[SIZE*(SIZE+5)];\
2512 uint8_t halfH[SIZE*SIZE];\
2513 uint8_t halfHV[SIZE*SIZE];\
2514 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2515 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2516 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2519 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2520 uint8_t full[SIZE*(SIZE+5)];\
2521 uint8_t * const full_mid= full + SIZE*2;\
2522 int16_t tmp[SIZE*(SIZE+5)];\
2523 uint8_t halfV[SIZE*SIZE];\
2524 uint8_t halfHV[SIZE*SIZE];\
2525 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2526 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2527 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2528 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2531 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2532 uint8_t full[SIZE*(SIZE+5)];\
2533 uint8_t * const full_mid= full + SIZE*2;\
2534 int16_t tmp[SIZE*(SIZE+5)];\
2535 uint8_t halfV[SIZE*SIZE];\
2536 uint8_t halfHV[SIZE*SIZE];\
2537 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2538 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2539 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2540 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2543 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2544 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2545 #define op_put(a, b) a = cm[((b) + 16)>>5]
2546 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2547 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2549 H264_LOWPASS(put_ , op_put, op2_put)
2550 H264_LOWPASS(avg_ , op_avg, op2_avg)
2565 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2566 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2567 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2568 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2570 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2571 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2575 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2576 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2577 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2578 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2579 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2580 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2581 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2582 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2588 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2589 put_pixels8_c(dst, src, stride, 8);
2591 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2592 avg_pixels8_c(dst, src, stride, 8);
2594 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2595 put_pixels16_c(dst, src, stride, 16);
2597 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2598 avg_pixels16_c(dst, src, stride, 16);
2601 #if CONFIG_RV40_DECODER
2602 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2603 put_pixels16_xy2_c(dst, src, stride, 16);
2605 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2606 avg_pixels16_xy2_c(dst, src, stride, 16);
2608 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609 put_pixels8_xy2_c(dst, src, stride, 8);
2611 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612 avg_pixels8_xy2_c(dst, src, stride, 8);
2614 #endif /* CONFIG_RV40_DECODER */
2616 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2617 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2621 const int src_1= src[ -srcStride];
2622 const int src0 = src[0 ];
2623 const int src1 = src[ srcStride];
2624 const int src2 = src[2*srcStride];
2625 const int src3 = src[3*srcStride];
2626 const int src4 = src[4*srcStride];
2627 const int src5 = src[5*srcStride];
2628 const int src6 = src[6*srcStride];
2629 const int src7 = src[7*srcStride];
2630 const int src8 = src[8*srcStride];
2631 const int src9 = src[9*srcStride];
2632 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2633 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2634 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2635 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2636 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2637 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2638 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2639 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2645 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2647 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2648 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2651 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2652 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2655 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2657 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2661 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2662 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2665 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2669 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2670 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2671 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2672 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2674 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2678 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2679 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2680 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2681 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2683 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2685 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2686 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2689 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2690 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2692 const int strength= ff_h263_loop_filter_strength[qscale];
2696 int p0= src[x-2*stride];
2697 int p1= src[x-1*stride];
2698 int p2= src[x+0*stride];
2699 int p3= src[x+1*stride];
2700 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2702 if (d<-2*strength) d1= 0;
2703 else if(d<- strength) d1=-2*strength - d;
2704 else if(d< strength) d1= d;
2705 else if(d< 2*strength) d1= 2*strength - d;
2710 if(p1&256) p1= ~(p1>>31);
2711 if(p2&256) p2= ~(p2>>31);
2713 src[x-1*stride] = p1;
2714 src[x+0*stride] = p2;
2718 d2= av_clip((p0-p3)/4, -ad1, ad1);
2720 src[x-2*stride] = p0 - d2;
2721 src[x+ stride] = p3 + d2;
2726 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2727 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2729 const int strength= ff_h263_loop_filter_strength[qscale];
2733 int p0= src[y*stride-2];
2734 int p1= src[y*stride-1];
2735 int p2= src[y*stride+0];
2736 int p3= src[y*stride+1];
2737 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2739 if (d<-2*strength) d1= 0;
2740 else if(d<- strength) d1=-2*strength - d;
2741 else if(d< strength) d1= d;
2742 else if(d< 2*strength) d1= 2*strength - d;
2747 if(p1&256) p1= ~(p1>>31);
2748 if(p2&256) p2= ~(p2>>31);
2750 src[y*stride-1] = p1;
2751 src[y*stride+0] = p2;
2755 d2= av_clip((p0-p3)/4, -ad1, ad1);
2757 src[y*stride-2] = p0 - d2;
2758 src[y*stride+1] = p3 + d2;
2763 static void h261_loop_filter_c(uint8_t *src, int stride){
2768 temp[x ] = 4*src[x ];
2769 temp[x + 7*8] = 4*src[x + 7*stride];
2773 xy = y * stride + x;
2775 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2780 src[ y*stride] = (temp[ y*8] + 2)>>2;
2781 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2783 xy = y * stride + x;
2785 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2790 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2796 s += abs(pix1[0] - pix2[0]);
2797 s += abs(pix1[1] - pix2[1]);
2798 s += abs(pix1[2] - pix2[2]);
2799 s += abs(pix1[3] - pix2[3]);
2800 s += abs(pix1[4] - pix2[4]);
2801 s += abs(pix1[5] - pix2[5]);
2802 s += abs(pix1[6] - pix2[6]);
2803 s += abs(pix1[7] - pix2[7]);
2804 s += abs(pix1[8] - pix2[8]);
2805 s += abs(pix1[9] - pix2[9]);
2806 s += abs(pix1[10] - pix2[10]);
2807 s += abs(pix1[11] - pix2[11]);
2808 s += abs(pix1[12] - pix2[12]);
2809 s += abs(pix1[13] - pix2[13]);
2810 s += abs(pix1[14] - pix2[14]);
2811 s += abs(pix1[15] - pix2[15]);
2818 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2824 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2825 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2826 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2827 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2828 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2829 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2830 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2831 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2832 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2833 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2834 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2835 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2836 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2837 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2838 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2839 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2846 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2849 uint8_t *pix3 = pix2 + line_size;
2853 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2854 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2855 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2856 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2857 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2858 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2859 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2860 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2861 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2862 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2863 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2864 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2865 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2866 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2867 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2868 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2876 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2879 uint8_t *pix3 = pix2 + line_size;
2883 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2884 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2885 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2886 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2887 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2888 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2889 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2890 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2891 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2892 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2893 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2894 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2895 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2896 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2897 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2898 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2906 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2912 s += abs(pix1[0] - pix2[0]);
2913 s += abs(pix1[1] - pix2[1]);
2914 s += abs(pix1[2] - pix2[2]);
2915 s += abs(pix1[3] - pix2[3]);
2916 s += abs(pix1[4] - pix2[4]);
2917 s += abs(pix1[5] - pix2[5]);
2918 s += abs(pix1[6] - pix2[6]);
2919 s += abs(pix1[7] - pix2[7]);
2926 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2932 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2933 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2934 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2935 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2936 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2937 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2938 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2939 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2946 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2949 uint8_t *pix3 = pix2 + line_size;
2953 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2954 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2955 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2956 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2957 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2958 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2959 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2960 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2968 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2971 uint8_t *pix3 = pix2 + line_size;
2975 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2976 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2977 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2978 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2979 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2980 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2981 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2982 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2990 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2991 MpegEncContext *c = v;
2997 for(x=0; x<16; x++){
2998 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3001 for(x=0; x<15; x++){
3002 score2+= FFABS( s1[x ] - s1[x +stride]
3003 - s1[x+1] + s1[x+1+stride])
3004 -FFABS( s2[x ] - s2[x +stride]
3005 - s2[x+1] + s2[x+1+stride]);
3012 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3013 else return score1 + FFABS(score2)*8;
3016 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3017 MpegEncContext *c = v;
3024 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3028 score2+= FFABS( s1[x ] - s1[x +stride]
3029 - s1[x+1] + s1[x+1+stride])
3030 -FFABS( s2[x ] - s2[x +stride]
3031 - s2[x+1] + s2[x+1+stride]);
3038 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3039 else return score1 + FFABS(score2)*8;
3042 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3046 for(i=0; i<8*8; i++){
3047 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3050 assert(-512<b && b<512);
3052 sum += (w*b)*(w*b)>>4;
3057 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3060 for(i=0; i<8*8; i++){
3061 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3066 * permutes an 8x8 block.
3067 * @param block the block which will be permuted according to the given permutation vector
3068 * @param permutation the permutation vector
3069 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3070 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3071 * (inverse) permutated to scantable order!
3073 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3079 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3081 for(i=0; i<=last; i++){
3082 const int j= scantable[i];
3087 for(i=0; i<=last; i++){
3088 const int j= scantable[i];
3089 const int perm_j= permutation[j];
3090 block[perm_j]= temp[j];
3094 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3098 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3101 memset(cmp, 0, sizeof(void*)*6);
3109 cmp[i]= c->hadamard8_diff[i];
3115 cmp[i]= c->dct_sad[i];
3118 cmp[i]= c->dct264_sad[i];
3121 cmp[i]= c->dct_max[i];
3124 cmp[i]= c->quant_psnr[i];
3153 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3158 static void clear_block_c(DCTELEM *block)
3160 memset(block, 0, sizeof(DCTELEM)*64);
3164 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3166 static void clear_blocks_c(DCTELEM *blocks)
3168 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3171 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3173 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3174 long a = *(long*)(src+i);
3175 long b = *(long*)(dst+i);
3176 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3179 dst[i+0] += src[i+0];
3182 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3184 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3185 long a = *(long*)(src1+i);
3186 long b = *(long*)(src2+i);
3187 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3190 dst[i] = src1[i]+src2[i];
3193 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3195 #if !HAVE_FAST_UNALIGNED
3196 if((long)src2 & (sizeof(long)-1)){
3197 for(i=0; i+7<w; i+=8){
3198 dst[i+0] = src1[i+0]-src2[i+0];
3199 dst[i+1] = src1[i+1]-src2[i+1];
3200 dst[i+2] = src1[i+2]-src2[i+2];
3201 dst[i+3] = src1[i+3]-src2[i+3];
3202 dst[i+4] = src1[i+4]-src2[i+4];
3203 dst[i+5] = src1[i+5]-src2[i+5];
3204 dst[i+6] = src1[i+6]-src2[i+6];
3205 dst[i+7] = src1[i+7]-src2[i+7];
3209 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3210 long a = *(long*)(src1+i);
3211 long b = *(long*)(src2+i);
3212 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3215 dst[i+0] = src1[i+0]-src2[i+0];
3218 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3226 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3235 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3243 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3253 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3256 for(i=0; i<w-1; i++){
3283 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3313 #define BUTTERFLY2(o1,o2,i1,i2) \
3317 #define BUTTERFLY1(x,y) \
3326 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3328 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3336 //FIXME try pointer walks
3337 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3338 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3339 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3340 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3342 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3343 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3344 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3345 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3347 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3348 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3349 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3350 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3354 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3355 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3356 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3357 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3359 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3360 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3361 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3362 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3365 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3366 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3367 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3368 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3374 printf("MAX:%d\n", maxi);
3380 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3388 //FIXME try pointer walks
3389 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3390 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3391 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3392 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3394 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3395 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3396 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3397 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3399 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3400 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3401 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3402 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3406 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3407 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3408 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3409 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3411 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3412 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3413 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3414 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3417 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3418 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3419 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3420 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3423 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3428 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3429 MpegEncContext * const s= (MpegEncContext *)c;
3430 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3434 s->dsp.diff_pixels(temp, src1, src2, stride);
3436 return s->dsp.sum_abs_dctelem(temp);
3441 const int s07 = SRC(0) + SRC(7);\
3442 const int s16 = SRC(1) + SRC(6);\
3443 const int s25 = SRC(2) + SRC(5);\
3444 const int s34 = SRC(3) + SRC(4);\
3445 const int a0 = s07 + s34;\
3446 const int a1 = s16 + s25;\
3447 const int a2 = s07 - s34;\
3448 const int a3 = s16 - s25;\
3449 const int d07 = SRC(0) - SRC(7);\
3450 const int d16 = SRC(1) - SRC(6);\
3451 const int d25 = SRC(2) - SRC(5);\
3452 const int d34 = SRC(3) - SRC(4);\
3453 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3454 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3455 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3456 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3458 DST(1, a4 + (a7>>2)) ;\
3459 DST(2, a2 + (a3>>1)) ;\
3460 DST(3, a5 + (a6>>2)) ;\
3462 DST(5, a6 - (a5>>2)) ;\
3463 DST(6, (a2>>1) - a3 ) ;\
3464 DST(7, (a4>>2) - a7 ) ;\
3467 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468 MpegEncContext * const s= (MpegEncContext *)c;
3473 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3475 #define SRC(x) dct[i][x]
3476 #define DST(x,v) dct[i][x]= v
3477 for( i = 0; i < 8; i++ )
3482 #define SRC(x) dct[x][i]
3483 #define DST(x,v) sum += FFABS(v)
3484 for( i = 0; i < 8; i++ )
3492 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493 MpegEncContext * const s= (MpegEncContext *)c;
3494 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3499 s->dsp.diff_pixels(temp, src1, src2, stride);
3503 sum= FFMAX(sum, FFABS(temp[i]));
3508 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3509 MpegEncContext * const s= (MpegEncContext *)c;
3510 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3511 DCTELEM * const bak = temp+64;
3517 s->dsp.diff_pixels(temp, src1, src2, stride);
3519 memcpy(bak, temp, 64*sizeof(DCTELEM));
3521 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523 ff_simple_idct(temp); //FIXME
3526 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3531 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532 MpegEncContext * const s= (MpegEncContext *)c;
3533 const uint8_t *scantable= s->intra_scantable.permutated;
3534 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3535 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3536 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3537 int i, last, run, bits, level, distortion, start_i;
3538 const int esc_length= s->ac_esc_length;
3540 uint8_t * last_length;
3544 copy_block8(lsrc1, src1, 8, stride, 8);
3545 copy_block8(lsrc2, src2, 8, stride, 8);
3547 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3549 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3555 length = s->intra_ac_vlc_length;
3556 last_length= s->intra_ac_vlc_last_length;
3557 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3560 length = s->inter_ac_vlc_length;
3561 last_length= s->inter_ac_vlc_last_length;
3566 for(i=start_i; i<last; i++){
3567 int j= scantable[i];
3572 if((level&(~127)) == 0){
3573 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3582 level= temp[i] + 64;
3586 if((level&(~127)) == 0){
3587 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3595 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3597 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3600 s->dsp.idct_add(lsrc2, 8, temp);
3602 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3604 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3607 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3608 MpegEncContext * const s= (MpegEncContext *)c;
3609 const uint8_t *scantable= s->intra_scantable.permutated;
3610 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3611 int i, last, run, bits, level, start_i;
3612 const int esc_length= s->ac_esc_length;
3614 uint8_t * last_length;
3618 s->dsp.diff_pixels(temp, src1, src2, stride);
3620 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3626 length = s->intra_ac_vlc_length;
3627 last_length= s->intra_ac_vlc_last_length;
3628 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3631 length = s->inter_ac_vlc_length;
3632 last_length= s->inter_ac_vlc_last_length;
3637 for(i=start_i; i<last; i++){
3638 int j= scantable[i];
3643 if((level&(~127)) == 0){
3644 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3653 level= temp[i] + 64;
3657 if((level&(~127)) == 0){
3658 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3666 #define VSAD_INTRA(size) \
3667 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3671 for(y=1; y<h; y++){ \
3672 for(x=0; x<size; x+=4){ \
3673 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3674 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3684 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3689 for(x=0; x<16; x++){
3690 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3699 #define SQ(a) ((a)*(a))
3700 #define VSSE_INTRA(size) \
3701 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3705 for(y=1; y<h; y++){ \
3706 for(x=0; x<size; x+=4){ \
3707 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3708 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3718 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3723 for(x=0; x<16; x++){
3724 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3733 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3737 for(i=0; i<size; i++)
3738 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3742 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3743 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3744 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3746 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3748 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3749 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3750 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3751 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3753 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
3755 for(i=0; i<len; i++)
3756 dst[i] = src0[i] * src1[i];
3759 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3762 for(i=0; i<len; i++)
3763 dst[i] = src0[i] * src1[-i];
3766 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3768 for(i=0; i<len; i++)
3769 dst[i] = src0[i] * src1[i] + src2[i];
3772 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3777 for(i=-len, j=len-1; i<0; i++, j--) {
3782 dst[i] = s0*wj - s1*wi + add_bias;
3783 dst[j] = s0*wi + s1*wj + add_bias;
3787 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3791 for (i = 0; i < len; i++)
3792 dst[i] = src[i] * mul;
3795 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3796 const float **sv, float mul, int len)
3799 for (i = 0; i < len; i += 2, sv++) {
3800 dst[i ] = src[i ] * sv[0][0] * mul;
3801 dst[i+1] = src[i+1] * sv[0][1] * mul;
3805 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3806 const float **sv, float mul, int len)
3809 for (i = 0; i < len; i += 4, sv++) {
3810 dst[i ] = src[i ] * sv[0][0] * mul;
3811 dst[i+1] = src[i+1] * sv[0][1] * mul;
3812 dst[i+2] = src[i+2] * sv[0][2] * mul;
3813 dst[i+3] = src[i+3] * sv[0][3] * mul;
3817 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3821 for (i = 0; i < len; i += 2, sv++) {
3822 dst[i ] = sv[0][0] * mul;
3823 dst[i+1] = sv[0][1] * mul;
3827 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3831 for (i = 0; i < len; i += 4, sv++) {
3832 dst[i ] = sv[0][0] * mul;
3833 dst[i+1] = sv[0][1] * mul;
3834 dst[i+2] = sv[0][2] * mul;
3835 dst[i+3] = sv[0][3] * mul;
3839 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3843 for (i = 0; i < len; i++) {
3844 float t = v1[i] - v2[i];
3850 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3855 for (i = 0; i < len; i++)
3861 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3863 for(i=0; i<len; i++)
3864 dst[i] = src[i] * mul;
3867 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3868 uint32_t maxi, uint32_t maxisign)
3871 if(a > mini) return mini;
3872 else if((a^(1<<31)) > maxisign) return maxi;
3876 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3878 uint32_t mini = *(uint32_t*)min;
3879 uint32_t maxi = *(uint32_t*)max;
3880 uint32_t maxisign = maxi ^ (1<<31);
3881 uint32_t *dsti = (uint32_t*)dst;
3882 const uint32_t *srci = (const uint32_t*)src;
3883 for(i=0; i<len; i+=8) {
3884 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3885 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3886 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3887 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3888 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3889 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3890 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3891 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3894 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3896 if(min < 0 && max > 0) {
3897 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3899 for(i=0; i < len; i+=8) {
3900 dst[i ] = av_clipf(src[i ], min, max);
3901 dst[i + 1] = av_clipf(src[i + 1], min, max);
3902 dst[i + 2] = av_clipf(src[i + 2], min, max);
3903 dst[i + 3] = av_clipf(src[i + 3], min, max);
3904 dst[i + 4] = av_clipf(src[i + 4], min, max);
3905 dst[i + 5] = av_clipf(src[i + 5], min, max);
3906 dst[i + 6] = av_clipf(src[i + 6], min, max);
3907 dst[i + 7] = av_clipf(src[i + 7], min, max);
3912 static av_always_inline int float_to_int16_one(const float *src){
3913 int_fast32_t tmp = *(const int32_t*)src;
3915 tmp = (0x43c0ffff - tmp)>>31;
3916 // is this faster on some gcc/cpu combinations?
3917 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3920 return tmp - 0x8000;
3923 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3925 for(i=0; i<len; i++)
3926 dst[i] = float_to_int16_one(src+i);
3929 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3932 for(i=0; i<len; i++){
3933 dst[2*i] = float_to_int16_one(src[0]+i);
3934 dst[2*i+1] = float_to_int16_one(src[1]+i);
3937 for(c=0; c<channels; c++)
3938 for(i=0, j=c; i<len; i++, j+=channels)
3939 dst[j] = float_to_int16_one(src[c]+i);
3943 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3948 res += (*v1++ * *v2++) >> shift;
3953 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3958 *v1++ += mul * *v3++;
3964 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3965 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3966 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3967 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3968 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3969 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3970 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3972 static void wmv2_idct_row(short * b)
3975 int a0,a1,a2,a3,a4,a5,a6,a7;
3977 a1 = W1*b[1]+W7*b[7];
3978 a7 = W7*b[1]-W1*b[7];
3979 a5 = W5*b[5]+W3*b[3];
3980 a3 = W3*b[5]-W5*b[3];
3981 a2 = W2*b[2]+W6*b[6];
3982 a6 = W6*b[2]-W2*b[6];
3983 a0 = W0*b[0]+W0*b[4];
3984 a4 = W0*b[0]-W0*b[4];
3986 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3987 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3989 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3990 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3991 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3992 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3993 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3994 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3995 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3996 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3998 static void wmv2_idct_col(short * b)
4001 int a0,a1,a2,a3,a4,a5,a6,a7;
4002 /*step 1, with extended precision*/
4003 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4004 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4005 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4006 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4007 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4008 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4009 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4010 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4012 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4013 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4015 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4016 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4017 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4018 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4020 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4021 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4022 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4023 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4025 void ff_wmv2_idct_c(short * block){
4029 wmv2_idct_row(block+i);
4032 wmv2_idct_col(block+i);
4035 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4037 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4039 ff_wmv2_idct_c(block);
4040 put_pixels_clamped_c(block, dest, line_size);
4042 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4044 ff_wmv2_idct_c(block);
4045 add_pixels_clamped_c(block, dest, line_size);
4047 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4050 put_pixels_clamped_c(block, dest, line_size);
4052 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4055 add_pixels_clamped_c(block, dest, line_size);
4058 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4061 put_pixels_clamped4_c(block, dest, line_size);
4063 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4066 add_pixels_clamped4_c(block, dest, line_size);
4069 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4072 put_pixels_clamped2_c(block, dest, line_size);
4074 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4077 add_pixels_clamped2_c(block, dest, line_size);
4080 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4082 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4084 dest[0] = cm[(block[0] + 4)>>3];
4086 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4088 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4090 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4093 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4095 /* init static data */
4096 av_cold void dsputil_static_init(void)
4100 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4101 for(i=0;i<MAX_NEG_CROP;i++) {
4103 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4106 for(i=0;i<512;i++) {
4107 ff_squareTbl[i] = (i - 256) * (i - 256);
4110 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4113 int ff_check_alignment(void){
4114 static int did_fail=0;
4115 DECLARE_ALIGNED(16, int, aligned);
4117 if((intptr_t)&aligned & 15){
4119 #if HAVE_MMX || HAVE_ALTIVEC
4120 av_log(NULL, AV_LOG_ERROR,
4121 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4122 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4123 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4124 "Do not report crashes to FFmpeg developers.\n");
4133 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4137 ff_check_alignment();
4140 if(avctx->dct_algo==FF_DCT_FASTINT) {
4141 c->fdct = fdct_ifast;
4142 c->fdct248 = fdct_ifast248;
4144 else if(avctx->dct_algo==FF_DCT_FAAN) {
4145 c->fdct = ff_faandct;
4146 c->fdct248 = ff_faandct248;
4149 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4150 c->fdct248 = ff_fdct248_islow;
4152 #endif //CONFIG_ENCODERS
4154 if(avctx->lowres==1){
4155 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4156 c->idct_put= ff_jref_idct4_put;
4157 c->idct_add= ff_jref_idct4_add;
4159 c->idct_put= ff_h264_lowres_idct_put_c;
4160 c->idct_add= ff_h264_lowres_idct_add_c;
4162 c->idct = j_rev_dct4;
4163 c->idct_permutation_type= FF_NO_IDCT_PERM;
4164 }else if(avctx->lowres==2){
4165 c->idct_put= ff_jref_idct2_put;
4166 c->idct_add= ff_jref_idct2_add;
4167 c->idct = j_rev_dct2;
4168 c->idct_permutation_type= FF_NO_IDCT_PERM;
4169 }else if(avctx->lowres==3){
4170 c->idct_put= ff_jref_idct1_put;
4171 c->idct_add= ff_jref_idct1_add;
4172 c->idct = j_rev_dct1;
4173 c->idct_permutation_type= FF_NO_IDCT_PERM;
4175 if(avctx->idct_algo==FF_IDCT_INT){
4176 c->idct_put= ff_jref_idct_put;
4177 c->idct_add= ff_jref_idct_add;
4178 c->idct = j_rev_dct;
4179 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4180 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4181 avctx->idct_algo==FF_IDCT_VP3){
4182 c->idct_put= ff_vp3_idct_put_c;
4183 c->idct_add= ff_vp3_idct_add_c;
4184 c->idct = ff_vp3_idct_c;
4185 c->idct_permutation_type= FF_NO_IDCT_PERM;
4186 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4187 c->idct_put= ff_wmv2_idct_put_c;
4188 c->idct_add= ff_wmv2_idct_add_c;
4189 c->idct = ff_wmv2_idct_c;
4190 c->idct_permutation_type= FF_NO_IDCT_PERM;
4191 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4192 c->idct_put= ff_faanidct_put;
4193 c->idct_add= ff_faanidct_add;
4194 c->idct = ff_faanidct;
4195 c->idct_permutation_type= FF_NO_IDCT_PERM;
4196 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4197 c->idct_put= ff_ea_idct_put_c;
4198 c->idct_permutation_type= FF_NO_IDCT_PERM;
4199 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4200 c->idct = ff_bink_idct_c;
4201 c->idct_add = ff_bink_idct_add_c;
4202 c->idct_put = ff_bink_idct_put_c;
4203 c->idct_permutation_type = FF_NO_IDCT_PERM;
4204 }else{ //accurate/default
4205 c->idct_put= ff_simple_idct_put;
4206 c->idct_add= ff_simple_idct_add;
4207 c->idct = ff_simple_idct;
4208 c->idct_permutation_type= FF_NO_IDCT_PERM;
4212 c->get_pixels = get_pixels_c;
4213 c->diff_pixels = diff_pixels_c;
4214 c->put_pixels_clamped = put_pixels_clamped_c;
4215 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4216 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4217 c->add_pixels_clamped = add_pixels_clamped_c;
4218 c->add_pixels8 = add_pixels8_c;
4219 c->add_pixels4 = add_pixels4_c;
4220 c->sum_abs_dctelem = sum_abs_dctelem_c;
4223 c->clear_block = clear_block_c;
4224 c->clear_blocks = clear_blocks_c;
4225 c->pix_sum = pix_sum_c;
4226 c->pix_norm1 = pix_norm1_c;
4228 c->fill_block_tab[0] = fill_block16_c;
4229 c->fill_block_tab[1] = fill_block8_c;
4230 c->scale_block = scale_block_c;
4232 /* TODO [0] 16 [1] 8 */
4233 c->pix_abs[0][0] = pix_abs16_c;
4234 c->pix_abs[0][1] = pix_abs16_x2_c;
4235 c->pix_abs[0][2] = pix_abs16_y2_c;
4236 c->pix_abs[0][3] = pix_abs16_xy2_c;
4237 c->pix_abs[1][0] = pix_abs8_c;
4238 c->pix_abs[1][1] = pix_abs8_x2_c;
4239 c->pix_abs[1][2] = pix_abs8_y2_c;
4240 c->pix_abs[1][3] = pix_abs8_xy2_c;
4242 #define dspfunc(PFX, IDX, NUM) \
4243 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4244 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4245 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4246 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4248 dspfunc(put, 0, 16);
4249 dspfunc(put_no_rnd, 0, 16);
4251 dspfunc(put_no_rnd, 1, 8);
4255 dspfunc(avg, 0, 16);
4256 dspfunc(avg_no_rnd, 0, 16);
4258 dspfunc(avg_no_rnd, 1, 8);
4263 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4264 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4266 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4267 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4268 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4269 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4270 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4271 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4272 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4273 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4274 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4276 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4277 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4278 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4279 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4280 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4281 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4282 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4283 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4284 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4286 #define dspfunc(PFX, IDX, NUM) \
4287 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4288 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4289 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4290 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4291 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4292 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4293 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4294 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4295 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4296 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4297 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4298 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4299 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4300 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4301 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4302 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4304 dspfunc(put_qpel, 0, 16);
4305 dspfunc(put_no_rnd_qpel, 0, 16);
4307 dspfunc(avg_qpel, 0, 16);
4308 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4310 dspfunc(put_qpel, 1, 8);
4311 dspfunc(put_no_rnd_qpel, 1, 8);
4313 dspfunc(avg_qpel, 1, 8);
4314 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4316 dspfunc(put_h264_qpel, 0, 16);
4317 dspfunc(put_h264_qpel, 1, 8);
4318 dspfunc(put_h264_qpel, 2, 4);
4319 dspfunc(put_h264_qpel, 3, 2);
4320 dspfunc(avg_h264_qpel, 0, 16);
4321 dspfunc(avg_h264_qpel, 1, 8);
4322 dspfunc(avg_h264_qpel, 2, 4);
4325 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4326 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4327 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4328 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4329 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4330 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4331 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4332 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4334 c->draw_edges = draw_edges_c;
4336 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4337 ff_mlp_init(c, avctx);
4339 #if CONFIG_VC1_DECODER
4340 ff_vc1dsp_init(c,avctx);
4342 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4343 ff_intrax8dsp_init(c,avctx);
4345 #if CONFIG_RV30_DECODER
4346 ff_rv30dsp_init(c,avctx);
4348 #if CONFIG_RV40_DECODER
4349 ff_rv40dsp_init(c,avctx);
4350 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4351 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4352 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4353 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4356 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4357 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4358 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4359 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4360 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4361 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4362 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4363 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4365 #define SET_CMP_FUNC(name) \
4366 c->name[0]= name ## 16_c;\
4367 c->name[1]= name ## 8x8_c;
4369 SET_CMP_FUNC(hadamard8_diff)
4370 c->hadamard8_diff[4]= hadamard8_intra16_c;
4371 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4372 SET_CMP_FUNC(dct_sad)
4373 SET_CMP_FUNC(dct_max)
4375 SET_CMP_FUNC(dct264_sad)
4377 c->sad[0]= pix_abs16_c;
4378 c->sad[1]= pix_abs8_c;
4382 SET_CMP_FUNC(quant_psnr)
4385 c->vsad[0]= vsad16_c;
4386 c->vsad[4]= vsad_intra16_c;
4387 c->vsad[5]= vsad_intra8_c;
4388 c->vsse[0]= vsse16_c;
4389 c->vsse[4]= vsse_intra16_c;
4390 c->vsse[5]= vsse_intra8_c;
4391 c->nsse[0]= nsse16_c;
4392 c->nsse[1]= nsse8_c;
4394 ff_dsputil_init_dwt(c);
4397 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4399 c->add_bytes= add_bytes_c;
4400 c->add_bytes_l2= add_bytes_l2_c;
4401 c->diff_bytes= diff_bytes_c;
4402 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4403 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4404 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4405 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4406 c->bswap_buf= bswap_buf;
4407 #if CONFIG_PNG_DECODER
4408 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4411 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4412 c->h263_h_loop_filter= h263_h_loop_filter_c;
4413 c->h263_v_loop_filter= h263_v_loop_filter_c;
4416 if (CONFIG_VP3_DECODER) {
4417 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4418 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4419 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4422 c->h261_loop_filter= h261_loop_filter_c;
4424 c->try_8x8basis= try_8x8basis_c;
4425 c->add_8x8basis= add_8x8basis_c;
4427 #if CONFIG_VORBIS_DECODER
4428 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4430 #if CONFIG_AC3_DECODER
4431 c->ac3_downmix = ff_ac3_downmix_c;
4433 c->vector_fmul = vector_fmul_c;
4434 c->vector_fmul_reverse = vector_fmul_reverse_c;
4435 c->vector_fmul_add = vector_fmul_add_c;
4436 c->vector_fmul_window = ff_vector_fmul_window_c;
4437 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4438 c->vector_clipf = vector_clipf_c;
4439 c->float_to_int16 = ff_float_to_int16_c;
4440 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4441 c->scalarproduct_int16 = scalarproduct_int16_c;
4442 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4443 c->scalarproduct_float = scalarproduct_float_c;
4444 c->butterflies_float = butterflies_float_c;
4445 c->vector_fmul_scalar = vector_fmul_scalar_c;
4447 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4448 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4450 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4451 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4453 c->shrink[0]= av_image_copy_plane;
4454 c->shrink[1]= ff_shrink22;
4455 c->shrink[2]= ff_shrink44;
4456 c->shrink[3]= ff_shrink88;
4458 c->prefetch= just_return;
4460 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4461 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4463 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4464 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4465 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4466 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4467 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4468 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4469 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4470 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4471 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4473 for(i=0; i<64; i++){
4474 if(!c->put_2tap_qpel_pixels_tab[0][i])
4475 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4476 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4477 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4480 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4481 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4482 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4483 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4485 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4486 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4487 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4488 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4490 switch(c->idct_permutation_type){
4491 case FF_NO_IDCT_PERM:
4493 c->idct_permutation[i]= i;
4495 case FF_LIBMPEG2_IDCT_PERM:
4497 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4499 case FF_SIMPLE_IDCT_PERM:
4501 c->idct_permutation[i]= simple_mmx_permutation[i];
4503 case FF_TRANSPOSE_IDCT_PERM:
4505 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4507 case FF_PARTTRANS_IDCT_PERM:
4509 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4511 case FF_SSE2_IDCT_PERM:
4513 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4516 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");