3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavcore/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
45 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
46 uint32_t ff_squareTbl[512] = {0, };
48 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
49 #define pb_7f (~0UL/255 * 0x7f)
50 #define pb_80 (~0UL/255 * 0x80)
52 const uint8_t ff_zigzag_direct[64] = {
53 0, 1, 8, 16, 9, 2, 3, 10,
54 17, 24, 32, 25, 18, 11, 4, 5,
55 12, 19, 26, 33, 40, 48, 41, 34,
56 27, 20, 13, 6, 7, 14, 21, 28,
57 35, 42, 49, 56, 57, 50, 43, 36,
58 29, 22, 15, 23, 30, 37, 44, 51,
59 58, 59, 52, 45, 38, 31, 39, 46,
60 53, 60, 61, 54, 47, 55, 62, 63
63 /* Specific zigzag scan for 248 idct. NOTE that unlike the
64 specification, we interleave the fields */
65 const uint8_t ff_zigzag248_direct[64] = {
66 0, 8, 1, 9, 16, 24, 2, 10,
67 17, 25, 32, 40, 48, 56, 33, 41,
68 18, 26, 3, 11, 4, 12, 19, 27,
69 34, 42, 49, 57, 50, 58, 35, 43,
70 20, 28, 5, 13, 6, 14, 21, 29,
71 36, 44, 51, 59, 52, 60, 37, 45,
72 22, 30, 7, 15, 23, 31, 38, 46,
73 53, 61, 54, 62, 39, 47, 55, 63,
76 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
77 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
79 const uint8_t ff_alternate_horizontal_scan[64] = {
80 0, 1, 2, 3, 8, 9, 16, 17,
81 10, 11, 4, 5, 6, 7, 15, 14,
82 13, 12, 19, 18, 24, 25, 32, 33,
83 26, 27, 20, 21, 22, 23, 28, 29,
84 30, 31, 34, 35, 40, 41, 48, 49,
85 42, 43, 36, 37, 38, 39, 44, 45,
86 46, 47, 50, 51, 56, 57, 58, 59,
87 52, 53, 54, 55, 60, 61, 62, 63,
90 const uint8_t ff_alternate_vertical_scan[64] = {
91 0, 8, 16, 24, 1, 9, 2, 10,
92 17, 25, 32, 40, 48, 56, 57, 49,
93 41, 33, 26, 18, 3, 11, 4, 12,
94 19, 27, 34, 42, 50, 58, 35, 43,
95 51, 59, 20, 28, 5, 13, 6, 14,
96 21, 29, 36, 44, 52, 60, 37, 45,
97 53, 61, 22, 30, 7, 15, 23, 31,
98 38, 46, 54, 62, 39, 47, 55, 63,
101 /* Input permutation for the simple_idct_mmx */
102 static const uint8_t simple_mmx_permutation[64]={
103 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
104 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
105 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
106 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
107 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
108 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
109 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
110 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
113 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
115 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
119 st->scantable= src_scantable;
123 j = src_scantable[i];
124 st->permutated[i] = permutation[j];
133 j = st->permutated[i];
135 st->raster_end[i]= end;
139 static int pix_sum_c(uint8_t * pix, int line_size)
144 for (i = 0; i < 16; i++) {
145 for (j = 0; j < 16; j += 8) {
156 pix += line_size - 16;
161 static int pix_norm1_c(uint8_t * pix, int line_size)
164 uint32_t *sq = ff_squareTbl + 256;
167 for (i = 0; i < 16; i++) {
168 for (j = 0; j < 16; j += 8) {
179 #if LONG_MAX > 2147483647
180 register uint64_t x=*(uint64_t*)pix;
182 s += sq[(x>>8)&0xff];
183 s += sq[(x>>16)&0xff];
184 s += sq[(x>>24)&0xff];
185 s += sq[(x>>32)&0xff];
186 s += sq[(x>>40)&0xff];
187 s += sq[(x>>48)&0xff];
188 s += sq[(x>>56)&0xff];
190 register uint32_t x=*(uint32_t*)pix;
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
195 x=*(uint32_t*)(pix+4);
197 s += sq[(x>>8)&0xff];
198 s += sq[(x>>16)&0xff];
199 s += sq[(x>>24)&0xff];
204 pix += line_size - 16;
209 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
212 for(i=0; i+8<=w; i+=8){
213 dst[i+0]= av_bswap32(src[i+0]);
214 dst[i+1]= av_bswap32(src[i+1]);
215 dst[i+2]= av_bswap32(src[i+2]);
216 dst[i+3]= av_bswap32(src[i+3]);
217 dst[i+4]= av_bswap32(src[i+4]);
218 dst[i+5]= av_bswap32(src[i+5]);
219 dst[i+6]= av_bswap32(src[i+6]);
220 dst[i+7]= av_bswap32(src[i+7]);
223 dst[i+0]= av_bswap32(src[i+0]);
227 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
230 uint32_t *sq = ff_squareTbl + 256;
233 for (i = 0; i < h; i++) {
234 s += sq[pix1[0] - pix2[0]];
235 s += sq[pix1[1] - pix2[1]];
236 s += sq[pix1[2] - pix2[2]];
237 s += sq[pix1[3] - pix2[3]];
244 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
247 uint32_t *sq = ff_squareTbl + 256;
250 for (i = 0; i < h; i++) {
251 s += sq[pix1[0] - pix2[0]];
252 s += sq[pix1[1] - pix2[1]];
253 s += sq[pix1[2] - pix2[2]];
254 s += sq[pix1[3] - pix2[3]];
255 s += sq[pix1[4] - pix2[4]];
256 s += sq[pix1[5] - pix2[5]];
257 s += sq[pix1[6] - pix2[6]];
258 s += sq[pix1[7] - pix2[7]];
265 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
268 uint32_t *sq = ff_squareTbl + 256;
271 for (i = 0; i < h; i++) {
272 s += sq[pix1[ 0] - pix2[ 0]];
273 s += sq[pix1[ 1] - pix2[ 1]];
274 s += sq[pix1[ 2] - pix2[ 2]];
275 s += sq[pix1[ 3] - pix2[ 3]];
276 s += sq[pix1[ 4] - pix2[ 4]];
277 s += sq[pix1[ 5] - pix2[ 5]];
278 s += sq[pix1[ 6] - pix2[ 6]];
279 s += sq[pix1[ 7] - pix2[ 7]];
280 s += sq[pix1[ 8] - pix2[ 8]];
281 s += sq[pix1[ 9] - pix2[ 9]];
282 s += sq[pix1[10] - pix2[10]];
283 s += sq[pix1[11] - pix2[11]];
284 s += sq[pix1[12] - pix2[12]];
285 s += sq[pix1[13] - pix2[13]];
286 s += sq[pix1[14] - pix2[14]];
287 s += sq[pix1[15] - pix2[15]];
295 /* draw the edges of width 'w' of an image of size width, height */
296 //FIXME check that this is ok for mpeg4 interlaced
297 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
299 uint8_t *ptr, *last_line;
302 last_line = buf + (height - 1) * wrap;
305 memcpy(buf - (i + 1) * wrap, buf, width);
306 memcpy(last_line + (i + 1) * wrap, last_line, width);
310 for(i=0;i<height;i++) {
311 memset(ptr - w, ptr[0], w);
312 memset(ptr + width, ptr[width-1], w);
317 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
318 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
319 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
320 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
325 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
326 * @param buf destination buffer
327 * @param src source buffer
328 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
329 * @param block_w width of block
330 * @param block_h height of block
331 * @param src_x x coordinate of the top left sample of the block in the source buffer
332 * @param src_y y coordinate of the top left sample of the block in the source buffer
333 * @param w width of the source buffer
334 * @param h height of the source buffer
336 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
337 int src_x, int src_y, int w, int h){
339 int start_y, start_x, end_y, end_x;
342 src+= (h-1-src_y)*linesize;
344 }else if(src_y<=-block_h){
345 src+= (1-block_h-src_y)*linesize;
351 }else if(src_x<=-block_w){
352 src+= (1-block_w-src_x);
356 start_y= FFMAX(0, -src_y);
357 start_x= FFMAX(0, -src_x);
358 end_y= FFMIN(block_h, h-src_y);
359 end_x= FFMIN(block_w, w-src_x);
361 // copy existing part
362 for(y=start_y; y<end_y; y++){
363 for(x=start_x; x<end_x; x++){
364 buf[x + y*linesize]= src[x + y*linesize];
369 for(y=0; y<start_y; y++){
370 for(x=start_x; x<end_x; x++){
371 buf[x + y*linesize]= buf[x + start_y*linesize];
376 for(y=end_y; y<block_h; y++){
377 for(x=start_x; x<end_x; x++){
378 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
382 for(y=0; y<block_h; y++){
384 for(x=0; x<start_x; x++){
385 buf[x + y*linesize]= buf[start_x + y*linesize];
389 for(x=end_x; x<block_w; x++){
390 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
399 /* read the pixels */
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415 const uint8_t *s2, int stride){
418 /* read the pixels */
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
439 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
441 /* read the pixels */
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
463 /* read the pixels */
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
479 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
481 /* read the pixels */
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
492 uint8_t *restrict pixels,
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
501 else if (*block > 127)
504 *pixels = (uint8_t)(*block + 128);
508 pixels += (line_size - 8);
512 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
517 /* read the pixels */
519 pixels[0] = block[0];
520 pixels[1] = block[1];
521 pixels[2] = block[2];
522 pixels[3] = block[3];
523 pixels[4] = block[4];
524 pixels[5] = block[5];
525 pixels[6] = block[6];
526 pixels[7] = block[7];
533 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
539 /* read the pixels */
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
545 pixels[4] = cm[pixels[4] + block[4]];
546 pixels[5] = cm[pixels[5] + block[5]];
547 pixels[6] = cm[pixels[6] + block[6]];
548 pixels[7] = cm[pixels[7] + block[7]];
554 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
560 /* read the pixels */
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
564 pixels[2] = cm[pixels[2] + block[2]];
565 pixels[3] = cm[pixels[3] + block[3]];
571 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
575 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
577 /* read the pixels */
579 pixels[0] = cm[pixels[0] + block[0]];
580 pixels[1] = cm[pixels[1] + block[1]];
586 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
594 pixels[4] += block[4];
595 pixels[5] += block[5];
596 pixels[6] += block[6];
597 pixels[7] += block[7];
603 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
607 pixels[0] += block[0];
608 pixels[1] += block[1];
609 pixels[2] += block[2];
610 pixels[3] += block[3];
616 static int sum_abs_dctelem_c(DCTELEM *block)
620 sum+= FFABS(block[i]);
624 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
628 for (i = 0; i < h; i++) {
629 memset(block, value, 16);
634 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
638 for (i = 0; i < h; i++) {
639 memset(block, value, 8);
644 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
647 uint16_t *dst1 = (uint16_t *) dst;
648 uint16_t *dst2 = (uint16_t *)(dst + linesize);
650 for (j = 0; j < 8; j++) {
651 for (i = 0; i < 8; i++) {
652 dst1[i] = dst2[i] = src[i] * 0x0101;
662 #define PIXOP2(OPNAME, OP) \
663 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
667 OP(*((uint64_t*)block), AV_RN64(pixels));\
673 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
677 const uint64_t a= AV_RN64(pixels );\
678 const uint64_t b= AV_RN64(pixels+1);\
679 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
685 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
689 const uint64_t a= AV_RN64(pixels );\
690 const uint64_t b= AV_RN64(pixels+1);\
691 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
697 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
701 const uint64_t a= AV_RN64(pixels );\
702 const uint64_t b= AV_RN64(pixels+line_size);\
703 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
709 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
713 const uint64_t a= AV_RN64(pixels );\
714 const uint64_t b= AV_RN64(pixels+line_size);\
715 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
721 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
724 const uint64_t a= AV_RN64(pixels );\
725 const uint64_t b= AV_RN64(pixels+1);\
726 uint64_t l0= (a&0x0303030303030303ULL)\
727 + (b&0x0303030303030303ULL)\
728 + 0x0202020202020202ULL;\
729 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
730 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 for(i=0; i<h; i+=2){\
735 uint64_t a= AV_RN64(pixels );\
736 uint64_t b= AV_RN64(pixels+1);\
737 l1= (a&0x0303030303030303ULL)\
738 + (b&0x0303030303030303ULL);\
739 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
740 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
741 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
744 a= AV_RN64(pixels );\
745 b= AV_RN64(pixels+1);\
746 l0= (a&0x0303030303030303ULL)\
747 + (b&0x0303030303030303ULL)\
748 + 0x0202020202020202ULL;\
749 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
750 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
751 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
757 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
760 const uint64_t a= AV_RN64(pixels );\
761 const uint64_t b= AV_RN64(pixels+1);\
762 uint64_t l0= (a&0x0303030303030303ULL)\
763 + (b&0x0303030303030303ULL)\
764 + 0x0101010101010101ULL;\
765 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
766 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
770 for(i=0; i<h; i+=2){\
771 uint64_t a= AV_RN64(pixels );\
772 uint64_t b= AV_RN64(pixels+1);\
773 l1= (a&0x0303030303030303ULL)\
774 + (b&0x0303030303030303ULL);\
775 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
776 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
777 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
780 a= AV_RN64(pixels );\
781 b= AV_RN64(pixels+1);\
782 l0= (a&0x0303030303030303ULL)\
783 + (b&0x0303030303030303ULL)\
784 + 0x0101010101010101ULL;\
785 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
786 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
787 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
793 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
794 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
795 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
796 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
799 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
801 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
802 #else // 64 bit variant
804 #define PIXOP2(OPNAME, OP) \
805 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
808 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
813 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
816 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
821 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
824 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
825 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
830 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
834 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
835 int src_stride1, int src_stride2, int h){\
839 a= AV_RN32(&src1[i*src_stride1 ]);\
840 b= AV_RN32(&src2[i*src_stride2 ]);\
841 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
842 a= AV_RN32(&src1[i*src_stride1+4]);\
843 b= AV_RN32(&src2[i*src_stride2+4]);\
844 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
848 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
849 int src_stride1, int src_stride2, int h){\
853 a= AV_RN32(&src1[i*src_stride1 ]);\
854 b= AV_RN32(&src2[i*src_stride2 ]);\
855 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
856 a= AV_RN32(&src1[i*src_stride1+4]);\
857 b= AV_RN32(&src2[i*src_stride2+4]);\
858 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
862 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
863 int src_stride1, int src_stride2, int h){\
867 a= AV_RN32(&src1[i*src_stride1 ]);\
868 b= AV_RN32(&src2[i*src_stride2 ]);\
869 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
873 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
874 int src_stride1, int src_stride2, int h){\
878 a= AV_RN16(&src1[i*src_stride1 ]);\
879 b= AV_RN16(&src2[i*src_stride2 ]);\
880 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
884 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
885 int src_stride1, int src_stride2, int h){\
886 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
887 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
890 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
891 int src_stride1, int src_stride2, int h){\
892 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
893 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
896 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
900 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
904 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
908 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
912 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
913 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
916 uint32_t a, b, c, d, l0, l1, h0, h1;\
917 a= AV_RN32(&src1[i*src_stride1]);\
918 b= AV_RN32(&src2[i*src_stride2]);\
919 c= AV_RN32(&src3[i*src_stride3]);\
920 d= AV_RN32(&src4[i*src_stride4]);\
921 l0= (a&0x03030303UL)\
924 h0= ((a&0xFCFCFCFCUL)>>2)\
925 + ((b&0xFCFCFCFCUL)>>2);\
926 l1= (c&0x03030303UL)\
928 h1= ((c&0xFCFCFCFCUL)>>2)\
929 + ((d&0xFCFCFCFCUL)>>2);\
930 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
931 a= AV_RN32(&src1[i*src_stride1+4]);\
932 b= AV_RN32(&src2[i*src_stride2+4]);\
933 c= AV_RN32(&src3[i*src_stride3+4]);\
934 d= AV_RN32(&src4[i*src_stride4+4]);\
935 l0= (a&0x03030303UL)\
938 h0= ((a&0xFCFCFCFCUL)>>2)\
939 + ((b&0xFCFCFCFCUL)>>2);\
940 l1= (c&0x03030303UL)\
942 h1= ((c&0xFCFCFCFCUL)>>2)\
943 + ((d&0xFCFCFCFCUL)>>2);\
944 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
948 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
949 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
952 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
953 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
956 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
957 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
960 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
961 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
964 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
965 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
968 uint32_t a, b, c, d, l0, l1, h0, h1;\
969 a= AV_RN32(&src1[i*src_stride1]);\
970 b= AV_RN32(&src2[i*src_stride2]);\
971 c= AV_RN32(&src3[i*src_stride3]);\
972 d= AV_RN32(&src4[i*src_stride4]);\
973 l0= (a&0x03030303UL)\
976 h0= ((a&0xFCFCFCFCUL)>>2)\
977 + ((b&0xFCFCFCFCUL)>>2);\
978 l1= (c&0x03030303UL)\
980 h1= ((c&0xFCFCFCFCUL)>>2)\
981 + ((d&0xFCFCFCFCUL)>>2);\
982 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
983 a= AV_RN32(&src1[i*src_stride1+4]);\
984 b= AV_RN32(&src2[i*src_stride2+4]);\
985 c= AV_RN32(&src3[i*src_stride3+4]);\
986 d= AV_RN32(&src4[i*src_stride4+4]);\
987 l0= (a&0x03030303UL)\
990 h0= ((a&0xFCFCFCFCUL)>>2)\
991 + ((b&0xFCFCFCFCUL)>>2);\
992 l1= (c&0x03030303UL)\
994 h1= ((c&0xFCFCFCFCUL)>>2)\
995 + ((d&0xFCFCFCFCUL)>>2);\
996 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
999 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1000 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1002 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1004 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1005 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1006 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1007 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1010 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1012 int i, a0, b0, a1, b1;\
1019 for(i=0; i<h; i+=2){\
1025 block[0]= (a1+a0)>>2; /* FIXME non put */\
1026 block[1]= (b1+b0)>>2;\
1036 block[0]= (a1+a0)>>2;\
1037 block[1]= (b1+b0)>>2;\
1043 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1046 const uint32_t a= AV_RN32(pixels );\
1047 const uint32_t b= AV_RN32(pixels+1);\
1048 uint32_t l0= (a&0x03030303UL)\
1051 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1052 + ((b&0xFCFCFCFCUL)>>2);\
1056 for(i=0; i<h; i+=2){\
1057 uint32_t a= AV_RN32(pixels );\
1058 uint32_t b= AV_RN32(pixels+1);\
1059 l1= (a&0x03030303UL)\
1060 + (b&0x03030303UL);\
1061 h1= ((a&0xFCFCFCFCUL)>>2)\
1062 + ((b&0xFCFCFCFCUL)>>2);\
1063 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1066 a= AV_RN32(pixels );\
1067 b= AV_RN32(pixels+1);\
1068 l0= (a&0x03030303UL)\
1071 h0= ((a&0xFCFCFCFCUL)>>2)\
1072 + ((b&0xFCFCFCFCUL)>>2);\
1073 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1079 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1082 for(j=0; j<2; j++){\
1084 const uint32_t a= AV_RN32(pixels );\
1085 const uint32_t b= AV_RN32(pixels+1);\
1086 uint32_t l0= (a&0x03030303UL)\
1089 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1090 + ((b&0xFCFCFCFCUL)>>2);\
1094 for(i=0; i<h; i+=2){\
1095 uint32_t a= AV_RN32(pixels );\
1096 uint32_t b= AV_RN32(pixels+1);\
1097 l1= (a&0x03030303UL)\
1098 + (b&0x03030303UL);\
1099 h1= ((a&0xFCFCFCFCUL)>>2)\
1100 + ((b&0xFCFCFCFCUL)>>2);\
1101 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1104 a= AV_RN32(pixels );\
1105 b= AV_RN32(pixels+1);\
1106 l0= (a&0x03030303UL)\
1109 h0= ((a&0xFCFCFCFCUL)>>2)\
1110 + ((b&0xFCFCFCFCUL)>>2);\
1111 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1115 pixels+=4-line_size*(h+1);\
1116 block +=4-line_size*h;\
1120 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1123 for(j=0; j<2; j++){\
1125 const uint32_t a= AV_RN32(pixels );\
1126 const uint32_t b= AV_RN32(pixels+1);\
1127 uint32_t l0= (a&0x03030303UL)\
1130 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1131 + ((b&0xFCFCFCFCUL)>>2);\
1135 for(i=0; i<h; i+=2){\
1136 uint32_t a= AV_RN32(pixels );\
1137 uint32_t b= AV_RN32(pixels+1);\
1138 l1= (a&0x03030303UL)\
1139 + (b&0x03030303UL);\
1140 h1= ((a&0xFCFCFCFCUL)>>2)\
1141 + ((b&0xFCFCFCFCUL)>>2);\
1142 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1145 a= AV_RN32(pixels );\
1146 b= AV_RN32(pixels+1);\
1147 l0= (a&0x03030303UL)\
1150 h0= ((a&0xFCFCFCFCUL)>>2)\
1151 + ((b&0xFCFCFCFCUL)>>2);\
1152 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1156 pixels+=4-line_size*(h+1);\
1157 block +=4-line_size*h;\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1164 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1165 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1168 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1170 #define op_avg(a, b) a = rnd_avg32(a, b)
1172 #define op_put(a, b) a = b
1179 #define put_no_rnd_pixels8_c put_pixels8_c
1180 #define put_no_rnd_pixels16_c put_pixels16_c
1182 #define avg2(a,b) ((a+b+1)>>1)
1183 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1185 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1189 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1190 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1193 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1195 const int A=(16-x16)*(16-y16);
1196 const int B=( x16)*(16-y16);
1197 const int C=(16-x16)*( y16);
1198 const int D=( x16)*( y16);
1203 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1204 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1205 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1206 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1207 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1208 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1209 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1210 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1216 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1217 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1220 const int s= 1<<shift;
1230 for(x=0; x<8; x++){ //XXX FIXME optimize
1231 int src_x, src_y, frac_x, frac_y, index;
1235 frac_x= src_x&(s-1);
1236 frac_y= src_y&(s-1);
1240 if((unsigned)src_x < width){
1241 if((unsigned)src_y < height){
1242 index= src_x + src_y*stride;
1243 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1244 + src[index +1]* frac_x )*(s-frac_y)
1245 + ( src[index+stride ]*(s-frac_x)
1246 + src[index+stride+1]* frac_x )* frac_y
1249 index= src_x + av_clip(src_y, 0, height)*stride;
1250 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1251 + src[index +1]* frac_x )*s
1255 if((unsigned)src_y < height){
1256 index= av_clip(src_x, 0, width) + src_y*stride;
1257 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1258 + src[index+stride ]* frac_y )*s
1261 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1262 dst[y*stride + x]= src[index ];
1274 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1276 case 2: put_pixels2_c (dst, src, stride, height); break;
1277 case 4: put_pixels4_c (dst, src, stride, height); break;
1278 case 8: put_pixels8_c (dst, src, stride, height); break;
1279 case 16:put_pixels16_c(dst, src, stride, height); break;
1283 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1285 for (i=0; i < height; i++) {
1286 for (j=0; j < width; j++) {
1287 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1294 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1296 for (i=0; i < height; i++) {
1297 for (j=0; j < width; j++) {
1298 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1305 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1307 for (i=0; i < height; i++) {
1308 for (j=0; j < width; j++) {
1309 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1316 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1318 for (i=0; i < height; i++) {
1319 for (j=0; j < width; j++) {
1320 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1327 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1329 for (i=0; i < height; i++) {
1330 for (j=0; j < width; j++) {
1331 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1338 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1340 for (i=0; i < height; i++) {
1341 for (j=0; j < width; j++) {
1342 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1349 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 for (i=0; i < height; i++) {
1352 for (j=0; j < width; j++) {
1353 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1360 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1362 for (i=0; i < height; i++) {
1363 for (j=0; j < width; j++) {
1364 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1371 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1373 case 2: avg_pixels2_c (dst, src, stride, height); break;
1374 case 4: avg_pixels4_c (dst, src, stride, height); break;
1375 case 8: avg_pixels8_c (dst, src, stride, height); break;
1376 case 16:avg_pixels16_c(dst, src, stride, height); break;
1380 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382 for (i=0; i < height; i++) {
1383 for (j=0; j < width; j++) {
1384 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1391 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393 for (i=0; i < height; i++) {
1394 for (j=0; j < width; j++) {
1395 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1402 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404 for (i=0; i < height; i++) {
1405 for (j=0; j < width; j++) {
1406 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1413 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1424 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1435 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1446 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1457 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1468 #define TPEL_WIDTH(width)\
1469 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1483 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1484 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1485 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1486 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1489 #define H264_CHROMA_MC(OPNAME, OP)\
1490 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1491 const int A=(8-x)*(8-y);\
1492 const int B=( x)*(8-y);\
1493 const int C=(8-x)*( y);\
1494 const int D=( x)*( y);\
1497 assert(x<8 && y<8 && x>=0 && y>=0);\
1500 for(i=0; i<h; i++){\
1501 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1502 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1508 const int step= C ? stride : 1;\
1509 for(i=0; i<h; i++){\
1510 OP(dst[0], (A*src[0] + E*src[step+0]));\
1511 OP(dst[1], (A*src[1] + E*src[step+1]));\
1518 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1519 const int A=(8-x)*(8-y);\
1520 const int B=( x)*(8-y);\
1521 const int C=(8-x)*( y);\
1522 const int D=( x)*( y);\
1525 assert(x<8 && y<8 && x>=0 && y>=0);\
1528 for(i=0; i<h; i++){\
1529 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1530 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1531 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1532 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1538 const int step= C ? stride : 1;\
1539 for(i=0; i<h; i++){\
1540 OP(dst[0], (A*src[0] + E*src[step+0]));\
1541 OP(dst[1], (A*src[1] + E*src[step+1]));\
1542 OP(dst[2], (A*src[2] + E*src[step+2]));\
1543 OP(dst[3], (A*src[3] + E*src[step+3]));\
1550 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1551 const int A=(8-x)*(8-y);\
1552 const int B=( x)*(8-y);\
1553 const int C=(8-x)*( y);\
1554 const int D=( x)*( y);\
1557 assert(x<8 && y<8 && x>=0 && y>=0);\
1560 for(i=0; i<h; i++){\
1561 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1562 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1563 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1564 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1565 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1566 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1567 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1568 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1574 const int step= C ? stride : 1;\
1575 for(i=0; i<h; i++){\
1576 OP(dst[0], (A*src[0] + E*src[step+0]));\
1577 OP(dst[1], (A*src[1] + E*src[step+1]));\
1578 OP(dst[2], (A*src[2] + E*src[step+2]));\
1579 OP(dst[3], (A*src[3] + E*src[step+3]));\
1580 OP(dst[4], (A*src[4] + E*src[step+4]));\
1581 OP(dst[5], (A*src[5] + E*src[step+5]));\
1582 OP(dst[6], (A*src[6] + E*src[step+6]));\
1583 OP(dst[7], (A*src[7] + E*src[step+7]));\
1590 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1591 #define op_put(a, b) a = (((b) + 32)>>6)
1593 H264_CHROMA_MC(put_ , op_put)
1594 H264_CHROMA_MC(avg_ , op_avg)
1598 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1599 const int A=(8-x)*(8-y);
1600 const int B=( x)*(8-y);
1601 const int C=(8-x)*( y);
1602 const int D=( x)*( y);
1605 assert(x<8 && y<8 && x>=0 && y>=0);
1609 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1610 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1611 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1612 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1613 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1614 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1615 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1616 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1622 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1623 const int A=(8-x)*(8-y);
1624 const int B=( x)*(8-y);
1625 const int C=(8-x)*( y);
1626 const int D=( x)*( y);
1629 assert(x<8 && y<8 && x>=0 && y>=0);
1633 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1634 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1635 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1636 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1637 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1638 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1639 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1640 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1646 #define QPEL_MC(r, OPNAME, RND, OP) \
1647 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1648 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1652 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1653 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1654 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1655 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1656 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1657 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1658 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1659 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1665 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1667 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1671 const int src0= src[0*srcStride];\
1672 const int src1= src[1*srcStride];\
1673 const int src2= src[2*srcStride];\
1674 const int src3= src[3*srcStride];\
1675 const int src4= src[4*srcStride];\
1676 const int src5= src[5*srcStride];\
1677 const int src6= src[6*srcStride];\
1678 const int src7= src[7*srcStride];\
1679 const int src8= src[8*srcStride];\
1680 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1681 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1682 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1683 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1684 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1685 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1686 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1687 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1693 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1694 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1699 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1700 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1701 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1702 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1703 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1704 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1705 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1706 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1707 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1708 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1709 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1710 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1711 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1712 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1713 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1714 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1720 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1721 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1726 const int src0= src[0*srcStride];\
1727 const int src1= src[1*srcStride];\
1728 const int src2= src[2*srcStride];\
1729 const int src3= src[3*srcStride];\
1730 const int src4= src[4*srcStride];\
1731 const int src5= src[5*srcStride];\
1732 const int src6= src[6*srcStride];\
1733 const int src7= src[7*srcStride];\
1734 const int src8= src[8*srcStride];\
1735 const int src9= src[9*srcStride];\
1736 const int src10= src[10*srcStride];\
1737 const int src11= src[11*srcStride];\
1738 const int src12= src[12*srcStride];\
1739 const int src13= src[13*srcStride];\
1740 const int src14= src[14*srcStride];\
1741 const int src15= src[15*srcStride];\
1742 const int src16= src[16*srcStride];\
1743 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1744 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1745 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1746 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1747 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1748 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1749 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1750 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1751 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1752 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1753 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1754 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1755 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1756 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1757 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1758 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1764 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1770 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1774 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1776 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1780 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1788 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t full[16*9];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1794 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1801 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1815 uint8_t halfHV[64];\
1816 copy_block9(full, src, 16, stride, 9);\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[16*9];\
1826 uint8_t halfHV[64];\
1827 copy_block9(full, src, 16, stride, 9);\
1828 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1833 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834 uint8_t full[16*9];\
1836 uint8_t halfHV[64];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1843 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1847 uint8_t halfHV[64];\
1848 copy_block9(full, src, 16, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1854 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1857 uint8_t halfHV[64];\
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1864 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1885 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t halfHV[64];\
1888 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1892 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1894 uint8_t halfHV[64];\
1895 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1899 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900 uint8_t full[16*9];\
1903 uint8_t halfHV[64];\
1904 copy_block9(full, src, 16, stride, 9);\
1905 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1910 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[16*9];\
1913 copy_block9(full, src, 16, stride, 9);\
1914 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1918 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1922 uint8_t halfHV[64];\
1923 copy_block9(full, src, 16, stride, 9);\
1924 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1929 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[16*9];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1937 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1939 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1943 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1946 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1949 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1950 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1953 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1955 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1956 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1959 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1960 uint8_t full[24*17];\
1962 copy_block17(full, src, 24, stride, 17);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1964 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1967 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[24*17];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1973 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[24*17];\
1976 copy_block17(full, src, 24, stride, 17);\
1977 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1978 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1980 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[24*17];\
1982 uint8_t halfH[272];\
1983 uint8_t halfV[256];\
1984 uint8_t halfHV[256];\
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1991 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t full[24*17];\
1993 uint8_t halfH[272];\
1994 uint8_t halfHV[256];\
1995 copy_block17(full, src, 24, stride, 17);\
1996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1997 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1998 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1999 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2001 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[24*17];\
2003 uint8_t halfH[272];\
2004 uint8_t halfV[256];\
2005 uint8_t halfHV[256];\
2006 copy_block17(full, src, 24, stride, 17);\
2007 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2008 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2009 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2010 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2012 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2013 uint8_t full[24*17];\
2014 uint8_t halfH[272];\
2015 uint8_t halfHV[256];\
2016 copy_block17(full, src, 24, stride, 17);\
2017 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2018 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2019 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2022 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 uint8_t halfV[256];\
2026 uint8_t halfHV[256];\
2027 copy_block17(full, src, 24, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2033 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 uint8_t halfH[272];\
2036 uint8_t halfHV[256];\
2037 copy_block17(full, src, 24, stride, 17);\
2038 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2040 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2043 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t full[24*17];\
2045 uint8_t halfH[272];\
2046 uint8_t halfV[256];\
2047 uint8_t halfHV[256];\
2048 copy_block17(full, src, 24, stride, 17);\
2049 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2050 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2051 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2052 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2054 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2055 uint8_t full[24*17];\
2056 uint8_t halfH[272];\
2057 uint8_t halfHV[256];\
2058 copy_block17(full, src, 24, stride, 17);\
2059 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2060 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2061 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2062 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2064 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2065 uint8_t halfH[272];\
2066 uint8_t halfHV[256];\
2067 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2068 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2069 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2071 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2072 uint8_t halfH[272];\
2073 uint8_t halfHV[256];\
2074 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2078 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2079 uint8_t full[24*17];\
2080 uint8_t halfH[272];\
2081 uint8_t halfV[256];\
2082 uint8_t halfHV[256];\
2083 copy_block17(full, src, 24, stride, 17);\
2084 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2086 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2089 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2090 uint8_t full[24*17];\
2091 uint8_t halfH[272];\
2092 copy_block17(full, src, 24, stride, 17);\
2093 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2094 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2095 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2097 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[24*17];\
2099 uint8_t halfH[272];\
2100 uint8_t halfV[256];\
2101 uint8_t halfHV[256];\
2102 copy_block17(full, src, 24, stride, 17);\
2103 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2104 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2105 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2106 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2108 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[24*17];\
2110 uint8_t halfH[272];\
2111 copy_block17(full, src, 24, stride, 17);\
2112 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2113 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2114 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2116 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2117 uint8_t halfH[272];\
2118 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2119 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2122 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2123 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2124 #define op_put(a, b) a = cm[((b) + 16)>>5]
2125 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2127 QPEL_MC(0, put_ , _ , op_put)
2128 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2129 QPEL_MC(0, avg_ , _ , op_avg)
2130 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2132 #undef op_avg_no_rnd
2134 #undef op_put_no_rnd
2136 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2137 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2138 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2139 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2140 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2141 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2144 #define H264_LOWPASS(OPNAME, OP, OP2) \
2145 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2147 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2151 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2152 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2158 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2160 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2164 const int srcB= src[-2*srcStride];\
2165 const int srcA= src[-1*srcStride];\
2166 const int src0= src[0 *srcStride];\
2167 const int src1= src[1 *srcStride];\
2168 const int src2= src[2 *srcStride];\
2169 const int src3= src[3 *srcStride];\
2170 const int src4= src[4 *srcStride];\
2171 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2172 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2178 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2181 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2183 src -= 2*srcStride;\
2184 for(i=0; i<h+5; i++)\
2186 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2187 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2191 tmp -= tmpStride*(h+5-2);\
2194 const int tmpB= tmp[-2*tmpStride];\
2195 const int tmpA= tmp[-1*tmpStride];\
2196 const int tmp0= tmp[0 *tmpStride];\
2197 const int tmp1= tmp[1 *tmpStride];\
2198 const int tmp2= tmp[2 *tmpStride];\
2199 const int tmp3= tmp[3 *tmpStride];\
2200 const int tmp4= tmp[4 *tmpStride];\
2201 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2202 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2207 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2209 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2213 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2214 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2215 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2216 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2222 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2228 const int srcB= src[-2*srcStride];\
2229 const int srcA= src[-1*srcStride];\
2230 const int src0= src[0 *srcStride];\
2231 const int src1= src[1 *srcStride];\
2232 const int src2= src[2 *srcStride];\
2233 const int src3= src[3 *srcStride];\
2234 const int src4= src[4 *srcStride];\
2235 const int src5= src[5 *srcStride];\
2236 const int src6= src[6 *srcStride];\
2237 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2238 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2239 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2240 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2249 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2251 src -= 2*srcStride;\
2252 for(i=0; i<h+5; i++)\
2254 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2255 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2256 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2257 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2261 tmp -= tmpStride*(h+5-2);\
2264 const int tmpB= tmp[-2*tmpStride];\
2265 const int tmpA= tmp[-1*tmpStride];\
2266 const int tmp0= tmp[0 *tmpStride];\
2267 const int tmp1= tmp[1 *tmpStride];\
2268 const int tmp2= tmp[2 *tmpStride];\
2269 const int tmp3= tmp[3 *tmpStride];\
2270 const int tmp4= tmp[4 *tmpStride];\
2271 const int tmp5= tmp[5 *tmpStride];\
2272 const int tmp6= tmp[6 *tmpStride];\
2273 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2274 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2275 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2276 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2282 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2284 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2288 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2289 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2290 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2291 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2292 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2293 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2294 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2295 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2301 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2307 const int srcB= src[-2*srcStride];\
2308 const int srcA= src[-1*srcStride];\
2309 const int src0= src[0 *srcStride];\
2310 const int src1= src[1 *srcStride];\
2311 const int src2= src[2 *srcStride];\
2312 const int src3= src[3 *srcStride];\
2313 const int src4= src[4 *srcStride];\
2314 const int src5= src[5 *srcStride];\
2315 const int src6= src[6 *srcStride];\
2316 const int src7= src[7 *srcStride];\
2317 const int src8= src[8 *srcStride];\
2318 const int src9= src[9 *srcStride];\
2319 const int src10=src[10*srcStride];\
2320 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2321 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2322 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2323 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2324 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2325 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2326 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2327 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2333 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338 src -= 2*srcStride;\
2339 for(i=0; i<h+5; i++)\
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2345 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2346 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2347 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2348 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2352 tmp -= tmpStride*(h+5-2);\
2355 const int tmpB= tmp[-2*tmpStride];\
2356 const int tmpA= tmp[-1*tmpStride];\
2357 const int tmp0= tmp[0 *tmpStride];\
2358 const int tmp1= tmp[1 *tmpStride];\
2359 const int tmp2= tmp[2 *tmpStride];\
2360 const int tmp3= tmp[3 *tmpStride];\
2361 const int tmp4= tmp[4 *tmpStride];\
2362 const int tmp5= tmp[5 *tmpStride];\
2363 const int tmp6= tmp[6 *tmpStride];\
2364 const int tmp7= tmp[7 *tmpStride];\
2365 const int tmp8= tmp[8 *tmpStride];\
2366 const int tmp9= tmp[9 *tmpStride];\
2367 const int tmp10=tmp[10*tmpStride];\
2368 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2369 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2370 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2371 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2372 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2373 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2374 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2375 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2381 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2382 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2383 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2384 src += 8*srcStride;\
2385 dst += 8*dstStride;\
2386 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2387 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2390 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2391 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2392 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2393 src += 8*srcStride;\
2394 dst += 8*dstStride;\
2395 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2396 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2399 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2400 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2401 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2402 src += 8*srcStride;\
2403 dst += 8*dstStride;\
2404 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2405 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2408 #define H264_MC(OPNAME, SIZE) \
2409 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2410 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2414 uint8_t half[SIZE*SIZE];\
2415 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2416 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2420 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2424 uint8_t half[SIZE*SIZE];\
2425 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2426 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2429 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2430 uint8_t full[SIZE*(SIZE+5)];\
2431 uint8_t * const full_mid= full + SIZE*2;\
2432 uint8_t half[SIZE*SIZE];\
2433 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2434 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2435 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2438 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2439 uint8_t full[SIZE*(SIZE+5)];\
2440 uint8_t * const full_mid= full + SIZE*2;\
2441 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2442 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2445 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2446 uint8_t full[SIZE*(SIZE+5)];\
2447 uint8_t * const full_mid= full + SIZE*2;\
2448 uint8_t half[SIZE*SIZE];\
2449 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2450 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2451 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2455 uint8_t full[SIZE*(SIZE+5)];\
2456 uint8_t * const full_mid= full + SIZE*2;\
2457 uint8_t halfH[SIZE*SIZE];\
2458 uint8_t halfV[SIZE*SIZE];\
2459 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2460 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2461 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2465 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2466 uint8_t full[SIZE*(SIZE+5)];\
2467 uint8_t * const full_mid= full + SIZE*2;\
2468 uint8_t halfH[SIZE*SIZE];\
2469 uint8_t halfV[SIZE*SIZE];\
2470 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2471 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2472 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2473 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2476 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2477 uint8_t full[SIZE*(SIZE+5)];\
2478 uint8_t * const full_mid= full + SIZE*2;\
2479 uint8_t halfH[SIZE*SIZE];\
2480 uint8_t halfV[SIZE*SIZE];\
2481 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2482 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2483 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2484 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2487 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2488 uint8_t full[SIZE*(SIZE+5)];\
2489 uint8_t * const full_mid= full + SIZE*2;\
2490 uint8_t halfH[SIZE*SIZE];\
2491 uint8_t halfV[SIZE*SIZE];\
2492 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2493 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2494 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2495 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2499 int16_t tmp[SIZE*(SIZE+5)];\
2500 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2503 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2504 int16_t tmp[SIZE*(SIZE+5)];\
2505 uint8_t halfH[SIZE*SIZE];\
2506 uint8_t halfHV[SIZE*SIZE];\
2507 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2508 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2509 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2512 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2513 int16_t tmp[SIZE*(SIZE+5)];\
2514 uint8_t halfH[SIZE*SIZE];\
2515 uint8_t halfHV[SIZE*SIZE];\
2516 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2517 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2518 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2521 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2522 uint8_t full[SIZE*(SIZE+5)];\
2523 uint8_t * const full_mid= full + SIZE*2;\
2524 int16_t tmp[SIZE*(SIZE+5)];\
2525 uint8_t halfV[SIZE*SIZE];\
2526 uint8_t halfHV[SIZE*SIZE];\
2527 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2528 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2529 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2530 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2533 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2534 uint8_t full[SIZE*(SIZE+5)];\
2535 uint8_t * const full_mid= full + SIZE*2;\
2536 int16_t tmp[SIZE*(SIZE+5)];\
2537 uint8_t halfV[SIZE*SIZE];\
2538 uint8_t halfHV[SIZE*SIZE];\
2539 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2540 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2541 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2542 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2545 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2546 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2547 #define op_put(a, b) a = cm[((b) + 16)>>5]
2548 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2549 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2551 H264_LOWPASS(put_ , op_put, op2_put)
2552 H264_LOWPASS(avg_ , op_avg, op2_avg)
2567 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2568 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2569 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2570 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2572 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2573 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2577 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2578 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2579 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2580 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2581 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2582 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2583 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2584 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2590 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2591 put_pixels8_c(dst, src, stride, 8);
2593 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2594 avg_pixels8_c(dst, src, stride, 8);
2596 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2597 put_pixels16_c(dst, src, stride, 16);
2599 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2600 avg_pixels16_c(dst, src, stride, 16);
2603 #if CONFIG_RV40_DECODER
2604 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2605 put_pixels16_xy2_c(dst, src, stride, 16);
2607 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2608 avg_pixels16_xy2_c(dst, src, stride, 16);
2610 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2611 put_pixels8_xy2_c(dst, src, stride, 8);
2613 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2614 avg_pixels8_xy2_c(dst, src, stride, 8);
2616 #endif /* CONFIG_RV40_DECODER */
2618 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2619 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2623 const int src_1= src[ -srcStride];
2624 const int src0 = src[0 ];
2625 const int src1 = src[ srcStride];
2626 const int src2 = src[2*srcStride];
2627 const int src3 = src[3*srcStride];
2628 const int src4 = src[4*srcStride];
2629 const int src5 = src[5*srcStride];
2630 const int src6 = src[6*srcStride];
2631 const int src7 = src[7*srcStride];
2632 const int src8 = src[8*srcStride];
2633 const int src9 = src[9*srcStride];
2634 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2635 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2636 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2637 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2638 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2639 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2640 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2641 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2647 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2649 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2650 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2653 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2654 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2657 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2659 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2660 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2663 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2664 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2667 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2671 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2672 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2673 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2674 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2676 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2680 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2682 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2683 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2685 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2687 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2688 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2691 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2692 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2694 const int strength= ff_h263_loop_filter_strength[qscale];
2698 int p0= src[x-2*stride];
2699 int p1= src[x-1*stride];
2700 int p2= src[x+0*stride];
2701 int p3= src[x+1*stride];
2702 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2704 if (d<-2*strength) d1= 0;
2705 else if(d<- strength) d1=-2*strength - d;
2706 else if(d< strength) d1= d;
2707 else if(d< 2*strength) d1= 2*strength - d;
2712 if(p1&256) p1= ~(p1>>31);
2713 if(p2&256) p2= ~(p2>>31);
2715 src[x-1*stride] = p1;
2716 src[x+0*stride] = p2;
2720 d2= av_clip((p0-p3)/4, -ad1, ad1);
2722 src[x-2*stride] = p0 - d2;
2723 src[x+ stride] = p3 + d2;
2728 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2729 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2731 const int strength= ff_h263_loop_filter_strength[qscale];
2735 int p0= src[y*stride-2];
2736 int p1= src[y*stride-1];
2737 int p2= src[y*stride+0];
2738 int p3= src[y*stride+1];
2739 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2741 if (d<-2*strength) d1= 0;
2742 else if(d<- strength) d1=-2*strength - d;
2743 else if(d< strength) d1= d;
2744 else if(d< 2*strength) d1= 2*strength - d;
2749 if(p1&256) p1= ~(p1>>31);
2750 if(p2&256) p2= ~(p2>>31);
2752 src[y*stride-1] = p1;
2753 src[y*stride+0] = p2;
2757 d2= av_clip((p0-p3)/4, -ad1, ad1);
2759 src[y*stride-2] = p0 - d2;
2760 src[y*stride+1] = p3 + d2;
2765 static void h261_loop_filter_c(uint8_t *src, int stride){
2770 temp[x ] = 4*src[x ];
2771 temp[x + 7*8] = 4*src[x + 7*stride];
2775 xy = y * stride + x;
2777 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2782 src[ y*stride] = (temp[ y*8] + 2)>>2;
2783 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2785 xy = y * stride + x;
2787 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2792 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2798 s += abs(pix1[0] - pix2[0]);
2799 s += abs(pix1[1] - pix2[1]);
2800 s += abs(pix1[2] - pix2[2]);
2801 s += abs(pix1[3] - pix2[3]);
2802 s += abs(pix1[4] - pix2[4]);
2803 s += abs(pix1[5] - pix2[5]);
2804 s += abs(pix1[6] - pix2[6]);
2805 s += abs(pix1[7] - pix2[7]);
2806 s += abs(pix1[8] - pix2[8]);
2807 s += abs(pix1[9] - pix2[9]);
2808 s += abs(pix1[10] - pix2[10]);
2809 s += abs(pix1[11] - pix2[11]);
2810 s += abs(pix1[12] - pix2[12]);
2811 s += abs(pix1[13] - pix2[13]);
2812 s += abs(pix1[14] - pix2[14]);
2813 s += abs(pix1[15] - pix2[15]);
2820 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2826 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2827 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2828 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2829 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2830 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2831 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2832 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2833 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2834 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2835 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2836 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2837 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2838 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2839 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2840 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2841 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2848 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2851 uint8_t *pix3 = pix2 + line_size;
2855 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2856 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2857 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2858 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2859 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2860 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2861 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2862 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2863 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2864 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2865 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2866 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2867 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2868 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2869 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2870 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2878 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2881 uint8_t *pix3 = pix2 + line_size;
2885 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2886 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2887 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2888 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2889 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2890 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2891 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2892 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2893 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2894 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2895 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2896 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2897 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2898 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2899 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2900 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2908 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2914 s += abs(pix1[0] - pix2[0]);
2915 s += abs(pix1[1] - pix2[1]);
2916 s += abs(pix1[2] - pix2[2]);
2917 s += abs(pix1[3] - pix2[3]);
2918 s += abs(pix1[4] - pix2[4]);
2919 s += abs(pix1[5] - pix2[5]);
2920 s += abs(pix1[6] - pix2[6]);
2921 s += abs(pix1[7] - pix2[7]);
2928 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2935 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2936 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2937 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2938 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2939 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2940 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2941 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2948 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2951 uint8_t *pix3 = pix2 + line_size;
2955 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2956 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2957 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2958 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2959 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2960 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2961 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2962 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2970 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2973 uint8_t *pix3 = pix2 + line_size;
2977 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2978 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2979 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2980 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2981 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2982 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2983 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2984 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2992 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2993 MpegEncContext *c = v;
2999 for(x=0; x<16; x++){
3000 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3003 for(x=0; x<15; x++){
3004 score2+= FFABS( s1[x ] - s1[x +stride]
3005 - s1[x+1] + s1[x+1+stride])
3006 -FFABS( s2[x ] - s2[x +stride]
3007 - s2[x+1] + s2[x+1+stride]);
3014 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3015 else return score1 + FFABS(score2)*8;
3018 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3019 MpegEncContext *c = v;
3026 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3030 score2+= FFABS( s1[x ] - s1[x +stride]
3031 - s1[x+1] + s1[x+1+stride])
3032 -FFABS( s2[x ] - s2[x +stride]
3033 - s2[x+1] + s2[x+1+stride]);
3040 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3041 else return score1 + FFABS(score2)*8;
3044 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3048 for(i=0; i<8*8; i++){
3049 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3052 assert(-512<b && b<512);
3054 sum += (w*b)*(w*b)>>4;
3059 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3062 for(i=0; i<8*8; i++){
3063 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3068 * permutes an 8x8 block.
3069 * @param block the block which will be permuted according to the given permutation vector
3070 * @param permutation the permutation vector
3071 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3072 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3073 * (inverse) permutated to scantable order!
3075 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3081 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3083 for(i=0; i<=last; i++){
3084 const int j= scantable[i];
3089 for(i=0; i<=last; i++){
3090 const int j= scantable[i];
3091 const int perm_j= permutation[j];
3092 block[perm_j]= temp[j];
3096 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3100 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3103 memset(cmp, 0, sizeof(void*)*6);
3111 cmp[i]= c->hadamard8_diff[i];
3117 cmp[i]= c->dct_sad[i];
3120 cmp[i]= c->dct264_sad[i];
3123 cmp[i]= c->dct_max[i];
3126 cmp[i]= c->quant_psnr[i];
3155 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3160 static void clear_block_c(DCTELEM *block)
3162 memset(block, 0, sizeof(DCTELEM)*64);
3166 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3168 static void clear_blocks_c(DCTELEM *blocks)
3170 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3173 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3175 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3176 long a = *(long*)(src+i);
3177 long b = *(long*)(dst+i);
3178 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3181 dst[i+0] += src[i+0];
3184 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3186 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3187 long a = *(long*)(src1+i);
3188 long b = *(long*)(src2+i);
3189 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3192 dst[i] = src1[i]+src2[i];
3195 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3197 #if !HAVE_FAST_UNALIGNED
3198 if((long)src2 & (sizeof(long)-1)){
3199 for(i=0; i+7<w; i+=8){
3200 dst[i+0] = src1[i+0]-src2[i+0];
3201 dst[i+1] = src1[i+1]-src2[i+1];
3202 dst[i+2] = src1[i+2]-src2[i+2];
3203 dst[i+3] = src1[i+3]-src2[i+3];
3204 dst[i+4] = src1[i+4]-src2[i+4];
3205 dst[i+5] = src1[i+5]-src2[i+5];
3206 dst[i+6] = src1[i+6]-src2[i+6];
3207 dst[i+7] = src1[i+7]-src2[i+7];
3211 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3212 long a = *(long*)(src1+i);
3213 long b = *(long*)(src2+i);
3214 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3217 dst[i+0] = src1[i+0]-src2[i+0];
3220 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3228 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3237 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3245 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3255 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3258 for(i=0; i<w-1; i++){
3285 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3315 #define BUTTERFLY2(o1,o2,i1,i2) \
3319 #define BUTTERFLY1(x,y) \
3328 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3330 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3338 //FIXME try pointer walks
3339 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3340 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3341 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3342 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3344 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3345 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3346 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3347 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3349 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3350 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3351 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3352 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3356 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3357 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3358 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3359 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3361 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3362 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3363 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3364 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3367 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3368 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3369 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3370 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3376 printf("MAX:%d\n", maxi);
3382 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3390 //FIXME try pointer walks
3391 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3392 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3393 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3394 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3396 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3397 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3398 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3399 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3401 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3402 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3403 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3404 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3408 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3409 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3410 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3411 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3413 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3414 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3415 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3416 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3419 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3420 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3421 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3422 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3425 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3430 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3431 MpegEncContext * const s= (MpegEncContext *)c;
3432 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3436 s->dsp.diff_pixels(temp, src1, src2, stride);
3438 return s->dsp.sum_abs_dctelem(temp);
3443 const int s07 = SRC(0) + SRC(7);\
3444 const int s16 = SRC(1) + SRC(6);\
3445 const int s25 = SRC(2) + SRC(5);\
3446 const int s34 = SRC(3) + SRC(4);\
3447 const int a0 = s07 + s34;\
3448 const int a1 = s16 + s25;\
3449 const int a2 = s07 - s34;\
3450 const int a3 = s16 - s25;\
3451 const int d07 = SRC(0) - SRC(7);\
3452 const int d16 = SRC(1) - SRC(6);\
3453 const int d25 = SRC(2) - SRC(5);\
3454 const int d34 = SRC(3) - SRC(4);\
3455 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3456 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3457 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3458 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3460 DST(1, a4 + (a7>>2)) ;\
3461 DST(2, a2 + (a3>>1)) ;\
3462 DST(3, a5 + (a6>>2)) ;\
3464 DST(5, a6 - (a5>>2)) ;\
3465 DST(6, (a2>>1) - a3 ) ;\
3466 DST(7, (a4>>2) - a7 ) ;\
3469 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3470 MpegEncContext * const s= (MpegEncContext *)c;
3475 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3477 #define SRC(x) dct[i][x]
3478 #define DST(x,v) dct[i][x]= v
3479 for( i = 0; i < 8; i++ )
3484 #define SRC(x) dct[x][i]
3485 #define DST(x,v) sum += FFABS(v)
3486 for( i = 0; i < 8; i++ )
3494 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3495 MpegEncContext * const s= (MpegEncContext *)c;
3496 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3501 s->dsp.diff_pixels(temp, src1, src2, stride);
3505 sum= FFMAX(sum, FFABS(temp[i]));
3510 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3511 MpegEncContext * const s= (MpegEncContext *)c;
3512 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3513 DCTELEM * const bak = temp+64;
3519 s->dsp.diff_pixels(temp, src1, src2, stride);
3521 memcpy(bak, temp, 64*sizeof(DCTELEM));
3523 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3524 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3525 ff_simple_idct(temp); //FIXME
3528 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3533 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3534 MpegEncContext * const s= (MpegEncContext *)c;
3535 const uint8_t *scantable= s->intra_scantable.permutated;
3536 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3537 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3538 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3539 int i, last, run, bits, level, distortion, start_i;
3540 const int esc_length= s->ac_esc_length;
3542 uint8_t * last_length;
3546 copy_block8(lsrc1, src1, 8, stride, 8);
3547 copy_block8(lsrc2, src2, 8, stride, 8);
3549 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3551 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3557 length = s->intra_ac_vlc_length;
3558 last_length= s->intra_ac_vlc_last_length;
3559 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3562 length = s->inter_ac_vlc_length;
3563 last_length= s->inter_ac_vlc_last_length;
3568 for(i=start_i; i<last; i++){
3569 int j= scantable[i];
3574 if((level&(~127)) == 0){
3575 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3584 level= temp[i] + 64;
3588 if((level&(~127)) == 0){
3589 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3597 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3599 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3602 s->dsp.idct_add(lsrc2, 8, temp);
3604 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3606 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3609 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3610 MpegEncContext * const s= (MpegEncContext *)c;
3611 const uint8_t *scantable= s->intra_scantable.permutated;
3612 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3613 int i, last, run, bits, level, start_i;
3614 const int esc_length= s->ac_esc_length;
3616 uint8_t * last_length;
3620 s->dsp.diff_pixels(temp, src1, src2, stride);
3622 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3628 length = s->intra_ac_vlc_length;
3629 last_length= s->intra_ac_vlc_last_length;
3630 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3633 length = s->inter_ac_vlc_length;
3634 last_length= s->inter_ac_vlc_last_length;
3639 for(i=start_i; i<last; i++){
3640 int j= scantable[i];
3645 if((level&(~127)) == 0){
3646 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3655 level= temp[i] + 64;
3659 if((level&(~127)) == 0){
3660 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3668 #define VSAD_INTRA(size) \
3669 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3673 for(y=1; y<h; y++){ \
3674 for(x=0; x<size; x+=4){ \
3675 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3676 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3686 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3691 for(x=0; x<16; x++){
3692 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3701 #define SQ(a) ((a)*(a))
3702 #define VSSE_INTRA(size) \
3703 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3707 for(y=1; y<h; y++){ \
3708 for(x=0; x<size; x+=4){ \
3709 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3710 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3720 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3725 for(x=0; x<16; x++){
3726 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3735 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3739 for(i=0; i<size; i++)
3740 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3744 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3745 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3746 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3748 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3750 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3751 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3752 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3753 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3755 static void vector_fmul_c(float *dst, const float *src, int len){
3757 for(i=0; i<len; i++)
3761 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3764 for(i=0; i<len; i++)
3765 dst[i] = src0[i] * src1[-i];
3768 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3770 for(i=0; i<len; i++)
3771 dst[i] = src0[i] * src1[i] + src2[i];
3774 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3779 for(i=-len, j=len-1; i<0; i++, j--) {
3784 dst[i] = s0*wj - s1*wi + add_bias;
3785 dst[j] = s0*wi + s1*wj + add_bias;
3789 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3793 for (i = 0; i < len; i++)
3794 dst[i] = src[i] * mul;
3797 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3798 const float **sv, float mul, int len)
3801 for (i = 0; i < len; i += 2, sv++) {
3802 dst[i ] = src[i ] * sv[0][0] * mul;
3803 dst[i+1] = src[i+1] * sv[0][1] * mul;
3807 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3808 const float **sv, float mul, int len)
3811 for (i = 0; i < len; i += 4, sv++) {
3812 dst[i ] = src[i ] * sv[0][0] * mul;
3813 dst[i+1] = src[i+1] * sv[0][1] * mul;
3814 dst[i+2] = src[i+2] * sv[0][2] * mul;
3815 dst[i+3] = src[i+3] * sv[0][3] * mul;
3819 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3823 for (i = 0; i < len; i += 2, sv++) {
3824 dst[i ] = sv[0][0] * mul;
3825 dst[i+1] = sv[0][1] * mul;
3829 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3833 for (i = 0; i < len; i += 4, sv++) {
3834 dst[i ] = sv[0][0] * mul;
3835 dst[i+1] = sv[0][1] * mul;
3836 dst[i+2] = sv[0][2] * mul;
3837 dst[i+3] = sv[0][3] * mul;
3841 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3845 for (i = 0; i < len; i++) {
3846 float t = v1[i] - v2[i];
3852 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3857 for (i = 0; i < len; i++)
3863 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3865 for(i=0; i<len; i++)
3866 dst[i] = src[i] * mul;
3869 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3870 uint32_t maxi, uint32_t maxisign)
3873 if(a > mini) return mini;
3874 else if((a^(1<<31)) > maxisign) return maxi;
3878 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3880 uint32_t mini = *(uint32_t*)min;
3881 uint32_t maxi = *(uint32_t*)max;
3882 uint32_t maxisign = maxi ^ (1<<31);
3883 uint32_t *dsti = (uint32_t*)dst;
3884 const uint32_t *srci = (const uint32_t*)src;
3885 for(i=0; i<len; i+=8) {
3886 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3887 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3888 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3889 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3890 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3891 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3892 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3893 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3896 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3898 if(min < 0 && max > 0) {
3899 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3901 for(i=0; i < len; i+=8) {
3902 dst[i ] = av_clipf(src[i ], min, max);
3903 dst[i + 1] = av_clipf(src[i + 1], min, max);
3904 dst[i + 2] = av_clipf(src[i + 2], min, max);
3905 dst[i + 3] = av_clipf(src[i + 3], min, max);
3906 dst[i + 4] = av_clipf(src[i + 4], min, max);
3907 dst[i + 5] = av_clipf(src[i + 5], min, max);
3908 dst[i + 6] = av_clipf(src[i + 6], min, max);
3909 dst[i + 7] = av_clipf(src[i + 7], min, max);
3914 static av_always_inline int float_to_int16_one(const float *src){
3915 int_fast32_t tmp = *(const int32_t*)src;
3917 tmp = (0x43c0ffff - tmp)>>31;
3918 // is this faster on some gcc/cpu combinations?
3919 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3922 return tmp - 0x8000;
3925 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3927 for(i=0; i<len; i++)
3928 dst[i] = float_to_int16_one(src+i);
3931 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3934 for(i=0; i<len; i++){
3935 dst[2*i] = float_to_int16_one(src[0]+i);
3936 dst[2*i+1] = float_to_int16_one(src[1]+i);
3939 for(c=0; c<channels; c++)
3940 for(i=0, j=c; i<len; i++, j+=channels)
3941 dst[j] = float_to_int16_one(src[c]+i);
3945 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3950 res += (*v1++ * *v2++) >> shift;
3955 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3960 *v1++ += mul * *v3++;
3966 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3967 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3968 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3969 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3970 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3971 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3972 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3974 static void wmv2_idct_row(short * b)
3977 int a0,a1,a2,a3,a4,a5,a6,a7;
3979 a1 = W1*b[1]+W7*b[7];
3980 a7 = W7*b[1]-W1*b[7];
3981 a5 = W5*b[5]+W3*b[3];
3982 a3 = W3*b[5]-W5*b[3];
3983 a2 = W2*b[2]+W6*b[6];
3984 a6 = W6*b[2]-W2*b[6];
3985 a0 = W0*b[0]+W0*b[4];
3986 a4 = W0*b[0]-W0*b[4];
3988 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3989 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3991 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3992 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3993 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3994 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3995 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3996 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3997 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3998 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4000 static void wmv2_idct_col(short * b)
4003 int a0,a1,a2,a3,a4,a5,a6,a7;
4004 /*step 1, with extended precision*/
4005 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4006 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4007 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4008 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4009 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4010 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4011 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4012 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4014 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4015 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4017 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4018 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4019 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4020 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4022 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4023 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4024 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4025 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4027 void ff_wmv2_idct_c(short * block){
4031 wmv2_idct_row(block+i);
4034 wmv2_idct_col(block+i);
4037 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4039 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4041 ff_wmv2_idct_c(block);
4042 put_pixels_clamped_c(block, dest, line_size);
4044 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4046 ff_wmv2_idct_c(block);
4047 add_pixels_clamped_c(block, dest, line_size);
4049 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4052 put_pixels_clamped_c(block, dest, line_size);
4054 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4057 add_pixels_clamped_c(block, dest, line_size);
4060 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4063 put_pixels_clamped4_c(block, dest, line_size);
4065 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4068 add_pixels_clamped4_c(block, dest, line_size);
4071 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4074 put_pixels_clamped2_c(block, dest, line_size);
4076 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4079 add_pixels_clamped2_c(block, dest, line_size);
4082 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4084 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4086 dest[0] = cm[(block[0] + 4)>>3];
4088 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4090 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4092 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4095 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4097 /* init static data */
4098 av_cold void dsputil_static_init(void)
4102 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4103 for(i=0;i<MAX_NEG_CROP;i++) {
4105 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4108 for(i=0;i<512;i++) {
4109 ff_squareTbl[i] = (i - 256) * (i - 256);
4112 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4115 int ff_check_alignment(void){
4116 static int did_fail=0;
4117 DECLARE_ALIGNED(16, int, aligned);
4119 if((intptr_t)&aligned & 15){
4121 #if HAVE_MMX || HAVE_ALTIVEC
4122 av_log(NULL, AV_LOG_ERROR,
4123 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4124 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4125 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4126 "Do not report crashes to FFmpeg developers.\n");
4135 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4139 ff_check_alignment();
4142 if(avctx->dct_algo==FF_DCT_FASTINT) {
4143 c->fdct = fdct_ifast;
4144 c->fdct248 = fdct_ifast248;
4146 else if(avctx->dct_algo==FF_DCT_FAAN) {
4147 c->fdct = ff_faandct;
4148 c->fdct248 = ff_faandct248;
4151 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4152 c->fdct248 = ff_fdct248_islow;
4154 #endif //CONFIG_ENCODERS
4156 if(avctx->lowres==1){
4157 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4158 c->idct_put= ff_jref_idct4_put;
4159 c->idct_add= ff_jref_idct4_add;
4161 c->idct_put= ff_h264_lowres_idct_put_c;
4162 c->idct_add= ff_h264_lowres_idct_add_c;
4164 c->idct = j_rev_dct4;
4165 c->idct_permutation_type= FF_NO_IDCT_PERM;
4166 }else if(avctx->lowres==2){
4167 c->idct_put= ff_jref_idct2_put;
4168 c->idct_add= ff_jref_idct2_add;
4169 c->idct = j_rev_dct2;
4170 c->idct_permutation_type= FF_NO_IDCT_PERM;
4171 }else if(avctx->lowres==3){
4172 c->idct_put= ff_jref_idct1_put;
4173 c->idct_add= ff_jref_idct1_add;
4174 c->idct = j_rev_dct1;
4175 c->idct_permutation_type= FF_NO_IDCT_PERM;
4177 if(avctx->idct_algo==FF_IDCT_INT){
4178 c->idct_put= ff_jref_idct_put;
4179 c->idct_add= ff_jref_idct_add;
4180 c->idct = j_rev_dct;
4181 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4182 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4183 avctx->idct_algo==FF_IDCT_VP3){
4184 c->idct_put= ff_vp3_idct_put_c;
4185 c->idct_add= ff_vp3_idct_add_c;
4186 c->idct = ff_vp3_idct_c;
4187 c->idct_permutation_type= FF_NO_IDCT_PERM;
4188 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4189 c->idct_put= ff_wmv2_idct_put_c;
4190 c->idct_add= ff_wmv2_idct_add_c;
4191 c->idct = ff_wmv2_idct_c;
4192 c->idct_permutation_type= FF_NO_IDCT_PERM;
4193 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4194 c->idct_put= ff_faanidct_put;
4195 c->idct_add= ff_faanidct_add;
4196 c->idct = ff_faanidct;
4197 c->idct_permutation_type= FF_NO_IDCT_PERM;
4198 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4199 c->idct_put= ff_ea_idct_put_c;
4200 c->idct_permutation_type= FF_NO_IDCT_PERM;
4201 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4202 c->idct = ff_bink_idct_c;
4203 c->idct_add = ff_bink_idct_add_c;
4204 c->idct_put = ff_bink_idct_put_c;
4205 c->idct_permutation_type = FF_NO_IDCT_PERM;
4206 }else{ //accurate/default
4207 c->idct_put= ff_simple_idct_put;
4208 c->idct_add= ff_simple_idct_add;
4209 c->idct = ff_simple_idct;
4210 c->idct_permutation_type= FF_NO_IDCT_PERM;
4214 c->get_pixels = get_pixels_c;
4215 c->diff_pixels = diff_pixels_c;
4216 c->put_pixels_clamped = put_pixels_clamped_c;
4217 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4218 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4219 c->add_pixels_clamped = add_pixels_clamped_c;
4220 c->add_pixels8 = add_pixels8_c;
4221 c->add_pixels4 = add_pixels4_c;
4222 c->sum_abs_dctelem = sum_abs_dctelem_c;
4225 c->clear_block = clear_block_c;
4226 c->clear_blocks = clear_blocks_c;
4227 c->pix_sum = pix_sum_c;
4228 c->pix_norm1 = pix_norm1_c;
4230 c->fill_block_tab[0] = fill_block16_c;
4231 c->fill_block_tab[1] = fill_block8_c;
4232 c->scale_block = scale_block_c;
4234 /* TODO [0] 16 [1] 8 */
4235 c->pix_abs[0][0] = pix_abs16_c;
4236 c->pix_abs[0][1] = pix_abs16_x2_c;
4237 c->pix_abs[0][2] = pix_abs16_y2_c;
4238 c->pix_abs[0][3] = pix_abs16_xy2_c;
4239 c->pix_abs[1][0] = pix_abs8_c;
4240 c->pix_abs[1][1] = pix_abs8_x2_c;
4241 c->pix_abs[1][2] = pix_abs8_y2_c;
4242 c->pix_abs[1][3] = pix_abs8_xy2_c;
4244 #define dspfunc(PFX, IDX, NUM) \
4245 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4246 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4247 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4248 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4250 dspfunc(put, 0, 16);
4251 dspfunc(put_no_rnd, 0, 16);
4253 dspfunc(put_no_rnd, 1, 8);
4257 dspfunc(avg, 0, 16);
4258 dspfunc(avg_no_rnd, 0, 16);
4260 dspfunc(avg_no_rnd, 1, 8);
4265 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4266 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4268 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4269 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4270 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4271 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4272 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4273 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4274 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4275 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4276 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4278 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4279 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4280 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4281 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4282 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4283 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4284 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4285 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4286 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4288 #define dspfunc(PFX, IDX, NUM) \
4289 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4290 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4291 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4292 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4293 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4294 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4295 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4296 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4297 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4298 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4299 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4300 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4301 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4302 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4303 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4304 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4306 dspfunc(put_qpel, 0, 16);
4307 dspfunc(put_no_rnd_qpel, 0, 16);
4309 dspfunc(avg_qpel, 0, 16);
4310 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4312 dspfunc(put_qpel, 1, 8);
4313 dspfunc(put_no_rnd_qpel, 1, 8);
4315 dspfunc(avg_qpel, 1, 8);
4316 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4318 dspfunc(put_h264_qpel, 0, 16);
4319 dspfunc(put_h264_qpel, 1, 8);
4320 dspfunc(put_h264_qpel, 2, 4);
4321 dspfunc(put_h264_qpel, 3, 2);
4322 dspfunc(avg_h264_qpel, 0, 16);
4323 dspfunc(avg_h264_qpel, 1, 8);
4324 dspfunc(avg_h264_qpel, 2, 4);
4327 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4328 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4329 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4330 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4331 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4332 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4333 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4334 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4336 c->draw_edges = draw_edges_c;
4338 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4339 ff_mlp_init(c, avctx);
4341 #if CONFIG_VC1_DECODER
4342 ff_vc1dsp_init(c,avctx);
4344 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4345 ff_intrax8dsp_init(c,avctx);
4347 #if CONFIG_RV30_DECODER
4348 ff_rv30dsp_init(c,avctx);
4350 #if CONFIG_RV40_DECODER
4351 ff_rv40dsp_init(c,avctx);
4352 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4353 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4354 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4355 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4358 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4359 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4360 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4361 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4362 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4363 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4364 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4365 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4367 #define SET_CMP_FUNC(name) \
4368 c->name[0]= name ## 16_c;\
4369 c->name[1]= name ## 8x8_c;
4371 SET_CMP_FUNC(hadamard8_diff)
4372 c->hadamard8_diff[4]= hadamard8_intra16_c;
4373 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4374 SET_CMP_FUNC(dct_sad)
4375 SET_CMP_FUNC(dct_max)
4377 SET_CMP_FUNC(dct264_sad)
4379 c->sad[0]= pix_abs16_c;
4380 c->sad[1]= pix_abs8_c;
4384 SET_CMP_FUNC(quant_psnr)
4387 c->vsad[0]= vsad16_c;
4388 c->vsad[4]= vsad_intra16_c;
4389 c->vsad[5]= vsad_intra8_c;
4390 c->vsse[0]= vsse16_c;
4391 c->vsse[4]= vsse_intra16_c;
4392 c->vsse[5]= vsse_intra8_c;
4393 c->nsse[0]= nsse16_c;
4394 c->nsse[1]= nsse8_c;
4396 ff_dsputil_init_dwt(c);
4399 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4401 c->add_bytes= add_bytes_c;
4402 c->add_bytes_l2= add_bytes_l2_c;
4403 c->diff_bytes= diff_bytes_c;
4404 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4405 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4406 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4407 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4408 c->bswap_buf= bswap_buf;
4409 #if CONFIG_PNG_DECODER
4410 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4413 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4414 c->h263_h_loop_filter= h263_h_loop_filter_c;
4415 c->h263_v_loop_filter= h263_v_loop_filter_c;
4418 if (CONFIG_VP3_DECODER) {
4419 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4420 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4421 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4424 c->h261_loop_filter= h261_loop_filter_c;
4426 c->try_8x8basis= try_8x8basis_c;
4427 c->add_8x8basis= add_8x8basis_c;
4429 #if CONFIG_VORBIS_DECODER
4430 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4432 #if CONFIG_AC3_DECODER
4433 c->ac3_downmix = ff_ac3_downmix_c;
4436 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4438 c->vector_fmul = vector_fmul_c;
4439 c->vector_fmul_reverse = vector_fmul_reverse_c;
4440 c->vector_fmul_add = vector_fmul_add_c;
4441 c->vector_fmul_window = ff_vector_fmul_window_c;
4442 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4443 c->vector_clipf = vector_clipf_c;
4444 c->float_to_int16 = ff_float_to_int16_c;
4445 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4446 c->scalarproduct_int16 = scalarproduct_int16_c;
4447 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4448 c->scalarproduct_float = scalarproduct_float_c;
4449 c->butterflies_float = butterflies_float_c;
4450 c->vector_fmul_scalar = vector_fmul_scalar_c;
4452 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4453 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4455 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4456 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4458 c->shrink[0]= av_image_copy_plane;
4459 c->shrink[1]= ff_shrink22;
4460 c->shrink[2]= ff_shrink44;
4461 c->shrink[3]= ff_shrink88;
4463 c->prefetch= just_return;
4465 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4466 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4468 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4469 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4470 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4471 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4472 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4473 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4474 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4475 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4476 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4478 for(i=0; i<64; i++){
4479 if(!c->put_2tap_qpel_pixels_tab[0][i])
4480 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4481 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4482 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4485 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4486 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4487 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4488 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4490 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4491 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4492 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4493 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4495 switch(c->idct_permutation_type){
4496 case FF_NO_IDCT_PERM:
4498 c->idct_permutation[i]= i;
4500 case FF_LIBMPEG2_IDCT_PERM:
4502 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4504 case FF_SIMPLE_IDCT_PERM:
4506 c->idct_permutation[i]= simple_mmx_permutation[i];
4508 case FF_TRANSPOSE_IDCT_PERM:
4510 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4512 case FF_PARTTRANS_IDCT_PERM:
4514 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4516 case FF_SSE2_IDCT_PERM:
4518 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4521 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");