3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "simple_idct.h"
36 #include "mpegvideo.h"
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48 #define pb_7f (~0UL/255 * 0x7f)
49 #define pb_80 (~0UL/255 * 0x80)
51 const uint8_t ff_zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63 specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
78 const uint8_t ff_alternate_horizontal_scan[64] = {
79 0, 1, 2, 3, 8, 9, 16, 17,
80 10, 11, 4, 5, 6, 7, 15, 14,
81 13, 12, 19, 18, 24, 25, 32, 33,
82 26, 27, 20, 21, 22, 23, 28, 29,
83 30, 31, 34, 35, 40, 41, 48, 49,
84 42, 43, 36, 37, 38, 39, 44, 45,
85 46, 47, 50, 51, 56, 57, 58, 59,
86 52, 53, 54, 55, 60, 61, 62, 63,
89 const uint8_t ff_alternate_vertical_scan[64] = {
90 0, 8, 16, 24, 1, 9, 2, 10,
91 17, 25, 32, 40, 48, 56, 57, 49,
92 41, 33, 26, 18, 3, 11, 4, 12,
93 19, 27, 34, 42, 50, 58, 35, 43,
94 51, 59, 20, 28, 5, 13, 6, 14,
95 21, 29, 36, 44, 52, 60, 37, 45,
96 53, 61, 22, 30, 7, 15, 23, 31,
97 38, 46, 54, 62, 39, 47, 55, 63,
100 /* Input permutation for the simple_idct_mmx */
101 static const uint8_t simple_mmx_permutation[64]={
102 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
103 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
104 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
105 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
106 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
107 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
108 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
109 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
112 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
114 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
118 st->scantable= src_scantable;
122 j = src_scantable[i];
123 st->permutated[i] = permutation[j];
132 j = st->permutated[i];
134 st->raster_end[i]= end;
138 static int pix_sum_c(uint8_t * pix, int line_size)
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
155 pix += line_size - 16;
160 static int pix_norm1_c(uint8_t * pix, int line_size)
163 uint32_t *sq = ff_squareTbl + 256;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
189 register uint32_t x=*(uint32_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
203 pix += line_size - 16;
208 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= av_bswap32(src[i+0]);
213 dst[i+1]= av_bswap32(src[i+1]);
214 dst[i+2]= av_bswap32(src[i+2]);
215 dst[i+3]= av_bswap32(src[i+3]);
216 dst[i+4]= av_bswap32(src[i+4]);
217 dst[i+5]= av_bswap32(src[i+5]);
218 dst[i+6]= av_bswap32(src[i+6]);
219 dst[i+7]= av_bswap32(src[i+7]);
222 dst[i+0]= av_bswap32(src[i+0]);
226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
229 uint32_t *sq = ff_squareTbl + 256;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
246 uint32_t *sq = ff_squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
267 uint32_t *sq = ff_squareTbl + 256;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
294 /* draw the edges of width 'w' of an image of size width, height */
295 //FIXME check that this is ok for mpeg4 interlaced
296 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
298 uint8_t *ptr, *last_line;
301 last_line = buf + (height - 1) * wrap;
304 memcpy(buf - (i + 1) * wrap, buf, width);
305 memcpy(last_line + (i + 1) * wrap, last_line, width);
309 for(i=0;i<height;i++) {
310 memset(ptr - w, ptr[0], w);
311 memset(ptr + width, ptr[width-1], w);
316 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
317 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
318 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
319 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
324 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
325 * @param buf destination buffer
326 * @param src source buffer
327 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
328 * @param block_w width of block
329 * @param block_h height of block
330 * @param src_x x coordinate of the top left sample of the block in the source buffer
331 * @param src_y y coordinate of the top left sample of the block in the source buffer
332 * @param w width of the source buffer
333 * @param h height of the source buffer
335 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
336 int src_x, int src_y, int w, int h){
338 int start_y, start_x, end_y, end_x;
341 src+= (h-1-src_y)*linesize;
343 }else if(src_y<=-block_h){
344 src+= (1-block_h-src_y)*linesize;
350 }else if(src_x<=-block_w){
351 src+= (1-block_w-src_x);
355 start_y= FFMAX(0, -src_y);
356 start_x= FFMAX(0, -src_x);
357 end_y= FFMIN(block_h, h-src_y);
358 end_x= FFMIN(block_w, w-src_x);
360 // copy existing part
361 for(y=start_y; y<end_y; y++){
362 for(x=start_x; x<end_x; x++){
363 buf[x + y*linesize]= src[x + y*linesize];
368 for(y=0; y<start_y; y++){
369 for(x=start_x; x<end_x; x++){
370 buf[x + y*linesize]= buf[x + start_y*linesize];
375 for(y=end_y; y<block_h; y++){
376 for(x=start_x; x<end_x; x++){
377 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
381 for(y=0; y<block_h; y++){
383 for(x=0; x<start_x; x++){
384 buf[x + y*linesize]= buf[start_x + y*linesize];
388 for(x=end_x; x<block_w; x++){
389 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
398 /* read the pixels */
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414 const uint8_t *s2, int stride){
417 /* read the pixels */
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
438 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
440 /* read the pixels */
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
460 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
462 /* read the pixels */
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
480 /* read the pixels */
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
500 else if (*block > 127)
503 *pixels = (uint8_t)(*block + 128);
507 pixels += (line_size - 8);
511 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
516 /* read the pixels */
518 pixels[0] = block[0];
519 pixels[1] = block[1];
520 pixels[2] = block[2];
521 pixels[3] = block[3];
522 pixels[4] = block[4];
523 pixels[5] = block[5];
524 pixels[6] = block[6];
525 pixels[7] = block[7];
532 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
536 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
538 /* read the pixels */
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels[4] = cm[pixels[4] + block[4]];
545 pixels[5] = cm[pixels[5] + block[5]];
546 pixels[6] = cm[pixels[6] + block[6]];
547 pixels[7] = cm[pixels[7] + block[7]];
553 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
557 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
559 /* read the pixels */
561 pixels[0] = cm[pixels[0] + block[0]];
562 pixels[1] = cm[pixels[1] + block[1]];
563 pixels[2] = cm[pixels[2] + block[2]];
564 pixels[3] = cm[pixels[3] + block[3]];
570 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
576 /* read the pixels */
578 pixels[0] = cm[pixels[0] + block[0]];
579 pixels[1] = cm[pixels[1] + block[1]];
585 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
589 pixels[0] += block[0];
590 pixels[1] += block[1];
591 pixels[2] += block[2];
592 pixels[3] += block[3];
593 pixels[4] += block[4];
594 pixels[5] += block[5];
595 pixels[6] += block[6];
596 pixels[7] += block[7];
602 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
606 pixels[0] += block[0];
607 pixels[1] += block[1];
608 pixels[2] += block[2];
609 pixels[3] += block[3];
615 static int sum_abs_dctelem_c(DCTELEM *block)
619 sum+= FFABS(block[i]);
623 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
627 for (i = 0; i < h; i++) {
628 memset(block, value, 16);
633 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
637 for (i = 0; i < h; i++) {
638 memset(block, value, 8);
643 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
646 uint16_t *dst1 = (uint16_t *) dst;
647 uint16_t *dst2 = (uint16_t *)(dst + linesize);
649 for (j = 0; j < 8; j++) {
650 for (i = 0; i < 8; i++) {
651 dst1[i] = dst2[i] = src[i] * 0x0101;
661 #define PIXOP2(OPNAME, OP) \
662 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
666 OP(*((uint64_t*)block), AV_RN64(pixels));\
672 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
676 const uint64_t a= AV_RN64(pixels );\
677 const uint64_t b= AV_RN64(pixels+1);\
678 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
684 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
688 const uint64_t a= AV_RN64(pixels );\
689 const uint64_t b= AV_RN64(pixels+1);\
690 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
696 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700 const uint64_t a= AV_RN64(pixels );\
701 const uint64_t b= AV_RN64(pixels+line_size);\
702 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
708 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
712 const uint64_t a= AV_RN64(pixels );\
713 const uint64_t b= AV_RN64(pixels+line_size);\
714 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
720 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
723 const uint64_t a= AV_RN64(pixels );\
724 const uint64_t b= AV_RN64(pixels+1);\
725 uint64_t l0= (a&0x0303030303030303ULL)\
726 + (b&0x0303030303030303ULL)\
727 + 0x0202020202020202ULL;\
728 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
733 for(i=0; i<h; i+=2){\
734 uint64_t a= AV_RN64(pixels );\
735 uint64_t b= AV_RN64(pixels+1);\
736 l1= (a&0x0303030303030303ULL)\
737 + (b&0x0303030303030303ULL);\
738 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
739 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
740 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
743 a= AV_RN64(pixels );\
744 b= AV_RN64(pixels+1);\
745 l0= (a&0x0303030303030303ULL)\
746 + (b&0x0303030303030303ULL)\
747 + 0x0202020202020202ULL;\
748 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
749 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
750 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
756 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
759 const uint64_t a= AV_RN64(pixels );\
760 const uint64_t b= AV_RN64(pixels+1);\
761 uint64_t l0= (a&0x0303030303030303ULL)\
762 + (b&0x0303030303030303ULL)\
763 + 0x0101010101010101ULL;\
764 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
765 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
769 for(i=0; i<h; i+=2){\
770 uint64_t a= AV_RN64(pixels );\
771 uint64_t b= AV_RN64(pixels+1);\
772 l1= (a&0x0303030303030303ULL)\
773 + (b&0x0303030303030303ULL);\
774 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
775 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
776 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
779 a= AV_RN64(pixels );\
780 b= AV_RN64(pixels+1);\
781 l0= (a&0x0303030303030303ULL)\
782 + (b&0x0303030303030303ULL)\
783 + 0x0101010101010101ULL;\
784 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
785 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
786 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
792 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
793 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
794 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
795 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
800 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
801 #else // 64 bit variant
803 #define PIXOP2(OPNAME, OP) \
804 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
812 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
815 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
820 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
823 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
824 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
829 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
833 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834 int src_stride1, int src_stride2, int h){\
838 a= AV_RN32(&src1[i*src_stride1 ]);\
839 b= AV_RN32(&src2[i*src_stride2 ]);\
840 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
841 a= AV_RN32(&src1[i*src_stride1+4]);\
842 b= AV_RN32(&src2[i*src_stride2+4]);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
847 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
848 int src_stride1, int src_stride2, int h){\
852 a= AV_RN32(&src1[i*src_stride1 ]);\
853 b= AV_RN32(&src2[i*src_stride2 ]);\
854 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
855 a= AV_RN32(&src1[i*src_stride1+4]);\
856 b= AV_RN32(&src2[i*src_stride2+4]);\
857 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
861 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
862 int src_stride1, int src_stride2, int h){\
866 a= AV_RN32(&src1[i*src_stride1 ]);\
867 b= AV_RN32(&src2[i*src_stride2 ]);\
868 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
872 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
873 int src_stride1, int src_stride2, int h){\
877 a= AV_RN16(&src1[i*src_stride1 ]);\
878 b= AV_RN16(&src2[i*src_stride2 ]);\
879 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
883 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
884 int src_stride1, int src_stride2, int h){\
885 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
886 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
889 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
891 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
892 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
895 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
947 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
948 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
951 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
952 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
955 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
956 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
959 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
963 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
964 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
967 uint32_t a, b, c, d, l0, l1, h0, h1;\
968 a= AV_RN32(&src1[i*src_stride1]);\
969 b= AV_RN32(&src2[i*src_stride2]);\
970 c= AV_RN32(&src3[i*src_stride3]);\
971 d= AV_RN32(&src4[i*src_stride4]);\
972 l0= (a&0x03030303UL)\
975 h0= ((a&0xFCFCFCFCUL)>>2)\
976 + ((b&0xFCFCFCFCUL)>>2);\
977 l1= (c&0x03030303UL)\
979 h1= ((c&0xFCFCFCFCUL)>>2)\
980 + ((d&0xFCFCFCFCUL)>>2);\
981 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982 a= AV_RN32(&src1[i*src_stride1+4]);\
983 b= AV_RN32(&src2[i*src_stride2+4]);\
984 c= AV_RN32(&src3[i*src_stride3+4]);\
985 d= AV_RN32(&src4[i*src_stride4+4]);\
986 l0= (a&0x03030303UL)\
989 h0= ((a&0xFCFCFCFCUL)>>2)\
990 + ((b&0xFCFCFCFCUL)>>2);\
991 l1= (c&0x03030303UL)\
993 h1= ((c&0xFCFCFCFCUL)>>2)\
994 + ((d&0xFCFCFCFCUL)>>2);\
995 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1003 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1009 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1011 int i, a0, b0, a1, b1;\
1018 for(i=0; i<h; i+=2){\
1024 block[0]= (a1+a0)>>2; /* FIXME non put */\
1025 block[1]= (b1+b0)>>2;\
1035 block[0]= (a1+a0)>>2;\
1036 block[1]= (b1+b0)>>2;\
1042 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1045 const uint32_t a= AV_RN32(pixels );\
1046 const uint32_t b= AV_RN32(pixels+1);\
1047 uint32_t l0= (a&0x03030303UL)\
1050 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051 + ((b&0xFCFCFCFCUL)>>2);\
1055 for(i=0; i<h; i+=2){\
1056 uint32_t a= AV_RN32(pixels );\
1057 uint32_t b= AV_RN32(pixels+1);\
1058 l1= (a&0x03030303UL)\
1059 + (b&0x03030303UL);\
1060 h1= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1065 a= AV_RN32(pixels );\
1066 b= AV_RN32(pixels+1);\
1067 l0= (a&0x03030303UL)\
1070 h0= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1081 for(j=0; j<2; j++){\
1083 const uint32_t a= AV_RN32(pixels );\
1084 const uint32_t b= AV_RN32(pixels+1);\
1085 uint32_t l0= (a&0x03030303UL)\
1088 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089 + ((b&0xFCFCFCFCUL)>>2);\
1093 for(i=0; i<h; i+=2){\
1094 uint32_t a= AV_RN32(pixels );\
1095 uint32_t b= AV_RN32(pixels+1);\
1096 l1= (a&0x03030303UL)\
1097 + (b&0x03030303UL);\
1098 h1= ((a&0xFCFCFCFCUL)>>2)\
1099 + ((b&0xFCFCFCFCUL)>>2);\
1100 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103 a= AV_RN32(pixels );\
1104 b= AV_RN32(pixels+1);\
1105 l0= (a&0x03030303UL)\
1108 h0= ((a&0xFCFCFCFCUL)>>2)\
1109 + ((b&0xFCFCFCFCUL)>>2);\
1110 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1114 pixels+=4-line_size*(h+1);\
1115 block +=4-line_size*h;\
1119 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1122 for(j=0; j<2; j++){\
1124 const uint32_t a= AV_RN32(pixels );\
1125 const uint32_t b= AV_RN32(pixels+1);\
1126 uint32_t l0= (a&0x03030303UL)\
1129 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130 + ((b&0xFCFCFCFCUL)>>2);\
1134 for(i=0; i<h; i+=2){\
1135 uint32_t a= AV_RN32(pixels );\
1136 uint32_t b= AV_RN32(pixels+1);\
1137 l1= (a&0x03030303UL)\
1138 + (b&0x03030303UL);\
1139 h1= ((a&0xFCFCFCFCUL)>>2)\
1140 + ((b&0xFCFCFCFCUL)>>2);\
1141 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1144 a= AV_RN32(pixels );\
1145 b= AV_RN32(pixels+1);\
1146 l0= (a&0x03030303UL)\
1149 h0= ((a&0xFCFCFCFCUL)>>2)\
1150 + ((b&0xFCFCFCFCUL)>>2);\
1151 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1155 pixels+=4-line_size*(h+1);\
1156 block +=4-line_size*h;\
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1169 #define op_avg(a, b) a = rnd_avg32(a, b)
1171 #define op_put(a, b) a = b
1178 #define avg2(a,b) ((a+b+1)>>1)
1179 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1181 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1182 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1185 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1186 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1189 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1191 const int A=(16-x16)*(16-y16);
1192 const int B=( x16)*(16-y16);
1193 const int C=(16-x16)*( y16);
1194 const int D=( x16)*( y16);
1199 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1200 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1201 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1202 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1203 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1204 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1205 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1206 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1212 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1213 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1216 const int s= 1<<shift;
1226 for(x=0; x<8; x++){ //XXX FIXME optimize
1227 int src_x, src_y, frac_x, frac_y, index;
1231 frac_x= src_x&(s-1);
1232 frac_y= src_y&(s-1);
1236 if((unsigned)src_x < width){
1237 if((unsigned)src_y < height){
1238 index= src_x + src_y*stride;
1239 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1240 + src[index +1]* frac_x )*(s-frac_y)
1241 + ( src[index+stride ]*(s-frac_x)
1242 + src[index+stride+1]* frac_x )* frac_y
1245 index= src_x + av_clip(src_y, 0, height)*stride;
1246 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1247 + src[index +1]* frac_x )*s
1251 if((unsigned)src_y < height){
1252 index= av_clip(src_x, 0, width) + src_y*stride;
1253 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1254 + src[index+stride ]* frac_y )*s
1257 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1258 dst[y*stride + x]= src[index ];
1270 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 case 2: put_pixels2_c (dst, src, stride, height); break;
1273 case 4: put_pixels4_c (dst, src, stride, height); break;
1274 case 8: put_pixels8_c (dst, src, stride, height); break;
1275 case 16:put_pixels16_c(dst, src, stride, height); break;
1279 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1290 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1301 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1312 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1323 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1334 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1345 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1356 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1367 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 case 2: avg_pixels2_c (dst, src, stride, height); break;
1370 case 4: avg_pixels4_c (dst, src, stride, height); break;
1371 case 8: avg_pixels8_c (dst, src, stride, height); break;
1372 case 16:avg_pixels16_c(dst, src, stride, height); break;
1376 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1398 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400 for (i=0; i < height; i++) {
1401 for (j=0; j < width; j++) {
1402 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1409 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1411 for (i=0; i < height; i++) {
1412 for (j=0; j < width; j++) {
1413 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1420 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1422 for (i=0; i < height; i++) {
1423 for (j=0; j < width; j++) {
1424 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1431 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1433 for (i=0; i < height; i++) {
1434 for (j=0; j < width; j++) {
1435 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1442 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1444 for (i=0; i < height; i++) {
1445 for (j=0; j < width; j++) {
1446 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1453 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1455 for (i=0; i < height; i++) {
1456 for (j=0; j < width; j++) {
1457 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1464 #define TPEL_WIDTH(width)\
1465 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1466 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1467 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1468 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1469 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1470 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1471 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1472 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1473 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1474 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1475 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1476 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1477 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1478 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1479 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1480 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1481 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1482 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1485 #define H264_CHROMA_MC(OPNAME, OP)\
1486 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1487 const int A=(8-x)*(8-y);\
1488 const int B=( x)*(8-y);\
1489 const int C=(8-x)*( y);\
1490 const int D=( x)*( y);\
1493 assert(x<8 && y<8 && x>=0 && y>=0);\
1496 for(i=0; i<h; i++){\
1497 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1498 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1504 const int step= C ? stride : 1;\
1505 for(i=0; i<h; i++){\
1506 OP(dst[0], (A*src[0] + E*src[step+0]));\
1507 OP(dst[1], (A*src[1] + E*src[step+1]));\
1514 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1515 const int A=(8-x)*(8-y);\
1516 const int B=( x)*(8-y);\
1517 const int C=(8-x)*( y);\
1518 const int D=( x)*( y);\
1521 assert(x<8 && y<8 && x>=0 && y>=0);\
1524 for(i=0; i<h; i++){\
1525 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1526 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1527 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1528 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1534 const int step= C ? stride : 1;\
1535 for(i=0; i<h; i++){\
1536 OP(dst[0], (A*src[0] + E*src[step+0]));\
1537 OP(dst[1], (A*src[1] + E*src[step+1]));\
1538 OP(dst[2], (A*src[2] + E*src[step+2]));\
1539 OP(dst[3], (A*src[3] + E*src[step+3]));\
1546 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1547 const int A=(8-x)*(8-y);\
1548 const int B=( x)*(8-y);\
1549 const int C=(8-x)*( y);\
1550 const int D=( x)*( y);\
1553 assert(x<8 && y<8 && x>=0 && y>=0);\
1556 for(i=0; i<h; i++){\
1557 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1558 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1559 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1560 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1561 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1562 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1563 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1564 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1570 const int step= C ? stride : 1;\
1571 for(i=0; i<h; i++){\
1572 OP(dst[0], (A*src[0] + E*src[step+0]));\
1573 OP(dst[1], (A*src[1] + E*src[step+1]));\
1574 OP(dst[2], (A*src[2] + E*src[step+2]));\
1575 OP(dst[3], (A*src[3] + E*src[step+3]));\
1576 OP(dst[4], (A*src[4] + E*src[step+4]));\
1577 OP(dst[5], (A*src[5] + E*src[step+5]));\
1578 OP(dst[6], (A*src[6] + E*src[step+6]));\
1579 OP(dst[7], (A*src[7] + E*src[step+7]));\
1586 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1587 #define op_put(a, b) a = (((b) + 32)>>6)
1589 H264_CHROMA_MC(put_ , op_put)
1590 H264_CHROMA_MC(avg_ , op_avg)
1594 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1595 const int A=(8-x)*(8-y);
1596 const int B=( x)*(8-y);
1597 const int C=(8-x)*( y);
1598 const int D=( x)*( y);
1601 assert(x<8 && y<8 && x>=0 && y>=0);
1605 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1606 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1607 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1608 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1609 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1610 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1611 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1612 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1618 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1619 const int A=(8-x)*(8-y);
1620 const int B=( x)*(8-y);
1621 const int C=(8-x)*( y);
1622 const int D=( x)*( y);
1625 assert(x<8 && y<8 && x>=0 && y>=0);
1629 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1630 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1631 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1632 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1633 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1634 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1635 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1636 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1642 #define QPEL_MC(r, OPNAME, RND, OP) \
1643 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1644 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1648 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1649 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1650 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1651 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1652 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1653 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1654 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1655 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1661 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1667 const int src0= src[0*srcStride];\
1668 const int src1= src[1*srcStride];\
1669 const int src2= src[2*srcStride];\
1670 const int src3= src[3*srcStride];\
1671 const int src4= src[4*srcStride];\
1672 const int src5= src[5*srcStride];\
1673 const int src6= src[6*srcStride];\
1674 const int src7= src[7*srcStride];\
1675 const int src8= src[8*srcStride];\
1676 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1677 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1678 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1679 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1680 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1681 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1682 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1683 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1689 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1690 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1695 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1696 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1697 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1698 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1699 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1700 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1701 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1702 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1703 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1704 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1705 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1706 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1707 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1708 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1709 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1710 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1716 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1717 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1722 const int src0= src[0*srcStride];\
1723 const int src1= src[1*srcStride];\
1724 const int src2= src[2*srcStride];\
1725 const int src3= src[3*srcStride];\
1726 const int src4= src[4*srcStride];\
1727 const int src5= src[5*srcStride];\
1728 const int src6= src[6*srcStride];\
1729 const int src7= src[7*srcStride];\
1730 const int src8= src[8*srcStride];\
1731 const int src9= src[9*srcStride];\
1732 const int src10= src[10*srcStride];\
1733 const int src11= src[11*srcStride];\
1734 const int src12= src[12*srcStride];\
1735 const int src13= src[13*srcStride];\
1736 const int src14= src[14*srcStride];\
1737 const int src15= src[15*srcStride];\
1738 const int src16= src[16*srcStride];\
1739 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1740 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1741 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1742 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1743 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1744 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1745 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1746 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1747 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1748 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1749 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1750 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1751 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1752 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1753 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1754 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1760 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1761 OPNAME ## pixels8_c(dst, src, stride, 8);\
1764 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1767 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1770 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1771 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1774 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1776 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1777 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1780 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1785 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1788 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t full[16*9];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1794 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1799 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1801 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1815 uint8_t halfHV[64];\
1816 copy_block9(full, src, 16, stride, 9);\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1818 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1819 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1822 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[16*9];\
1826 uint8_t halfHV[64];\
1827 copy_block9(full, src, 16, stride, 9);\
1828 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1830 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1831 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1833 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1834 uint8_t full[16*9];\
1836 uint8_t halfHV[64];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1843 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1847 uint8_t halfHV[64];\
1848 copy_block9(full, src, 16, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1854 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1857 uint8_t halfHV[64];\
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1862 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1864 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1885 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t halfHV[64];\
1888 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1890 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1892 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1894 uint8_t halfHV[64];\
1895 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1896 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1897 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1899 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900 uint8_t full[16*9];\
1903 uint8_t halfHV[64];\
1904 copy_block9(full, src, 16, stride, 9);\
1905 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1906 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1907 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1908 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1910 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[16*9];\
1913 copy_block9(full, src, 16, stride, 9);\
1914 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1918 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1922 uint8_t halfHV[64];\
1923 copy_block9(full, src, 16, stride, 9);\
1924 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1926 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1927 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1929 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[16*9];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1934 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1935 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1937 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1939 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1940 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1942 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1943 OPNAME ## pixels16_c(dst, src, stride, 16);\
1946 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1948 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1949 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1952 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1953 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1956 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1958 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1959 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1962 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[24*17];\
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1967 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1970 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[24*17];\
1972 copy_block17(full, src, 24, stride, 17);\
1973 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1976 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1981 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1983 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1994 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 uint8_t halfHV[256];\
1998 copy_block17(full, src, 24, stride, 17);\
1999 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2000 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2001 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2004 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2005 uint8_t full[24*17];\
2006 uint8_t halfH[272];\
2007 uint8_t halfV[256];\
2008 uint8_t halfHV[256];\
2009 copy_block17(full, src, 24, stride, 17);\
2010 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2012 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2013 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2015 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2016 uint8_t full[24*17];\
2017 uint8_t halfH[272];\
2018 uint8_t halfHV[256];\
2019 copy_block17(full, src, 24, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2025 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t halfH[272];\
2028 uint8_t halfV[256];\
2029 uint8_t halfHV[256];\
2030 copy_block17(full, src, 24, stride, 17);\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2036 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[24*17];\
2038 uint8_t halfH[272];\
2039 uint8_t halfHV[256];\
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2044 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2046 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfHV[256];\
2061 copy_block17(full, src, 24, stride, 17);\
2062 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2067 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t halfH[272];\
2069 uint8_t halfHV[256];\
2070 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2071 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2072 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2074 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t halfH[272];\
2076 uint8_t halfHV[256];\
2077 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2078 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2079 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2081 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2082 uint8_t full[24*17];\
2083 uint8_t halfH[272];\
2084 uint8_t halfV[256];\
2085 uint8_t halfHV[256];\
2086 copy_block17(full, src, 24, stride, 17);\
2087 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2088 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2089 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2090 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2092 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2093 uint8_t full[24*17];\
2094 uint8_t halfH[272];\
2095 copy_block17(full, src, 24, stride, 17);\
2096 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2100 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2101 uint8_t full[24*17];\
2102 uint8_t halfH[272];\
2103 uint8_t halfV[256];\
2104 uint8_t halfHV[256];\
2105 copy_block17(full, src, 24, stride, 17);\
2106 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2107 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2108 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2109 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2111 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2112 uint8_t full[24*17];\
2113 uint8_t halfH[272];\
2114 copy_block17(full, src, 24, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2116 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2117 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2119 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t halfH[272];\
2121 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2122 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2125 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2126 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2127 #define op_put(a, b) a = cm[((b) + 16)>>5]
2128 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2130 QPEL_MC(0, put_ , _ , op_put)
2131 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2132 QPEL_MC(0, avg_ , _ , op_avg)
2133 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2135 #undef op_avg_no_rnd
2137 #undef op_put_no_rnd
2140 #define H264_LOWPASS(OPNAME, OP, OP2) \
2141 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2143 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2147 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2148 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2154 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2156 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2160 const int srcB= src[-2*srcStride];\
2161 const int srcA= src[-1*srcStride];\
2162 const int src0= src[0 *srcStride];\
2163 const int src1= src[1 *srcStride];\
2164 const int src2= src[2 *srcStride];\
2165 const int src3= src[3 *srcStride];\
2166 const int src4= src[4 *srcStride];\
2167 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2168 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2174 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2177 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2179 src -= 2*srcStride;\
2180 for(i=0; i<h+5; i++)\
2182 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2183 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2187 tmp -= tmpStride*(h+5-2);\
2190 const int tmpB= tmp[-2*tmpStride];\
2191 const int tmpA= tmp[-1*tmpStride];\
2192 const int tmp0= tmp[0 *tmpStride];\
2193 const int tmp1= tmp[1 *tmpStride];\
2194 const int tmp2= tmp[2 *tmpStride];\
2195 const int tmp3= tmp[3 *tmpStride];\
2196 const int tmp4= tmp[4 *tmpStride];\
2197 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2198 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2203 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2209 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2210 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2211 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2212 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2218 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2224 const int srcB= src[-2*srcStride];\
2225 const int srcA= src[-1*srcStride];\
2226 const int src0= src[0 *srcStride];\
2227 const int src1= src[1 *srcStride];\
2228 const int src2= src[2 *srcStride];\
2229 const int src3= src[3 *srcStride];\
2230 const int src4= src[4 *srcStride];\
2231 const int src5= src[5 *srcStride];\
2232 const int src6= src[6 *srcStride];\
2233 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2234 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2235 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2236 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2245 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2247 src -= 2*srcStride;\
2248 for(i=0; i<h+5; i++)\
2250 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2251 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2252 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2253 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2257 tmp -= tmpStride*(h+5-2);\
2260 const int tmpB= tmp[-2*tmpStride];\
2261 const int tmpA= tmp[-1*tmpStride];\
2262 const int tmp0= tmp[0 *tmpStride];\
2263 const int tmp1= tmp[1 *tmpStride];\
2264 const int tmp2= tmp[2 *tmpStride];\
2265 const int tmp3= tmp[3 *tmpStride];\
2266 const int tmp4= tmp[4 *tmpStride];\
2267 const int tmp5= tmp[5 *tmpStride];\
2268 const int tmp6= tmp[6 *tmpStride];\
2269 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2270 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2271 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2272 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2278 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2280 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2284 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2285 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2286 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2287 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2288 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2289 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2290 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2291 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2297 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2299 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2303 const int srcB= src[-2*srcStride];\
2304 const int srcA= src[-1*srcStride];\
2305 const int src0= src[0 *srcStride];\
2306 const int src1= src[1 *srcStride];\
2307 const int src2= src[2 *srcStride];\
2308 const int src3= src[3 *srcStride];\
2309 const int src4= src[4 *srcStride];\
2310 const int src5= src[5 *srcStride];\
2311 const int src6= src[6 *srcStride];\
2312 const int src7= src[7 *srcStride];\
2313 const int src8= src[8 *srcStride];\
2314 const int src9= src[9 *srcStride];\
2315 const int src10=src[10*srcStride];\
2316 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2317 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2318 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2319 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2320 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2321 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2322 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2323 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2329 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2332 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2334 src -= 2*srcStride;\
2335 for(i=0; i<h+5; i++)\
2337 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2338 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2339 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2340 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2341 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2342 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2343 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2344 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2348 tmp -= tmpStride*(h+5-2);\
2351 const int tmpB= tmp[-2*tmpStride];\
2352 const int tmpA= tmp[-1*tmpStride];\
2353 const int tmp0= tmp[0 *tmpStride];\
2354 const int tmp1= tmp[1 *tmpStride];\
2355 const int tmp2= tmp[2 *tmpStride];\
2356 const int tmp3= tmp[3 *tmpStride];\
2357 const int tmp4= tmp[4 *tmpStride];\
2358 const int tmp5= tmp[5 *tmpStride];\
2359 const int tmp6= tmp[6 *tmpStride];\
2360 const int tmp7= tmp[7 *tmpStride];\
2361 const int tmp8= tmp[8 *tmpStride];\
2362 const int tmp9= tmp[9 *tmpStride];\
2363 const int tmp10=tmp[10*tmpStride];\
2364 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2365 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2366 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2367 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2368 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2369 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2370 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2371 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2377 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2378 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2379 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2380 src += 8*srcStride;\
2381 dst += 8*dstStride;\
2382 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2383 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2386 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2387 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2388 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2389 src += 8*srcStride;\
2390 dst += 8*dstStride;\
2391 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2392 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2395 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2397 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2398 src += 8*srcStride;\
2399 dst += 8*dstStride;\
2400 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2401 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2404 #define H264_MC(OPNAME, SIZE) \
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2406 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2409 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2410 uint8_t half[SIZE*SIZE];\
2411 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2415 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2416 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2420 uint8_t half[SIZE*SIZE];\
2421 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2422 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2426 uint8_t full[SIZE*(SIZE+5)];\
2427 uint8_t * const full_mid= full + SIZE*2;\
2428 uint8_t half[SIZE*SIZE];\
2429 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2430 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2435 uint8_t full[SIZE*(SIZE+5)];\
2436 uint8_t * const full_mid= full + SIZE*2;\
2437 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2438 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2441 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2442 uint8_t full[SIZE*(SIZE+5)];\
2443 uint8_t * const full_mid= full + SIZE*2;\
2444 uint8_t half[SIZE*SIZE];\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2447 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2451 uint8_t full[SIZE*(SIZE+5)];\
2452 uint8_t * const full_mid= full + SIZE*2;\
2453 uint8_t halfH[SIZE*SIZE];\
2454 uint8_t halfV[SIZE*SIZE];\
2455 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2456 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2457 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2458 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2461 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2462 uint8_t full[SIZE*(SIZE+5)];\
2463 uint8_t * const full_mid= full + SIZE*2;\
2464 uint8_t halfH[SIZE*SIZE];\
2465 uint8_t halfV[SIZE*SIZE];\
2466 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2467 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2468 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2469 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2473 uint8_t full[SIZE*(SIZE+5)];\
2474 uint8_t * const full_mid= full + SIZE*2;\
2475 uint8_t halfH[SIZE*SIZE];\
2476 uint8_t halfV[SIZE*SIZE];\
2477 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2478 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2479 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2480 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2484 uint8_t full[SIZE*(SIZE+5)];\
2485 uint8_t * const full_mid= full + SIZE*2;\
2486 uint8_t halfH[SIZE*SIZE];\
2487 uint8_t halfV[SIZE*SIZE];\
2488 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2489 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2490 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2491 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2494 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2495 int16_t tmp[SIZE*(SIZE+5)];\
2496 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2499 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2500 int16_t tmp[SIZE*(SIZE+5)];\
2501 uint8_t halfH[SIZE*SIZE];\
2502 uint8_t halfHV[SIZE*SIZE];\
2503 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2504 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2505 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2508 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2509 int16_t tmp[SIZE*(SIZE+5)];\
2510 uint8_t halfH[SIZE*SIZE];\
2511 uint8_t halfHV[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2513 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2514 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2517 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2518 uint8_t full[SIZE*(SIZE+5)];\
2519 uint8_t * const full_mid= full + SIZE*2;\
2520 int16_t tmp[SIZE*(SIZE+5)];\
2521 uint8_t halfV[SIZE*SIZE];\
2522 uint8_t halfHV[SIZE*SIZE];\
2523 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2524 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2525 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2526 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2529 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2530 uint8_t full[SIZE*(SIZE+5)];\
2531 uint8_t * const full_mid= full + SIZE*2;\
2532 int16_t tmp[SIZE*(SIZE+5)];\
2533 uint8_t halfV[SIZE*SIZE];\
2534 uint8_t halfHV[SIZE*SIZE];\
2535 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2536 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2537 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2538 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2541 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2542 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2543 #define op_put(a, b) a = cm[((b) + 16)>>5]
2544 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2545 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2547 H264_LOWPASS(put_ , op_put, op2_put)
2548 H264_LOWPASS(avg_ , op_avg, op2_avg)
2563 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2568 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2581 #if CONFIG_CAVS_DECODER
2583 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584 put_pixels8_c(dst, src, stride, 8);
2586 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587 avg_pixels8_c(dst, src, stride, 8);
2589 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590 put_pixels16_c(dst, src, stride, 16);
2592 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2593 avg_pixels16_c(dst, src, stride, 16);
2595 #endif /* CONFIG_CAVS_DECODER */
2597 #if CONFIG_VC1_DECODER
2599 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2600 put_pixels8_c(dst, src, stride, 8);
2602 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2603 avg_pixels8_c(dst, src, stride, 8);
2605 #endif /* CONFIG_VC1_DECODER */
2607 #if CONFIG_RV40_DECODER
2608 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2609 put_pixels16_xy2_c(dst, src, stride, 16);
2611 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2612 avg_pixels16_xy2_c(dst, src, stride, 16);
2614 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2615 put_pixels8_xy2_c(dst, src, stride, 8);
2617 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2618 avg_pixels8_xy2_c(dst, src, stride, 8);
2620 #endif /* CONFIG_RV40_DECODER */
2622 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2627 const int src_1= src[ -srcStride];
2628 const int src0 = src[0 ];
2629 const int src1 = src[ srcStride];
2630 const int src2 = src[2*srcStride];
2631 const int src3 = src[3*srcStride];
2632 const int src4 = src[4*srcStride];
2633 const int src5 = src[5*srcStride];
2634 const int src6 = src[6*srcStride];
2635 const int src7 = src[7*srcStride];
2636 const int src8 = src[8*srcStride];
2637 const int src9 = src[9*srcStride];
2638 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2639 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2640 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2641 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2642 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2643 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2644 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2645 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2651 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2652 put_pixels8_c(dst, src, stride, 8);
2655 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2657 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2658 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2661 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2662 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2665 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2667 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2668 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2671 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2672 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2675 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2681 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2684 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2688 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2689 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2690 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2691 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2693 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2695 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2699 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2700 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2702 const int strength= ff_h263_loop_filter_strength[qscale];
2706 int p0= src[x-2*stride];
2707 int p1= src[x-1*stride];
2708 int p2= src[x+0*stride];
2709 int p3= src[x+1*stride];
2710 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2712 if (d<-2*strength) d1= 0;
2713 else if(d<- strength) d1=-2*strength - d;
2714 else if(d< strength) d1= d;
2715 else if(d< 2*strength) d1= 2*strength - d;
2720 if(p1&256) p1= ~(p1>>31);
2721 if(p2&256) p2= ~(p2>>31);
2723 src[x-1*stride] = p1;
2724 src[x+0*stride] = p2;
2728 d2= av_clip((p0-p3)/4, -ad1, ad1);
2730 src[x-2*stride] = p0 - d2;
2731 src[x+ stride] = p3 + d2;
2736 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2737 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2739 const int strength= ff_h263_loop_filter_strength[qscale];
2743 int p0= src[y*stride-2];
2744 int p1= src[y*stride-1];
2745 int p2= src[y*stride+0];
2746 int p3= src[y*stride+1];
2747 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2749 if (d<-2*strength) d1= 0;
2750 else if(d<- strength) d1=-2*strength - d;
2751 else if(d< strength) d1= d;
2752 else if(d< 2*strength) d1= 2*strength - d;
2757 if(p1&256) p1= ~(p1>>31);
2758 if(p2&256) p2= ~(p2>>31);
2760 src[y*stride-1] = p1;
2761 src[y*stride+0] = p2;
2765 d2= av_clip((p0-p3)/4, -ad1, ad1);
2767 src[y*stride-2] = p0 - d2;
2768 src[y*stride+1] = p3 + d2;
2773 static void h261_loop_filter_c(uint8_t *src, int stride){
2778 temp[x ] = 4*src[x ];
2779 temp[x + 7*8] = 4*src[x + 7*stride];
2783 xy = y * stride + x;
2785 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2790 src[ y*stride] = (temp[ y*8] + 2)>>2;
2791 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2793 xy = y * stride + x;
2795 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2800 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2806 s += abs(pix1[0] - pix2[0]);
2807 s += abs(pix1[1] - pix2[1]);
2808 s += abs(pix1[2] - pix2[2]);
2809 s += abs(pix1[3] - pix2[3]);
2810 s += abs(pix1[4] - pix2[4]);
2811 s += abs(pix1[5] - pix2[5]);
2812 s += abs(pix1[6] - pix2[6]);
2813 s += abs(pix1[7] - pix2[7]);
2814 s += abs(pix1[8] - pix2[8]);
2815 s += abs(pix1[9] - pix2[9]);
2816 s += abs(pix1[10] - pix2[10]);
2817 s += abs(pix1[11] - pix2[11]);
2818 s += abs(pix1[12] - pix2[12]);
2819 s += abs(pix1[13] - pix2[13]);
2820 s += abs(pix1[14] - pix2[14]);
2821 s += abs(pix1[15] - pix2[15]);
2828 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2834 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2835 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2836 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2837 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2838 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2839 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2840 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2841 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2842 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2843 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2844 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2845 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2846 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2847 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2848 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2849 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2856 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2859 uint8_t *pix3 = pix2 + line_size;
2863 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2864 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2865 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2866 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2867 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2868 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2869 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2870 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2871 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2872 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2873 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2874 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2875 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2876 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2877 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2878 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2886 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2889 uint8_t *pix3 = pix2 + line_size;
2893 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2894 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2895 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2896 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2897 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2898 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2899 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2900 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2901 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2902 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2903 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2904 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2905 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2906 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2907 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2908 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2916 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2922 s += abs(pix1[0] - pix2[0]);
2923 s += abs(pix1[1] - pix2[1]);
2924 s += abs(pix1[2] - pix2[2]);
2925 s += abs(pix1[3] - pix2[3]);
2926 s += abs(pix1[4] - pix2[4]);
2927 s += abs(pix1[5] - pix2[5]);
2928 s += abs(pix1[6] - pix2[6]);
2929 s += abs(pix1[7] - pix2[7]);
2936 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2942 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2943 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2944 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2945 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2946 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2947 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2948 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2949 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2956 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2959 uint8_t *pix3 = pix2 + line_size;
2963 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2964 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2965 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2966 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2967 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2968 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2969 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2970 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2978 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2981 uint8_t *pix3 = pix2 + line_size;
2985 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2986 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2987 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2988 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2989 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2990 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2991 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2992 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3000 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3001 MpegEncContext *c = v;
3007 for(x=0; x<16; x++){
3008 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3011 for(x=0; x<15; x++){
3012 score2+= FFABS( s1[x ] - s1[x +stride]
3013 - s1[x+1] + s1[x+1+stride])
3014 -FFABS( s2[x ] - s2[x +stride]
3015 - s2[x+1] + s2[x+1+stride]);
3022 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3023 else return score1 + FFABS(score2)*8;
3026 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3027 MpegEncContext *c = v;
3034 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3038 score2+= FFABS( s1[x ] - s1[x +stride]
3039 - s1[x+1] + s1[x+1+stride])
3040 -FFABS( s2[x ] - s2[x +stride]
3041 - s2[x+1] + s2[x+1+stride]);
3048 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3049 else return score1 + FFABS(score2)*8;
3052 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3056 for(i=0; i<8*8; i++){
3057 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3060 assert(-512<b && b<512);
3062 sum += (w*b)*(w*b)>>4;
3067 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3070 for(i=0; i<8*8; i++){
3071 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3076 * permutes an 8x8 block.
3077 * @param block the block which will be permuted according to the given permutation vector
3078 * @param permutation the permutation vector
3079 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3080 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3081 * (inverse) permutated to scantable order!
3083 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3089 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3091 for(i=0; i<=last; i++){
3092 const int j= scantable[i];
3097 for(i=0; i<=last; i++){
3098 const int j= scantable[i];
3099 const int perm_j= permutation[j];
3100 block[perm_j]= temp[j];
3104 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3108 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3111 memset(cmp, 0, sizeof(void*)*6);
3119 cmp[i]= c->hadamard8_diff[i];
3125 cmp[i]= c->dct_sad[i];
3128 cmp[i]= c->dct264_sad[i];
3131 cmp[i]= c->dct_max[i];
3134 cmp[i]= c->quant_psnr[i];
3163 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3168 static void clear_block_c(DCTELEM *block)
3170 memset(block, 0, sizeof(DCTELEM)*64);
3174 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3176 static void clear_blocks_c(DCTELEM *blocks)
3178 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3181 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3183 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3184 long a = *(long*)(src+i);
3185 long b = *(long*)(dst+i);
3186 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3189 dst[i+0] += src[i+0];
3192 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3194 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3195 long a = *(long*)(src1+i);
3196 long b = *(long*)(src2+i);
3197 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3200 dst[i] = src1[i]+src2[i];
3203 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3205 #if !HAVE_FAST_UNALIGNED
3206 if((long)src2 & (sizeof(long)-1)){
3207 for(i=0; i+7<w; i+=8){
3208 dst[i+0] = src1[i+0]-src2[i+0];
3209 dst[i+1] = src1[i+1]-src2[i+1];
3210 dst[i+2] = src1[i+2]-src2[i+2];
3211 dst[i+3] = src1[i+3]-src2[i+3];
3212 dst[i+4] = src1[i+4]-src2[i+4];
3213 dst[i+5] = src1[i+5]-src2[i+5];
3214 dst[i+6] = src1[i+6]-src2[i+6];
3215 dst[i+7] = src1[i+7]-src2[i+7];
3219 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3220 long a = *(long*)(src1+i);
3221 long b = *(long*)(src2+i);
3222 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3225 dst[i+0] = src1[i+0]-src2[i+0];
3228 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3236 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3245 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3253 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3263 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3266 for(i=0; i<w-1; i++){
3293 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3323 #define BUTTERFLY2(o1,o2,i1,i2) \
3327 #define BUTTERFLY1(x,y) \
3336 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3338 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3346 //FIXME try pointer walks
3347 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3348 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3349 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3350 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3352 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3357 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3364 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3369 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3375 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3384 printf("MAX:%d\n", maxi);
3390 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3398 //FIXME try pointer walks
3399 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3400 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3401 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3402 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3404 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3405 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3406 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3407 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3409 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3410 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3411 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3412 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3416 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3417 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3418 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3419 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3421 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3422 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3423 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3424 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3427 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3428 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3429 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3430 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3433 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3438 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3439 MpegEncContext * const s= (MpegEncContext *)c;
3440 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3444 s->dsp.diff_pixels(temp, src1, src2, stride);
3446 return s->dsp.sum_abs_dctelem(temp);
3451 const int s07 = SRC(0) + SRC(7);\
3452 const int s16 = SRC(1) + SRC(6);\
3453 const int s25 = SRC(2) + SRC(5);\
3454 const int s34 = SRC(3) + SRC(4);\
3455 const int a0 = s07 + s34;\
3456 const int a1 = s16 + s25;\
3457 const int a2 = s07 - s34;\
3458 const int a3 = s16 - s25;\
3459 const int d07 = SRC(0) - SRC(7);\
3460 const int d16 = SRC(1) - SRC(6);\
3461 const int d25 = SRC(2) - SRC(5);\
3462 const int d34 = SRC(3) - SRC(4);\
3463 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3468 DST(1, a4 + (a7>>2)) ;\
3469 DST(2, a2 + (a3>>1)) ;\
3470 DST(3, a5 + (a6>>2)) ;\
3472 DST(5, a6 - (a5>>2)) ;\
3473 DST(6, (a2>>1) - a3 ) ;\
3474 DST(7, (a4>>2) - a7 ) ;\
3477 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478 MpegEncContext * const s= (MpegEncContext *)c;
3483 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3485 #define SRC(x) dct[i][x]
3486 #define DST(x,v) dct[i][x]= v
3487 for( i = 0; i < 8; i++ )
3492 #define SRC(x) dct[x][i]
3493 #define DST(x,v) sum += FFABS(v)
3494 for( i = 0; i < 8; i++ )
3502 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503 MpegEncContext * const s= (MpegEncContext *)c;
3504 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3509 s->dsp.diff_pixels(temp, src1, src2, stride);
3513 sum= FFMAX(sum, FFABS(temp[i]));
3518 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3519 MpegEncContext * const s= (MpegEncContext *)c;
3520 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3521 DCTELEM * const bak = temp+64;
3527 s->dsp.diff_pixels(temp, src1, src2, stride);
3529 memcpy(bak, temp, 64*sizeof(DCTELEM));
3531 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3532 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3533 ff_simple_idct(temp); //FIXME
3536 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3541 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3542 MpegEncContext * const s= (MpegEncContext *)c;
3543 const uint8_t *scantable= s->intra_scantable.permutated;
3544 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3545 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3546 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3547 int i, last, run, bits, level, distortion, start_i;
3548 const int esc_length= s->ac_esc_length;
3550 uint8_t * last_length;
3554 copy_block8(lsrc1, src1, 8, stride, 8);
3555 copy_block8(lsrc2, src2, 8, stride, 8);
3557 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3559 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3565 length = s->intra_ac_vlc_length;
3566 last_length= s->intra_ac_vlc_last_length;
3567 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3570 length = s->inter_ac_vlc_length;
3571 last_length= s->inter_ac_vlc_last_length;
3576 for(i=start_i; i<last; i++){
3577 int j= scantable[i];
3582 if((level&(~127)) == 0){
3583 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3592 level= temp[i] + 64;
3596 if((level&(~127)) == 0){
3597 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3605 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3607 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3610 s->dsp.idct_add(lsrc2, 8, temp);
3612 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3614 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3617 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3618 MpegEncContext * const s= (MpegEncContext *)c;
3619 const uint8_t *scantable= s->intra_scantable.permutated;
3620 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3621 int i, last, run, bits, level, start_i;
3622 const int esc_length= s->ac_esc_length;
3624 uint8_t * last_length;
3628 s->dsp.diff_pixels(temp, src1, src2, stride);
3630 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3636 length = s->intra_ac_vlc_length;
3637 last_length= s->intra_ac_vlc_last_length;
3638 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3641 length = s->inter_ac_vlc_length;
3642 last_length= s->inter_ac_vlc_last_length;
3647 for(i=start_i; i<last; i++){
3648 int j= scantable[i];
3653 if((level&(~127)) == 0){
3654 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3663 level= temp[i] + 64;
3667 if((level&(~127)) == 0){
3668 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3676 #define VSAD_INTRA(size) \
3677 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3681 for(y=1; y<h; y++){ \
3682 for(x=0; x<size; x+=4){ \
3683 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3684 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3694 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3699 for(x=0; x<16; x++){
3700 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3709 #define SQ(a) ((a)*(a))
3710 #define VSSE_INTRA(size) \
3711 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3715 for(y=1; y<h; y++){ \
3716 for(x=0; x<size; x+=4){ \
3717 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3718 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3728 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3733 for(x=0; x<16; x++){
3734 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3743 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3747 for(i=0; i<size; i++)
3748 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3752 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3753 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3754 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3756 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3758 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3759 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3760 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3761 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3763 static void vector_fmul_c(float *dst, const float *src, int len){
3765 for(i=0; i<len; i++)
3769 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3772 for(i=0; i<len; i++)
3773 dst[i] = src0[i] * src1[-i];
3776 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3778 for(i=0; i<len; i++)
3779 dst[i] = src0[i] * src1[i] + src2[i];
3782 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3787 for(i=-len, j=len-1; i<0; i++, j--) {
3792 dst[i] = s0*wj - s1*wi + add_bias;
3793 dst[j] = s0*wi + s1*wj + add_bias;
3797 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3801 for (i = 0; i < len; i++)
3802 dst[i] = src[i] * mul;
3805 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3806 const float **sv, float mul, int len)
3809 for (i = 0; i < len; i += 2, sv++) {
3810 dst[i ] = src[i ] * sv[0][0] * mul;
3811 dst[i+1] = src[i+1] * sv[0][1] * mul;
3815 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3816 const float **sv, float mul, int len)
3819 for (i = 0; i < len; i += 4, sv++) {
3820 dst[i ] = src[i ] * sv[0][0] * mul;
3821 dst[i+1] = src[i+1] * sv[0][1] * mul;
3822 dst[i+2] = src[i+2] * sv[0][2] * mul;
3823 dst[i+3] = src[i+3] * sv[0][3] * mul;
3827 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3831 for (i = 0; i < len; i += 2, sv++) {
3832 dst[i ] = sv[0][0] * mul;
3833 dst[i+1] = sv[0][1] * mul;
3837 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3841 for (i = 0; i < len; i += 4, sv++) {
3842 dst[i ] = sv[0][0] * mul;
3843 dst[i+1] = sv[0][1] * mul;
3844 dst[i+2] = sv[0][2] * mul;
3845 dst[i+3] = sv[0][3] * mul;
3849 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3853 for (i = 0; i < len; i++) {
3854 float t = v1[i] - v2[i];
3860 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3865 for (i = 0; i < len; i++)
3871 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3873 for(i=0; i<len; i++)
3874 dst[i] = src[i] * mul;
3877 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3878 uint32_t maxi, uint32_t maxisign)
3881 if(a > mini) return mini;
3882 else if((a^(1<<31)) > maxisign) return maxi;
3886 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3888 uint32_t mini = *(uint32_t*)min;
3889 uint32_t maxi = *(uint32_t*)max;
3890 uint32_t maxisign = maxi ^ (1<<31);
3891 uint32_t *dsti = (uint32_t*)dst;
3892 const uint32_t *srci = (const uint32_t*)src;
3893 for(i=0; i<len; i+=8) {
3894 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3895 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3896 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3897 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3898 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3899 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3900 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3901 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3904 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3906 if(min < 0 && max > 0) {
3907 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3909 for(i=0; i < len; i+=8) {
3910 dst[i ] = av_clipf(src[i ], min, max);
3911 dst[i + 1] = av_clipf(src[i + 1], min, max);
3912 dst[i + 2] = av_clipf(src[i + 2], min, max);
3913 dst[i + 3] = av_clipf(src[i + 3], min, max);
3914 dst[i + 4] = av_clipf(src[i + 4], min, max);
3915 dst[i + 5] = av_clipf(src[i + 5], min, max);
3916 dst[i + 6] = av_clipf(src[i + 6], min, max);
3917 dst[i + 7] = av_clipf(src[i + 7], min, max);
3922 static av_always_inline int float_to_int16_one(const float *src){
3923 int_fast32_t tmp = *(const int32_t*)src;
3925 tmp = (0x43c0ffff - tmp)>>31;
3926 // is this faster on some gcc/cpu combinations?
3927 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3930 return tmp - 0x8000;
3933 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3935 for(i=0; i<len; i++)
3936 dst[i] = float_to_int16_one(src+i);
3939 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3942 for(i=0; i<len; i++){
3943 dst[2*i] = float_to_int16_one(src[0]+i);
3944 dst[2*i+1] = float_to_int16_one(src[1]+i);
3947 for(c=0; c<channels; c++)
3948 for(i=0, j=c; i<len; i++, j+=channels)
3949 dst[j] = float_to_int16_one(src[c]+i);
3953 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3958 res += (*v1++ * *v2++) >> shift;
3963 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3968 *v1++ += mul * *v3++;
3974 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3975 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3976 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3977 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3978 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3979 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3980 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3982 static void wmv2_idct_row(short * b)
3985 int a0,a1,a2,a3,a4,a5,a6,a7;
3987 a1 = W1*b[1]+W7*b[7];
3988 a7 = W7*b[1]-W1*b[7];
3989 a5 = W5*b[5]+W3*b[3];
3990 a3 = W3*b[5]-W5*b[3];
3991 a2 = W2*b[2]+W6*b[6];
3992 a6 = W6*b[2]-W2*b[6];
3993 a0 = W0*b[0]+W0*b[4];
3994 a4 = W0*b[0]-W0*b[4];
3996 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3997 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3999 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4000 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4001 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4002 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4003 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4004 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4005 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4006 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4008 static void wmv2_idct_col(short * b)
4011 int a0,a1,a2,a3,a4,a5,a6,a7;
4012 /*step 1, with extended precision*/
4013 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4014 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4015 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4016 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4017 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4018 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4019 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4020 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4022 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4023 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4025 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4026 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4027 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4028 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4030 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4031 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4032 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4033 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4035 void ff_wmv2_idct_c(short * block){
4039 wmv2_idct_row(block+i);
4042 wmv2_idct_col(block+i);
4045 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4047 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4049 ff_wmv2_idct_c(block);
4050 put_pixels_clamped_c(block, dest, line_size);
4052 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4054 ff_wmv2_idct_c(block);
4055 add_pixels_clamped_c(block, dest, line_size);
4057 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4060 put_pixels_clamped_c(block, dest, line_size);
4062 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4065 add_pixels_clamped_c(block, dest, line_size);
4068 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4071 put_pixels_clamped4_c(block, dest, line_size);
4073 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4076 add_pixels_clamped4_c(block, dest, line_size);
4079 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4082 put_pixels_clamped2_c(block, dest, line_size);
4084 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4087 add_pixels_clamped2_c(block, dest, line_size);
4090 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4092 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4094 dest[0] = cm[(block[0] + 4)>>3];
4096 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4098 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4100 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4103 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4105 /* init static data */
4106 av_cold void dsputil_static_init(void)
4110 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4111 for(i=0;i<MAX_NEG_CROP;i++) {
4113 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4116 for(i=0;i<512;i++) {
4117 ff_squareTbl[i] = (i - 256) * (i - 256);
4120 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4123 int ff_check_alignment(void){
4124 static int did_fail=0;
4125 DECLARE_ALIGNED(16, int, aligned);
4127 if((intptr_t)&aligned & 15){
4129 #if HAVE_MMX || HAVE_ALTIVEC
4130 av_log(NULL, AV_LOG_ERROR,
4131 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4132 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4133 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4134 "Do not report crashes to FFmpeg developers.\n");
4143 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4147 ff_check_alignment();
4150 if(avctx->dct_algo==FF_DCT_FASTINT) {
4151 c->fdct = fdct_ifast;
4152 c->fdct248 = fdct_ifast248;
4154 else if(avctx->dct_algo==FF_DCT_FAAN) {
4155 c->fdct = ff_faandct;
4156 c->fdct248 = ff_faandct248;
4159 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4160 c->fdct248 = ff_fdct248_islow;
4162 #endif //CONFIG_ENCODERS
4164 if(avctx->lowres==1){
4165 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4166 c->idct_put= ff_jref_idct4_put;
4167 c->idct_add= ff_jref_idct4_add;
4169 c->idct_put= ff_h264_lowres_idct_put_c;
4170 c->idct_add= ff_h264_lowres_idct_add_c;
4172 c->idct = j_rev_dct4;
4173 c->idct_permutation_type= FF_NO_IDCT_PERM;
4174 }else if(avctx->lowres==2){
4175 c->idct_put= ff_jref_idct2_put;
4176 c->idct_add= ff_jref_idct2_add;
4177 c->idct = j_rev_dct2;
4178 c->idct_permutation_type= FF_NO_IDCT_PERM;
4179 }else if(avctx->lowres==3){
4180 c->idct_put= ff_jref_idct1_put;
4181 c->idct_add= ff_jref_idct1_add;
4182 c->idct = j_rev_dct1;
4183 c->idct_permutation_type= FF_NO_IDCT_PERM;
4185 if(avctx->idct_algo==FF_IDCT_INT){
4186 c->idct_put= ff_jref_idct_put;
4187 c->idct_add= ff_jref_idct_add;
4188 c->idct = j_rev_dct;
4189 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4190 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4191 avctx->idct_algo==FF_IDCT_VP3){
4192 c->idct_put= ff_vp3_idct_put_c;
4193 c->idct_add= ff_vp3_idct_add_c;
4194 c->idct = ff_vp3_idct_c;
4195 c->idct_permutation_type= FF_NO_IDCT_PERM;
4196 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4197 c->idct_put= ff_wmv2_idct_put_c;
4198 c->idct_add= ff_wmv2_idct_add_c;
4199 c->idct = ff_wmv2_idct_c;
4200 c->idct_permutation_type= FF_NO_IDCT_PERM;
4201 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4202 c->idct_put= ff_faanidct_put;
4203 c->idct_add= ff_faanidct_add;
4204 c->idct = ff_faanidct;
4205 c->idct_permutation_type= FF_NO_IDCT_PERM;
4206 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4207 c->idct_put= ff_ea_idct_put_c;
4208 c->idct_permutation_type= FF_NO_IDCT_PERM;
4209 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4210 c->idct = ff_bink_idct_c;
4211 c->idct_add = ff_bink_idct_add_c;
4212 c->idct_put = ff_bink_idct_put_c;
4213 c->idct_permutation_type = FF_NO_IDCT_PERM;
4214 }else{ //accurate/default
4215 c->idct_put= ff_simple_idct_put;
4216 c->idct_add= ff_simple_idct_add;
4217 c->idct = ff_simple_idct;
4218 c->idct_permutation_type= FF_NO_IDCT_PERM;
4222 c->get_pixels = get_pixels_c;
4223 c->diff_pixels = diff_pixels_c;
4224 c->put_pixels_clamped = put_pixels_clamped_c;
4225 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4226 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4227 c->add_pixels_clamped = add_pixels_clamped_c;
4228 c->add_pixels8 = add_pixels8_c;
4229 c->add_pixels4 = add_pixels4_c;
4230 c->sum_abs_dctelem = sum_abs_dctelem_c;
4233 c->clear_block = clear_block_c;
4234 c->clear_blocks = clear_blocks_c;
4235 c->pix_sum = pix_sum_c;
4236 c->pix_norm1 = pix_norm1_c;
4238 c->fill_block_tab[0] = fill_block16_c;
4239 c->fill_block_tab[1] = fill_block8_c;
4240 c->scale_block = scale_block_c;
4242 /* TODO [0] 16 [1] 8 */
4243 c->pix_abs[0][0] = pix_abs16_c;
4244 c->pix_abs[0][1] = pix_abs16_x2_c;
4245 c->pix_abs[0][2] = pix_abs16_y2_c;
4246 c->pix_abs[0][3] = pix_abs16_xy2_c;
4247 c->pix_abs[1][0] = pix_abs8_c;
4248 c->pix_abs[1][1] = pix_abs8_x2_c;
4249 c->pix_abs[1][2] = pix_abs8_y2_c;
4250 c->pix_abs[1][3] = pix_abs8_xy2_c;
4252 #define dspfunc(PFX, IDX, NUM) \
4253 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4254 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4255 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4256 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4258 dspfunc(put, 0, 16);
4259 dspfunc(put_no_rnd, 0, 16);
4261 dspfunc(put_no_rnd, 1, 8);
4265 dspfunc(avg, 0, 16);
4266 dspfunc(avg_no_rnd, 0, 16);
4268 dspfunc(avg_no_rnd, 1, 8);
4273 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4274 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4276 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4277 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4278 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4279 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4280 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4281 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4282 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4283 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4284 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4286 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4287 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4288 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4289 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4290 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4291 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4292 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4293 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4294 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4296 #define dspfunc(PFX, IDX, NUM) \
4297 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4298 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4299 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4300 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4301 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4302 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4303 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4304 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4305 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4306 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4307 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4308 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4309 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4310 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4311 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4312 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4314 dspfunc(put_qpel, 0, 16);
4315 dspfunc(put_no_rnd_qpel, 0, 16);
4317 dspfunc(avg_qpel, 0, 16);
4318 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4320 dspfunc(put_qpel, 1, 8);
4321 dspfunc(put_no_rnd_qpel, 1, 8);
4323 dspfunc(avg_qpel, 1, 8);
4324 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4326 dspfunc(put_h264_qpel, 0, 16);
4327 dspfunc(put_h264_qpel, 1, 8);
4328 dspfunc(put_h264_qpel, 2, 4);
4329 dspfunc(put_h264_qpel, 3, 2);
4330 dspfunc(avg_h264_qpel, 0, 16);
4331 dspfunc(avg_h264_qpel, 1, 8);
4332 dspfunc(avg_h264_qpel, 2, 4);
4335 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4336 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4337 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4338 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4339 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4340 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4341 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4342 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4344 c->draw_edges = draw_edges_c;
4346 #if CONFIG_CAVS_DECODER
4347 ff_cavsdsp_init(c,avctx);
4350 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4351 ff_mlp_init(c, avctx);
4353 #if CONFIG_VC1_DECODER
4354 ff_vc1dsp_init(c,avctx);
4356 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4357 ff_intrax8dsp_init(c,avctx);
4359 #if CONFIG_RV30_DECODER
4360 ff_rv30dsp_init(c,avctx);
4362 #if CONFIG_RV40_DECODER
4363 ff_rv40dsp_init(c,avctx);
4364 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4365 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4366 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4367 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4370 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4371 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4372 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4373 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4374 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4375 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4376 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4377 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4379 #define SET_CMP_FUNC(name) \
4380 c->name[0]= name ## 16_c;\
4381 c->name[1]= name ## 8x8_c;
4383 SET_CMP_FUNC(hadamard8_diff)
4384 c->hadamard8_diff[4]= hadamard8_intra16_c;
4385 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4386 SET_CMP_FUNC(dct_sad)
4387 SET_CMP_FUNC(dct_max)
4389 SET_CMP_FUNC(dct264_sad)
4391 c->sad[0]= pix_abs16_c;
4392 c->sad[1]= pix_abs8_c;
4396 SET_CMP_FUNC(quant_psnr)
4399 c->vsad[0]= vsad16_c;
4400 c->vsad[4]= vsad_intra16_c;
4401 c->vsad[5]= vsad_intra8_c;
4402 c->vsse[0]= vsse16_c;
4403 c->vsse[4]= vsse_intra16_c;
4404 c->vsse[5]= vsse_intra8_c;
4405 c->nsse[0]= nsse16_c;
4406 c->nsse[1]= nsse8_c;
4408 ff_dsputil_init_dwt(c);
4411 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4413 c->add_bytes= add_bytes_c;
4414 c->add_bytes_l2= add_bytes_l2_c;
4415 c->diff_bytes= diff_bytes_c;
4416 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4417 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4418 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4419 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4420 c->bswap_buf= bswap_buf;
4421 #if CONFIG_PNG_DECODER
4422 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4425 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4426 c->h263_h_loop_filter= h263_h_loop_filter_c;
4427 c->h263_v_loop_filter= h263_v_loop_filter_c;
4430 if (CONFIG_VP3_DECODER) {
4431 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4432 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4433 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4435 if (CONFIG_VP6_DECODER) {
4436 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4439 c->h261_loop_filter= h261_loop_filter_c;
4441 c->try_8x8basis= try_8x8basis_c;
4442 c->add_8x8basis= add_8x8basis_c;
4444 #if CONFIG_VORBIS_DECODER
4445 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4447 #if CONFIG_AC3_DECODER
4448 c->ac3_downmix = ff_ac3_downmix_c;
4451 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4453 c->vector_fmul = vector_fmul_c;
4454 c->vector_fmul_reverse = vector_fmul_reverse_c;
4455 c->vector_fmul_add = vector_fmul_add_c;
4456 c->vector_fmul_window = ff_vector_fmul_window_c;
4457 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4458 c->vector_clipf = vector_clipf_c;
4459 c->float_to_int16 = ff_float_to_int16_c;
4460 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4461 c->scalarproduct_int16 = scalarproduct_int16_c;
4462 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4463 c->scalarproduct_float = scalarproduct_float_c;
4464 c->butterflies_float = butterflies_float_c;
4465 c->vector_fmul_scalar = vector_fmul_scalar_c;
4467 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4468 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4470 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4471 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4473 c->shrink[0]= ff_img_copy_plane;
4474 c->shrink[1]= ff_shrink22;
4475 c->shrink[2]= ff_shrink44;
4476 c->shrink[3]= ff_shrink88;
4478 c->prefetch= just_return;
4480 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4481 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4483 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4484 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4485 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4486 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4487 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4488 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4489 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4490 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4491 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4493 for(i=0; i<64; i++){
4494 if(!c->put_2tap_qpel_pixels_tab[0][i])
4495 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4496 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4497 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4500 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4501 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4502 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4503 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4505 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4506 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4507 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4508 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4510 switch(c->idct_permutation_type){
4511 case FF_NO_IDCT_PERM:
4513 c->idct_permutation[i]= i;
4515 case FF_LIBMPEG2_IDCT_PERM:
4517 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4519 case FF_SIMPLE_IDCT_PERM:
4521 c->idct_permutation[i]= simple_mmx_permutation[i];
4523 case FF_TRANSPOSE_IDCT_PERM:
4525 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4527 case FF_PARTTRANS_IDCT_PERM:
4529 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4531 case FF_SSE2_IDCT_PERM:
4533 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4536 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");