3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavcore/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48 #define pb_7f (~0UL/255 * 0x7f)
49 #define pb_80 (~0UL/255 * 0x80)
51 const uint8_t ff_zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63 specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
78 const uint8_t ff_alternate_horizontal_scan[64] = {
79 0, 1, 2, 3, 8, 9, 16, 17,
80 10, 11, 4, 5, 6, 7, 15, 14,
81 13, 12, 19, 18, 24, 25, 32, 33,
82 26, 27, 20, 21, 22, 23, 28, 29,
83 30, 31, 34, 35, 40, 41, 48, 49,
84 42, 43, 36, 37, 38, 39, 44, 45,
85 46, 47, 50, 51, 56, 57, 58, 59,
86 52, 53, 54, 55, 60, 61, 62, 63,
89 const uint8_t ff_alternate_vertical_scan[64] = {
90 0, 8, 16, 24, 1, 9, 2, 10,
91 17, 25, 32, 40, 48, 56, 57, 49,
92 41, 33, 26, 18, 3, 11, 4, 12,
93 19, 27, 34, 42, 50, 58, 35, 43,
94 51, 59, 20, 28, 5, 13, 6, 14,
95 21, 29, 36, 44, 52, 60, 37, 45,
96 53, 61, 22, 30, 7, 15, 23, 31,
97 38, 46, 54, 62, 39, 47, 55, 63,
100 /* Input permutation for the simple_idct_mmx */
101 static const uint8_t simple_mmx_permutation[64]={
102 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
103 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
104 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
105 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
106 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
107 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
108 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
109 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
112 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
114 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
118 st->scantable= src_scantable;
122 j = src_scantable[i];
123 st->permutated[i] = permutation[j];
132 j = st->permutated[i];
134 st->raster_end[i]= end;
138 static int pix_sum_c(uint8_t * pix, int line_size)
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
155 pix += line_size - 16;
160 static int pix_norm1_c(uint8_t * pix, int line_size)
163 uint32_t *sq = ff_squareTbl + 256;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
189 register uint32_t x=*(uint32_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
203 pix += line_size - 16;
208 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= av_bswap32(src[i+0]);
213 dst[i+1]= av_bswap32(src[i+1]);
214 dst[i+2]= av_bswap32(src[i+2]);
215 dst[i+3]= av_bswap32(src[i+3]);
216 dst[i+4]= av_bswap32(src[i+4]);
217 dst[i+5]= av_bswap32(src[i+5]);
218 dst[i+6]= av_bswap32(src[i+6]);
219 dst[i+7]= av_bswap32(src[i+7]);
222 dst[i+0]= av_bswap32(src[i+0]);
226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
229 uint32_t *sq = ff_squareTbl + 256;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
246 uint32_t *sq = ff_squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
267 uint32_t *sq = ff_squareTbl + 256;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
294 /* draw the edges of width 'w' of an image of size width, height */
295 //FIXME check that this is ok for mpeg4 interlaced
296 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
298 uint8_t *ptr, *last_line;
301 last_line = buf + (height - 1) * wrap;
304 memcpy(buf - (i + 1) * wrap, buf, width);
305 memcpy(last_line + (i + 1) * wrap, last_line, width);
309 for(i=0;i<height;i++) {
310 memset(ptr - w, ptr[0], w);
311 memset(ptr + width, ptr[width-1], w);
316 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
317 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
318 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
319 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
324 * Copy a rectangular area of samples to a temporary buffer and replicate the border samples.
325 * @param buf destination buffer
326 * @param src source buffer
327 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
328 * @param block_w width of block
329 * @param block_h height of block
330 * @param src_x x coordinate of the top left sample of the block in the source buffer
331 * @param src_y y coordinate of the top left sample of the block in the source buffer
332 * @param w width of the source buffer
333 * @param h height of the source buffer
335 void ff_emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize, int block_w, int block_h,
336 int src_x, int src_y, int w, int h){
338 int start_y, start_x, end_y, end_x;
341 src+= (h-1-src_y)*linesize;
343 }else if(src_y<=-block_h){
344 src+= (1-block_h-src_y)*linesize;
350 }else if(src_x<=-block_w){
351 src+= (1-block_w-src_x);
355 start_y= FFMAX(0, -src_y);
356 start_x= FFMAX(0, -src_x);
357 end_y= FFMIN(block_h, h-src_y);
358 end_x= FFMIN(block_w, w-src_x);
360 // copy existing part
361 for(y=start_y; y<end_y; y++){
362 for(x=start_x; x<end_x; x++){
363 buf[x + y*linesize]= src[x + y*linesize];
368 for(y=0; y<start_y; y++){
369 for(x=start_x; x<end_x; x++){
370 buf[x + y*linesize]= buf[x + start_y*linesize];
375 for(y=end_y; y<block_h; y++){
376 for(x=start_x; x<end_x; x++){
377 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
381 for(y=0; y<block_h; y++){
383 for(x=0; x<start_x; x++){
384 buf[x + y*linesize]= buf[start_x + y*linesize];
388 for(x=end_x; x<block_w; x++){
389 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
398 /* read the pixels */
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414 const uint8_t *s2, int stride){
417 /* read the pixels */
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
438 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
440 /* read the pixels */
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
460 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
462 /* read the pixels */
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
480 /* read the pixels */
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
500 else if (*block > 127)
503 *pixels = (uint8_t)(*block + 128);
507 pixels += (line_size - 8);
511 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
516 /* read the pixels */
518 pixels[0] = block[0];
519 pixels[1] = block[1];
520 pixels[2] = block[2];
521 pixels[3] = block[3];
522 pixels[4] = block[4];
523 pixels[5] = block[5];
524 pixels[6] = block[6];
525 pixels[7] = block[7];
532 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
536 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
538 /* read the pixels */
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
544 pixels[4] = cm[pixels[4] + block[4]];
545 pixels[5] = cm[pixels[5] + block[5]];
546 pixels[6] = cm[pixels[6] + block[6]];
547 pixels[7] = cm[pixels[7] + block[7]];
553 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
557 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
559 /* read the pixels */
561 pixels[0] = cm[pixels[0] + block[0]];
562 pixels[1] = cm[pixels[1] + block[1]];
563 pixels[2] = cm[pixels[2] + block[2]];
564 pixels[3] = cm[pixels[3] + block[3]];
570 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
576 /* read the pixels */
578 pixels[0] = cm[pixels[0] + block[0]];
579 pixels[1] = cm[pixels[1] + block[1]];
585 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
589 pixels[0] += block[0];
590 pixels[1] += block[1];
591 pixels[2] += block[2];
592 pixels[3] += block[3];
593 pixels[4] += block[4];
594 pixels[5] += block[5];
595 pixels[6] += block[6];
596 pixels[7] += block[7];
602 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
606 pixels[0] += block[0];
607 pixels[1] += block[1];
608 pixels[2] += block[2];
609 pixels[3] += block[3];
615 static int sum_abs_dctelem_c(DCTELEM *block)
619 sum+= FFABS(block[i]);
623 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
627 for (i = 0; i < h; i++) {
628 memset(block, value, 16);
633 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
637 for (i = 0; i < h; i++) {
638 memset(block, value, 8);
643 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
646 uint16_t *dst1 = (uint16_t *) dst;
647 uint16_t *dst2 = (uint16_t *)(dst + linesize);
649 for (j = 0; j < 8; j++) {
650 for (i = 0; i < 8; i++) {
651 dst1[i] = dst2[i] = src[i] * 0x0101;
661 #define PIXOP2(OPNAME, OP) \
662 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
666 OP(*((uint64_t*)block), AV_RN64(pixels));\
672 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
676 const uint64_t a= AV_RN64(pixels );\
677 const uint64_t b= AV_RN64(pixels+1);\
678 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
684 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
688 const uint64_t a= AV_RN64(pixels );\
689 const uint64_t b= AV_RN64(pixels+1);\
690 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
696 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
700 const uint64_t a= AV_RN64(pixels );\
701 const uint64_t b= AV_RN64(pixels+line_size);\
702 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
708 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
712 const uint64_t a= AV_RN64(pixels );\
713 const uint64_t b= AV_RN64(pixels+line_size);\
714 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
720 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
723 const uint64_t a= AV_RN64(pixels );\
724 const uint64_t b= AV_RN64(pixels+1);\
725 uint64_t l0= (a&0x0303030303030303ULL)\
726 + (b&0x0303030303030303ULL)\
727 + 0x0202020202020202ULL;\
728 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
733 for(i=0; i<h; i+=2){\
734 uint64_t a= AV_RN64(pixels );\
735 uint64_t b= AV_RN64(pixels+1);\
736 l1= (a&0x0303030303030303ULL)\
737 + (b&0x0303030303030303ULL);\
738 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
739 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
740 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
743 a= AV_RN64(pixels );\
744 b= AV_RN64(pixels+1);\
745 l0= (a&0x0303030303030303ULL)\
746 + (b&0x0303030303030303ULL)\
747 + 0x0202020202020202ULL;\
748 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
749 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
750 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
756 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
759 const uint64_t a= AV_RN64(pixels );\
760 const uint64_t b= AV_RN64(pixels+1);\
761 uint64_t l0= (a&0x0303030303030303ULL)\
762 + (b&0x0303030303030303ULL)\
763 + 0x0101010101010101ULL;\
764 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
765 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
769 for(i=0; i<h; i+=2){\
770 uint64_t a= AV_RN64(pixels );\
771 uint64_t b= AV_RN64(pixels+1);\
772 l1= (a&0x0303030303030303ULL)\
773 + (b&0x0303030303030303ULL);\
774 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
775 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
776 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
779 a= AV_RN64(pixels );\
780 b= AV_RN64(pixels+1);\
781 l0= (a&0x0303030303030303ULL)\
782 + (b&0x0303030303030303ULL)\
783 + 0x0101010101010101ULL;\
784 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
785 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
786 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
792 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
793 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
794 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
795 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
796 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
797 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
798 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
800 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
801 #else // 64 bit variant
803 #define PIXOP2(OPNAME, OP) \
804 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
812 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
815 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
820 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
823 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
824 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
829 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
833 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834 int src_stride1, int src_stride2, int h){\
838 a= AV_RN32(&src1[i*src_stride1 ]);\
839 b= AV_RN32(&src2[i*src_stride2 ]);\
840 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
841 a= AV_RN32(&src1[i*src_stride1+4]);\
842 b= AV_RN32(&src2[i*src_stride2+4]);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
847 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
848 int src_stride1, int src_stride2, int h){\
852 a= AV_RN32(&src1[i*src_stride1 ]);\
853 b= AV_RN32(&src2[i*src_stride2 ]);\
854 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
855 a= AV_RN32(&src1[i*src_stride1+4]);\
856 b= AV_RN32(&src2[i*src_stride2+4]);\
857 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
861 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
862 int src_stride1, int src_stride2, int h){\
866 a= AV_RN32(&src1[i*src_stride1 ]);\
867 b= AV_RN32(&src2[i*src_stride2 ]);\
868 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
872 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
873 int src_stride1, int src_stride2, int h){\
877 a= AV_RN16(&src1[i*src_stride1 ]);\
878 b= AV_RN16(&src2[i*src_stride2 ]);\
879 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
883 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
884 int src_stride1, int src_stride2, int h){\
885 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
886 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
889 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
890 int src_stride1, int src_stride2, int h){\
891 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
892 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
895 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
947 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
948 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
951 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
952 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
955 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
956 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
959 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
963 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
964 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
967 uint32_t a, b, c, d, l0, l1, h0, h1;\
968 a= AV_RN32(&src1[i*src_stride1]);\
969 b= AV_RN32(&src2[i*src_stride2]);\
970 c= AV_RN32(&src3[i*src_stride3]);\
971 d= AV_RN32(&src4[i*src_stride4]);\
972 l0= (a&0x03030303UL)\
975 h0= ((a&0xFCFCFCFCUL)>>2)\
976 + ((b&0xFCFCFCFCUL)>>2);\
977 l1= (c&0x03030303UL)\
979 h1= ((c&0xFCFCFCFCUL)>>2)\
980 + ((d&0xFCFCFCFCUL)>>2);\
981 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
982 a= AV_RN32(&src1[i*src_stride1+4]);\
983 b= AV_RN32(&src2[i*src_stride2+4]);\
984 c= AV_RN32(&src3[i*src_stride3+4]);\
985 d= AV_RN32(&src4[i*src_stride4+4]);\
986 l0= (a&0x03030303UL)\
989 h0= ((a&0xFCFCFCFCUL)>>2)\
990 + ((b&0xFCFCFCFCUL)>>2);\
991 l1= (c&0x03030303UL)\
993 h1= ((c&0xFCFCFCFCUL)>>2)\
994 + ((d&0xFCFCFCFCUL)>>2);\
995 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1000 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1001 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1003 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, const uint8_t *src4,\
1004 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1006 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1009 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1011 int i, a0, b0, a1, b1;\
1018 for(i=0; i<h; i+=2){\
1024 block[0]= (a1+a0)>>2; /* FIXME non put */\
1025 block[1]= (b1+b0)>>2;\
1035 block[0]= (a1+a0)>>2;\
1036 block[1]= (b1+b0)>>2;\
1042 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1045 const uint32_t a= AV_RN32(pixels );\
1046 const uint32_t b= AV_RN32(pixels+1);\
1047 uint32_t l0= (a&0x03030303UL)\
1050 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1051 + ((b&0xFCFCFCFCUL)>>2);\
1055 for(i=0; i<h; i+=2){\
1056 uint32_t a= AV_RN32(pixels );\
1057 uint32_t b= AV_RN32(pixels+1);\
1058 l1= (a&0x03030303UL)\
1059 + (b&0x03030303UL);\
1060 h1= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1062 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1065 a= AV_RN32(pixels );\
1066 b= AV_RN32(pixels+1);\
1067 l0= (a&0x03030303UL)\
1070 h0= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1078 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1081 for(j=0; j<2; j++){\
1083 const uint32_t a= AV_RN32(pixels );\
1084 const uint32_t b= AV_RN32(pixels+1);\
1085 uint32_t l0= (a&0x03030303UL)\
1088 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1089 + ((b&0xFCFCFCFCUL)>>2);\
1093 for(i=0; i<h; i+=2){\
1094 uint32_t a= AV_RN32(pixels );\
1095 uint32_t b= AV_RN32(pixels+1);\
1096 l1= (a&0x03030303UL)\
1097 + (b&0x03030303UL);\
1098 h1= ((a&0xFCFCFCFCUL)>>2)\
1099 + ((b&0xFCFCFCFCUL)>>2);\
1100 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103 a= AV_RN32(pixels );\
1104 b= AV_RN32(pixels+1);\
1105 l0= (a&0x03030303UL)\
1108 h0= ((a&0xFCFCFCFCUL)>>2)\
1109 + ((b&0xFCFCFCFCUL)>>2);\
1110 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1114 pixels+=4-line_size*(h+1);\
1115 block +=4-line_size*h;\
1119 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1122 for(j=0; j<2; j++){\
1124 const uint32_t a= AV_RN32(pixels );\
1125 const uint32_t b= AV_RN32(pixels+1);\
1126 uint32_t l0= (a&0x03030303UL)\
1129 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130 + ((b&0xFCFCFCFCUL)>>2);\
1134 for(i=0; i<h; i+=2){\
1135 uint32_t a= AV_RN32(pixels );\
1136 uint32_t b= AV_RN32(pixels+1);\
1137 l1= (a&0x03030303UL)\
1138 + (b&0x03030303UL);\
1139 h1= ((a&0xFCFCFCFCUL)>>2)\
1140 + ((b&0xFCFCFCFCUL)>>2);\
1141 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1144 a= AV_RN32(pixels );\
1145 b= AV_RN32(pixels+1);\
1146 l0= (a&0x03030303UL)\
1149 h0= ((a&0xFCFCFCFCUL)>>2)\
1150 + ((b&0xFCFCFCFCUL)>>2);\
1151 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1155 pixels+=4-line_size*(h+1);\
1156 block +=4-line_size*h;\
1160 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1161 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1162 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1163 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1164 av_unused CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1165 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1166 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1167 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1169 #define op_avg(a, b) a = rnd_avg32(a, b)
1171 #define op_put(a, b) a = b
1178 #define put_no_rnd_pixels8_c put_pixels8_c
1179 #define put_no_rnd_pixels16_c put_pixels16_c
1181 #define avg2(a,b) ((a+b+1)>>1)
1182 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1184 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1185 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1188 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1189 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1192 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1194 const int A=(16-x16)*(16-y16);
1195 const int B=( x16)*(16-y16);
1196 const int C=(16-x16)*( y16);
1197 const int D=( x16)*( y16);
1202 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1203 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1204 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1205 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1206 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1207 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1208 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1209 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1215 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1216 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1219 const int s= 1<<shift;
1229 for(x=0; x<8; x++){ //XXX FIXME optimize
1230 int src_x, src_y, frac_x, frac_y, index;
1234 frac_x= src_x&(s-1);
1235 frac_y= src_y&(s-1);
1239 if((unsigned)src_x < width){
1240 if((unsigned)src_y < height){
1241 index= src_x + src_y*stride;
1242 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1243 + src[index +1]* frac_x )*(s-frac_y)
1244 + ( src[index+stride ]*(s-frac_x)
1245 + src[index+stride+1]* frac_x )* frac_y
1248 index= src_x + av_clip(src_y, 0, height)*stride;
1249 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1250 + src[index +1]* frac_x )*s
1254 if((unsigned)src_y < height){
1255 index= av_clip(src_x, 0, width) + src_y*stride;
1256 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1257 + src[index+stride ]* frac_y )*s
1260 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1261 dst[y*stride + x]= src[index ];
1273 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1275 case 2: put_pixels2_c (dst, src, stride, height); break;
1276 case 4: put_pixels4_c (dst, src, stride, height); break;
1277 case 8: put_pixels8_c (dst, src, stride, height); break;
1278 case 16:put_pixels16_c(dst, src, stride, height); break;
1282 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
1286 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1293 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
1297 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1304 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
1308 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1315 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317 for (i=0; i < height; i++) {
1318 for (j=0; j < width; j++) {
1319 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1326 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1328 for (i=0; i < height; i++) {
1329 for (j=0; j < width; j++) {
1330 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1337 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1339 for (i=0; i < height; i++) {
1340 for (j=0; j < width; j++) {
1341 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1348 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1350 for (i=0; i < height; i++) {
1351 for (j=0; j < width; j++) {
1352 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1359 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1361 for (i=0; i < height; i++) {
1362 for (j=0; j < width; j++) {
1363 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1370 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372 case 2: avg_pixels2_c (dst, src, stride, height); break;
1373 case 4: avg_pixels4_c (dst, src, stride, height); break;
1374 case 8: avg_pixels8_c (dst, src, stride, height); break;
1375 case 16:avg_pixels16_c(dst, src, stride, height); break;
1379 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
1383 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1390 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
1394 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1401 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
1405 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1412 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1414 for (i=0; i < height; i++) {
1415 for (j=0; j < width; j++) {
1416 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1423 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425 for (i=0; i < height; i++) {
1426 for (j=0; j < width; j++) {
1427 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1434 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1436 for (i=0; i < height; i++) {
1437 for (j=0; j < width; j++) {
1438 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1445 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1447 for (i=0; i < height; i++) {
1448 for (j=0; j < width; j++) {
1449 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1456 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1458 for (i=0; i < height; i++) {
1459 for (j=0; j < width; j++) {
1460 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1467 #define TPEL_WIDTH(width)\
1468 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1469 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1470 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1471 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1472 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1473 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1474 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1475 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1476 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1477 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1478 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1479 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1480 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1481 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1482 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1483 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1484 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1485 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1488 #define H264_CHROMA_MC(OPNAME, OP)\
1489 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1490 const int A=(8-x)*(8-y);\
1491 const int B=( x)*(8-y);\
1492 const int C=(8-x)*( y);\
1493 const int D=( x)*( y);\
1496 assert(x<8 && y<8 && x>=0 && y>=0);\
1499 for(i=0; i<h; i++){\
1500 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1501 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1507 const int step= C ? stride : 1;\
1508 for(i=0; i<h; i++){\
1509 OP(dst[0], (A*src[0] + E*src[step+0]));\
1510 OP(dst[1], (A*src[1] + E*src[step+1]));\
1517 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1518 const int A=(8-x)*(8-y);\
1519 const int B=( x)*(8-y);\
1520 const int C=(8-x)*( y);\
1521 const int D=( x)*( y);\
1524 assert(x<8 && y<8 && x>=0 && y>=0);\
1527 for(i=0; i<h; i++){\
1528 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1529 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1530 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1531 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1537 const int step= C ? stride : 1;\
1538 for(i=0; i<h; i++){\
1539 OP(dst[0], (A*src[0] + E*src[step+0]));\
1540 OP(dst[1], (A*src[1] + E*src[step+1]));\
1541 OP(dst[2], (A*src[2] + E*src[step+2]));\
1542 OP(dst[3], (A*src[3] + E*src[step+3]));\
1549 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1550 const int A=(8-x)*(8-y);\
1551 const int B=( x)*(8-y);\
1552 const int C=(8-x)*( y);\
1553 const int D=( x)*( y);\
1556 assert(x<8 && y<8 && x>=0 && y>=0);\
1559 for(i=0; i<h; i++){\
1560 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1561 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1562 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1563 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1564 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1565 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1566 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1567 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1573 const int step= C ? stride : 1;\
1574 for(i=0; i<h; i++){\
1575 OP(dst[0], (A*src[0] + E*src[step+0]));\
1576 OP(dst[1], (A*src[1] + E*src[step+1]));\
1577 OP(dst[2], (A*src[2] + E*src[step+2]));\
1578 OP(dst[3], (A*src[3] + E*src[step+3]));\
1579 OP(dst[4], (A*src[4] + E*src[step+4]));\
1580 OP(dst[5], (A*src[5] + E*src[step+5]));\
1581 OP(dst[6], (A*src[6] + E*src[step+6]));\
1582 OP(dst[7], (A*src[7] + E*src[step+7]));\
1589 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1590 #define op_put(a, b) a = (((b) + 32)>>6)
1592 H264_CHROMA_MC(put_ , op_put)
1593 H264_CHROMA_MC(avg_ , op_avg)
1597 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1598 const int A=(8-x)*(8-y);
1599 const int B=( x)*(8-y);
1600 const int C=(8-x)*( y);
1601 const int D=( x)*( y);
1604 assert(x<8 && y<8 && x>=0 && y>=0);
1608 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1609 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1610 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1611 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1612 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1613 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1614 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1615 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1621 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1622 const int A=(8-x)*(8-y);
1623 const int B=( x)*(8-y);
1624 const int C=(8-x)*( y);
1625 const int D=( x)*( y);
1628 assert(x<8 && y<8 && x>=0 && y>=0);
1632 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1633 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1634 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1635 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1636 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1637 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1638 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1639 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1645 #define QPEL_MC(r, OPNAME, RND, OP) \
1646 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1647 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1651 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1652 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1653 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1654 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1655 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1656 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1657 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1658 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1664 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1666 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1670 const int src0= src[0*srcStride];\
1671 const int src1= src[1*srcStride];\
1672 const int src2= src[2*srcStride];\
1673 const int src3= src[3*srcStride];\
1674 const int src4= src[4*srcStride];\
1675 const int src5= src[5*srcStride];\
1676 const int src6= src[6*srcStride];\
1677 const int src7= src[7*srcStride];\
1678 const int src8= src[8*srcStride];\
1679 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1680 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1681 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1682 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1683 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1684 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1685 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1686 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1692 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1693 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1698 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1699 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1700 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1701 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1702 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1703 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1704 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1705 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1706 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1707 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1708 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1709 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1710 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1711 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1712 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1713 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1719 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1720 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1725 const int src0= src[0*srcStride];\
1726 const int src1= src[1*srcStride];\
1727 const int src2= src[2*srcStride];\
1728 const int src3= src[3*srcStride];\
1729 const int src4= src[4*srcStride];\
1730 const int src5= src[5*srcStride];\
1731 const int src6= src[6*srcStride];\
1732 const int src7= src[7*srcStride];\
1733 const int src8= src[8*srcStride];\
1734 const int src9= src[9*srcStride];\
1735 const int src10= src[10*srcStride];\
1736 const int src11= src[11*srcStride];\
1737 const int src12= src[12*srcStride];\
1738 const int src13= src[13*srcStride];\
1739 const int src14= src[14*srcStride];\
1740 const int src15= src[15*srcStride];\
1741 const int src16= src[16*srcStride];\
1742 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1743 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1744 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1745 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1746 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1747 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1748 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1749 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1750 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1751 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1752 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1753 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1754 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1755 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1756 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1757 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1763 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1765 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1766 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1769 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1770 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1773 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1776 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1779 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1782 copy_block9(full, src, 16, stride, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1784 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1787 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[16*9];\
1789 copy_block9(full, src, 16, stride, 9);\
1790 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1793 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1798 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1800 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801 uint8_t full[16*9];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1811 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t full[16*9];\
1814 uint8_t halfHV[64];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1821 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t full[16*9];\
1825 uint8_t halfHV[64];\
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1832 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1835 uint8_t halfHV[64];\
1836 copy_block9(full, src, 16, stride, 9);\
1837 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1838 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1839 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1840 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1842 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[16*9];\
1846 uint8_t halfHV[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1853 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[16*9];\
1856 uint8_t halfHV[64];\
1857 copy_block9(full, src, 16, stride, 9);\
1858 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1860 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1861 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1863 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t full[16*9];\
1867 uint8_t halfHV[64];\
1868 copy_block9(full, src, 16, stride, 9);\
1869 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1870 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1872 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1874 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1875 uint8_t full[16*9];\
1877 uint8_t halfHV[64];\
1878 copy_block9(full, src, 16, stride, 9);\
1879 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1880 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1881 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1882 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1884 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t halfHV[64];\
1887 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1888 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1889 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1891 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t halfHV[64];\
1894 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1895 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1896 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1898 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[16*9];\
1902 uint8_t halfHV[64];\
1903 copy_block9(full, src, 16, stride, 9);\
1904 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1905 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1906 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1907 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1909 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t full[16*9];\
1912 copy_block9(full, src, 16, stride, 9);\
1913 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1915 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1917 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1918 uint8_t full[16*9];\
1921 uint8_t halfHV[64];\
1922 copy_block9(full, src, 16, stride, 9);\
1923 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1928 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1931 copy_block9(full, src, 16, stride, 9);\
1932 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1933 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1934 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1936 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1938 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1939 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1942 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1944 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1945 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1948 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1949 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1952 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1954 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1955 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1958 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1961 copy_block17(full, src, 24, stride, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1963 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1966 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 copy_block17(full, src, 24, stride, 17);\
1969 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1972 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1977 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1979 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfV[256];\
1983 uint8_t halfHV[256];\
1984 copy_block17(full, src, 24, stride, 17);\
1985 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1988 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1990 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[24*17];\
1992 uint8_t halfH[272];\
1993 uint8_t halfHV[256];\
1994 copy_block17(full, src, 24, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2000 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t full[24*17];\
2002 uint8_t halfH[272];\
2003 uint8_t halfV[256];\
2004 uint8_t halfHV[256];\
2005 copy_block17(full, src, 24, stride, 17);\
2006 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2007 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2008 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2011 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2012 uint8_t full[24*17];\
2013 uint8_t halfH[272];\
2014 uint8_t halfHV[256];\
2015 copy_block17(full, src, 24, stride, 17);\
2016 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2017 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2018 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2019 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2021 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2022 uint8_t full[24*17];\
2023 uint8_t halfH[272];\
2024 uint8_t halfV[256];\
2025 uint8_t halfHV[256];\
2026 copy_block17(full, src, 24, stride, 17);\
2027 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2028 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2032 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t full[24*17];\
2034 uint8_t halfH[272];\
2035 uint8_t halfHV[256];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2042 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043 uint8_t full[24*17];\
2044 uint8_t halfH[272];\
2045 uint8_t halfV[256];\
2046 uint8_t halfHV[256];\
2047 copy_block17(full, src, 24, stride, 17);\
2048 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2049 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2050 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2053 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2054 uint8_t full[24*17];\
2055 uint8_t halfH[272];\
2056 uint8_t halfHV[256];\
2057 copy_block17(full, src, 24, stride, 17);\
2058 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2059 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2060 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2061 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2063 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t halfH[272];\
2065 uint8_t halfHV[256];\
2066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2067 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2068 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2070 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t halfH[272];\
2072 uint8_t halfHV[256];\
2073 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2074 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2075 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2077 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[24*17];\
2079 uint8_t halfH[272];\
2080 uint8_t halfV[256];\
2081 uint8_t halfHV[256];\
2082 copy_block17(full, src, 24, stride, 17);\
2083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2088 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2089 uint8_t full[24*17];\
2090 uint8_t halfH[272];\
2091 copy_block17(full, src, 24, stride, 17);\
2092 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2093 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2094 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2096 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2097 uint8_t full[24*17];\
2098 uint8_t halfH[272];\
2099 uint8_t halfV[256];\
2100 uint8_t halfHV[256];\
2101 copy_block17(full, src, 24, stride, 17);\
2102 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2104 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2105 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2107 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2108 uint8_t full[24*17];\
2109 uint8_t halfH[272];\
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2115 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2116 uint8_t halfH[272];\
2117 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2118 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2121 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2122 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2123 #define op_put(a, b) a = cm[((b) + 16)>>5]
2124 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2126 QPEL_MC(0, put_ , _ , op_put)
2127 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2128 QPEL_MC(0, avg_ , _ , op_avg)
2129 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2131 #undef op_avg_no_rnd
2133 #undef op_put_no_rnd
2135 #define put_qpel8_mc00_c ff_put_pixels8x8_c
2136 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
2137 #define put_qpel16_mc00_c ff_put_pixels16x16_c
2138 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
2139 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
2140 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_c
2143 #define H264_LOWPASS(OPNAME, OP, OP2) \
2144 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2146 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2150 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2151 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2157 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2159 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2163 const int srcB= src[-2*srcStride];\
2164 const int srcA= src[-1*srcStride];\
2165 const int src0= src[0 *srcStride];\
2166 const int src1= src[1 *srcStride];\
2167 const int src2= src[2 *srcStride];\
2168 const int src3= src[3 *srcStride];\
2169 const int src4= src[4 *srcStride];\
2170 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2171 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2177 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2180 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2182 src -= 2*srcStride;\
2183 for(i=0; i<h+5; i++)\
2185 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2186 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2190 tmp -= tmpStride*(h+5-2);\
2193 const int tmpB= tmp[-2*tmpStride];\
2194 const int tmpA= tmp[-1*tmpStride];\
2195 const int tmp0= tmp[0 *tmpStride];\
2196 const int tmp1= tmp[1 *tmpStride];\
2197 const int tmp2= tmp[2 *tmpStride];\
2198 const int tmp3= tmp[3 *tmpStride];\
2199 const int tmp4= tmp[4 *tmpStride];\
2200 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2201 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2206 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2208 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2212 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2213 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2214 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2215 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2221 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2227 const int srcB= src[-2*srcStride];\
2228 const int srcA= src[-1*srcStride];\
2229 const int src0= src[0 *srcStride];\
2230 const int src1= src[1 *srcStride];\
2231 const int src2= src[2 *srcStride];\
2232 const int src3= src[3 *srcStride];\
2233 const int src4= src[4 *srcStride];\
2234 const int src5= src[5 *srcStride];\
2235 const int src6= src[6 *srcStride];\
2236 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2237 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2238 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2239 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2248 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2250 src -= 2*srcStride;\
2251 for(i=0; i<h+5; i++)\
2253 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2254 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2255 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2256 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2260 tmp -= tmpStride*(h+5-2);\
2263 const int tmpB= tmp[-2*tmpStride];\
2264 const int tmpA= tmp[-1*tmpStride];\
2265 const int tmp0= tmp[0 *tmpStride];\
2266 const int tmp1= tmp[1 *tmpStride];\
2267 const int tmp2= tmp[2 *tmpStride];\
2268 const int tmp3= tmp[3 *tmpStride];\
2269 const int tmp4= tmp[4 *tmpStride];\
2270 const int tmp5= tmp[5 *tmpStride];\
2271 const int tmp6= tmp[6 *tmpStride];\
2272 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2273 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2274 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2275 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2281 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2287 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2288 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2289 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2290 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2291 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2292 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2293 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2294 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2300 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2302 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2306 const int srcB= src[-2*srcStride];\
2307 const int srcA= src[-1*srcStride];\
2308 const int src0= src[0 *srcStride];\
2309 const int src1= src[1 *srcStride];\
2310 const int src2= src[2 *srcStride];\
2311 const int src3= src[3 *srcStride];\
2312 const int src4= src[4 *srcStride];\
2313 const int src5= src[5 *srcStride];\
2314 const int src6= src[6 *srcStride];\
2315 const int src7= src[7 *srcStride];\
2316 const int src8= src[8 *srcStride];\
2317 const int src9= src[9 *srcStride];\
2318 const int src10=src[10*srcStride];\
2319 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2320 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2321 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2322 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2323 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2324 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2325 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2326 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2332 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2337 src -= 2*srcStride;\
2338 for(i=0; i<h+5; i++)\
2340 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2341 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2342 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2343 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2344 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2345 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2346 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2347 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2351 tmp -= tmpStride*(h+5-2);\
2354 const int tmpB= tmp[-2*tmpStride];\
2355 const int tmpA= tmp[-1*tmpStride];\
2356 const int tmp0= tmp[0 *tmpStride];\
2357 const int tmp1= tmp[1 *tmpStride];\
2358 const int tmp2= tmp[2 *tmpStride];\
2359 const int tmp3= tmp[3 *tmpStride];\
2360 const int tmp4= tmp[4 *tmpStride];\
2361 const int tmp5= tmp[5 *tmpStride];\
2362 const int tmp6= tmp[6 *tmpStride];\
2363 const int tmp7= tmp[7 *tmpStride];\
2364 const int tmp8= tmp[8 *tmpStride];\
2365 const int tmp9= tmp[9 *tmpStride];\
2366 const int tmp10=tmp[10*tmpStride];\
2367 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2368 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2369 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2370 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2371 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2372 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2373 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2374 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2380 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2381 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2382 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2383 src += 8*srcStride;\
2384 dst += 8*dstStride;\
2385 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2386 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2389 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2391 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2392 src += 8*srcStride;\
2393 dst += 8*dstStride;\
2394 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2395 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2398 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2399 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2400 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2401 src += 8*srcStride;\
2402 dst += 8*dstStride;\
2403 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2404 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2407 #define H264_MC(OPNAME, SIZE) \
2408 static av_unused void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2409 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t half[SIZE*SIZE];\
2414 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2415 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2418 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2419 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t half[SIZE*SIZE];\
2424 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2425 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2428 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2429 uint8_t full[SIZE*(SIZE+5)];\
2430 uint8_t * const full_mid= full + SIZE*2;\
2431 uint8_t half[SIZE*SIZE];\
2432 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2433 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2434 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2437 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2438 uint8_t full[SIZE*(SIZE+5)];\
2439 uint8_t * const full_mid= full + SIZE*2;\
2440 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2441 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2445 uint8_t full[SIZE*(SIZE+5)];\
2446 uint8_t * const full_mid= full + SIZE*2;\
2447 uint8_t half[SIZE*SIZE];\
2448 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2449 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2450 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2453 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2454 uint8_t full[SIZE*(SIZE+5)];\
2455 uint8_t * const full_mid= full + SIZE*2;\
2456 uint8_t halfH[SIZE*SIZE];\
2457 uint8_t halfV[SIZE*SIZE];\
2458 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2459 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2460 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2461 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2464 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2465 uint8_t full[SIZE*(SIZE+5)];\
2466 uint8_t * const full_mid= full + SIZE*2;\
2467 uint8_t halfH[SIZE*SIZE];\
2468 uint8_t halfV[SIZE*SIZE];\
2469 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2470 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2471 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2472 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2475 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2476 uint8_t full[SIZE*(SIZE+5)];\
2477 uint8_t * const full_mid= full + SIZE*2;\
2478 uint8_t halfH[SIZE*SIZE];\
2479 uint8_t halfV[SIZE*SIZE];\
2480 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2481 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2482 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2483 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2486 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2487 uint8_t full[SIZE*(SIZE+5)];\
2488 uint8_t * const full_mid= full + SIZE*2;\
2489 uint8_t halfH[SIZE*SIZE];\
2490 uint8_t halfV[SIZE*SIZE];\
2491 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2492 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2493 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2494 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2498 int16_t tmp[SIZE*(SIZE+5)];\
2499 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2502 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2503 int16_t tmp[SIZE*(SIZE+5)];\
2504 uint8_t halfH[SIZE*SIZE];\
2505 uint8_t halfHV[SIZE*SIZE];\
2506 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2507 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2508 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2511 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2512 int16_t tmp[SIZE*(SIZE+5)];\
2513 uint8_t halfH[SIZE*SIZE];\
2514 uint8_t halfHV[SIZE*SIZE];\
2515 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2516 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2517 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2520 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2521 uint8_t full[SIZE*(SIZE+5)];\
2522 uint8_t * const full_mid= full + SIZE*2;\
2523 int16_t tmp[SIZE*(SIZE+5)];\
2524 uint8_t halfV[SIZE*SIZE];\
2525 uint8_t halfHV[SIZE*SIZE];\
2526 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2527 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2528 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2529 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2533 uint8_t full[SIZE*(SIZE+5)];\
2534 uint8_t * const full_mid= full + SIZE*2;\
2535 int16_t tmp[SIZE*(SIZE+5)];\
2536 uint8_t halfV[SIZE*SIZE];\
2537 uint8_t halfHV[SIZE*SIZE];\
2538 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2539 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2540 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2541 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2544 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2545 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2546 #define op_put(a, b) a = cm[((b) + 16)>>5]
2547 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2548 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2550 H264_LOWPASS(put_ , op_put, op2_put)
2551 H264_LOWPASS(avg_ , op_avg, op2_avg)
2566 #define put_h264_qpel8_mc00_c ff_put_pixels8x8_c
2567 #define avg_h264_qpel8_mc00_c ff_avg_pixels8x8_c
2568 #define put_h264_qpel16_mc00_c ff_put_pixels16x16_c
2569 #define avg_h264_qpel16_mc00_c ff_avg_pixels16x16_c
2571 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2572 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2576 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2577 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2578 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2579 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2580 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2581 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2582 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2583 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2589 void ff_put_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2590 put_pixels8_c(dst, src, stride, 8);
2592 void ff_avg_pixels8x8_c(uint8_t *dst, uint8_t *src, int stride) {
2593 avg_pixels8_c(dst, src, stride, 8);
2595 void ff_put_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2596 put_pixels16_c(dst, src, stride, 16);
2598 void ff_avg_pixels16x16_c(uint8_t *dst, uint8_t *src, int stride) {
2599 avg_pixels16_c(dst, src, stride, 16);
2602 #if CONFIG_RV40_DECODER
2603 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2604 put_pixels16_xy2_c(dst, src, stride, 16);
2606 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2607 avg_pixels16_xy2_c(dst, src, stride, 16);
2609 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2610 put_pixels8_xy2_c(dst, src, stride, 8);
2612 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2613 avg_pixels8_xy2_c(dst, src, stride, 8);
2615 #endif /* CONFIG_RV40_DECODER */
2617 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2618 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2622 const int src_1= src[ -srcStride];
2623 const int src0 = src[0 ];
2624 const int src1 = src[ srcStride];
2625 const int src2 = src[2*srcStride];
2626 const int src3 = src[3*srcStride];
2627 const int src4 = src[4*srcStride];
2628 const int src5 = src[5*srcStride];
2629 const int src6 = src[6*srcStride];
2630 const int src7 = src[7*srcStride];
2631 const int src8 = src[8*srcStride];
2632 const int src9 = src[9*srcStride];
2633 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2634 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2635 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2636 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2637 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2638 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2639 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2640 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2646 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2648 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2652 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2653 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2656 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2658 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2659 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2662 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2663 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2666 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2670 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2671 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2672 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2673 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2675 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2681 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2684 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2686 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2690 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2691 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2693 const int strength= ff_h263_loop_filter_strength[qscale];
2697 int p0= src[x-2*stride];
2698 int p1= src[x-1*stride];
2699 int p2= src[x+0*stride];
2700 int p3= src[x+1*stride];
2701 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703 if (d<-2*strength) d1= 0;
2704 else if(d<- strength) d1=-2*strength - d;
2705 else if(d< strength) d1= d;
2706 else if(d< 2*strength) d1= 2*strength - d;
2711 if(p1&256) p1= ~(p1>>31);
2712 if(p2&256) p2= ~(p2>>31);
2714 src[x-1*stride] = p1;
2715 src[x+0*stride] = p2;
2719 d2= av_clip((p0-p3)/4, -ad1, ad1);
2721 src[x-2*stride] = p0 - d2;
2722 src[x+ stride] = p3 + d2;
2727 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2728 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2730 const int strength= ff_h263_loop_filter_strength[qscale];
2734 int p0= src[y*stride-2];
2735 int p1= src[y*stride-1];
2736 int p2= src[y*stride+0];
2737 int p3= src[y*stride+1];
2738 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2740 if (d<-2*strength) d1= 0;
2741 else if(d<- strength) d1=-2*strength - d;
2742 else if(d< strength) d1= d;
2743 else if(d< 2*strength) d1= 2*strength - d;
2748 if(p1&256) p1= ~(p1>>31);
2749 if(p2&256) p2= ~(p2>>31);
2751 src[y*stride-1] = p1;
2752 src[y*stride+0] = p2;
2756 d2= av_clip((p0-p3)/4, -ad1, ad1);
2758 src[y*stride-2] = p0 - d2;
2759 src[y*stride+1] = p3 + d2;
2764 static void h261_loop_filter_c(uint8_t *src, int stride){
2769 temp[x ] = 4*src[x ];
2770 temp[x + 7*8] = 4*src[x + 7*stride];
2774 xy = y * stride + x;
2776 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2781 src[ y*stride] = (temp[ y*8] + 2)>>2;
2782 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2784 xy = y * stride + x;
2786 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2791 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2797 s += abs(pix1[0] - pix2[0]);
2798 s += abs(pix1[1] - pix2[1]);
2799 s += abs(pix1[2] - pix2[2]);
2800 s += abs(pix1[3] - pix2[3]);
2801 s += abs(pix1[4] - pix2[4]);
2802 s += abs(pix1[5] - pix2[5]);
2803 s += abs(pix1[6] - pix2[6]);
2804 s += abs(pix1[7] - pix2[7]);
2805 s += abs(pix1[8] - pix2[8]);
2806 s += abs(pix1[9] - pix2[9]);
2807 s += abs(pix1[10] - pix2[10]);
2808 s += abs(pix1[11] - pix2[11]);
2809 s += abs(pix1[12] - pix2[12]);
2810 s += abs(pix1[13] - pix2[13]);
2811 s += abs(pix1[14] - pix2[14]);
2812 s += abs(pix1[15] - pix2[15]);
2819 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2825 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2826 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2827 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2828 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2829 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2830 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2831 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2832 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2833 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2834 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2835 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2836 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2837 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2838 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2839 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2840 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2847 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2850 uint8_t *pix3 = pix2 + line_size;
2854 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2855 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2856 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2857 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2858 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2859 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2860 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2861 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2862 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2863 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2864 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2865 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2866 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2867 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2868 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2869 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2877 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2880 uint8_t *pix3 = pix2 + line_size;
2884 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2885 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2886 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2887 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2888 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2889 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2890 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2891 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2892 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2893 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2894 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2895 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2896 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2897 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2898 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2899 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2907 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2913 s += abs(pix1[0] - pix2[0]);
2914 s += abs(pix1[1] - pix2[1]);
2915 s += abs(pix1[2] - pix2[2]);
2916 s += abs(pix1[3] - pix2[3]);
2917 s += abs(pix1[4] - pix2[4]);
2918 s += abs(pix1[5] - pix2[5]);
2919 s += abs(pix1[6] - pix2[6]);
2920 s += abs(pix1[7] - pix2[7]);
2927 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2933 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2947 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2950 uint8_t *pix3 = pix2 + line_size;
2954 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2969 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2972 uint8_t *pix3 = pix2 + line_size;
2976 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2977 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2978 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2979 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2980 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2981 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2982 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2983 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2991 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
2992 MpegEncContext *c = v;
2998 for(x=0; x<16; x++){
2999 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3002 for(x=0; x<15; x++){
3003 score2+= FFABS( s1[x ] - s1[x +stride]
3004 - s1[x+1] + s1[x+1+stride])
3005 -FFABS( s2[x ] - s2[x +stride]
3006 - s2[x+1] + s2[x+1+stride]);
3013 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3014 else return score1 + FFABS(score2)*8;
3017 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3018 MpegEncContext *c = v;
3025 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3029 score2+= FFABS( s1[x ] - s1[x +stride]
3030 - s1[x+1] + s1[x+1+stride])
3031 -FFABS( s2[x ] - s2[x +stride]
3032 - s2[x+1] + s2[x+1+stride]);
3039 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3040 else return score1 + FFABS(score2)*8;
3043 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3047 for(i=0; i<8*8; i++){
3048 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3051 assert(-512<b && b<512);
3053 sum += (w*b)*(w*b)>>4;
3058 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3061 for(i=0; i<8*8; i++){
3062 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3067 * permutes an 8x8 block.
3068 * @param block the block which will be permuted according to the given permutation vector
3069 * @param permutation the permutation vector
3070 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3071 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3072 * (inverse) permutated to scantable order!
3074 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3080 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3082 for(i=0; i<=last; i++){
3083 const int j= scantable[i];
3088 for(i=0; i<=last; i++){
3089 const int j= scantable[i];
3090 const int perm_j= permutation[j];
3091 block[perm_j]= temp[j];
3095 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3099 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3102 memset(cmp, 0, sizeof(void*)*6);
3110 cmp[i]= c->hadamard8_diff[i];
3116 cmp[i]= c->dct_sad[i];
3119 cmp[i]= c->dct264_sad[i];
3122 cmp[i]= c->dct_max[i];
3125 cmp[i]= c->quant_psnr[i];
3154 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3159 static void clear_block_c(DCTELEM *block)
3161 memset(block, 0, sizeof(DCTELEM)*64);
3165 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3167 static void clear_blocks_c(DCTELEM *blocks)
3169 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3172 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3174 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3175 long a = *(long*)(src+i);
3176 long b = *(long*)(dst+i);
3177 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3180 dst[i+0] += src[i+0];
3183 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3185 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3186 long a = *(long*)(src1+i);
3187 long b = *(long*)(src2+i);
3188 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3191 dst[i] = src1[i]+src2[i];
3194 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3196 #if !HAVE_FAST_UNALIGNED
3197 if((long)src2 & (sizeof(long)-1)){
3198 for(i=0; i+7<w; i+=8){
3199 dst[i+0] = src1[i+0]-src2[i+0];
3200 dst[i+1] = src1[i+1]-src2[i+1];
3201 dst[i+2] = src1[i+2]-src2[i+2];
3202 dst[i+3] = src1[i+3]-src2[i+3];
3203 dst[i+4] = src1[i+4]-src2[i+4];
3204 dst[i+5] = src1[i+5]-src2[i+5];
3205 dst[i+6] = src1[i+6]-src2[i+6];
3206 dst[i+7] = src1[i+7]-src2[i+7];
3210 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3211 long a = *(long*)(src1+i);
3212 long b = *(long*)(src2+i);
3213 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3216 dst[i+0] = src1[i+0]-src2[i+0];
3219 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3227 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3236 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3244 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3254 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3257 for(i=0; i<w-1; i++){
3284 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3314 #define BUTTERFLY2(o1,o2,i1,i2) \
3318 #define BUTTERFLY1(x,y) \
3327 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3329 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3337 //FIXME try pointer walks
3338 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3339 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3340 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3341 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3343 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3344 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3345 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3346 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3348 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3349 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3350 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3351 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3355 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3356 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3357 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3358 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3360 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3361 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3362 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3363 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3366 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3367 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3368 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3369 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3375 printf("MAX:%d\n", maxi);
3381 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3389 //FIXME try pointer walks
3390 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3391 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3392 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3393 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3395 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3396 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3397 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3398 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3400 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3401 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3402 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3403 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3407 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3408 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3409 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3410 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3412 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3413 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3414 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3415 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3418 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3419 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3420 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3421 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3424 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3429 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3430 MpegEncContext * const s= (MpegEncContext *)c;
3431 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3435 s->dsp.diff_pixels(temp, src1, src2, stride);
3437 return s->dsp.sum_abs_dctelem(temp);
3442 const int s07 = SRC(0) + SRC(7);\
3443 const int s16 = SRC(1) + SRC(6);\
3444 const int s25 = SRC(2) + SRC(5);\
3445 const int s34 = SRC(3) + SRC(4);\
3446 const int a0 = s07 + s34;\
3447 const int a1 = s16 + s25;\
3448 const int a2 = s07 - s34;\
3449 const int a3 = s16 - s25;\
3450 const int d07 = SRC(0) - SRC(7);\
3451 const int d16 = SRC(1) - SRC(6);\
3452 const int d25 = SRC(2) - SRC(5);\
3453 const int d34 = SRC(3) - SRC(4);\
3454 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3455 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3456 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3457 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3459 DST(1, a4 + (a7>>2)) ;\
3460 DST(2, a2 + (a3>>1)) ;\
3461 DST(3, a5 + (a6>>2)) ;\
3463 DST(5, a6 - (a5>>2)) ;\
3464 DST(6, (a2>>1) - a3 ) ;\
3465 DST(7, (a4>>2) - a7 ) ;\
3468 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3469 MpegEncContext * const s= (MpegEncContext *)c;
3474 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3476 #define SRC(x) dct[i][x]
3477 #define DST(x,v) dct[i][x]= v
3478 for( i = 0; i < 8; i++ )
3483 #define SRC(x) dct[x][i]
3484 #define DST(x,v) sum += FFABS(v)
3485 for( i = 0; i < 8; i++ )
3493 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494 MpegEncContext * const s= (MpegEncContext *)c;
3495 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3500 s->dsp.diff_pixels(temp, src1, src2, stride);
3504 sum= FFMAX(sum, FFABS(temp[i]));
3509 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3510 MpegEncContext * const s= (MpegEncContext *)c;
3511 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3512 DCTELEM * const bak = temp+64;
3518 s->dsp.diff_pixels(temp, src1, src2, stride);
3520 memcpy(bak, temp, 64*sizeof(DCTELEM));
3522 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3523 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3524 ff_simple_idct(temp); //FIXME
3527 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3532 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3533 MpegEncContext * const s= (MpegEncContext *)c;
3534 const uint8_t *scantable= s->intra_scantable.permutated;
3535 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3536 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3537 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3538 int i, last, run, bits, level, distortion, start_i;
3539 const int esc_length= s->ac_esc_length;
3541 uint8_t * last_length;
3545 copy_block8(lsrc1, src1, 8, stride, 8);
3546 copy_block8(lsrc2, src2, 8, stride, 8);
3548 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3550 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3556 length = s->intra_ac_vlc_length;
3557 last_length= s->intra_ac_vlc_last_length;
3558 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3561 length = s->inter_ac_vlc_length;
3562 last_length= s->inter_ac_vlc_last_length;
3567 for(i=start_i; i<last; i++){
3568 int j= scantable[i];
3573 if((level&(~127)) == 0){
3574 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3583 level= temp[i] + 64;
3587 if((level&(~127)) == 0){
3588 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3596 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3598 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3601 s->dsp.idct_add(lsrc2, 8, temp);
3603 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3605 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3608 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3609 MpegEncContext * const s= (MpegEncContext *)c;
3610 const uint8_t *scantable= s->intra_scantable.permutated;
3611 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3612 int i, last, run, bits, level, start_i;
3613 const int esc_length= s->ac_esc_length;
3615 uint8_t * last_length;
3619 s->dsp.diff_pixels(temp, src1, src2, stride);
3621 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3627 length = s->intra_ac_vlc_length;
3628 last_length= s->intra_ac_vlc_last_length;
3629 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3632 length = s->inter_ac_vlc_length;
3633 last_length= s->inter_ac_vlc_last_length;
3638 for(i=start_i; i<last; i++){
3639 int j= scantable[i];
3644 if((level&(~127)) == 0){
3645 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3654 level= temp[i] + 64;
3658 if((level&(~127)) == 0){
3659 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3667 #define VSAD_INTRA(size) \
3668 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3672 for(y=1; y<h; y++){ \
3673 for(x=0; x<size; x+=4){ \
3674 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
3675 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
3685 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3690 for(x=0; x<16; x++){
3691 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3700 #define SQ(a) ((a)*(a))
3701 #define VSSE_INTRA(size) \
3702 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
3706 for(y=1; y<h; y++){ \
3707 for(x=0; x<size; x+=4){ \
3708 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
3709 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
3719 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3724 for(x=0; x<16; x++){
3725 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3734 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3738 for(i=0; i<size; i++)
3739 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3743 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3744 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3745 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3747 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3749 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3750 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3751 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3752 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3754 static void vector_fmul_c(float *dst, const float *src, int len){
3756 for(i=0; i<len; i++)
3760 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3763 for(i=0; i<len; i++)
3764 dst[i] = src0[i] * src1[-i];
3767 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
3769 for(i=0; i<len; i++)
3770 dst[i] = src0[i] * src1[i] + src2[i];
3773 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
3778 for(i=-len, j=len-1; i<0; i++, j--) {
3783 dst[i] = s0*wj - s1*wi + add_bias;
3784 dst[j] = s0*wi + s1*wj + add_bias;
3788 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
3792 for (i = 0; i < len; i++)
3793 dst[i] = src[i] * mul;
3796 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
3797 const float **sv, float mul, int len)
3800 for (i = 0; i < len; i += 2, sv++) {
3801 dst[i ] = src[i ] * sv[0][0] * mul;
3802 dst[i+1] = src[i+1] * sv[0][1] * mul;
3806 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
3807 const float **sv, float mul, int len)
3810 for (i = 0; i < len; i += 4, sv++) {
3811 dst[i ] = src[i ] * sv[0][0] * mul;
3812 dst[i+1] = src[i+1] * sv[0][1] * mul;
3813 dst[i+2] = src[i+2] * sv[0][2] * mul;
3814 dst[i+3] = src[i+3] * sv[0][3] * mul;
3818 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
3822 for (i = 0; i < len; i += 2, sv++) {
3823 dst[i ] = sv[0][0] * mul;
3824 dst[i+1] = sv[0][1] * mul;
3828 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
3832 for (i = 0; i < len; i += 4, sv++) {
3833 dst[i ] = sv[0][0] * mul;
3834 dst[i+1] = sv[0][1] * mul;
3835 dst[i+2] = sv[0][2] * mul;
3836 dst[i+3] = sv[0][3] * mul;
3840 static void butterflies_float_c(float *restrict v1, float *restrict v2,
3844 for (i = 0; i < len; i++) {
3845 float t = v1[i] - v2[i];
3851 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
3856 for (i = 0; i < len; i++)
3862 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
3864 for(i=0; i<len; i++)
3865 dst[i] = src[i] * mul;
3868 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
3869 uint32_t maxi, uint32_t maxisign)
3872 if(a > mini) return mini;
3873 else if((a^(1<<31)) > maxisign) return maxi;
3877 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
3879 uint32_t mini = *(uint32_t*)min;
3880 uint32_t maxi = *(uint32_t*)max;
3881 uint32_t maxisign = maxi ^ (1<<31);
3882 uint32_t *dsti = (uint32_t*)dst;
3883 const uint32_t *srci = (const uint32_t*)src;
3884 for(i=0; i<len; i+=8) {
3885 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
3886 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
3887 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
3888 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
3889 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
3890 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
3891 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
3892 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
3895 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
3897 if(min < 0 && max > 0) {
3898 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
3900 for(i=0; i < len; i+=8) {
3901 dst[i ] = av_clipf(src[i ], min, max);
3902 dst[i + 1] = av_clipf(src[i + 1], min, max);
3903 dst[i + 2] = av_clipf(src[i + 2], min, max);
3904 dst[i + 3] = av_clipf(src[i + 3], min, max);
3905 dst[i + 4] = av_clipf(src[i + 4], min, max);
3906 dst[i + 5] = av_clipf(src[i + 5], min, max);
3907 dst[i + 6] = av_clipf(src[i + 6], min, max);
3908 dst[i + 7] = av_clipf(src[i + 7], min, max);
3913 static av_always_inline int float_to_int16_one(const float *src){
3914 int_fast32_t tmp = *(const int32_t*)src;
3916 tmp = (0x43c0ffff - tmp)>>31;
3917 // is this faster on some gcc/cpu combinations?
3918 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3921 return tmp - 0x8000;
3924 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
3926 for(i=0; i<len; i++)
3927 dst[i] = float_to_int16_one(src+i);
3930 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
3933 for(i=0; i<len; i++){
3934 dst[2*i] = float_to_int16_one(src[0]+i);
3935 dst[2*i+1] = float_to_int16_one(src[1]+i);
3938 for(c=0; c<channels; c++)
3939 for(i=0, j=c; i<len; i++, j+=channels)
3940 dst[j] = float_to_int16_one(src[c]+i);
3944 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
3949 res += (*v1++ * *v2++) >> shift;
3954 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
3959 *v1++ += mul * *v3++;
3965 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3966 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3967 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3968 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3969 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3970 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3971 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3973 static void wmv2_idct_row(short * b)
3976 int a0,a1,a2,a3,a4,a5,a6,a7;
3978 a1 = W1*b[1]+W7*b[7];
3979 a7 = W7*b[1]-W1*b[7];
3980 a5 = W5*b[5]+W3*b[3];
3981 a3 = W3*b[5]-W5*b[3];
3982 a2 = W2*b[2]+W6*b[6];
3983 a6 = W6*b[2]-W2*b[6];
3984 a0 = W0*b[0]+W0*b[4];
3985 a4 = W0*b[0]-W0*b[4];
3987 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3988 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3990 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3991 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3992 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3993 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3994 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3995 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3996 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3997 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3999 static void wmv2_idct_col(short * b)
4002 int a0,a1,a2,a3,a4,a5,a6,a7;
4003 /*step 1, with extended precision*/
4004 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4005 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4006 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4007 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4008 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4009 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4010 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4011 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4013 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4014 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4016 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4017 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4018 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4019 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4021 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4022 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4023 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4024 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4026 void ff_wmv2_idct_c(short * block){
4030 wmv2_idct_row(block+i);
4033 wmv2_idct_col(block+i);
4036 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4038 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4040 ff_wmv2_idct_c(block);
4041 put_pixels_clamped_c(block, dest, line_size);
4043 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4045 ff_wmv2_idct_c(block);
4046 add_pixels_clamped_c(block, dest, line_size);
4048 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4051 put_pixels_clamped_c(block, dest, line_size);
4053 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4056 add_pixels_clamped_c(block, dest, line_size);
4059 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4062 put_pixels_clamped4_c(block, dest, line_size);
4064 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4067 add_pixels_clamped4_c(block, dest, line_size);
4070 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4073 put_pixels_clamped2_c(block, dest, line_size);
4075 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4078 add_pixels_clamped2_c(block, dest, line_size);
4081 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4083 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4085 dest[0] = cm[(block[0] + 4)>>3];
4087 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4089 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4091 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4094 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4096 /* init static data */
4097 av_cold void dsputil_static_init(void)
4101 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4102 for(i=0;i<MAX_NEG_CROP;i++) {
4104 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4107 for(i=0;i<512;i++) {
4108 ff_squareTbl[i] = (i - 256) * (i - 256);
4111 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4114 int ff_check_alignment(void){
4115 static int did_fail=0;
4116 DECLARE_ALIGNED(16, int, aligned);
4118 if((intptr_t)&aligned & 15){
4120 #if HAVE_MMX || HAVE_ALTIVEC
4121 av_log(NULL, AV_LOG_ERROR,
4122 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4123 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4124 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4125 "Do not report crashes to FFmpeg developers.\n");
4134 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4138 ff_check_alignment();
4141 if(avctx->dct_algo==FF_DCT_FASTINT) {
4142 c->fdct = fdct_ifast;
4143 c->fdct248 = fdct_ifast248;
4145 else if(avctx->dct_algo==FF_DCT_FAAN) {
4146 c->fdct = ff_faandct;
4147 c->fdct248 = ff_faandct248;
4150 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4151 c->fdct248 = ff_fdct248_islow;
4153 #endif //CONFIG_ENCODERS
4155 if(avctx->lowres==1){
4156 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4157 c->idct_put= ff_jref_idct4_put;
4158 c->idct_add= ff_jref_idct4_add;
4160 c->idct_put= ff_h264_lowres_idct_put_c;
4161 c->idct_add= ff_h264_lowres_idct_add_c;
4163 c->idct = j_rev_dct4;
4164 c->idct_permutation_type= FF_NO_IDCT_PERM;
4165 }else if(avctx->lowres==2){
4166 c->idct_put= ff_jref_idct2_put;
4167 c->idct_add= ff_jref_idct2_add;
4168 c->idct = j_rev_dct2;
4169 c->idct_permutation_type= FF_NO_IDCT_PERM;
4170 }else if(avctx->lowres==3){
4171 c->idct_put= ff_jref_idct1_put;
4172 c->idct_add= ff_jref_idct1_add;
4173 c->idct = j_rev_dct1;
4174 c->idct_permutation_type= FF_NO_IDCT_PERM;
4176 if(avctx->idct_algo==FF_IDCT_INT){
4177 c->idct_put= ff_jref_idct_put;
4178 c->idct_add= ff_jref_idct_add;
4179 c->idct = j_rev_dct;
4180 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4181 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4182 avctx->idct_algo==FF_IDCT_VP3){
4183 c->idct_put= ff_vp3_idct_put_c;
4184 c->idct_add= ff_vp3_idct_add_c;
4185 c->idct = ff_vp3_idct_c;
4186 c->idct_permutation_type= FF_NO_IDCT_PERM;
4187 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4188 c->idct_put= ff_wmv2_idct_put_c;
4189 c->idct_add= ff_wmv2_idct_add_c;
4190 c->idct = ff_wmv2_idct_c;
4191 c->idct_permutation_type= FF_NO_IDCT_PERM;
4192 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4193 c->idct_put= ff_faanidct_put;
4194 c->idct_add= ff_faanidct_add;
4195 c->idct = ff_faanidct;
4196 c->idct_permutation_type= FF_NO_IDCT_PERM;
4197 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4198 c->idct_put= ff_ea_idct_put_c;
4199 c->idct_permutation_type= FF_NO_IDCT_PERM;
4200 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4201 c->idct = ff_bink_idct_c;
4202 c->idct_add = ff_bink_idct_add_c;
4203 c->idct_put = ff_bink_idct_put_c;
4204 c->idct_permutation_type = FF_NO_IDCT_PERM;
4205 }else{ //accurate/default
4206 c->idct_put= ff_simple_idct_put;
4207 c->idct_add= ff_simple_idct_add;
4208 c->idct = ff_simple_idct;
4209 c->idct_permutation_type= FF_NO_IDCT_PERM;
4213 c->get_pixels = get_pixels_c;
4214 c->diff_pixels = diff_pixels_c;
4215 c->put_pixels_clamped = put_pixels_clamped_c;
4216 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4217 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4218 c->add_pixels_clamped = add_pixels_clamped_c;
4219 c->add_pixels8 = add_pixels8_c;
4220 c->add_pixels4 = add_pixels4_c;
4221 c->sum_abs_dctelem = sum_abs_dctelem_c;
4224 c->clear_block = clear_block_c;
4225 c->clear_blocks = clear_blocks_c;
4226 c->pix_sum = pix_sum_c;
4227 c->pix_norm1 = pix_norm1_c;
4229 c->fill_block_tab[0] = fill_block16_c;
4230 c->fill_block_tab[1] = fill_block8_c;
4231 c->scale_block = scale_block_c;
4233 /* TODO [0] 16 [1] 8 */
4234 c->pix_abs[0][0] = pix_abs16_c;
4235 c->pix_abs[0][1] = pix_abs16_x2_c;
4236 c->pix_abs[0][2] = pix_abs16_y2_c;
4237 c->pix_abs[0][3] = pix_abs16_xy2_c;
4238 c->pix_abs[1][0] = pix_abs8_c;
4239 c->pix_abs[1][1] = pix_abs8_x2_c;
4240 c->pix_abs[1][2] = pix_abs8_y2_c;
4241 c->pix_abs[1][3] = pix_abs8_xy2_c;
4243 #define dspfunc(PFX, IDX, NUM) \
4244 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4245 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4246 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4247 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4249 dspfunc(put, 0, 16);
4250 dspfunc(put_no_rnd, 0, 16);
4252 dspfunc(put_no_rnd, 1, 8);
4256 dspfunc(avg, 0, 16);
4257 dspfunc(avg_no_rnd, 0, 16);
4259 dspfunc(avg_no_rnd, 1, 8);
4264 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4265 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4267 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4268 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4269 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4270 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4271 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4272 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4273 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4274 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4275 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4277 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4278 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4279 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4280 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4281 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4282 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4283 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4284 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4285 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4287 #define dspfunc(PFX, IDX, NUM) \
4288 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4289 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4290 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4291 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4292 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4293 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4294 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4295 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4296 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4297 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4298 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4299 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4300 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4301 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4302 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4303 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4305 dspfunc(put_qpel, 0, 16);
4306 dspfunc(put_no_rnd_qpel, 0, 16);
4308 dspfunc(avg_qpel, 0, 16);
4309 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4311 dspfunc(put_qpel, 1, 8);
4312 dspfunc(put_no_rnd_qpel, 1, 8);
4314 dspfunc(avg_qpel, 1, 8);
4315 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4317 dspfunc(put_h264_qpel, 0, 16);
4318 dspfunc(put_h264_qpel, 1, 8);
4319 dspfunc(put_h264_qpel, 2, 4);
4320 dspfunc(put_h264_qpel, 3, 2);
4321 dspfunc(avg_h264_qpel, 0, 16);
4322 dspfunc(avg_h264_qpel, 1, 8);
4323 dspfunc(avg_h264_qpel, 2, 4);
4326 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4327 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4328 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4329 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4330 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4331 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4332 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4333 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4335 c->draw_edges = draw_edges_c;
4337 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4338 ff_mlp_init(c, avctx);
4340 #if CONFIG_VC1_DECODER
4341 ff_vc1dsp_init(c,avctx);
4343 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4344 ff_intrax8dsp_init(c,avctx);
4346 #if CONFIG_RV30_DECODER
4347 ff_rv30dsp_init(c,avctx);
4349 #if CONFIG_RV40_DECODER
4350 ff_rv40dsp_init(c,avctx);
4351 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4352 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4353 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4354 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4357 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
4358 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4359 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4360 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4361 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4362 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4363 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4364 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4366 #define SET_CMP_FUNC(name) \
4367 c->name[0]= name ## 16_c;\
4368 c->name[1]= name ## 8x8_c;
4370 SET_CMP_FUNC(hadamard8_diff)
4371 c->hadamard8_diff[4]= hadamard8_intra16_c;
4372 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4373 SET_CMP_FUNC(dct_sad)
4374 SET_CMP_FUNC(dct_max)
4376 SET_CMP_FUNC(dct264_sad)
4378 c->sad[0]= pix_abs16_c;
4379 c->sad[1]= pix_abs8_c;
4383 SET_CMP_FUNC(quant_psnr)
4386 c->vsad[0]= vsad16_c;
4387 c->vsad[4]= vsad_intra16_c;
4388 c->vsad[5]= vsad_intra8_c;
4389 c->vsse[0]= vsse16_c;
4390 c->vsse[4]= vsse_intra16_c;
4391 c->vsse[5]= vsse_intra8_c;
4392 c->nsse[0]= nsse16_c;
4393 c->nsse[1]= nsse8_c;
4395 ff_dsputil_init_dwt(c);
4398 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4400 c->add_bytes= add_bytes_c;
4401 c->add_bytes_l2= add_bytes_l2_c;
4402 c->diff_bytes= diff_bytes_c;
4403 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4404 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4405 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4406 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4407 c->bswap_buf= bswap_buf;
4408 #if CONFIG_PNG_DECODER
4409 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4412 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4413 c->h263_h_loop_filter= h263_h_loop_filter_c;
4414 c->h263_v_loop_filter= h263_v_loop_filter_c;
4417 if (CONFIG_VP3_DECODER) {
4418 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4419 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4420 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
4423 c->h261_loop_filter= h261_loop_filter_c;
4425 c->try_8x8basis= try_8x8basis_c;
4426 c->add_8x8basis= add_8x8basis_c;
4428 #if CONFIG_VORBIS_DECODER
4429 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4431 #if CONFIG_AC3_DECODER
4432 c->ac3_downmix = ff_ac3_downmix_c;
4435 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4437 c->vector_fmul = vector_fmul_c;
4438 c->vector_fmul_reverse = vector_fmul_reverse_c;
4439 c->vector_fmul_add = vector_fmul_add_c;
4440 c->vector_fmul_window = ff_vector_fmul_window_c;
4441 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4442 c->vector_clipf = vector_clipf_c;
4443 c->float_to_int16 = ff_float_to_int16_c;
4444 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4445 c->scalarproduct_int16 = scalarproduct_int16_c;
4446 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4447 c->scalarproduct_float = scalarproduct_float_c;
4448 c->butterflies_float = butterflies_float_c;
4449 c->vector_fmul_scalar = vector_fmul_scalar_c;
4451 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4452 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4454 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4455 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4457 c->shrink[0]= av_image_copy_plane;
4458 c->shrink[1]= ff_shrink22;
4459 c->shrink[2]= ff_shrink44;
4460 c->shrink[3]= ff_shrink88;
4462 c->prefetch= just_return;
4464 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4465 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4467 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4468 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4469 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4470 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4471 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4472 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4473 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4474 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4475 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4477 for(i=0; i<64; i++){
4478 if(!c->put_2tap_qpel_pixels_tab[0][i])
4479 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4480 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4481 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4484 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4485 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4486 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4487 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4489 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
4490 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
4491 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
4492 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
4494 switch(c->idct_permutation_type){
4495 case FF_NO_IDCT_PERM:
4497 c->idct_permutation[i]= i;
4499 case FF_LIBMPEG2_IDCT_PERM:
4501 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4503 case FF_SIMPLE_IDCT_PERM:
4505 c->idct_permutation[i]= simple_mmx_permutation[i];
4507 case FF_TRANSPOSE_IDCT_PERM:
4509 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4511 case FF_PARTTRANS_IDCT_PERM:
4513 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4515 case FF_SSE2_IDCT_PERM:
4517 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4520 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");