3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
46 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
49 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
51 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
52 uint32_t ff_squareTbl[512] = {0, };
54 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
55 #define pb_7f (~0UL/255 * 0x7f)
56 #define pb_80 (~0UL/255 * 0x80)
58 const uint8_t ff_zigzag_direct[64] = {
59 0, 1, 8, 16, 9, 2, 3, 10,
60 17, 24, 32, 25, 18, 11, 4, 5,
61 12, 19, 26, 33, 40, 48, 41, 34,
62 27, 20, 13, 6, 7, 14, 21, 28,
63 35, 42, 49, 56, 57, 50, 43, 36,
64 29, 22, 15, 23, 30, 37, 44, 51,
65 58, 59, 52, 45, 38, 31, 39, 46,
66 53, 60, 61, 54, 47, 55, 62, 63
69 /* Specific zigzag scan for 248 idct. NOTE that unlike the
70 specification, we interleave the fields */
71 const uint8_t ff_zigzag248_direct[64] = {
72 0, 8, 1, 9, 16, 24, 2, 10,
73 17, 25, 32, 40, 48, 56, 33, 41,
74 18, 26, 3, 11, 4, 12, 19, 27,
75 34, 42, 49, 57, 50, 58, 35, 43,
76 20, 28, 5, 13, 6, 14, 21, 29,
77 36, 44, 51, 59, 52, 60, 37, 45,
78 22, 30, 7, 15, 23, 31, 38, 46,
79 53, 61, 54, 62, 39, 47, 55, 63,
82 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
83 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
85 const uint8_t ff_alternate_horizontal_scan[64] = {
86 0, 1, 2, 3, 8, 9, 16, 17,
87 10, 11, 4, 5, 6, 7, 15, 14,
88 13, 12, 19, 18, 24, 25, 32, 33,
89 26, 27, 20, 21, 22, 23, 28, 29,
90 30, 31, 34, 35, 40, 41, 48, 49,
91 42, 43, 36, 37, 38, 39, 44, 45,
92 46, 47, 50, 51, 56, 57, 58, 59,
93 52, 53, 54, 55, 60, 61, 62, 63,
96 const uint8_t ff_alternate_vertical_scan[64] = {
97 0, 8, 16, 24, 1, 9, 2, 10,
98 17, 25, 32, 40, 48, 56, 57, 49,
99 41, 33, 26, 18, 3, 11, 4, 12,
100 19, 27, 34, 42, 50, 58, 35, 43,
101 51, 59, 20, 28, 5, 13, 6, 14,
102 21, 29, 36, 44, 52, 60, 37, 45,
103 53, 61, 22, 30, 7, 15, 23, 31,
104 38, 46, 54, 62, 39, 47, 55, 63,
107 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
108 const uint32_t ff_inverse[256]={
109 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
110 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
111 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
112 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
113 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
114 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
115 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
116 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
117 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
118 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
119 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
120 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
121 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
122 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
123 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
124 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
125 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
126 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
127 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
128 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
129 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
130 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
131 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
132 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
133 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
134 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
135 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
136 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
137 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
138 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
139 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
140 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
143 /* Input permutation for the simple_idct_mmx */
144 static const uint8_t simple_mmx_permutation[64]={
145 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
146 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
147 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
148 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
149 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
150 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
151 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
152 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
155 static int pix_sum_c(uint8_t * pix, int line_size)
160 for (i = 0; i < 16; i++) {
161 for (j = 0; j < 16; j += 8) {
172 pix += line_size - 16;
177 static int pix_norm1_c(uint8_t * pix, int line_size)
180 uint32_t *sq = ff_squareTbl + 256;
183 for (i = 0; i < 16; i++) {
184 for (j = 0; j < 16; j += 8) {
195 #if LONG_MAX > 2147483647
196 register uint64_t x=*(uint64_t*)pix;
198 s += sq[(x>>8)&0xff];
199 s += sq[(x>>16)&0xff];
200 s += sq[(x>>24)&0xff];
201 s += sq[(x>>32)&0xff];
202 s += sq[(x>>40)&0xff];
203 s += sq[(x>>48)&0xff];
204 s += sq[(x>>56)&0xff];
206 register uint32_t x=*(uint32_t*)pix;
208 s += sq[(x>>8)&0xff];
209 s += sq[(x>>16)&0xff];
210 s += sq[(x>>24)&0xff];
211 x=*(uint32_t*)(pix+4);
213 s += sq[(x>>8)&0xff];
214 s += sq[(x>>16)&0xff];
215 s += sq[(x>>24)&0xff];
220 pix += line_size - 16;
225 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
228 for(i=0; i+8<=w; i+=8){
229 dst[i+0]= bswap_32(src[i+0]);
230 dst[i+1]= bswap_32(src[i+1]);
231 dst[i+2]= bswap_32(src[i+2]);
232 dst[i+3]= bswap_32(src[i+3]);
233 dst[i+4]= bswap_32(src[i+4]);
234 dst[i+5]= bswap_32(src[i+5]);
235 dst[i+6]= bswap_32(src[i+6]);
236 dst[i+7]= bswap_32(src[i+7]);
239 dst[i+0]= bswap_32(src[i+0]);
243 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
246 uint32_t *sq = ff_squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
260 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
263 uint32_t *sq = ff_squareTbl + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[0] - pix2[0]];
268 s += sq[pix1[1] - pix2[1]];
269 s += sq[pix1[2] - pix2[2]];
270 s += sq[pix1[3] - pix2[3]];
271 s += sq[pix1[4] - pix2[4]];
272 s += sq[pix1[5] - pix2[5]];
273 s += sq[pix1[6] - pix2[6]];
274 s += sq[pix1[7] - pix2[7]];
281 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
284 uint32_t *sq = ff_squareTbl + 256;
287 for (i = 0; i < h; i++) {
288 s += sq[pix1[ 0] - pix2[ 0]];
289 s += sq[pix1[ 1] - pix2[ 1]];
290 s += sq[pix1[ 2] - pix2[ 2]];
291 s += sq[pix1[ 3] - pix2[ 3]];
292 s += sq[pix1[ 4] - pix2[ 4]];
293 s += sq[pix1[ 5] - pix2[ 5]];
294 s += sq[pix1[ 6] - pix2[ 6]];
295 s += sq[pix1[ 7] - pix2[ 7]];
296 s += sq[pix1[ 8] - pix2[ 8]];
297 s += sq[pix1[ 9] - pix2[ 9]];
298 s += sq[pix1[10] - pix2[10]];
299 s += sq[pix1[11] - pix2[11]];
300 s += sq[pix1[12] - pix2[12]];
301 s += sq[pix1[13] - pix2[13]];
302 s += sq[pix1[14] - pix2[14]];
303 s += sq[pix1[15] - pix2[15]];
312 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
313 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
315 const int dec_count= w==8 ? 3 : 4;
318 static const int scale[2][2][4][4]={
322 {268, 239, 239, 213},
326 // 9/7 16x16 or 32x32 dec=4
327 {344, 310, 310, 280},
335 {275, 245, 245, 218},
339 // 5/3 16x16 or 32x32 dec=4
340 {352, 317, 317, 286},
348 for (i = 0; i < h; i++) {
349 for (j = 0; j < w; j+=4) {
350 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
351 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
352 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
353 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
359 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
363 for(level=0; level<dec_count; level++){
364 for(ori= level ? 1 : 0; ori<4; ori++){
365 int size= w>>(dec_count-level);
366 int sx= (ori&1) ? size : 0;
367 int stride= 32<<(dec_count-level);
368 int sy= (ori&2) ? stride>>1 : 0;
370 for(i=0; i<size; i++){
371 for(j=0; j<size; j++){
372 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
382 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 8, h, 1);
386 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 8, h, 0);
390 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 16, h, 1);
394 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395 return w_c(v, pix1, pix2, line_size, 16, h, 0);
398 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
399 return w_c(v, pix1, pix2, line_size, 32, h, 1);
402 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
403 return w_c(v, pix1, pix2, line_size, 32, h, 0);
407 /* draw the edges of width 'w' of an image of size width, height */
408 //FIXME check that this is ok for mpeg4 interlaced
409 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
411 uint8_t *ptr, *last_line;
414 last_line = buf + (height - 1) * wrap;
417 memcpy(buf - (i + 1) * wrap, buf, width);
418 memcpy(last_line + (i + 1) * wrap, last_line, width);
422 for(i=0;i<height;i++) {
423 memset(ptr - w, ptr[0], w);
424 memset(ptr + width, ptr[width-1], w);
429 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
430 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
431 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
432 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
436 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
440 /* read the pixels */
442 block[0] = pixels[0];
443 block[1] = pixels[1];
444 block[2] = pixels[2];
445 block[3] = pixels[3];
446 block[4] = pixels[4];
447 block[5] = pixels[5];
448 block[6] = pixels[6];
449 block[7] = pixels[7];
455 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
456 const uint8_t *s2, int stride){
459 /* read the pixels */
461 block[0] = s1[0] - s2[0];
462 block[1] = s1[1] - s2[1];
463 block[2] = s1[2] - s2[2];
464 block[3] = s1[3] - s2[3];
465 block[4] = s1[4] - s2[4];
466 block[5] = s1[5] - s2[5];
467 block[6] = s1[6] - s2[6];
468 block[7] = s1[7] - s2[7];
476 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
480 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
482 /* read the pixels */
484 pixels[0] = cm[block[0]];
485 pixels[1] = cm[block[1]];
486 pixels[2] = cm[block[2]];
487 pixels[3] = cm[block[3]];
488 pixels[4] = cm[block[4]];
489 pixels[5] = cm[block[5]];
490 pixels[6] = cm[block[6]];
491 pixels[7] = cm[block[7]];
498 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
502 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
504 /* read the pixels */
506 pixels[0] = cm[block[0]];
507 pixels[1] = cm[block[1]];
508 pixels[2] = cm[block[2]];
509 pixels[3] = cm[block[3]];
516 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
522 /* read the pixels */
524 pixels[0] = cm[block[0]];
525 pixels[1] = cm[block[1]];
532 static void put_signed_pixels_clamped_c(const DCTELEM *block,
533 uint8_t *restrict pixels,
538 for (i = 0; i < 8; i++) {
539 for (j = 0; j < 8; j++) {
542 else if (*block > 127)
545 *pixels = (uint8_t)(*block + 128);
549 pixels += (line_size - 8);
553 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
557 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
559 /* read the pixels */
561 pixels[0] = cm[pixels[0] + block[0]];
562 pixels[1] = cm[pixels[1] + block[1]];
563 pixels[2] = cm[pixels[2] + block[2]];
564 pixels[3] = cm[pixels[3] + block[3]];
565 pixels[4] = cm[pixels[4] + block[4]];
566 pixels[5] = cm[pixels[5] + block[5]];
567 pixels[6] = cm[pixels[6] + block[6]];
568 pixels[7] = cm[pixels[7] + block[7]];
574 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
578 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
580 /* read the pixels */
582 pixels[0] = cm[pixels[0] + block[0]];
583 pixels[1] = cm[pixels[1] + block[1]];
584 pixels[2] = cm[pixels[2] + block[2]];
585 pixels[3] = cm[pixels[3] + block[3]];
591 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
595 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
597 /* read the pixels */
599 pixels[0] = cm[pixels[0] + block[0]];
600 pixels[1] = cm[pixels[1] + block[1]];
606 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
610 pixels[0] += block[0];
611 pixels[1] += block[1];
612 pixels[2] += block[2];
613 pixels[3] += block[3];
614 pixels[4] += block[4];
615 pixels[5] += block[5];
616 pixels[6] += block[6];
617 pixels[7] += block[7];
623 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
627 pixels[0] += block[0];
628 pixels[1] += block[1];
629 pixels[2] += block[2];
630 pixels[3] += block[3];
636 static int sum_abs_dctelem_c(DCTELEM *block)
640 sum+= FFABS(block[i]);
646 #define PIXOP2(OPNAME, OP) \
647 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
651 OP(*((uint64_t*)block), AV_RN64(pixels));\
657 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
661 const uint64_t a= AV_RN64(pixels );\
662 const uint64_t b= AV_RN64(pixels+1);\
663 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
669 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
673 const uint64_t a= AV_RN64(pixels );\
674 const uint64_t b= AV_RN64(pixels+1);\
675 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
681 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
685 const uint64_t a= AV_RN64(pixels );\
686 const uint64_t b= AV_RN64(pixels+line_size);\
687 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
693 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
697 const uint64_t a= AV_RN64(pixels );\
698 const uint64_t b= AV_RN64(pixels+line_size);\
699 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
705 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
708 const uint64_t a= AV_RN64(pixels );\
709 const uint64_t b= AV_RN64(pixels+1);\
710 uint64_t l0= (a&0x0303030303030303ULL)\
711 + (b&0x0303030303030303ULL)\
712 + 0x0202020202020202ULL;\
713 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
714 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
718 for(i=0; i<h; i+=2){\
719 uint64_t a= AV_RN64(pixels );\
720 uint64_t b= AV_RN64(pixels+1);\
721 l1= (a&0x0303030303030303ULL)\
722 + (b&0x0303030303030303ULL);\
723 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
724 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
725 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
728 a= AV_RN64(pixels );\
729 b= AV_RN64(pixels+1);\
730 l0= (a&0x0303030303030303ULL)\
731 + (b&0x0303030303030303ULL)\
732 + 0x0202020202020202ULL;\
733 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
734 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
735 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
741 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
744 const uint64_t a= AV_RN64(pixels );\
745 const uint64_t b= AV_RN64(pixels+1);\
746 uint64_t l0= (a&0x0303030303030303ULL)\
747 + (b&0x0303030303030303ULL)\
748 + 0x0101010101010101ULL;\
749 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
750 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
754 for(i=0; i<h; i+=2){\
755 uint64_t a= AV_RN64(pixels );\
756 uint64_t b= AV_RN64(pixels+1);\
757 l1= (a&0x0303030303030303ULL)\
758 + (b&0x0303030303030303ULL);\
759 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
760 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
761 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
764 a= AV_RN64(pixels );\
765 b= AV_RN64(pixels+1);\
766 l0= (a&0x0303030303030303ULL)\
767 + (b&0x0303030303030303ULL)\
768 + 0x0101010101010101ULL;\
769 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
770 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
771 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
777 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
778 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
779 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
780 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
781 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
782 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
783 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
785 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
786 #else // 64 bit variant
788 #define PIXOP2(OPNAME, OP) \
789 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
792 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
797 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
800 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
805 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
808 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
809 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
814 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
815 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
818 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
819 int src_stride1, int src_stride2, int h){\
823 a= AV_RN32(&src1[i*src_stride1 ]);\
824 b= AV_RN32(&src2[i*src_stride2 ]);\
825 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
826 a= AV_RN32(&src1[i*src_stride1+4]);\
827 b= AV_RN32(&src2[i*src_stride2+4]);\
828 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
832 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
833 int src_stride1, int src_stride2, int h){\
837 a= AV_RN32(&src1[i*src_stride1 ]);\
838 b= AV_RN32(&src2[i*src_stride2 ]);\
839 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
840 a= AV_RN32(&src1[i*src_stride1+4]);\
841 b= AV_RN32(&src2[i*src_stride2+4]);\
842 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
846 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
847 int src_stride1, int src_stride2, int h){\
851 a= AV_RN32(&src1[i*src_stride1 ]);\
852 b= AV_RN32(&src2[i*src_stride2 ]);\
853 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
857 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
858 int src_stride1, int src_stride2, int h){\
862 a= AV_RN16(&src1[i*src_stride1 ]);\
863 b= AV_RN16(&src2[i*src_stride2 ]);\
864 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
868 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
869 int src_stride1, int src_stride2, int h){\
870 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
871 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
874 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
875 int src_stride1, int src_stride2, int h){\
876 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
877 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
880 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
881 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
884 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
885 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
888 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
889 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
892 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
896 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
897 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900 uint32_t a, b, c, d, l0, l1, h0, h1;\
901 a= AV_RN32(&src1[i*src_stride1]);\
902 b= AV_RN32(&src2[i*src_stride2]);\
903 c= AV_RN32(&src3[i*src_stride3]);\
904 d= AV_RN32(&src4[i*src_stride4]);\
905 l0= (a&0x03030303UL)\
908 h0= ((a&0xFCFCFCFCUL)>>2)\
909 + ((b&0xFCFCFCFCUL)>>2);\
910 l1= (c&0x03030303UL)\
912 h1= ((c&0xFCFCFCFCUL)>>2)\
913 + ((d&0xFCFCFCFCUL)>>2);\
914 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
915 a= AV_RN32(&src1[i*src_stride1+4]);\
916 b= AV_RN32(&src2[i*src_stride2+4]);\
917 c= AV_RN32(&src3[i*src_stride3+4]);\
918 d= AV_RN32(&src4[i*src_stride4+4]);\
919 l0= (a&0x03030303UL)\
922 h0= ((a&0xFCFCFCFCUL)>>2)\
923 + ((b&0xFCFCFCFCUL)>>2);\
924 l1= (c&0x03030303UL)\
926 h1= ((c&0xFCFCFCFCUL)>>2)\
927 + ((d&0xFCFCFCFCUL)>>2);\
928 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
933 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
936 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
937 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
940 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
944 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
945 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
948 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
952 uint32_t a, b, c, d, l0, l1, h0, h1;\
953 a= AV_RN32(&src1[i*src_stride1]);\
954 b= AV_RN32(&src2[i*src_stride2]);\
955 c= AV_RN32(&src3[i*src_stride3]);\
956 d= AV_RN32(&src4[i*src_stride4]);\
957 l0= (a&0x03030303UL)\
960 h0= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 l1= (c&0x03030303UL)\
964 h1= ((c&0xFCFCFCFCUL)>>2)\
965 + ((d&0xFCFCFCFCUL)>>2);\
966 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967 a= AV_RN32(&src1[i*src_stride1+4]);\
968 b= AV_RN32(&src2[i*src_stride2+4]);\
969 c= AV_RN32(&src3[i*src_stride3+4]);\
970 d= AV_RN32(&src4[i*src_stride4+4]);\
971 l0= (a&0x03030303UL)\
974 h0= ((a&0xFCFCFCFCUL)>>2)\
975 + ((b&0xFCFCFCFCUL)>>2);\
976 l1= (c&0x03030303UL)\
978 h1= ((c&0xFCFCFCFCUL)>>2)\
979 + ((d&0xFCFCFCFCUL)>>2);\
980 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
983 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
984 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
985 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
986 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
988 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
989 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
990 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
991 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
994 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
996 int i, a0, b0, a1, b1;\
1003 for(i=0; i<h; i+=2){\
1009 block[0]= (a1+a0)>>2; /* FIXME non put */\
1010 block[1]= (b1+b0)>>2;\
1020 block[0]= (a1+a0)>>2;\
1021 block[1]= (b1+b0)>>2;\
1027 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1030 const uint32_t a= AV_RN32(pixels );\
1031 const uint32_t b= AV_RN32(pixels+1);\
1032 uint32_t l0= (a&0x03030303UL)\
1035 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1036 + ((b&0xFCFCFCFCUL)>>2);\
1040 for(i=0; i<h; i+=2){\
1041 uint32_t a= AV_RN32(pixels );\
1042 uint32_t b= AV_RN32(pixels+1);\
1043 l1= (a&0x03030303UL)\
1044 + (b&0x03030303UL);\
1045 h1= ((a&0xFCFCFCFCUL)>>2)\
1046 + ((b&0xFCFCFCFCUL)>>2);\
1047 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1050 a= AV_RN32(pixels );\
1051 b= AV_RN32(pixels+1);\
1052 l0= (a&0x03030303UL)\
1055 h0= ((a&0xFCFCFCFCUL)>>2)\
1056 + ((b&0xFCFCFCFCUL)>>2);\
1057 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1063 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1066 for(j=0; j<2; j++){\
1068 const uint32_t a= AV_RN32(pixels );\
1069 const uint32_t b= AV_RN32(pixels+1);\
1070 uint32_t l0= (a&0x03030303UL)\
1073 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1078 for(i=0; i<h; i+=2){\
1079 uint32_t a= AV_RN32(pixels );\
1080 uint32_t b= AV_RN32(pixels+1);\
1081 l1= (a&0x03030303UL)\
1082 + (b&0x03030303UL);\
1083 h1= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1088 a= AV_RN32(pixels );\
1089 b= AV_RN32(pixels+1);\
1090 l0= (a&0x03030303UL)\
1093 h0= ((a&0xFCFCFCFCUL)>>2)\
1094 + ((b&0xFCFCFCFCUL)>>2);\
1095 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1099 pixels+=4-line_size*(h+1);\
1100 block +=4-line_size*h;\
1104 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1107 for(j=0; j<2; j++){\
1109 const uint32_t a= AV_RN32(pixels );\
1110 const uint32_t b= AV_RN32(pixels+1);\
1111 uint32_t l0= (a&0x03030303UL)\
1114 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1115 + ((b&0xFCFCFCFCUL)>>2);\
1119 for(i=0; i<h; i+=2){\
1120 uint32_t a= AV_RN32(pixels );\
1121 uint32_t b= AV_RN32(pixels+1);\
1122 l1= (a&0x03030303UL)\
1123 + (b&0x03030303UL);\
1124 h1= ((a&0xFCFCFCFCUL)>>2)\
1125 + ((b&0xFCFCFCFCUL)>>2);\
1126 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1129 a= AV_RN32(pixels );\
1130 b= AV_RN32(pixels+1);\
1131 l0= (a&0x03030303UL)\
1134 h0= ((a&0xFCFCFCFCUL)>>2)\
1135 + ((b&0xFCFCFCFCUL)>>2);\
1136 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1140 pixels+=4-line_size*(h+1);\
1141 block +=4-line_size*h;\
1145 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1146 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1147 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1148 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1149 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1150 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1151 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1152 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1154 #define op_avg(a, b) a = rnd_avg32(a, b)
1156 #define op_put(a, b) a = b
1163 #define avg2(a,b) ((a+b+1)>>1)
1164 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1166 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1167 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1170 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1171 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1174 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1176 const int A=(16-x16)*(16-y16);
1177 const int B=( x16)*(16-y16);
1178 const int C=(16-x16)*( y16);
1179 const int D=( x16)*( y16);
1184 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1185 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1186 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1187 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1188 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1189 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1190 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1191 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1197 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1198 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1201 const int s= 1<<shift;
1211 for(x=0; x<8; x++){ //XXX FIXME optimize
1212 int src_x, src_y, frac_x, frac_y, index;
1216 frac_x= src_x&(s-1);
1217 frac_y= src_y&(s-1);
1221 if((unsigned)src_x < width){
1222 if((unsigned)src_y < height){
1223 index= src_x + src_y*stride;
1224 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1225 + src[index +1]* frac_x )*(s-frac_y)
1226 + ( src[index+stride ]*(s-frac_x)
1227 + src[index+stride+1]* frac_x )* frac_y
1230 index= src_x + av_clip(src_y, 0, height)*stride;
1231 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1232 + src[index +1]* frac_x )*s
1236 if((unsigned)src_y < height){
1237 index= av_clip(src_x, 0, width) + src_y*stride;
1238 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1239 + src[index+stride ]* frac_y )*s
1242 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1243 dst[y*stride + x]= src[index ];
1255 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1257 case 2: put_pixels2_c (dst, src, stride, height); break;
1258 case 4: put_pixels4_c (dst, src, stride, height); break;
1259 case 8: put_pixels8_c (dst, src, stride, height); break;
1260 case 16:put_pixels16_c(dst, src, stride, height); break;
1264 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266 for (i=0; i < height; i++) {
1267 for (j=0; j < width; j++) {
1268 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1275 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 for (i=0; i < height; i++) {
1278 for (j=0; j < width; j++) {
1279 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1286 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 for (i=0; i < height; i++) {
1289 for (j=0; j < width; j++) {
1290 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1297 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 for (i=0; i < height; i++) {
1300 for (j=0; j < width; j++) {
1301 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1308 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1310 for (i=0; i < height; i++) {
1311 for (j=0; j < width; j++) {
1312 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1319 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321 for (i=0; i < height; i++) {
1322 for (j=0; j < width; j++) {
1323 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1330 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1332 for (i=0; i < height; i++) {
1333 for (j=0; j < width; j++) {
1334 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1341 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1343 for (i=0; i < height; i++) {
1344 for (j=0; j < width; j++) {
1345 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1352 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1354 case 2: avg_pixels2_c (dst, src, stride, height); break;
1355 case 4: avg_pixels4_c (dst, src, stride, height); break;
1356 case 8: avg_pixels8_c (dst, src, stride, height); break;
1357 case 16:avg_pixels16_c(dst, src, stride, height); break;
1361 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363 for (i=0; i < height; i++) {
1364 for (j=0; j < width; j++) {
1365 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1372 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1374 for (i=0; i < height; i++) {
1375 for (j=0; j < width; j++) {
1376 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1383 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1385 for (i=0; i < height; i++) {
1386 for (j=0; j < width; j++) {
1387 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1394 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1396 for (i=0; i < height; i++) {
1397 for (j=0; j < width; j++) {
1398 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1405 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1407 for (i=0; i < height; i++) {
1408 for (j=0; j < width; j++) {
1409 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1416 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1418 for (i=0; i < height; i++) {
1419 for (j=0; j < width; j++) {
1420 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1427 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1429 for (i=0; i < height; i++) {
1430 for (j=0; j < width; j++) {
1431 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1438 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1440 for (i=0; i < height; i++) {
1441 for (j=0; j < width; j++) {
1442 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1449 #define TPEL_WIDTH(width)\
1450 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1451 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1452 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1453 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1454 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1455 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1456 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1457 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1458 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1459 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1460 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1461 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1462 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1463 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1464 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1465 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1466 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1467 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1470 #define H264_CHROMA_MC(OPNAME, OP)\
1471 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1472 const int A=(8-x)*(8-y);\
1473 const int B=( x)*(8-y);\
1474 const int C=(8-x)*( y);\
1475 const int D=( x)*( y);\
1478 assert(x<8 && y<8 && x>=0 && y>=0);\
1481 for(i=0; i<h; i++){\
1482 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1483 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1489 const int step= C ? stride : 1;\
1490 for(i=0; i<h; i++){\
1491 OP(dst[0], (A*src[0] + E*src[step+0]));\
1492 OP(dst[1], (A*src[1] + E*src[step+1]));\
1499 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1500 const int A=(8-x)*(8-y);\
1501 const int B=( x)*(8-y);\
1502 const int C=(8-x)*( y);\
1503 const int D=( x)*( y);\
1506 assert(x<8 && y<8 && x>=0 && y>=0);\
1509 for(i=0; i<h; i++){\
1510 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1511 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1512 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1513 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1519 const int step= C ? stride : 1;\
1520 for(i=0; i<h; i++){\
1521 OP(dst[0], (A*src[0] + E*src[step+0]));\
1522 OP(dst[1], (A*src[1] + E*src[step+1]));\
1523 OP(dst[2], (A*src[2] + E*src[step+2]));\
1524 OP(dst[3], (A*src[3] + E*src[step+3]));\
1531 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1532 const int A=(8-x)*(8-y);\
1533 const int B=( x)*(8-y);\
1534 const int C=(8-x)*( y);\
1535 const int D=( x)*( y);\
1538 assert(x<8 && y<8 && x>=0 && y>=0);\
1541 for(i=0; i<h; i++){\
1542 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1543 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1544 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1545 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1546 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1547 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1548 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1549 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1555 const int step= C ? stride : 1;\
1556 for(i=0; i<h; i++){\
1557 OP(dst[0], (A*src[0] + E*src[step+0]));\
1558 OP(dst[1], (A*src[1] + E*src[step+1]));\
1559 OP(dst[2], (A*src[2] + E*src[step+2]));\
1560 OP(dst[3], (A*src[3] + E*src[step+3]));\
1561 OP(dst[4], (A*src[4] + E*src[step+4]));\
1562 OP(dst[5], (A*src[5] + E*src[step+5]));\
1563 OP(dst[6], (A*src[6] + E*src[step+6]));\
1564 OP(dst[7], (A*src[7] + E*src[step+7]));\
1571 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1572 #define op_put(a, b) a = (((b) + 32)>>6)
1574 H264_CHROMA_MC(put_ , op_put)
1575 H264_CHROMA_MC(avg_ , op_avg)
1579 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1580 const int A=(8-x)*(8-y);
1581 const int B=( x)*(8-y);
1582 const int C=(8-x)*( y);
1583 const int D=( x)*( y);
1586 assert(x<8 && y<8 && x>=0 && y>=0);
1590 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1591 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1592 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1593 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1594 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1595 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1596 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1597 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1603 #define QPEL_MC(r, OPNAME, RND, OP) \
1604 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1609 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1610 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1611 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1612 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1613 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1614 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1615 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1616 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1622 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1624 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1628 const int src0= src[0*srcStride];\
1629 const int src1= src[1*srcStride];\
1630 const int src2= src[2*srcStride];\
1631 const int src3= src[3*srcStride];\
1632 const int src4= src[4*srcStride];\
1633 const int src5= src[5*srcStride];\
1634 const int src6= src[6*srcStride];\
1635 const int src7= src[7*srcStride];\
1636 const int src8= src[8*srcStride];\
1637 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1638 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1639 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1640 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1641 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1642 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1643 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1644 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1650 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1651 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1656 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1657 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1658 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1659 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1660 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1661 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1662 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1663 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1664 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1665 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1666 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1667 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1668 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1669 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1670 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1671 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1677 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1678 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1683 const int src0= src[0*srcStride];\
1684 const int src1= src[1*srcStride];\
1685 const int src2= src[2*srcStride];\
1686 const int src3= src[3*srcStride];\
1687 const int src4= src[4*srcStride];\
1688 const int src5= src[5*srcStride];\
1689 const int src6= src[6*srcStride];\
1690 const int src7= src[7*srcStride];\
1691 const int src8= src[8*srcStride];\
1692 const int src9= src[9*srcStride];\
1693 const int src10= src[10*srcStride];\
1694 const int src11= src[11*srcStride];\
1695 const int src12= src[12*srcStride];\
1696 const int src13= src[13*srcStride];\
1697 const int src14= src[14*srcStride];\
1698 const int src15= src[15*srcStride];\
1699 const int src16= src[16*srcStride];\
1700 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1701 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1702 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1703 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1704 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1705 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1706 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1707 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1708 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1709 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1710 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1711 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1712 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1713 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1714 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1715 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1721 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1722 OPNAME ## pixels8_c(dst, src, stride, 8);\
1725 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1727 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1728 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1731 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1732 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1735 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1737 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1738 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1741 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1742 uint8_t full[16*9];\
1744 copy_block9(full, src, 16, stride, 9);\
1745 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1746 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1749 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[16*9];\
1751 copy_block9(full, src, 16, stride, 9);\
1752 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1755 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1758 copy_block9(full, src, 16, stride, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1760 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1762 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1763 uint8_t full[16*9];\
1766 uint8_t halfHV[64];\
1767 copy_block9(full, src, 16, stride, 9);\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1773 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1776 uint8_t halfHV[64];\
1777 copy_block9(full, src, 16, stride, 9);\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1779 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1781 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1783 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1784 uint8_t full[16*9];\
1787 uint8_t halfHV[64];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1791 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1794 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1797 uint8_t halfHV[64];\
1798 copy_block9(full, src, 16, stride, 9);\
1799 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1800 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1802 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1804 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1805 uint8_t full[16*9];\
1808 uint8_t halfHV[64];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1815 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[16*9];\
1818 uint8_t halfHV[64];\
1819 copy_block9(full, src, 16, stride, 9);\
1820 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1821 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1825 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1829 uint8_t halfHV[64];\
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1836 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[16*9];\
1839 uint8_t halfHV[64];\
1840 copy_block9(full, src, 16, stride, 9);\
1841 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1842 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1843 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1844 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1846 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t halfHV[64];\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1853 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t halfHV[64];\
1856 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1858 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1860 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t full[16*9];\
1864 uint8_t halfHV[64];\
1865 copy_block9(full, src, 16, stride, 9);\
1866 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1867 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1868 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1869 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1871 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[16*9];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1877 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1879 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
1883 uint8_t halfHV[64];\
1884 copy_block9(full, src, 16, stride, 9);\
1885 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1886 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1887 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1888 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1890 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1891 uint8_t full[16*9];\
1893 copy_block9(full, src, 16, stride, 9);\
1894 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1895 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1896 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1898 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1900 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1901 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1903 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1904 OPNAME ## pixels16_c(dst, src, stride, 16);\
1907 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1909 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1910 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1913 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1914 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1917 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1919 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1920 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1923 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1924 uint8_t full[24*17];\
1926 copy_block17(full, src, 24, stride, 17);\
1927 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1928 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1931 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[24*17];\
1933 copy_block17(full, src, 24, stride, 17);\
1934 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1937 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1940 copy_block17(full, src, 24, stride, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1942 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1944 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1945 uint8_t full[24*17];\
1946 uint8_t halfH[272];\
1947 uint8_t halfV[256];\
1948 uint8_t halfHV[256];\
1949 copy_block17(full, src, 24, stride, 17);\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1955 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfHV[256];\
1959 copy_block17(full, src, 24, stride, 17);\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1961 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1963 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1965 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t full[24*17];\
1967 uint8_t halfH[272];\
1968 uint8_t halfV[256];\
1969 uint8_t halfHV[256];\
1970 copy_block17(full, src, 24, stride, 17);\
1971 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1973 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1976 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1978 uint8_t halfH[272];\
1979 uint8_t halfHV[256];\
1980 copy_block17(full, src, 24, stride, 17);\
1981 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1982 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1984 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1986 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[24*17];\
1988 uint8_t halfH[272];\
1989 uint8_t halfV[256];\
1990 uint8_t halfHV[256];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1997 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t full[24*17];\
1999 uint8_t halfH[272];\
2000 uint8_t halfHV[256];\
2001 copy_block17(full, src, 24, stride, 17);\
2002 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2003 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2007 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 uint8_t halfH[272];\
2010 uint8_t halfV[256];\
2011 uint8_t halfHV[256];\
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2018 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t full[24*17];\
2020 uint8_t halfH[272];\
2021 uint8_t halfHV[256];\
2022 copy_block17(full, src, 24, stride, 17);\
2023 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2024 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2025 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2026 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2028 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t halfH[272];\
2030 uint8_t halfHV[256];\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2035 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t halfH[272];\
2037 uint8_t halfHV[256];\
2038 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2040 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2042 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2043 uint8_t full[24*17];\
2044 uint8_t halfH[272];\
2045 uint8_t halfV[256];\
2046 uint8_t halfHV[256];\
2047 copy_block17(full, src, 24, stride, 17);\
2048 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2049 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2050 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2051 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2053 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2054 uint8_t full[24*17];\
2055 uint8_t halfH[272];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2059 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2061 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
2063 uint8_t halfH[272];\
2064 uint8_t halfV[256];\
2065 uint8_t halfHV[256];\
2066 copy_block17(full, src, 24, stride, 17);\
2067 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2068 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2069 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2070 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2072 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2073 uint8_t full[24*17];\
2074 uint8_t halfH[272];\
2075 copy_block17(full, src, 24, stride, 17);\
2076 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2077 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2078 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2080 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2081 uint8_t halfH[272];\
2082 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2083 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2086 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2087 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2088 #define op_put(a, b) a = cm[((b) + 16)>>5]
2089 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2091 QPEL_MC(0, put_ , _ , op_put)
2092 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2093 QPEL_MC(0, avg_ , _ , op_avg)
2094 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2096 #undef op_avg_no_rnd
2098 #undef op_put_no_rnd
2101 #define H264_LOWPASS(OPNAME, OP, OP2) \
2102 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2104 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2108 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2109 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2115 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2117 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2121 const int srcB= src[-2*srcStride];\
2122 const int srcA= src[-1*srcStride];\
2123 const int src0= src[0 *srcStride];\
2124 const int src1= src[1 *srcStride];\
2125 const int src2= src[2 *srcStride];\
2126 const int src3= src[3 *srcStride];\
2127 const int src4= src[4 *srcStride];\
2128 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2129 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2135 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2138 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2140 src -= 2*srcStride;\
2141 for(i=0; i<h+5; i++)\
2143 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2144 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2148 tmp -= tmpStride*(h+5-2);\
2151 const int tmpB= tmp[-2*tmpStride];\
2152 const int tmpA= tmp[-1*tmpStride];\
2153 const int tmp0= tmp[0 *tmpStride];\
2154 const int tmp1= tmp[1 *tmpStride];\
2155 const int tmp2= tmp[2 *tmpStride];\
2156 const int tmp3= tmp[3 *tmpStride];\
2157 const int tmp4= tmp[4 *tmpStride];\
2158 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2159 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2164 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2166 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2171 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2172 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2173 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2179 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2181 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2185 const int srcB= src[-2*srcStride];\
2186 const int srcA= src[-1*srcStride];\
2187 const int src0= src[0 *srcStride];\
2188 const int src1= src[1 *srcStride];\
2189 const int src2= src[2 *srcStride];\
2190 const int src3= src[3 *srcStride];\
2191 const int src4= src[4 *srcStride];\
2192 const int src5= src[5 *srcStride];\
2193 const int src6= src[6 *srcStride];\
2194 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2195 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2196 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2197 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 src -= 2*srcStride;\
2209 for(i=0; i<h+5; i++)\
2211 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2212 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2213 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2214 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2218 tmp -= tmpStride*(h+5-2);\
2221 const int tmpB= tmp[-2*tmpStride];\
2222 const int tmpA= tmp[-1*tmpStride];\
2223 const int tmp0= tmp[0 *tmpStride];\
2224 const int tmp1= tmp[1 *tmpStride];\
2225 const int tmp2= tmp[2 *tmpStride];\
2226 const int tmp3= tmp[3 *tmpStride];\
2227 const int tmp4= tmp[4 *tmpStride];\
2228 const int tmp5= tmp[5 *tmpStride];\
2229 const int tmp6= tmp[6 *tmpStride];\
2230 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2231 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2232 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2233 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2239 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2241 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2245 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2246 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2247 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2248 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2249 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2250 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2251 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2252 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2258 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264 const int srcB= src[-2*srcStride];\
2265 const int srcA= src[-1*srcStride];\
2266 const int src0= src[0 *srcStride];\
2267 const int src1= src[1 *srcStride];\
2268 const int src2= src[2 *srcStride];\
2269 const int src3= src[3 *srcStride];\
2270 const int src4= src[4 *srcStride];\
2271 const int src5= src[5 *srcStride];\
2272 const int src6= src[6 *srcStride];\
2273 const int src7= src[7 *srcStride];\
2274 const int src8= src[8 *srcStride];\
2275 const int src9= src[9 *srcStride];\
2276 const int src10=src[10*srcStride];\
2277 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2278 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2279 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2280 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2281 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2282 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2283 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2284 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2290 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2293 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2295 src -= 2*srcStride;\
2296 for(i=0; i<h+5; i++)\
2298 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2299 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2300 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2301 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2302 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2303 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2304 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2305 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2309 tmp -= tmpStride*(h+5-2);\
2312 const int tmpB= tmp[-2*tmpStride];\
2313 const int tmpA= tmp[-1*tmpStride];\
2314 const int tmp0= tmp[0 *tmpStride];\
2315 const int tmp1= tmp[1 *tmpStride];\
2316 const int tmp2= tmp[2 *tmpStride];\
2317 const int tmp3= tmp[3 *tmpStride];\
2318 const int tmp4= tmp[4 *tmpStride];\
2319 const int tmp5= tmp[5 *tmpStride];\
2320 const int tmp6= tmp[6 *tmpStride];\
2321 const int tmp7= tmp[7 *tmpStride];\
2322 const int tmp8= tmp[8 *tmpStride];\
2323 const int tmp9= tmp[9 *tmpStride];\
2324 const int tmp10=tmp[10*tmpStride];\
2325 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2326 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2327 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2328 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2329 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2330 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2331 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2332 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2338 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2340 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2341 src += 8*srcStride;\
2342 dst += 8*dstStride;\
2343 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2344 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2347 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2348 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2349 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2350 src += 8*srcStride;\
2351 dst += 8*dstStride;\
2352 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2353 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2356 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2357 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2358 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2359 src += 8*srcStride;\
2360 dst += 8*dstStride;\
2361 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2362 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2365 #define H264_MC(OPNAME, SIZE) \
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2367 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2371 uint8_t half[SIZE*SIZE];\
2372 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2377 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2381 uint8_t half[SIZE*SIZE];\
2382 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2383 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2387 uint8_t full[SIZE*(SIZE+5)];\
2388 uint8_t * const full_mid= full + SIZE*2;\
2389 uint8_t half[SIZE*SIZE];\
2390 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2391 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2392 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2395 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2396 uint8_t full[SIZE*(SIZE+5)];\
2397 uint8_t * const full_mid= full + SIZE*2;\
2398 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2399 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2403 uint8_t full[SIZE*(SIZE+5)];\
2404 uint8_t * const full_mid= full + SIZE*2;\
2405 uint8_t half[SIZE*SIZE];\
2406 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2407 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2408 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2412 uint8_t full[SIZE*(SIZE+5)];\
2413 uint8_t * const full_mid= full + SIZE*2;\
2414 uint8_t halfH[SIZE*SIZE];\
2415 uint8_t halfV[SIZE*SIZE];\
2416 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2417 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2418 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2422 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2423 uint8_t full[SIZE*(SIZE+5)];\
2424 uint8_t * const full_mid= full + SIZE*2;\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2429 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2434 uint8_t full[SIZE*(SIZE+5)];\
2435 uint8_t * const full_mid= full + SIZE*2;\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2440 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2441 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2444 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2445 uint8_t full[SIZE*(SIZE+5)];\
2446 uint8_t * const full_mid= full + SIZE*2;\
2447 uint8_t halfH[SIZE*SIZE];\
2448 uint8_t halfV[SIZE*SIZE];\
2449 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2450 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2451 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2456 int16_t tmp[SIZE*(SIZE+5)];\
2457 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2461 int16_t tmp[SIZE*(SIZE+5)];\
2462 uint8_t halfH[SIZE*SIZE];\
2463 uint8_t halfHV[SIZE*SIZE];\
2464 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2465 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2466 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2470 int16_t tmp[SIZE*(SIZE+5)];\
2471 uint8_t halfH[SIZE*SIZE];\
2472 uint8_t halfHV[SIZE*SIZE];\
2473 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2474 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2479 uint8_t full[SIZE*(SIZE+5)];\
2480 uint8_t * const full_mid= full + SIZE*2;\
2481 int16_t tmp[SIZE*(SIZE+5)];\
2482 uint8_t halfV[SIZE*SIZE];\
2483 uint8_t halfHV[SIZE*SIZE];\
2484 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2485 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2490 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2491 uint8_t full[SIZE*(SIZE+5)];\
2492 uint8_t * const full_mid= full + SIZE*2;\
2493 int16_t tmp[SIZE*(SIZE+5)];\
2494 uint8_t halfV[SIZE*SIZE];\
2495 uint8_t halfHV[SIZE*SIZE];\
2496 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2497 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2498 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2499 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2502 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2503 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2504 #define op_put(a, b) a = cm[((b) + 16)>>5]
2505 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2506 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2508 H264_LOWPASS(put_ , op_put, op2_put)
2509 H264_LOWPASS(avg_ , op_avg, op2_avg)
2524 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2525 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2526 #define H264_WEIGHT(W,H) \
2527 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2529 offset <<= log2_denom; \
2530 if(log2_denom) offset += 1<<(log2_denom-1); \
2531 for(y=0; y<H; y++, block += stride){ \
2534 if(W==2) continue; \
2537 if(W==4) continue; \
2542 if(W==8) continue; \
2553 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2555 offset = ((offset + 1) | 1) << log2_denom; \
2556 for(y=0; y<H; y++, dst += stride, src += stride){ \
2559 if(W==2) continue; \
2562 if(W==4) continue; \
2567 if(W==8) continue; \
2594 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2595 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2599 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2600 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2601 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2602 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2603 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2604 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2605 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2606 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2612 #ifdef CONFIG_CAVS_DECODER
2614 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2616 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2617 put_pixels8_c(dst, src, stride, 8);
2619 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2620 avg_pixels8_c(dst, src, stride, 8);
2622 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2623 put_pixels16_c(dst, src, stride, 16);
2625 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2626 avg_pixels16_c(dst, src, stride, 16);
2628 #endif /* CONFIG_CAVS_DECODER */
2630 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2632 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2634 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2635 put_pixels8_c(dst, src, stride, 8);
2637 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2639 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2642 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2644 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2645 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2649 const int src_1= src[ -srcStride];
2650 const int src0 = src[0 ];
2651 const int src1 = src[ srcStride];
2652 const int src2 = src[2*srcStride];
2653 const int src3 = src[3*srcStride];
2654 const int src4 = src[4*srcStride];
2655 const int src5 = src[5*srcStride];
2656 const int src6 = src[6*srcStride];
2657 const int src7 = src[7*srcStride];
2658 const int src8 = src[8*srcStride];
2659 const int src9 = src[9*srcStride];
2660 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2661 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2662 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2663 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2664 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2665 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2666 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2667 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2673 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2674 put_pixels8_c(dst, src, stride, 8);
2677 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2680 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2683 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2684 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2687 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2689 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2690 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2693 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2694 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2697 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2701 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2702 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2703 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2704 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2706 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2710 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2711 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2712 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2713 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2715 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2717 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2718 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2721 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2722 if(ENABLE_ANY_H263) {
2724 const int strength= ff_h263_loop_filter_strength[qscale];
2728 int p0= src[x-2*stride];
2729 int p1= src[x-1*stride];
2730 int p2= src[x+0*stride];
2731 int p3= src[x+1*stride];
2732 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2734 if (d<-2*strength) d1= 0;
2735 else if(d<- strength) d1=-2*strength - d;
2736 else if(d< strength) d1= d;
2737 else if(d< 2*strength) d1= 2*strength - d;
2742 if(p1&256) p1= ~(p1>>31);
2743 if(p2&256) p2= ~(p2>>31);
2745 src[x-1*stride] = p1;
2746 src[x+0*stride] = p2;
2750 d2= av_clip((p0-p3)/4, -ad1, ad1);
2752 src[x-2*stride] = p0 - d2;
2753 src[x+ stride] = p3 + d2;
2758 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2759 if(ENABLE_ANY_H263) {
2761 const int strength= ff_h263_loop_filter_strength[qscale];
2765 int p0= src[y*stride-2];
2766 int p1= src[y*stride-1];
2767 int p2= src[y*stride+0];
2768 int p3= src[y*stride+1];
2769 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2771 if (d<-2*strength) d1= 0;
2772 else if(d<- strength) d1=-2*strength - d;
2773 else if(d< strength) d1= d;
2774 else if(d< 2*strength) d1= 2*strength - d;
2779 if(p1&256) p1= ~(p1>>31);
2780 if(p2&256) p2= ~(p2>>31);
2782 src[y*stride-1] = p1;
2783 src[y*stride+0] = p2;
2787 d2= av_clip((p0-p3)/4, -ad1, ad1);
2789 src[y*stride-2] = p0 - d2;
2790 src[y*stride+1] = p3 + d2;
2795 static void h261_loop_filter_c(uint8_t *src, int stride){
2800 temp[x ] = 4*src[x ];
2801 temp[x + 7*8] = 4*src[x + 7*stride];
2805 xy = y * stride + x;
2807 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2812 src[ y*stride] = (temp[ y*8] + 2)>>2;
2813 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2815 xy = y * stride + x;
2817 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2822 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2825 for( i = 0; i < 4; i++ ) {
2830 for( d = 0; d < 4; d++ ) {
2831 const int p0 = pix[-1*xstride];
2832 const int p1 = pix[-2*xstride];
2833 const int p2 = pix[-3*xstride];
2834 const int q0 = pix[0];
2835 const int q1 = pix[1*xstride];
2836 const int q2 = pix[2*xstride];
2838 if( FFABS( p0 - q0 ) < alpha &&
2839 FFABS( p1 - p0 ) < beta &&
2840 FFABS( q1 - q0 ) < beta ) {
2845 if( FFABS( p2 - p0 ) < beta ) {
2846 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2849 if( FFABS( q2 - q0 ) < beta ) {
2850 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2854 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2855 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2856 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2862 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2864 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2866 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2868 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2871 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2874 for( i = 0; i < 4; i++ ) {
2875 const int tc = tc0[i];
2880 for( d = 0; d < 2; d++ ) {
2881 const int p0 = pix[-1*xstride];
2882 const int p1 = pix[-2*xstride];
2883 const int q0 = pix[0];
2884 const int q1 = pix[1*xstride];
2886 if( FFABS( p0 - q0 ) < alpha &&
2887 FFABS( p1 - p0 ) < beta &&
2888 FFABS( q1 - q0 ) < beta ) {
2890 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2892 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2893 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2899 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2901 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2903 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2905 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2908 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2911 for( d = 0; d < 8; d++ ) {
2912 const int p0 = pix[-1*xstride];
2913 const int p1 = pix[-2*xstride];
2914 const int q0 = pix[0];
2915 const int q1 = pix[1*xstride];
2917 if( FFABS( p0 - q0 ) < alpha &&
2918 FFABS( p1 - p0 ) < beta &&
2919 FFABS( q1 - q0 ) < beta ) {
2921 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2922 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2927 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2929 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2931 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2933 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2936 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2942 s += abs(pix1[0] - pix2[0]);
2943 s += abs(pix1[1] - pix2[1]);
2944 s += abs(pix1[2] - pix2[2]);
2945 s += abs(pix1[3] - pix2[3]);
2946 s += abs(pix1[4] - pix2[4]);
2947 s += abs(pix1[5] - pix2[5]);
2948 s += abs(pix1[6] - pix2[6]);
2949 s += abs(pix1[7] - pix2[7]);
2950 s += abs(pix1[8] - pix2[8]);
2951 s += abs(pix1[9] - pix2[9]);
2952 s += abs(pix1[10] - pix2[10]);
2953 s += abs(pix1[11] - pix2[11]);
2954 s += abs(pix1[12] - pix2[12]);
2955 s += abs(pix1[13] - pix2[13]);
2956 s += abs(pix1[14] - pix2[14]);
2957 s += abs(pix1[15] - pix2[15]);
2964 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2970 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2971 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2972 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2973 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2974 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2975 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2976 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2977 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2978 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2979 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2980 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2981 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2982 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2983 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2984 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2985 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2992 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2995 uint8_t *pix3 = pix2 + line_size;
2999 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3000 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3001 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3002 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3003 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3004 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3005 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3006 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3007 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3008 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3009 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3010 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3011 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3012 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3013 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3014 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3022 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3025 uint8_t *pix3 = pix2 + line_size;
3029 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3030 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3031 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3032 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3033 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3034 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3035 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3036 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3037 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3038 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3039 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3040 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3041 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3042 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3043 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3044 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3052 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3058 s += abs(pix1[0] - pix2[0]);
3059 s += abs(pix1[1] - pix2[1]);
3060 s += abs(pix1[2] - pix2[2]);
3061 s += abs(pix1[3] - pix2[3]);
3062 s += abs(pix1[4] - pix2[4]);
3063 s += abs(pix1[5] - pix2[5]);
3064 s += abs(pix1[6] - pix2[6]);
3065 s += abs(pix1[7] - pix2[7]);
3072 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3078 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3079 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3080 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3081 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3082 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3083 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3084 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3085 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3092 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3095 uint8_t *pix3 = pix2 + line_size;
3099 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3100 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3101 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3102 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3103 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3104 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3105 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3106 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3114 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3117 uint8_t *pix3 = pix2 + line_size;
3121 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3122 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3123 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3124 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3125 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3126 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3127 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3128 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3136 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3137 MpegEncContext *c = v;
3143 for(x=0; x<16; x++){
3144 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3147 for(x=0; x<15; x++){
3148 score2+= FFABS( s1[x ] - s1[x +stride]
3149 - s1[x+1] + s1[x+1+stride])
3150 -FFABS( s2[x ] - s2[x +stride]
3151 - s2[x+1] + s2[x+1+stride]);
3158 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3159 else return score1 + FFABS(score2)*8;
3162 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3163 MpegEncContext *c = v;
3170 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3174 score2+= FFABS( s1[x ] - s1[x +stride]
3175 - s1[x+1] + s1[x+1+stride])
3176 -FFABS( s2[x ] - s2[x +stride]
3177 - s2[x+1] + s2[x+1+stride]);
3184 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3185 else return score1 + FFABS(score2)*8;
3188 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3192 for(i=0; i<8*8; i++){
3193 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3196 assert(-512<b && b<512);
3198 sum += (w*b)*(w*b)>>4;
3203 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3206 for(i=0; i<8*8; i++){
3207 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3212 * permutes an 8x8 block.
3213 * @param block the block which will be permuted according to the given permutation vector
3214 * @param permutation the permutation vector
3215 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3216 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3217 * (inverse) permutated to scantable order!
3219 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3225 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3227 for(i=0; i<=last; i++){
3228 const int j= scantable[i];
3233 for(i=0; i<=last; i++){
3234 const int j= scantable[i];
3235 const int perm_j= permutation[j];
3236 block[perm_j]= temp[j];
3240 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3244 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3247 memset(cmp, 0, sizeof(void*)*5);
3255 cmp[i]= c->hadamard8_diff[i];
3261 cmp[i]= c->dct_sad[i];
3264 cmp[i]= c->dct264_sad[i];
3267 cmp[i]= c->dct_max[i];
3270 cmp[i]= c->quant_psnr[i];
3290 #ifdef CONFIG_SNOW_ENCODER
3299 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3305 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3307 static void clear_blocks_c(DCTELEM *blocks)
3309 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3312 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3314 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3315 long a = *(long*)(src+i);
3316 long b = *(long*)(dst+i);
3317 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3320 dst[i+0] += src[i+0];
3323 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3325 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3326 long a = *(long*)(src1+i);
3327 long b = *(long*)(src2+i);
3328 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3331 dst[i] = src1[i]+src2[i];
3334 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3336 #ifndef HAVE_FAST_UNALIGNED
3337 if((long)src2 & (sizeof(long)-1)){
3338 for(i=0; i+7<w; i+=8){
3339 dst[i+0] = src1[i+0]-src2[i+0];
3340 dst[i+1] = src1[i+1]-src2[i+1];
3341 dst[i+2] = src1[i+2]-src2[i+2];
3342 dst[i+3] = src1[i+3]-src2[i+3];
3343 dst[i+4] = src1[i+4]-src2[i+4];
3344 dst[i+5] = src1[i+5]-src2[i+5];
3345 dst[i+6] = src1[i+6]-src2[i+6];
3346 dst[i+7] = src1[i+7]-src2[i+7];
3350 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3351 long a = *(long*)(src1+i);
3352 long b = *(long*)(src2+i);
3353 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3356 dst[i+0] = src1[i+0]-src2[i+0];
3359 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3367 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3377 #define BUTTERFLY2(o1,o2,i1,i2) \
3381 #define BUTTERFLY1(x,y) \
3390 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3392 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3400 //FIXME try pointer walks
3401 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3402 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3403 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3404 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3406 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3407 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3408 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3409 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3411 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3412 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3413 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3414 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3418 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3419 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3420 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3421 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3423 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3424 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3425 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3426 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3429 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3430 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3431 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3432 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3438 printf("MAX:%d\n", maxi);
3444 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3452 //FIXME try pointer walks
3453 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3454 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3455 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3456 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3458 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3459 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3460 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3461 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3463 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3464 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3465 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3466 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3470 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3471 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3472 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3473 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3475 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3476 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3477 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3478 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3481 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3482 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3483 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3484 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3487 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3492 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493 MpegEncContext * const s= (MpegEncContext *)c;
3494 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3495 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3499 s->dsp.diff_pixels(temp, src1, src2, stride);
3501 return s->dsp.sum_abs_dctelem(temp);
3506 const int s07 = SRC(0) + SRC(7);\
3507 const int s16 = SRC(1) + SRC(6);\
3508 const int s25 = SRC(2) + SRC(5);\
3509 const int s34 = SRC(3) + SRC(4);\
3510 const int a0 = s07 + s34;\
3511 const int a1 = s16 + s25;\
3512 const int a2 = s07 - s34;\
3513 const int a3 = s16 - s25;\
3514 const int d07 = SRC(0) - SRC(7);\
3515 const int d16 = SRC(1) - SRC(6);\
3516 const int d25 = SRC(2) - SRC(5);\
3517 const int d34 = SRC(3) - SRC(4);\
3518 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3519 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3520 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3521 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3523 DST(1, a4 + (a7>>2)) ;\
3524 DST(2, a2 + (a3>>1)) ;\
3525 DST(3, a5 + (a6>>2)) ;\
3527 DST(5, a6 - (a5>>2)) ;\
3528 DST(6, (a2>>1) - a3 ) ;\
3529 DST(7, (a4>>2) - a7 ) ;\
3532 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3533 MpegEncContext * const s= (MpegEncContext *)c;
3538 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3540 #define SRC(x) dct[i][x]
3541 #define DST(x,v) dct[i][x]= v
3542 for( i = 0; i < 8; i++ )
3547 #define SRC(x) dct[x][i]
3548 #define DST(x,v) sum += FFABS(v)
3549 for( i = 0; i < 8; i++ )
3557 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3558 MpegEncContext * const s= (MpegEncContext *)c;
3559 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3560 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3565 s->dsp.diff_pixels(temp, src1, src2, stride);
3569 sum= FFMAX(sum, FFABS(temp[i]));
3574 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3575 MpegEncContext * const s= (MpegEncContext *)c;
3576 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3577 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3578 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3584 s->dsp.diff_pixels(temp, src1, src2, stride);
3586 memcpy(bak, temp, 64*sizeof(DCTELEM));
3588 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3589 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3590 ff_simple_idct(temp); //FIXME
3593 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3598 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3599 MpegEncContext * const s= (MpegEncContext *)c;
3600 const uint8_t *scantable= s->intra_scantable.permutated;
3601 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3602 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3603 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3604 uint8_t * const bak= (uint8_t*)aligned_bak;
3605 int i, last, run, bits, level, distoration, start_i;
3606 const int esc_length= s->ac_esc_length;
3608 uint8_t * last_length;
3613 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3614 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3617 s->dsp.diff_pixels(temp, src1, src2, stride);
3619 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3625 length = s->intra_ac_vlc_length;
3626 last_length= s->intra_ac_vlc_last_length;
3627 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3630 length = s->inter_ac_vlc_length;
3631 last_length= s->inter_ac_vlc_last_length;
3636 for(i=start_i; i<last; i++){
3637 int j= scantable[i];
3642 if((level&(~127)) == 0){
3643 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3652 level= temp[i] + 64;
3656 if((level&(~127)) == 0){
3657 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3665 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3667 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3670 s->dsp.idct_add(bak, stride, temp);
3672 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3674 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3677 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3678 MpegEncContext * const s= (MpegEncContext *)c;
3679 const uint8_t *scantable= s->intra_scantable.permutated;
3680 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3681 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3682 int i, last, run, bits, level, start_i;
3683 const int esc_length= s->ac_esc_length;
3685 uint8_t * last_length;
3689 s->dsp.diff_pixels(temp, src1, src2, stride);
3691 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3697 length = s->intra_ac_vlc_length;
3698 last_length= s->intra_ac_vlc_last_length;
3699 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3702 length = s->inter_ac_vlc_length;
3703 last_length= s->inter_ac_vlc_last_length;
3708 for(i=start_i; i<last; i++){
3709 int j= scantable[i];
3714 if((level&(~127)) == 0){
3715 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3724 level= temp[i] + 64;
3728 if((level&(~127)) == 0){
3729 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3737 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3742 for(x=0; x<16; x+=4){
3743 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3744 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3752 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3757 for(x=0; x<16; x++){
3758 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3767 #define SQ(a) ((a)*(a))
3768 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3773 for(x=0; x<16; x+=4){
3774 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3775 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3783 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3788 for(x=0; x<16; x++){
3789 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3798 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3802 for(i=0; i<size; i++)
3803 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3807 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3808 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3809 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3811 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3813 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3814 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3815 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3816 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3818 static void vector_fmul_c(float *dst, const float *src, int len){
3820 for(i=0; i<len; i++)
3824 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3827 for(i=0; i<len; i++)
3828 dst[i] = src0[i] * src1[-i];
3831 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3833 for(i=0; i<len; i++)
3834 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3837 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3839 for(i=0; i<len; i++) {
3840 int_fast32_t tmp = ((const int32_t*)src)[i];
3842 tmp = (0x43c0ffff - tmp)>>31;
3843 // is this faster on some gcc/cpu combinations?
3844 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3847 dst[i] = tmp - 0x8000;
3852 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3853 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3854 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3855 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3856 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3857 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3858 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3860 static void wmv2_idct_row(short * b)
3863 int a0,a1,a2,a3,a4,a5,a6,a7;
3865 a1 = W1*b[1]+W7*b[7];
3866 a7 = W7*b[1]-W1*b[7];
3867 a5 = W5*b[5]+W3*b[3];
3868 a3 = W3*b[5]-W5*b[3];
3869 a2 = W2*b[2]+W6*b[6];
3870 a6 = W6*b[2]-W2*b[6];
3871 a0 = W0*b[0]+W0*b[4];
3872 a4 = W0*b[0]-W0*b[4];
3874 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3875 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3877 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3878 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3879 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3880 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3881 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3882 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3883 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3884 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3886 static void wmv2_idct_col(short * b)
3889 int a0,a1,a2,a3,a4,a5,a6,a7;
3890 /*step 1, with extended precision*/
3891 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3892 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3893 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3894 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3895 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3896 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3897 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3898 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3900 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3901 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3903 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3904 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3905 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3906 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3908 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3909 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3910 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3911 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3913 void ff_wmv2_idct_c(short * block){
3917 wmv2_idct_row(block+i);
3920 wmv2_idct_col(block+i);
3923 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3925 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3927 ff_wmv2_idct_c(block);
3928 put_pixels_clamped_c(block, dest, line_size);
3930 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3932 ff_wmv2_idct_c(block);
3933 add_pixels_clamped_c(block, dest, line_size);
3935 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3938 put_pixels_clamped_c(block, dest, line_size);
3940 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3943 add_pixels_clamped_c(block, dest, line_size);
3946 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3949 put_pixels_clamped4_c(block, dest, line_size);
3951 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3954 add_pixels_clamped4_c(block, dest, line_size);
3957 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3960 put_pixels_clamped2_c(block, dest, line_size);
3962 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3965 add_pixels_clamped2_c(block, dest, line_size);
3968 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3970 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3972 dest[0] = cm[(block[0] + 4)>>3];
3974 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3976 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3978 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3981 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3983 /* init static data */
3984 void dsputil_static_init(void)
3988 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3989 for(i=0;i<MAX_NEG_CROP;i++) {
3991 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3994 for(i=0;i<512;i++) {
3995 ff_squareTbl[i] = (i - 256) * (i - 256);
3998 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4001 int ff_check_alignment(void){
4002 static int did_fail=0;
4003 DECLARE_ALIGNED_16(int, aligned);
4005 if((long)&aligned & 15){
4007 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4008 av_log(NULL, AV_LOG_ERROR,
4009 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4010 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4011 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4012 "Do not report crashes to FFmpeg developers.\n");
4021 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4025 ff_check_alignment();
4027 #ifdef CONFIG_ENCODERS
4028 if(avctx->dct_algo==FF_DCT_FASTINT) {
4029 c->fdct = fdct_ifast;
4030 c->fdct248 = fdct_ifast248;
4032 else if(avctx->dct_algo==FF_DCT_FAAN) {
4033 c->fdct = ff_faandct;
4034 c->fdct248 = ff_faandct248;
4037 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4038 c->fdct248 = ff_fdct248_islow;
4040 #endif //CONFIG_ENCODERS
4042 if(avctx->lowres==1){
4043 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4044 c->idct_put= ff_jref_idct4_put;
4045 c->idct_add= ff_jref_idct4_add;
4047 c->idct_put= ff_h264_lowres_idct_put_c;
4048 c->idct_add= ff_h264_lowres_idct_add_c;
4050 c->idct = j_rev_dct4;
4051 c->idct_permutation_type= FF_NO_IDCT_PERM;
4052 }else if(avctx->lowres==2){
4053 c->idct_put= ff_jref_idct2_put;
4054 c->idct_add= ff_jref_idct2_add;
4055 c->idct = j_rev_dct2;
4056 c->idct_permutation_type= FF_NO_IDCT_PERM;
4057 }else if(avctx->lowres==3){
4058 c->idct_put= ff_jref_idct1_put;
4059 c->idct_add= ff_jref_idct1_add;
4060 c->idct = j_rev_dct1;
4061 c->idct_permutation_type= FF_NO_IDCT_PERM;
4063 if(avctx->idct_algo==FF_IDCT_INT){
4064 c->idct_put= ff_jref_idct_put;
4065 c->idct_add= ff_jref_idct_add;
4066 c->idct = j_rev_dct;
4067 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4068 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4069 avctx->idct_algo==FF_IDCT_VP3){
4070 c->idct_put= ff_vp3_idct_put_c;
4071 c->idct_add= ff_vp3_idct_add_c;
4072 c->idct = ff_vp3_idct_c;
4073 c->idct_permutation_type= FF_NO_IDCT_PERM;
4074 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4075 c->idct_put= ff_wmv2_idct_put_c;
4076 c->idct_add= ff_wmv2_idct_add_c;
4077 c->idct = ff_wmv2_idct_c;
4078 c->idct_permutation_type= FF_NO_IDCT_PERM;
4079 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4080 c->idct_put= ff_faanidct_put;
4081 c->idct_add= ff_faanidct_add;
4082 c->idct = ff_faanidct;
4083 c->idct_permutation_type= FF_NO_IDCT_PERM;
4084 }else{ //accurate/default
4085 c->idct_put= ff_simple_idct_put;
4086 c->idct_add= ff_simple_idct_add;
4087 c->idct = ff_simple_idct;
4088 c->idct_permutation_type= FF_NO_IDCT_PERM;
4092 if (ENABLE_H264_DECODER) {
4093 c->h264_idct_add= ff_h264_idct_add_c;
4094 c->h264_idct8_add= ff_h264_idct8_add_c;
4095 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4096 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4099 c->get_pixels = get_pixels_c;
4100 c->diff_pixels = diff_pixels_c;
4101 c->put_pixels_clamped = put_pixels_clamped_c;
4102 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4103 c->add_pixels_clamped = add_pixels_clamped_c;
4104 c->add_pixels8 = add_pixels8_c;
4105 c->add_pixels4 = add_pixels4_c;
4106 c->sum_abs_dctelem = sum_abs_dctelem_c;
4109 c->clear_blocks = clear_blocks_c;
4110 c->pix_sum = pix_sum_c;
4111 c->pix_norm1 = pix_norm1_c;
4113 /* TODO [0] 16 [1] 8 */
4114 c->pix_abs[0][0] = pix_abs16_c;
4115 c->pix_abs[0][1] = pix_abs16_x2_c;
4116 c->pix_abs[0][2] = pix_abs16_y2_c;
4117 c->pix_abs[0][3] = pix_abs16_xy2_c;
4118 c->pix_abs[1][0] = pix_abs8_c;
4119 c->pix_abs[1][1] = pix_abs8_x2_c;
4120 c->pix_abs[1][2] = pix_abs8_y2_c;
4121 c->pix_abs[1][3] = pix_abs8_xy2_c;
4123 #define dspfunc(PFX, IDX, NUM) \
4124 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4125 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4126 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4127 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4129 dspfunc(put, 0, 16);
4130 dspfunc(put_no_rnd, 0, 16);
4132 dspfunc(put_no_rnd, 1, 8);
4136 dspfunc(avg, 0, 16);
4137 dspfunc(avg_no_rnd, 0, 16);
4139 dspfunc(avg_no_rnd, 1, 8);
4144 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4145 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4147 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4148 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4149 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4150 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4151 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4152 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4153 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4154 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4155 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4157 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4158 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4159 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4160 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4161 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4162 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4163 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4164 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4165 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4167 #define dspfunc(PFX, IDX, NUM) \
4168 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4169 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4170 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4171 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4172 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4173 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4174 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4175 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4176 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4177 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4178 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4179 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4180 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4181 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4182 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4183 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4185 dspfunc(put_qpel, 0, 16);
4186 dspfunc(put_no_rnd_qpel, 0, 16);
4188 dspfunc(avg_qpel, 0, 16);
4189 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4191 dspfunc(put_qpel, 1, 8);
4192 dspfunc(put_no_rnd_qpel, 1, 8);
4194 dspfunc(avg_qpel, 1, 8);
4195 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4197 dspfunc(put_h264_qpel, 0, 16);
4198 dspfunc(put_h264_qpel, 1, 8);
4199 dspfunc(put_h264_qpel, 2, 4);
4200 dspfunc(put_h264_qpel, 3, 2);
4201 dspfunc(avg_h264_qpel, 0, 16);
4202 dspfunc(avg_h264_qpel, 1, 8);
4203 dspfunc(avg_h264_qpel, 2, 4);
4206 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4207 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4208 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4209 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4210 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4211 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4212 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4214 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4215 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4216 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4217 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4218 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4219 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4220 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4221 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4222 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4223 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4224 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4225 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4226 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4227 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4228 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4229 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4230 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4231 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4232 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4233 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4235 c->draw_edges = draw_edges_c;
4237 #ifdef CONFIG_CAVS_DECODER
4238 ff_cavsdsp_init(c,avctx);
4240 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4241 ff_vc1dsp_init(c,avctx);
4243 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4244 ff_intrax8dsp_init(c,avctx);
4246 #if defined(CONFIG_H264_ENCODER)
4247 ff_h264dspenc_init(c,avctx);
4250 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4251 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4252 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4253 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4254 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4255 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4256 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4257 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4259 #define SET_CMP_FUNC(name) \
4260 c->name[0]= name ## 16_c;\
4261 c->name[1]= name ## 8x8_c;
4263 SET_CMP_FUNC(hadamard8_diff)
4264 c->hadamard8_diff[4]= hadamard8_intra16_c;
4265 SET_CMP_FUNC(dct_sad)
4266 SET_CMP_FUNC(dct_max)
4268 SET_CMP_FUNC(dct264_sad)
4270 c->sad[0]= pix_abs16_c;
4271 c->sad[1]= pix_abs8_c;
4275 SET_CMP_FUNC(quant_psnr)
4278 c->vsad[0]= vsad16_c;
4279 c->vsad[4]= vsad_intra16_c;
4280 c->vsse[0]= vsse16_c;
4281 c->vsse[4]= vsse_intra16_c;
4282 c->nsse[0]= nsse16_c;
4283 c->nsse[1]= nsse8_c;
4284 #ifdef CONFIG_SNOW_ENCODER
4285 c->w53[0]= w53_16_c;
4287 c->w97[0]= w97_16_c;
4291 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4293 c->add_bytes= add_bytes_c;
4294 c->add_bytes_l2= add_bytes_l2_c;
4295 c->diff_bytes= diff_bytes_c;
4296 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4297 c->bswap_buf= bswap_buf;
4298 #ifdef CONFIG_PNG_DECODER
4299 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4302 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4303 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4304 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4305 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4306 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4307 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4308 c->h264_loop_filter_strength= NULL;
4310 if (ENABLE_ANY_H263) {
4311 c->h263_h_loop_filter= h263_h_loop_filter_c;
4312 c->h263_v_loop_filter= h263_v_loop_filter_c;
4315 c->h261_loop_filter= h261_loop_filter_c;
4317 c->try_8x8basis= try_8x8basis_c;
4318 c->add_8x8basis= add_8x8basis_c;
4320 #ifdef CONFIG_SNOW_DECODER
4321 c->vertical_compose97i = ff_snow_vertical_compose97i;
4322 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4323 c->inner_add_yblock = ff_snow_inner_add_yblock;
4326 #ifdef CONFIG_VORBIS_DECODER
4327 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4329 #ifdef CONFIG_FLAC_ENCODER
4330 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4332 c->vector_fmul = vector_fmul_c;
4333 c->vector_fmul_reverse = vector_fmul_reverse_c;
4334 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4335 c->float_to_int16 = ff_float_to_int16_c;
4337 c->shrink[0]= ff_img_copy_plane;
4338 c->shrink[1]= ff_shrink22;
4339 c->shrink[2]= ff_shrink44;
4340 c->shrink[3]= ff_shrink88;
4342 c->prefetch= just_return;
4344 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4345 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4347 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4348 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4349 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4350 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4351 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4352 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4353 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4354 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4355 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4357 for(i=0; i<64; i++){
4358 if(!c->put_2tap_qpel_pixels_tab[0][i])
4359 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4360 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4361 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4364 switch(c->idct_permutation_type){
4365 case FF_NO_IDCT_PERM:
4367 c->idct_permutation[i]= i;
4369 case FF_LIBMPEG2_IDCT_PERM:
4371 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4373 case FF_SIMPLE_IDCT_PERM:
4375 c->idct_permutation[i]= simple_mmx_permutation[i];
4377 case FF_TRANSPOSE_IDCT_PERM:
4379 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4381 case FF_PARTTRANS_IDCT_PERM:
4383 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4386 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");