3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
47 const uint8_t ff_zigzag_direct[64] = {
48 0, 1, 8, 16, 9, 2, 3, 10,
49 17, 24, 32, 25, 18, 11, 4, 5,
50 12, 19, 26, 33, 40, 48, 41, 34,
51 27, 20, 13, 6, 7, 14, 21, 28,
52 35, 42, 49, 56, 57, 50, 43, 36,
53 29, 22, 15, 23, 30, 37, 44, 51,
54 58, 59, 52, 45, 38, 31, 39, 46,
55 53, 60, 61, 54, 47, 55, 62, 63
58 /* Specific zigzag scan for 248 idct. NOTE that unlike the
59 specification, we interleave the fields */
60 const uint8_t ff_zigzag248_direct[64] = {
61 0, 8, 1, 9, 16, 24, 2, 10,
62 17, 25, 32, 40, 48, 56, 33, 41,
63 18, 26, 3, 11, 4, 12, 19, 27,
64 34, 42, 49, 57, 50, 58, 35, 43,
65 20, 28, 5, 13, 6, 14, 21, 29,
66 36, 44, 51, 59, 52, 60, 37, 45,
67 22, 30, 7, 15, 23, 31, 38, 46,
68 53, 61, 54, 62, 39, 47, 55, 63,
71 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
72 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
74 const uint8_t ff_alternate_horizontal_scan[64] = {
75 0, 1, 2, 3, 8, 9, 16, 17,
76 10, 11, 4, 5, 6, 7, 15, 14,
77 13, 12, 19, 18, 24, 25, 32, 33,
78 26, 27, 20, 21, 22, 23, 28, 29,
79 30, 31, 34, 35, 40, 41, 48, 49,
80 42, 43, 36, 37, 38, 39, 44, 45,
81 46, 47, 50, 51, 56, 57, 58, 59,
82 52, 53, 54, 55, 60, 61, 62, 63,
85 const uint8_t ff_alternate_vertical_scan[64] = {
86 0, 8, 16, 24, 1, 9, 2, 10,
87 17, 25, 32, 40, 48, 56, 57, 49,
88 41, 33, 26, 18, 3, 11, 4, 12,
89 19, 27, 34, 42, 50, 58, 35, 43,
90 51, 59, 20, 28, 5, 13, 6, 14,
91 21, 29, 36, 44, 52, 60, 37, 45,
92 53, 61, 22, 30, 7, 15, 23, 31,
93 38, 46, 54, 62, 39, 47, 55, 63,
96 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
97 const uint32_t ff_inverse[256]={
98 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
99 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
100 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
101 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
102 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
103 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
104 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
105 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
106 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
107 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
108 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
109 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
110 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
111 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
112 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
113 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
114 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
115 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
116 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
117 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
118 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
119 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
120 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
121 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
122 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
123 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
124 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
125 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
126 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
127 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
128 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
129 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
132 /* Input permutation for the simple_idct_mmx */
133 static const uint8_t simple_mmx_permutation[64]={
134 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
135 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
136 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
137 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
138 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
139 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
140 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
141 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
144 static int pix_sum_c(uint8_t * pix, int line_size)
149 for (i = 0; i < 16; i++) {
150 for (j = 0; j < 16; j += 8) {
161 pix += line_size - 16;
166 static int pix_norm1_c(uint8_t * pix, int line_size)
169 uint32_t *sq = ff_squareTbl + 256;
172 for (i = 0; i < 16; i++) {
173 for (j = 0; j < 16; j += 8) {
184 #if LONG_MAX > 2147483647
185 register uint64_t x=*(uint64_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 s += sq[(x>>32)&0xff];
191 s += sq[(x>>40)&0xff];
192 s += sq[(x>>48)&0xff];
193 s += sq[(x>>56)&0xff];
195 register uint32_t x=*(uint32_t*)pix;
197 s += sq[(x>>8)&0xff];
198 s += sq[(x>>16)&0xff];
199 s += sq[(x>>24)&0xff];
200 x=*(uint32_t*)(pix+4);
202 s += sq[(x>>8)&0xff];
203 s += sq[(x>>16)&0xff];
204 s += sq[(x>>24)&0xff];
209 pix += line_size - 16;
214 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
217 for(i=0; i+8<=w; i+=8){
218 dst[i+0]= bswap_32(src[i+0]);
219 dst[i+1]= bswap_32(src[i+1]);
220 dst[i+2]= bswap_32(src[i+2]);
221 dst[i+3]= bswap_32(src[i+3]);
222 dst[i+4]= bswap_32(src[i+4]);
223 dst[i+5]= bswap_32(src[i+5]);
224 dst[i+6]= bswap_32(src[i+6]);
225 dst[i+7]= bswap_32(src[i+7]);
228 dst[i+0]= bswap_32(src[i+0]);
232 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
235 uint32_t *sq = ff_squareTbl + 256;
238 for (i = 0; i < h; i++) {
239 s += sq[pix1[0] - pix2[0]];
240 s += sq[pix1[1] - pix2[1]];
241 s += sq[pix1[2] - pix2[2]];
242 s += sq[pix1[3] - pix2[3]];
249 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
252 uint32_t *sq = ff_squareTbl + 256;
255 for (i = 0; i < h; i++) {
256 s += sq[pix1[0] - pix2[0]];
257 s += sq[pix1[1] - pix2[1]];
258 s += sq[pix1[2] - pix2[2]];
259 s += sq[pix1[3] - pix2[3]];
260 s += sq[pix1[4] - pix2[4]];
261 s += sq[pix1[5] - pix2[5]];
262 s += sq[pix1[6] - pix2[6]];
263 s += sq[pix1[7] - pix2[7]];
270 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
273 uint32_t *sq = ff_squareTbl + 256;
276 for (i = 0; i < h; i++) {
277 s += sq[pix1[ 0] - pix2[ 0]];
278 s += sq[pix1[ 1] - pix2[ 1]];
279 s += sq[pix1[ 2] - pix2[ 2]];
280 s += sq[pix1[ 3] - pix2[ 3]];
281 s += sq[pix1[ 4] - pix2[ 4]];
282 s += sq[pix1[ 5] - pix2[ 5]];
283 s += sq[pix1[ 6] - pix2[ 6]];
284 s += sq[pix1[ 7] - pix2[ 7]];
285 s += sq[pix1[ 8] - pix2[ 8]];
286 s += sq[pix1[ 9] - pix2[ 9]];
287 s += sq[pix1[10] - pix2[10]];
288 s += sq[pix1[11] - pix2[11]];
289 s += sq[pix1[12] - pix2[12]];
290 s += sq[pix1[13] - pix2[13]];
291 s += sq[pix1[14] - pix2[14]];
292 s += sq[pix1[15] - pix2[15]];
301 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
302 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
304 const int dec_count= w==8 ? 3 : 4;
307 static const int scale[2][2][4][4]={
311 {268, 239, 239, 213},
315 // 9/7 16x16 or 32x32 dec=4
316 {344, 310, 310, 280},
324 {275, 245, 245, 218},
328 // 5/3 16x16 or 32x32 dec=4
329 {352, 317, 317, 286},
337 for (i = 0; i < h; i++) {
338 for (j = 0; j < w; j+=4) {
339 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
340 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
341 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
342 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
348 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
352 for(level=0; level<dec_count; level++){
353 for(ori= level ? 1 : 0; ori<4; ori++){
354 int size= w>>(dec_count-level);
355 int sx= (ori&1) ? size : 0;
356 int stride= 32<<(dec_count-level);
357 int sy= (ori&2) ? stride>>1 : 0;
359 for(i=0; i<size; i++){
360 for(j=0; j<size; j++){
361 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
387 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
388 return w_c(v, pix1, pix2, line_size, 32, h, 1);
391 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
392 return w_c(v, pix1, pix2, line_size, 32, h, 0);
396 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
400 /* read the pixels */
402 block[0] = pixels[0];
403 block[1] = pixels[1];
404 block[2] = pixels[2];
405 block[3] = pixels[3];
406 block[4] = pixels[4];
407 block[5] = pixels[5];
408 block[6] = pixels[6];
409 block[7] = pixels[7];
415 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
416 const uint8_t *s2, int stride){
419 /* read the pixels */
421 block[0] = s1[0] - s2[0];
422 block[1] = s1[1] - s2[1];
423 block[2] = s1[2] - s2[2];
424 block[3] = s1[3] - s2[3];
425 block[4] = s1[4] - s2[4];
426 block[5] = s1[5] - s2[5];
427 block[6] = s1[6] - s2[6];
428 block[7] = s1[7] - s2[7];
436 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
440 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
442 /* read the pixels */
444 pixels[0] = cm[block[0]];
445 pixels[1] = cm[block[1]];
446 pixels[2] = cm[block[2]];
447 pixels[3] = cm[block[3]];
448 pixels[4] = cm[block[4]];
449 pixels[5] = cm[block[5]];
450 pixels[6] = cm[block[6]];
451 pixels[7] = cm[block[7]];
458 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
462 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
464 /* read the pixels */
466 pixels[0] = cm[block[0]];
467 pixels[1] = cm[block[1]];
468 pixels[2] = cm[block[2]];
469 pixels[3] = cm[block[3]];
476 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
480 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
482 /* read the pixels */
484 pixels[0] = cm[block[0]];
485 pixels[1] = cm[block[1]];
492 static void put_signed_pixels_clamped_c(const DCTELEM *block,
493 uint8_t *restrict pixels,
498 for (i = 0; i < 8; i++) {
499 for (j = 0; j < 8; j++) {
502 else if (*block > 127)
505 *pixels = (uint8_t)(*block + 128);
509 pixels += (line_size - 8);
513 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
517 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
519 /* read the pixels */
521 pixels[0] = cm[pixels[0] + block[0]];
522 pixels[1] = cm[pixels[1] + block[1]];
523 pixels[2] = cm[pixels[2] + block[2]];
524 pixels[3] = cm[pixels[3] + block[3]];
525 pixels[4] = cm[pixels[4] + block[4]];
526 pixels[5] = cm[pixels[5] + block[5]];
527 pixels[6] = cm[pixels[6] + block[6]];
528 pixels[7] = cm[pixels[7] + block[7]];
534 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
538 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
540 /* read the pixels */
542 pixels[0] = cm[pixels[0] + block[0]];
543 pixels[1] = cm[pixels[1] + block[1]];
544 pixels[2] = cm[pixels[2] + block[2]];
545 pixels[3] = cm[pixels[3] + block[3]];
551 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
555 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
557 /* read the pixels */
559 pixels[0] = cm[pixels[0] + block[0]];
560 pixels[1] = cm[pixels[1] + block[1]];
566 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
570 pixels[0] += block[0];
571 pixels[1] += block[1];
572 pixels[2] += block[2];
573 pixels[3] += block[3];
574 pixels[4] += block[4];
575 pixels[5] += block[5];
576 pixels[6] += block[6];
577 pixels[7] += block[7];
583 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
587 pixels[0] += block[0];
588 pixels[1] += block[1];
589 pixels[2] += block[2];
590 pixels[3] += block[3];
596 static int sum_abs_dctelem_c(DCTELEM *block)
600 sum+= FFABS(block[i]);
606 #define PIXOP2(OPNAME, OP) \
607 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611 OP(*((uint64_t*)block), LD64(pixels));\
617 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
621 const uint64_t a= LD64(pixels );\
622 const uint64_t b= LD64(pixels+1);\
623 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
629 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
633 const uint64_t a= LD64(pixels );\
634 const uint64_t b= LD64(pixels+1);\
635 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
641 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
645 const uint64_t a= LD64(pixels );\
646 const uint64_t b= LD64(pixels+line_size);\
647 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
653 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 const uint64_t a= LD64(pixels );\
658 const uint64_t b= LD64(pixels+line_size);\
659 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
665 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
668 const uint64_t a= LD64(pixels );\
669 const uint64_t b= LD64(pixels+1);\
670 uint64_t l0= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL)\
672 + 0x0202020202020202ULL;\
673 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
678 for(i=0; i<h; i+=2){\
679 uint64_t a= LD64(pixels );\
680 uint64_t b= LD64(pixels+1);\
681 l1= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL);\
683 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 l0= (a&0x0303030303030303ULL)\
691 + (b&0x0303030303030303ULL)\
692 + 0x0202020202020202ULL;\
693 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
694 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
695 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
701 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
704 const uint64_t a= LD64(pixels );\
705 const uint64_t b= LD64(pixels+1);\
706 uint64_t l0= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL)\
708 + 0x0101010101010101ULL;\
709 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
714 for(i=0; i<h; i+=2){\
715 uint64_t a= LD64(pixels );\
716 uint64_t b= LD64(pixels+1);\
717 l1= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL);\
719 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
726 l0= (a&0x0303030303030303ULL)\
727 + (b&0x0303030303030303ULL)\
728 + 0x0101010101010101ULL;\
729 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
730 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
731 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
737 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
738 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
739 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
740 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
741 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
745 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
746 #else // 64 bit variant
748 #define PIXOP2(OPNAME, OP) \
749 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
752 OP(*((uint16_t*)(block )), LD16(pixels ));\
757 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
760 OP(*((uint32_t*)(block )), LD32(pixels ));\
765 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
768 OP(*((uint32_t*)(block )), LD32(pixels ));\
769 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
774 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
775 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
778 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779 int src_stride1, int src_stride2, int h){\
783 a= LD32(&src1[i*src_stride1 ]);\
784 b= LD32(&src2[i*src_stride2 ]);\
785 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
786 a= LD32(&src1[i*src_stride1+4]);\
787 b= LD32(&src2[i*src_stride2+4]);\
788 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
792 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
797 a= LD32(&src1[i*src_stride1 ]);\
798 b= LD32(&src2[i*src_stride2 ]);\
799 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
800 a= LD32(&src1[i*src_stride1+4]);\
801 b= LD32(&src2[i*src_stride2+4]);\
802 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
806 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
811 a= LD32(&src1[i*src_stride1 ]);\
812 b= LD32(&src2[i*src_stride2 ]);\
813 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
822 a= LD16(&src1[i*src_stride1 ]);\
823 b= LD16(&src2[i*src_stride2 ]);\
824 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
828 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
829 int src_stride1, int src_stride2, int h){\
830 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
831 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
834 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
835 int src_stride1, int src_stride2, int h){\
836 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
837 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
840 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
841 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
844 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
848 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
852 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
856 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
857 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
860 uint32_t a, b, c, d, l0, l1, h0, h1;\
861 a= LD32(&src1[i*src_stride1]);\
862 b= LD32(&src2[i*src_stride2]);\
863 c= LD32(&src3[i*src_stride3]);\
864 d= LD32(&src4[i*src_stride4]);\
865 l0= (a&0x03030303UL)\
868 h0= ((a&0xFCFCFCFCUL)>>2)\
869 + ((b&0xFCFCFCFCUL)>>2);\
870 l1= (c&0x03030303UL)\
872 h1= ((c&0xFCFCFCFCUL)>>2)\
873 + ((d&0xFCFCFCFCUL)>>2);\
874 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
875 a= LD32(&src1[i*src_stride1+4]);\
876 b= LD32(&src2[i*src_stride2+4]);\
877 c= LD32(&src3[i*src_stride3+4]);\
878 d= LD32(&src4[i*src_stride4+4]);\
879 l0= (a&0x03030303UL)\
882 h0= ((a&0xFCFCFCFCUL)>>2)\
883 + ((b&0xFCFCFCFCUL)>>2);\
884 l1= (c&0x03030303UL)\
886 h1= ((c&0xFCFCFCFCUL)>>2)\
887 + ((d&0xFCFCFCFCUL)>>2);\
888 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
892 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
893 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
896 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
897 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
900 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
901 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
904 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
905 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
908 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
909 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
912 uint32_t a, b, c, d, l0, l1, h0, h1;\
913 a= LD32(&src1[i*src_stride1]);\
914 b= LD32(&src2[i*src_stride2]);\
915 c= LD32(&src3[i*src_stride3]);\
916 d= LD32(&src4[i*src_stride4]);\
917 l0= (a&0x03030303UL)\
920 h0= ((a&0xFCFCFCFCUL)>>2)\
921 + ((b&0xFCFCFCFCUL)>>2);\
922 l1= (c&0x03030303UL)\
924 h1= ((c&0xFCFCFCFCUL)>>2)\
925 + ((d&0xFCFCFCFCUL)>>2);\
926 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
927 a= LD32(&src1[i*src_stride1+4]);\
928 b= LD32(&src2[i*src_stride2+4]);\
929 c= LD32(&src3[i*src_stride3+4]);\
930 d= LD32(&src4[i*src_stride4+4]);\
931 l0= (a&0x03030303UL)\
934 h0= ((a&0xFCFCFCFCUL)>>2)\
935 + ((b&0xFCFCFCFCUL)>>2);\
936 l1= (c&0x03030303UL)\
938 h1= ((c&0xFCFCFCFCUL)>>2)\
939 + ((d&0xFCFCFCFCUL)>>2);\
940 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
943 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
944 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
945 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
946 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
948 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
949 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
950 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
956 int i, a0, b0, a1, b1;\
963 for(i=0; i<h; i+=2){\
969 block[0]= (a1+a0)>>2; /* FIXME non put */\
970 block[1]= (b1+b0)>>2;\
980 block[0]= (a1+a0)>>2;\
981 block[1]= (b1+b0)>>2;\
987 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
990 const uint32_t a= LD32(pixels );\
991 const uint32_t b= LD32(pixels+1);\
992 uint32_t l0= (a&0x03030303UL)\
995 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
1000 for(i=0; i<h; i+=2){\
1001 uint32_t a= LD32(pixels );\
1002 uint32_t b= LD32(pixels+1);\
1003 l1= (a&0x03030303UL)\
1004 + (b&0x03030303UL);\
1005 h1= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1012 l0= (a&0x03030303UL)\
1015 h0= ((a&0xFCFCFCFCUL)>>2)\
1016 + ((b&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1023 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1026 for(j=0; j<2; j++){\
1028 const uint32_t a= LD32(pixels );\
1029 const uint32_t b= LD32(pixels+1);\
1030 uint32_t l0= (a&0x03030303UL)\
1033 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1034 + ((b&0xFCFCFCFCUL)>>2);\
1038 for(i=0; i<h; i+=2){\
1039 uint32_t a= LD32(pixels );\
1040 uint32_t b= LD32(pixels+1);\
1041 l1= (a&0x03030303UL)\
1042 + (b&0x03030303UL);\
1043 h1= ((a&0xFCFCFCFCUL)>>2)\
1044 + ((b&0xFCFCFCFCUL)>>2);\
1045 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1050 l0= (a&0x03030303UL)\
1053 h0= ((a&0xFCFCFCFCUL)>>2)\
1054 + ((b&0xFCFCFCFCUL)>>2);\
1055 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1059 pixels+=4-line_size*(h+1);\
1060 block +=4-line_size*h;\
1064 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1067 for(j=0; j<2; j++){\
1069 const uint32_t a= LD32(pixels );\
1070 const uint32_t b= LD32(pixels+1);\
1071 uint32_t l0= (a&0x03030303UL)\
1074 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1079 for(i=0; i<h; i+=2){\
1080 uint32_t a= LD32(pixels );\
1081 uint32_t b= LD32(pixels+1);\
1082 l1= (a&0x03030303UL)\
1083 + (b&0x03030303UL);\
1084 h1= ((a&0xFCFCFCFCUL)>>2)\
1085 + ((b&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1091 l0= (a&0x03030303UL)\
1094 h0= ((a&0xFCFCFCFCUL)>>2)\
1095 + ((b&0xFCFCFCFCUL)>>2);\
1096 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 pixels+=4-line_size*(h+1);\
1101 block +=4-line_size*h;\
1105 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1106 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1107 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1109 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1114 #define op_avg(a, b) a = rnd_avg32(a, b)
1116 #define op_put(a, b) a = b
1123 #define avg2(a,b) ((a+b+1)>>1)
1124 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1126 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1127 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1130 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1131 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1134 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1136 const int A=(16-x16)*(16-y16);
1137 const int B=( x16)*(16-y16);
1138 const int C=(16-x16)*( y16);
1139 const int D=( x16)*( y16);
1144 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1145 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1146 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1147 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1148 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1149 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1150 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1151 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1157 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1158 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1161 const int s= 1<<shift;
1171 for(x=0; x<8; x++){ //XXX FIXME optimize
1172 int src_x, src_y, frac_x, frac_y, index;
1176 frac_x= src_x&(s-1);
1177 frac_y= src_y&(s-1);
1181 if((unsigned)src_x < width){
1182 if((unsigned)src_y < height){
1183 index= src_x + src_y*stride;
1184 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1185 + src[index +1]* frac_x )*(s-frac_y)
1186 + ( src[index+stride ]*(s-frac_x)
1187 + src[index+stride+1]* frac_x )* frac_y
1190 index= src_x + av_clip(src_y, 0, height)*stride;
1191 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1192 + src[index +1]* frac_x )*s
1196 if((unsigned)src_y < height){
1197 index= av_clip(src_x, 0, width) + src_y*stride;
1198 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1199 + src[index+stride ]* frac_y )*s
1202 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1203 dst[y*stride + x]= src[index ];
1215 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1217 case 2: put_pixels2_c (dst, src, stride, height); break;
1218 case 4: put_pixels4_c (dst, src, stride, height); break;
1219 case 8: put_pixels8_c (dst, src, stride, height); break;
1220 case 16:put_pixels16_c(dst, src, stride, height); break;
1224 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
1228 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1235 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
1239 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1246 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
1250 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1257 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
1261 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1268 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
1272 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1279 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1290 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1301 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1312 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 case 2: avg_pixels2_c (dst, src, stride, height); break;
1315 case 4: avg_pixels4_c (dst, src, stride, height); break;
1316 case 8: avg_pixels8_c (dst, src, stride, height); break;
1317 case 16:avg_pixels16_c(dst, src, stride, height); break;
1321 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1332 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1343 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1354 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1376 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1400 for (i=0; i < height; i++) {
1401 for (j=0; j < width; j++) {
1402 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1409 #define TPEL_WIDTH(width)\
1410 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1418 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1419 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1420 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1421 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1422 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1423 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1424 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1425 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1426 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1427 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1430 #define H264_CHROMA_MC(OPNAME, OP)\
1431 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1432 const int A=(8-x)*(8-y);\
1433 const int B=( x)*(8-y);\
1434 const int C=(8-x)*( y);\
1435 const int D=( x)*( y);\
1438 assert(x<8 && y<8 && x>=0 && y>=0);\
1442 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1443 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1449 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1450 const int A=(8-x)*(8-y);\
1451 const int B=( x)*(8-y);\
1452 const int C=(8-x)*( y);\
1453 const int D=( x)*( y);\
1456 assert(x<8 && y<8 && x>=0 && y>=0);\
1460 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1461 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1462 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1463 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1469 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1470 const int A=(8-x)*(8-y);\
1471 const int B=( x)*(8-y);\
1472 const int C=(8-x)*( y);\
1473 const int D=( x)*( y);\
1476 assert(x<8 && y<8 && x>=0 && y>=0);\
1480 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1481 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1482 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1483 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1484 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1485 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1486 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1487 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1493 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1494 #define op_put(a, b) a = (((b) + 32)>>6)
1496 H264_CHROMA_MC(put_ , op_put)
1497 H264_CHROMA_MC(avg_ , op_avg)
1501 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1502 const int A=(8-x)*(8-y);
1503 const int B=( x)*(8-y);
1504 const int C=(8-x)*( y);
1505 const int D=( x)*( y);
1508 assert(x<8 && y<8 && x>=0 && y>=0);
1512 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1513 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1514 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1515 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1516 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1517 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1518 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1519 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1525 #define QPEL_MC(r, OPNAME, RND, OP) \
1526 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1531 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1544 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1546 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1550 const int src0= src[0*srcStride];\
1551 const int src1= src[1*srcStride];\
1552 const int src2= src[2*srcStride];\
1553 const int src3= src[3*srcStride];\
1554 const int src4= src[4*srcStride];\
1555 const int src5= src[5*srcStride];\
1556 const int src6= src[6*srcStride];\
1557 const int src7= src[7*srcStride];\
1558 const int src8= src[8*srcStride];\
1559 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1572 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1578 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1599 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1605 const int src0= src[0*srcStride];\
1606 const int src1= src[1*srcStride];\
1607 const int src2= src[2*srcStride];\
1608 const int src3= src[3*srcStride];\
1609 const int src4= src[4*srcStride];\
1610 const int src5= src[5*srcStride];\
1611 const int src6= src[6*srcStride];\
1612 const int src7= src[7*srcStride];\
1613 const int src8= src[8*srcStride];\
1614 const int src9= src[9*srcStride];\
1615 const int src10= src[10*srcStride];\
1616 const int src11= src[11*srcStride];\
1617 const int src12= src[12*srcStride];\
1618 const int src13= src[13*srcStride];\
1619 const int src14= src[14*srcStride];\
1620 const int src15= src[15*srcStride];\
1621 const int src16= src[16*srcStride];\
1622 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1643 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels8_c(dst, src, stride, 8);\
1647 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1653 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1657 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1659 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1663 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664 uint8_t full[16*9];\
1666 copy_block9(full, src, 16, stride, 9);\
1667 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1671 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1677 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678 uint8_t full[16*9];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1684 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685 uint8_t full[16*9];\
1688 uint8_t halfHV[64];\
1689 copy_block9(full, src, 16, stride, 9);\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1695 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696 uint8_t full[16*9];\
1698 uint8_t halfHV[64];\
1699 copy_block9(full, src, 16, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1705 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1709 uint8_t halfHV[64];\
1710 copy_block9(full, src, 16, stride, 9);\
1711 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1716 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1719 uint8_t halfHV[64];\
1720 copy_block9(full, src, 16, stride, 9);\
1721 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1726 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1747 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1768 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t halfHV[64];\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1775 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t halfHV[64];\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1782 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1793 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1801 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1820 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1825 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826 OPNAME ## pixels16_c(dst, src, stride, 16);\
1829 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1835 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1839 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1841 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1845 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[24*17];\
1848 copy_block17(full, src, 24, stride, 17);\
1849 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1853 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1859 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[24*17];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1866 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867 uint8_t full[24*17];\
1868 uint8_t halfH[272];\
1869 uint8_t halfV[256];\
1870 uint8_t halfHV[256];\
1871 copy_block17(full, src, 24, stride, 17);\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1877 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878 uint8_t full[24*17];\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 copy_block17(full, src, 24, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1887 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1889 uint8_t halfH[272];\
1890 uint8_t halfV[256];\
1891 uint8_t halfHV[256];\
1892 copy_block17(full, src, 24, stride, 17);\
1893 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1898 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t halfH[272];\
1901 uint8_t halfHV[256];\
1902 copy_block17(full, src, 24, stride, 17);\
1903 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1908 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1929 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1950 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t halfH[272];\
1952 uint8_t halfHV[256];\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1957 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t halfH[272];\
1959 uint8_t halfHV[256];\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1964 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfV[256];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1975 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 copy_block17(full, src, 24, stride, 17);\
1979 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1983 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1994 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 copy_block17(full, src, 24, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2002 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003 uint8_t halfH[272];\
2004 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2008 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010 #define op_put(a, b) a = cm[((b) + 16)>>5]
2011 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2013 QPEL_MC(0, put_ , _ , op_put)
2014 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015 QPEL_MC(0, avg_ , _ , op_avg)
2016 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2018 #undef op_avg_no_rnd
2020 #undef op_put_no_rnd
2023 #define H264_LOWPASS(OPNAME, OP, OP2) \
2024 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2026 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2030 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2037 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2039 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2043 const int srcB= src[-2*srcStride];\
2044 const int srcA= src[-1*srcStride];\
2045 const int src0= src[0 *srcStride];\
2046 const int src1= src[1 *srcStride];\
2047 const int src2= src[2 *srcStride];\
2048 const int src3= src[3 *srcStride];\
2049 const int src4= src[4 *srcStride];\
2050 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2051 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2057 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2060 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2062 src -= 2*srcStride;\
2063 for(i=0; i<h+5; i++)\
2065 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp -= tmpStride*(h+5-2);\
2073 const int tmpB= tmp[-2*tmpStride];\
2074 const int tmpA= tmp[-1*tmpStride];\
2075 const int tmp0= tmp[0 *tmpStride];\
2076 const int tmp1= tmp[1 *tmpStride];\
2077 const int tmp2= tmp[2 *tmpStride];\
2078 const int tmp3= tmp[3 *tmpStride];\
2079 const int tmp4= tmp[4 *tmpStride];\
2080 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2081 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2088 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2092 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2093 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2094 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2095 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2101 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2103 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2107 const int srcB= src[-2*srcStride];\
2108 const int srcA= src[-1*srcStride];\
2109 const int src0= src[0 *srcStride];\
2110 const int src1= src[1 *srcStride];\
2111 const int src2= src[2 *srcStride];\
2112 const int src3= src[3 *srcStride];\
2113 const int src4= src[4 *srcStride];\
2114 const int src5= src[5 *srcStride];\
2115 const int src6= src[6 *srcStride];\
2116 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2117 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2118 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2119 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2125 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2128 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2130 src -= 2*srcStride;\
2131 for(i=0; i<h+5; i++)\
2133 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2135 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2136 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2140 tmp -= tmpStride*(h+5-2);\
2143 const int tmpB= tmp[-2*tmpStride];\
2144 const int tmpA= tmp[-1*tmpStride];\
2145 const int tmp0= tmp[0 *tmpStride];\
2146 const int tmp1= tmp[1 *tmpStride];\
2147 const int tmp2= tmp[2 *tmpStride];\
2148 const int tmp3= tmp[3 *tmpStride];\
2149 const int tmp4= tmp[4 *tmpStride];\
2150 const int tmp5= tmp[5 *tmpStride];\
2151 const int tmp6= tmp[6 *tmpStride];\
2152 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2153 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2154 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2155 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2161 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2163 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2167 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2168 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2169 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2170 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2171 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2172 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2173 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2174 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2180 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2182 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2186 const int srcB= src[-2*srcStride];\
2187 const int srcA= src[-1*srcStride];\
2188 const int src0= src[0 *srcStride];\
2189 const int src1= src[1 *srcStride];\
2190 const int src2= src[2 *srcStride];\
2191 const int src3= src[3 *srcStride];\
2192 const int src4= src[4 *srcStride];\
2193 const int src5= src[5 *srcStride];\
2194 const int src6= src[6 *srcStride];\
2195 const int src7= src[7 *srcStride];\
2196 const int src8= src[8 *srcStride];\
2197 const int src9= src[9 *srcStride];\
2198 const int src10=src[10*srcStride];\
2199 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2200 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2201 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2202 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2203 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2204 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2205 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2206 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2212 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2215 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2217 src -= 2*srcStride;\
2218 for(i=0; i<h+5; i++)\
2220 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2221 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2222 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2223 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2224 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2225 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2226 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2227 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2231 tmp -= tmpStride*(h+5-2);\
2234 const int tmpB= tmp[-2*tmpStride];\
2235 const int tmpA= tmp[-1*tmpStride];\
2236 const int tmp0= tmp[0 *tmpStride];\
2237 const int tmp1= tmp[1 *tmpStride];\
2238 const int tmp2= tmp[2 *tmpStride];\
2239 const int tmp3= tmp[3 *tmpStride];\
2240 const int tmp4= tmp[4 *tmpStride];\
2241 const int tmp5= tmp[5 *tmpStride];\
2242 const int tmp6= tmp[6 *tmpStride];\
2243 const int tmp7= tmp[7 *tmpStride];\
2244 const int tmp8= tmp[8 *tmpStride];\
2245 const int tmp9= tmp[9 *tmpStride];\
2246 const int tmp10=tmp[10*tmpStride];\
2247 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2248 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2249 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2250 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2251 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2252 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2253 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2254 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2260 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2262 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2263 src += 8*srcStride;\
2264 dst += 8*dstStride;\
2265 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2266 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2269 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2270 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2271 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2272 src += 8*srcStride;\
2273 dst += 8*dstStride;\
2274 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2275 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2278 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2279 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2280 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2281 src += 8*srcStride;\
2282 dst += 8*dstStride;\
2283 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2284 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2287 #define H264_MC(OPNAME, SIZE) \
2288 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2289 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2293 uint8_t half[SIZE*SIZE];\
2294 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2295 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2298 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2299 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2302 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2303 uint8_t half[SIZE*SIZE];\
2304 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2305 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2308 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2309 uint8_t full[SIZE*(SIZE+5)];\
2310 uint8_t * const full_mid= full + SIZE*2;\
2311 uint8_t half[SIZE*SIZE];\
2312 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2313 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2314 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2318 uint8_t full[SIZE*(SIZE+5)];\
2319 uint8_t * const full_mid= full + SIZE*2;\
2320 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2321 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2325 uint8_t full[SIZE*(SIZE+5)];\
2326 uint8_t * const full_mid= full + SIZE*2;\
2327 uint8_t half[SIZE*SIZE];\
2328 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2329 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2330 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2333 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2334 uint8_t full[SIZE*(SIZE+5)];\
2335 uint8_t * const full_mid= full + SIZE*2;\
2336 uint8_t halfH[SIZE*SIZE];\
2337 uint8_t halfV[SIZE*SIZE];\
2338 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2339 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2340 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2341 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t full[SIZE*(SIZE+5)];\
2346 uint8_t * const full_mid= full + SIZE*2;\
2347 uint8_t halfH[SIZE*SIZE];\
2348 uint8_t halfV[SIZE*SIZE];\
2349 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2350 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2351 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2356 uint8_t full[SIZE*(SIZE+5)];\
2357 uint8_t * const full_mid= full + SIZE*2;\
2358 uint8_t halfH[SIZE*SIZE];\
2359 uint8_t halfV[SIZE*SIZE];\
2360 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2361 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2362 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2363 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2367 uint8_t full[SIZE*(SIZE+5)];\
2368 uint8_t * const full_mid= full + SIZE*2;\
2369 uint8_t halfH[SIZE*SIZE];\
2370 uint8_t halfV[SIZE*SIZE];\
2371 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2372 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2373 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2374 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2377 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2378 int16_t tmp[SIZE*(SIZE+5)];\
2379 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2383 int16_t tmp[SIZE*(SIZE+5)];\
2384 uint8_t halfH[SIZE*SIZE];\
2385 uint8_t halfHV[SIZE*SIZE];\
2386 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2387 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2392 int16_t tmp[SIZE*(SIZE+5)];\
2393 uint8_t halfH[SIZE*SIZE];\
2394 uint8_t halfHV[SIZE*SIZE];\
2395 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2396 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2397 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2400 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2401 uint8_t full[SIZE*(SIZE+5)];\
2402 uint8_t * const full_mid= full + SIZE*2;\
2403 int16_t tmp[SIZE*(SIZE+5)];\
2404 uint8_t halfV[SIZE*SIZE];\
2405 uint8_t halfHV[SIZE*SIZE];\
2406 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2407 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2408 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2409 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t full[SIZE*(SIZE+5)];\
2414 uint8_t * const full_mid= full + SIZE*2;\
2415 int16_t tmp[SIZE*(SIZE+5)];\
2416 uint8_t halfV[SIZE*SIZE];\
2417 uint8_t halfHV[SIZE*SIZE];\
2418 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2419 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2421 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2424 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2425 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2426 #define op_put(a, b) a = cm[((b) + 16)>>5]
2427 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2428 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2430 H264_LOWPASS(put_ , op_put, op2_put)
2431 H264_LOWPASS(avg_ , op_avg, op2_avg)
2446 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2447 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2448 #define H264_WEIGHT(W,H) \
2449 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2451 offset <<= log2_denom; \
2452 if(log2_denom) offset += 1<<(log2_denom-1); \
2453 for(y=0; y<H; y++, block += stride){ \
2456 if(W==2) continue; \
2459 if(W==4) continue; \
2464 if(W==8) continue; \
2475 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2477 offset = ((offset + 1) | 1) << log2_denom; \
2478 for(y=0; y<H; y++, dst += stride, src += stride){ \
2481 if(W==2) continue; \
2484 if(W==4) continue; \
2489 if(W==8) continue; \
2516 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2517 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2521 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2522 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2523 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2524 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2525 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2526 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2527 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2528 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2534 #ifdef CONFIG_CAVS_DECODER
2536 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2538 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539 put_pixels8_c(dst, src, stride, 8);
2541 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542 avg_pixels8_c(dst, src, stride, 8);
2544 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545 put_pixels16_c(dst, src, stride, 16);
2547 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548 avg_pixels16_c(dst, src, stride, 16);
2550 #endif /* CONFIG_CAVS_DECODER */
2552 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2554 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2556 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2557 put_pixels8_c(dst, src, stride, 8);
2559 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2561 #if defined(CONFIG_H264_ENCODER)
2563 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2564 #endif /* CONFIG_H264_ENCODER */
2566 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2567 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2571 const int src_1= src[ -srcStride];
2572 const int src0 = src[0 ];
2573 const int src1 = src[ srcStride];
2574 const int src2 = src[2*srcStride];
2575 const int src3 = src[3*srcStride];
2576 const int src4 = src[4*srcStride];
2577 const int src5 = src[5*srcStride];
2578 const int src6 = src[6*srcStride];
2579 const int src7 = src[7*srcStride];
2580 const int src8 = src[8*srcStride];
2581 const int src9 = src[9*srcStride];
2582 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2583 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2584 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2585 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2586 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2587 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2588 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2589 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2595 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2596 put_pixels8_c(dst, src, stride, 8);
2599 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2601 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2602 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2605 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2606 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2609 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2611 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2615 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2616 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2619 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2623 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2625 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2628 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2632 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2633 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2634 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2635 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2637 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2639 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2640 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2643 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2645 const int strength= ff_h263_loop_filter_strength[qscale];
2649 int p0= src[x-2*stride];
2650 int p1= src[x-1*stride];
2651 int p2= src[x+0*stride];
2652 int p3= src[x+1*stride];
2653 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2655 if (d<-2*strength) d1= 0;
2656 else if(d<- strength) d1=-2*strength - d;
2657 else if(d< strength) d1= d;
2658 else if(d< 2*strength) d1= 2*strength - d;
2663 if(p1&256) p1= ~(p1>>31);
2664 if(p2&256) p2= ~(p2>>31);
2666 src[x-1*stride] = p1;
2667 src[x+0*stride] = p2;
2671 d2= av_clip((p0-p3)/4, -ad1, ad1);
2673 src[x-2*stride] = p0 - d2;
2674 src[x+ stride] = p3 + d2;
2678 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2680 const int strength= ff_h263_loop_filter_strength[qscale];
2684 int p0= src[y*stride-2];
2685 int p1= src[y*stride-1];
2686 int p2= src[y*stride+0];
2687 int p3= src[y*stride+1];
2688 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2690 if (d<-2*strength) d1= 0;
2691 else if(d<- strength) d1=-2*strength - d;
2692 else if(d< strength) d1= d;
2693 else if(d< 2*strength) d1= 2*strength - d;
2698 if(p1&256) p1= ~(p1>>31);
2699 if(p2&256) p2= ~(p2>>31);
2701 src[y*stride-1] = p1;
2702 src[y*stride+0] = p2;
2706 d2= av_clip((p0-p3)/4, -ad1, ad1);
2708 src[y*stride-2] = p0 - d2;
2709 src[y*stride+1] = p3 + d2;
2713 static void h261_loop_filter_c(uint8_t *src, int stride){
2718 temp[x ] = 4*src[x ];
2719 temp[x + 7*8] = 4*src[x + 7*stride];
2723 xy = y * stride + x;
2725 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2730 src[ y*stride] = (temp[ y*8] + 2)>>2;
2731 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2733 xy = y * stride + x;
2735 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2740 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2743 for( i = 0; i < 4; i++ ) {
2748 for( d = 0; d < 4; d++ ) {
2749 const int p0 = pix[-1*xstride];
2750 const int p1 = pix[-2*xstride];
2751 const int p2 = pix[-3*xstride];
2752 const int q0 = pix[0];
2753 const int q1 = pix[1*xstride];
2754 const int q2 = pix[2*xstride];
2756 if( FFABS( p0 - q0 ) < alpha &&
2757 FFABS( p1 - p0 ) < beta &&
2758 FFABS( q1 - q0 ) < beta ) {
2763 if( FFABS( p2 - p0 ) < beta ) {
2764 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2767 if( FFABS( q2 - q0 ) < beta ) {
2768 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2772 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2773 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2774 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2780 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2782 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2784 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2786 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2789 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2792 for( i = 0; i < 4; i++ ) {
2793 const int tc = tc0[i];
2798 for( d = 0; d < 2; d++ ) {
2799 const int p0 = pix[-1*xstride];
2800 const int p1 = pix[-2*xstride];
2801 const int q0 = pix[0];
2802 const int q1 = pix[1*xstride];
2804 if( FFABS( p0 - q0 ) < alpha &&
2805 FFABS( p1 - p0 ) < beta &&
2806 FFABS( q1 - q0 ) < beta ) {
2808 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2810 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2811 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2817 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2819 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2821 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2823 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2826 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2829 for( d = 0; d < 8; d++ ) {
2830 const int p0 = pix[-1*xstride];
2831 const int p1 = pix[-2*xstride];
2832 const int q0 = pix[0];
2833 const int q1 = pix[1*xstride];
2835 if( FFABS( p0 - q0 ) < alpha &&
2836 FFABS( p1 - p0 ) < beta &&
2837 FFABS( q1 - q0 ) < beta ) {
2839 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2840 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2845 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2847 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2849 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2851 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2854 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2860 s += abs(pix1[0] - pix2[0]);
2861 s += abs(pix1[1] - pix2[1]);
2862 s += abs(pix1[2] - pix2[2]);
2863 s += abs(pix1[3] - pix2[3]);
2864 s += abs(pix1[4] - pix2[4]);
2865 s += abs(pix1[5] - pix2[5]);
2866 s += abs(pix1[6] - pix2[6]);
2867 s += abs(pix1[7] - pix2[7]);
2868 s += abs(pix1[8] - pix2[8]);
2869 s += abs(pix1[9] - pix2[9]);
2870 s += abs(pix1[10] - pix2[10]);
2871 s += abs(pix1[11] - pix2[11]);
2872 s += abs(pix1[12] - pix2[12]);
2873 s += abs(pix1[13] - pix2[13]);
2874 s += abs(pix1[14] - pix2[14]);
2875 s += abs(pix1[15] - pix2[15]);
2882 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2888 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2889 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2890 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2891 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2892 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2893 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2894 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2895 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2896 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2897 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2898 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2899 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2900 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2901 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2902 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2903 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2910 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2913 uint8_t *pix3 = pix2 + line_size;
2917 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2918 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2919 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2920 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2921 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2922 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2923 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2924 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2925 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2926 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2927 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2928 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2929 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2930 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2931 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2932 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2940 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2943 uint8_t *pix3 = pix2 + line_size;
2947 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2948 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2949 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2950 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2951 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2952 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2953 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2954 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2955 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2956 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2957 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2958 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2959 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2960 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2961 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2962 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2970 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2976 s += abs(pix1[0] - pix2[0]);
2977 s += abs(pix1[1] - pix2[1]);
2978 s += abs(pix1[2] - pix2[2]);
2979 s += abs(pix1[3] - pix2[3]);
2980 s += abs(pix1[4] - pix2[4]);
2981 s += abs(pix1[5] - pix2[5]);
2982 s += abs(pix1[6] - pix2[6]);
2983 s += abs(pix1[7] - pix2[7]);
2990 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2996 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2997 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2998 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2999 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3000 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3001 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3002 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3003 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3010 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3013 uint8_t *pix3 = pix2 + line_size;
3017 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3018 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3019 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3020 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3021 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3022 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3023 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3024 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3032 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3035 uint8_t *pix3 = pix2 + line_size;
3039 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3040 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3041 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3042 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3043 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3044 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3045 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3046 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3054 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3055 MpegEncContext *c = v;
3061 for(x=0; x<16; x++){
3062 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3065 for(x=0; x<15; x++){
3066 score2+= FFABS( s1[x ] - s1[x +stride]
3067 - s1[x+1] + s1[x+1+stride])
3068 -FFABS( s2[x ] - s2[x +stride]
3069 - s2[x+1] + s2[x+1+stride]);
3076 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3077 else return score1 + FFABS(score2)*8;
3080 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3081 MpegEncContext *c = v;
3088 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3092 score2+= FFABS( s1[x ] - s1[x +stride]
3093 - s1[x+1] + s1[x+1+stride])
3094 -FFABS( s2[x ] - s2[x +stride]
3095 - s2[x+1] + s2[x+1+stride]);
3102 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3103 else return score1 + FFABS(score2)*8;
3106 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3110 for(i=0; i<8*8; i++){
3111 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3114 assert(-512<b && b<512);
3116 sum += (w*b)*(w*b)>>4;
3121 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3124 for(i=0; i<8*8; i++){
3125 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3130 * permutes an 8x8 block.
3131 * @param block the block which will be permuted according to the given permutation vector
3132 * @param permutation the permutation vector
3133 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3134 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3135 * (inverse) permutated to scantable order!
3137 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3143 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3145 for(i=0; i<=last; i++){
3146 const int j= scantable[i];
3151 for(i=0; i<=last; i++){
3152 const int j= scantable[i];
3153 const int perm_j= permutation[j];
3154 block[perm_j]= temp[j];
3158 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3162 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3165 memset(cmp, 0, sizeof(void*)*5);
3173 cmp[i]= c->hadamard8_diff[i];
3179 cmp[i]= c->dct_sad[i];
3182 cmp[i]= c->dct264_sad[i];
3185 cmp[i]= c->dct_max[i];
3188 cmp[i]= c->quant_psnr[i];
3208 #ifdef CONFIG_SNOW_ENCODER
3217 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3223 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3225 static void clear_blocks_c(DCTELEM *blocks)
3227 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3230 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3232 for(i=0; i+7<w; i+=8){
3233 dst[i+0] += src[i+0];
3234 dst[i+1] += src[i+1];
3235 dst[i+2] += src[i+2];
3236 dst[i+3] += src[i+3];
3237 dst[i+4] += src[i+4];
3238 dst[i+5] += src[i+5];
3239 dst[i+6] += src[i+6];
3240 dst[i+7] += src[i+7];
3243 dst[i+0] += src[i+0];
3246 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3248 for(i=0; i+7<w; i+=8){
3249 dst[i+0] = src1[i+0]-src2[i+0];
3250 dst[i+1] = src1[i+1]-src2[i+1];
3251 dst[i+2] = src1[i+2]-src2[i+2];
3252 dst[i+3] = src1[i+3]-src2[i+3];
3253 dst[i+4] = src1[i+4]-src2[i+4];
3254 dst[i+5] = src1[i+5]-src2[i+5];
3255 dst[i+6] = src1[i+6]-src2[i+6];
3256 dst[i+7] = src1[i+7]-src2[i+7];
3259 dst[i+0] = src1[i+0]-src2[i+0];
3262 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3270 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3280 #define BUTTERFLY2(o1,o2,i1,i2) \
3284 #define BUTTERFLY1(x,y) \
3293 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3295 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3303 //FIXME try pointer walks
3304 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3305 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3306 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3307 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3309 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3310 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3311 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3312 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3314 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3315 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3316 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3317 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3321 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3322 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3323 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3324 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3326 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3327 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3328 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3329 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3332 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3333 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3334 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3335 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3341 printf("MAX:%d\n", maxi);
3347 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3355 //FIXME try pointer walks
3356 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3357 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3358 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3359 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3361 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3362 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3363 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3364 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3366 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3367 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3368 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3369 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3373 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3374 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3375 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3376 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3378 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3379 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3380 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3381 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3384 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3385 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3386 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3387 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3390 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3395 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3396 MpegEncContext * const s= (MpegEncContext *)c;
3397 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3398 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3402 s->dsp.diff_pixels(temp, src1, src2, stride);
3404 return s->dsp.sum_abs_dctelem(temp);
3409 const int s07 = SRC(0) + SRC(7);\
3410 const int s16 = SRC(1) + SRC(6);\
3411 const int s25 = SRC(2) + SRC(5);\
3412 const int s34 = SRC(3) + SRC(4);\
3413 const int a0 = s07 + s34;\
3414 const int a1 = s16 + s25;\
3415 const int a2 = s07 - s34;\
3416 const int a3 = s16 - s25;\
3417 const int d07 = SRC(0) - SRC(7);\
3418 const int d16 = SRC(1) - SRC(6);\
3419 const int d25 = SRC(2) - SRC(5);\
3420 const int d34 = SRC(3) - SRC(4);\
3421 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3422 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3423 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3424 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3426 DST(1, a4 + (a7>>2)) ;\
3427 DST(2, a2 + (a3>>1)) ;\
3428 DST(3, a5 + (a6>>2)) ;\
3430 DST(5, a6 - (a5>>2)) ;\
3431 DST(6, (a2>>1) - a3 ) ;\
3432 DST(7, (a4>>2) - a7 ) ;\
3435 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3436 MpegEncContext * const s= (MpegEncContext *)c;
3441 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3443 #define SRC(x) dct[i][x]
3444 #define DST(x,v) dct[i][x]= v
3445 for( i = 0; i < 8; i++ )
3450 #define SRC(x) dct[x][i]
3451 #define DST(x,v) sum += FFABS(v)
3452 for( i = 0; i < 8; i++ )
3460 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3461 MpegEncContext * const s= (MpegEncContext *)c;
3462 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3463 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3468 s->dsp.diff_pixels(temp, src1, src2, stride);
3472 sum= FFMAX(sum, FFABS(temp[i]));
3477 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478 MpegEncContext * const s= (MpegEncContext *)c;
3479 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3480 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3481 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3487 s->dsp.diff_pixels(temp, src1, src2, stride);
3489 memcpy(bak, temp, 64*sizeof(DCTELEM));
3491 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3492 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3493 simple_idct(temp); //FIXME
3496 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3501 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502 MpegEncContext * const s= (MpegEncContext *)c;
3503 const uint8_t *scantable= s->intra_scantable.permutated;
3504 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3505 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3506 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3507 uint8_t * const bak= (uint8_t*)aligned_bak;
3508 int i, last, run, bits, level, distoration, start_i;
3509 const int esc_length= s->ac_esc_length;
3511 uint8_t * last_length;
3516 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3517 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3520 s->dsp.diff_pixels(temp, src1, src2, stride);
3522 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3528 length = s->intra_ac_vlc_length;
3529 last_length= s->intra_ac_vlc_last_length;
3530 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3533 length = s->inter_ac_vlc_length;
3534 last_length= s->inter_ac_vlc_last_length;
3539 for(i=start_i; i<last; i++){
3540 int j= scantable[i];
3545 if((level&(~127)) == 0){
3546 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3555 level= temp[i] + 64;
3559 if((level&(~127)) == 0){
3560 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3568 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3570 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3573 s->dsp.idct_add(bak, stride, temp);
3575 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3577 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3580 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3581 MpegEncContext * const s= (MpegEncContext *)c;
3582 const uint8_t *scantable= s->intra_scantable.permutated;
3583 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3584 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3585 int i, last, run, bits, level, start_i;
3586 const int esc_length= s->ac_esc_length;
3588 uint8_t * last_length;
3592 s->dsp.diff_pixels(temp, src1, src2, stride);
3594 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3600 length = s->intra_ac_vlc_length;
3601 last_length= s->intra_ac_vlc_last_length;
3602 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3605 length = s->inter_ac_vlc_length;
3606 last_length= s->inter_ac_vlc_last_length;
3611 for(i=start_i; i<last; i++){
3612 int j= scantable[i];
3617 if((level&(~127)) == 0){
3618 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3627 level= temp[i] + 64;
3631 if((level&(~127)) == 0){
3632 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3640 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3645 for(x=0; x<16; x+=4){
3646 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3647 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3655 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3660 for(x=0; x<16; x++){
3661 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3670 #define SQ(a) ((a)*(a))
3671 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3676 for(x=0; x<16; x+=4){
3677 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3678 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3686 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3691 for(x=0; x<16; x++){
3692 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3701 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3705 for(i=0; i<size; i++)
3706 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3710 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3711 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3712 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3714 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3716 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3717 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3718 WARPER8_16_SQ(rd8x8_c, rd16_c)
3719 WARPER8_16_SQ(bit8x8_c, bit16_c)
3721 static void vector_fmul_c(float *dst, const float *src, int len){
3723 for(i=0; i<len; i++)
3727 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3730 for(i=0; i<len; i++)
3731 dst[i] = src0[i] * src1[-i];
3734 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3736 for(i=0; i<len; i++)
3737 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3740 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3742 for(i=0; i<len; i++) {
3743 int_fast32_t tmp = ((int32_t*)src)[i];
3745 tmp = (0x43c0ffff - tmp)>>31;
3746 // is this faster on some gcc/cpu combinations?
3747 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3750 dst[i] = tmp - 0x8000;
3754 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3756 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3759 put_pixels_clamped_c(block, dest, line_size);
3761 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3764 add_pixels_clamped_c(block, dest, line_size);
3767 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3770 put_pixels_clamped4_c(block, dest, line_size);
3772 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3775 add_pixels_clamped4_c(block, dest, line_size);
3778 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3781 put_pixels_clamped2_c(block, dest, line_size);
3783 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3786 add_pixels_clamped2_c(block, dest, line_size);
3789 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3791 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3793 dest[0] = cm[(block[0] + 4)>>3];
3795 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3797 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3799 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3802 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3804 /* init static data */
3805 void dsputil_static_init(void)
3809 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3810 for(i=0;i<MAX_NEG_CROP;i++) {
3812 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3815 for(i=0;i<512;i++) {
3816 ff_squareTbl[i] = (i - 256) * (i - 256);
3819 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3822 int ff_check_alignment(void){
3823 static int did_fail=0;
3824 DECLARE_ALIGNED_16(int, aligned);
3826 if((long)&aligned & 15){
3828 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3829 av_log(NULL, AV_LOG_ERROR,
3830 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3831 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3832 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3841 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3845 ff_check_alignment();
3847 #ifdef CONFIG_ENCODERS
3848 if(avctx->dct_algo==FF_DCT_FASTINT) {
3849 c->fdct = fdct_ifast;
3850 c->fdct248 = fdct_ifast248;
3852 else if(avctx->dct_algo==FF_DCT_FAAN) {
3853 c->fdct = ff_faandct;
3854 c->fdct248 = ff_faandct248;
3857 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3858 c->fdct248 = ff_fdct248_islow;
3860 #endif //CONFIG_ENCODERS
3862 if(avctx->lowres==1){
3863 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3864 c->idct_put= ff_jref_idct4_put;
3865 c->idct_add= ff_jref_idct4_add;
3867 c->idct_put= ff_h264_lowres_idct_put_c;
3868 c->idct_add= ff_h264_lowres_idct_add_c;
3870 c->idct = j_rev_dct4;
3871 c->idct_permutation_type= FF_NO_IDCT_PERM;
3872 }else if(avctx->lowres==2){
3873 c->idct_put= ff_jref_idct2_put;
3874 c->idct_add= ff_jref_idct2_add;
3875 c->idct = j_rev_dct2;
3876 c->idct_permutation_type= FF_NO_IDCT_PERM;
3877 }else if(avctx->lowres==3){
3878 c->idct_put= ff_jref_idct1_put;
3879 c->idct_add= ff_jref_idct1_add;
3880 c->idct = j_rev_dct1;
3881 c->idct_permutation_type= FF_NO_IDCT_PERM;
3883 if(avctx->idct_algo==FF_IDCT_INT){
3884 c->idct_put= ff_jref_idct_put;
3885 c->idct_add= ff_jref_idct_add;
3886 c->idct = j_rev_dct;
3887 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3888 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3889 avctx->idct_algo==FF_IDCT_VP3){
3890 c->idct_put= ff_vp3_idct_put_c;
3891 c->idct_add= ff_vp3_idct_add_c;
3892 c->idct = ff_vp3_idct_c;
3893 c->idct_permutation_type= FF_NO_IDCT_PERM;
3894 }else{ //accurate/default
3895 c->idct_put= simple_idct_put;
3896 c->idct_add= simple_idct_add;
3897 c->idct = simple_idct;
3898 c->idct_permutation_type= FF_NO_IDCT_PERM;
3902 if (ENABLE_H264_DECODER) {
3903 c->h264_idct_add= ff_h264_idct_add_c;
3904 c->h264_idct8_add= ff_h264_idct8_add_c;
3905 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3906 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3909 c->get_pixels = get_pixels_c;
3910 c->diff_pixels = diff_pixels_c;
3911 c->put_pixels_clamped = put_pixels_clamped_c;
3912 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3913 c->add_pixels_clamped = add_pixels_clamped_c;
3914 c->add_pixels8 = add_pixels8_c;
3915 c->add_pixels4 = add_pixels4_c;
3916 c->sum_abs_dctelem = sum_abs_dctelem_c;
3919 c->clear_blocks = clear_blocks_c;
3920 c->pix_sum = pix_sum_c;
3921 c->pix_norm1 = pix_norm1_c;
3923 /* TODO [0] 16 [1] 8 */
3924 c->pix_abs[0][0] = pix_abs16_c;
3925 c->pix_abs[0][1] = pix_abs16_x2_c;
3926 c->pix_abs[0][2] = pix_abs16_y2_c;
3927 c->pix_abs[0][3] = pix_abs16_xy2_c;
3928 c->pix_abs[1][0] = pix_abs8_c;
3929 c->pix_abs[1][1] = pix_abs8_x2_c;
3930 c->pix_abs[1][2] = pix_abs8_y2_c;
3931 c->pix_abs[1][3] = pix_abs8_xy2_c;
3933 #define dspfunc(PFX, IDX, NUM) \
3934 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3935 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3936 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3937 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3939 dspfunc(put, 0, 16);
3940 dspfunc(put_no_rnd, 0, 16);
3942 dspfunc(put_no_rnd, 1, 8);
3946 dspfunc(avg, 0, 16);
3947 dspfunc(avg_no_rnd, 0, 16);
3949 dspfunc(avg_no_rnd, 1, 8);
3954 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3955 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3957 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3958 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3959 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3960 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3961 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3962 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3963 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3964 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3965 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3967 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3968 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3969 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3970 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3971 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3972 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3973 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3974 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3975 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3977 #define dspfunc(PFX, IDX, NUM) \
3978 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3979 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3980 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3981 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3982 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3983 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3984 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3985 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3986 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3987 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3988 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3989 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3990 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3991 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3992 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3993 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3995 dspfunc(put_qpel, 0, 16);
3996 dspfunc(put_no_rnd_qpel, 0, 16);
3998 dspfunc(avg_qpel, 0, 16);
3999 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4001 dspfunc(put_qpel, 1, 8);
4002 dspfunc(put_no_rnd_qpel, 1, 8);
4004 dspfunc(avg_qpel, 1, 8);
4005 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4007 dspfunc(put_h264_qpel, 0, 16);
4008 dspfunc(put_h264_qpel, 1, 8);
4009 dspfunc(put_h264_qpel, 2, 4);
4010 dspfunc(put_h264_qpel, 3, 2);
4011 dspfunc(avg_h264_qpel, 0, 16);
4012 dspfunc(avg_h264_qpel, 1, 8);
4013 dspfunc(avg_h264_qpel, 2, 4);
4016 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4017 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4018 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4019 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4020 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4021 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4022 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4024 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4025 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4026 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4027 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4028 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4029 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4030 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4031 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4032 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4033 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4034 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4035 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4036 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4037 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4038 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4039 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4040 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4041 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4042 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4043 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4045 #ifdef CONFIG_CAVS_DECODER
4046 ff_cavsdsp_init(c,avctx);
4048 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4049 ff_vc1dsp_init(c,avctx);
4051 #if defined(CONFIG_H264_ENCODER)
4052 ff_h264dsp_init(c,avctx);
4055 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4056 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4057 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4058 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4059 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4060 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4061 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4062 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4064 #define SET_CMP_FUNC(name) \
4065 c->name[0]= name ## 16_c;\
4066 c->name[1]= name ## 8x8_c;
4068 SET_CMP_FUNC(hadamard8_diff)
4069 c->hadamard8_diff[4]= hadamard8_intra16_c;
4070 SET_CMP_FUNC(dct_sad)
4071 SET_CMP_FUNC(dct_max)
4073 SET_CMP_FUNC(dct264_sad)
4075 c->sad[0]= pix_abs16_c;
4076 c->sad[1]= pix_abs8_c;
4080 SET_CMP_FUNC(quant_psnr)
4083 c->vsad[0]= vsad16_c;
4084 c->vsad[4]= vsad_intra16_c;
4085 c->vsse[0]= vsse16_c;
4086 c->vsse[4]= vsse_intra16_c;
4087 c->nsse[0]= nsse16_c;
4088 c->nsse[1]= nsse8_c;
4089 #ifdef CONFIG_SNOW_ENCODER
4090 c->w53[0]= w53_16_c;
4092 c->w97[0]= w97_16_c;
4096 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4098 c->add_bytes= add_bytes_c;
4099 c->diff_bytes= diff_bytes_c;
4100 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4101 c->bswap_buf= bswap_buf;
4103 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4104 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4105 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4106 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4107 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4108 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4109 c->h264_loop_filter_strength= NULL;
4111 if (ENABLE_ANY_H263) {
4112 c->h263_h_loop_filter= h263_h_loop_filter_c;
4113 c->h263_v_loop_filter= h263_v_loop_filter_c;
4116 c->h261_loop_filter= h261_loop_filter_c;
4118 c->try_8x8basis= try_8x8basis_c;
4119 c->add_8x8basis= add_8x8basis_c;
4121 #ifdef CONFIG_SNOW_DECODER
4122 c->vertical_compose97i = ff_snow_vertical_compose97i;
4123 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4124 c->inner_add_yblock = ff_snow_inner_add_yblock;
4127 #ifdef CONFIG_VORBIS_DECODER
4128 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4130 c->vector_fmul = vector_fmul_c;
4131 c->vector_fmul_reverse = vector_fmul_reverse_c;
4132 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4133 c->float_to_int16 = ff_float_to_int16_c;
4135 c->shrink[0]= ff_img_copy_plane;
4136 c->shrink[1]= ff_shrink22;
4137 c->shrink[2]= ff_shrink44;
4138 c->shrink[3]= ff_shrink88;
4140 c->prefetch= just_return;
4142 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4143 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4145 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4146 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4147 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4148 if (ENABLE_SPARC) dsputil_init_vis (c, avctx);
4149 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4150 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4151 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4152 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4153 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4155 for(i=0; i<64; i++){
4156 if(!c->put_2tap_qpel_pixels_tab[0][i])
4157 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4158 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4159 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4162 switch(c->idct_permutation_type){
4163 case FF_NO_IDCT_PERM:
4165 c->idct_permutation[i]= i;
4167 case FF_LIBMPEG2_IDCT_PERM:
4169 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4171 case FF_SIMPLE_IDCT_PERM:
4173 c->idct_permutation[i]= simple_mmx_permutation[i];
4175 case FF_TRANSPOSE_IDCT_PERM:
4177 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4179 case FF_PARTTRANS_IDCT_PERM:
4181 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4184 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");