3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 const uint8_t ff_zigzag_direct[64] = {
47 0, 1, 8, 16, 9, 2, 3, 10,
48 17, 24, 32, 25, 18, 11, 4, 5,
49 12, 19, 26, 33, 40, 48, 41, 34,
50 27, 20, 13, 6, 7, 14, 21, 28,
51 35, 42, 49, 56, 57, 50, 43, 36,
52 29, 22, 15, 23, 30, 37, 44, 51,
53 58, 59, 52, 45, 38, 31, 39, 46,
54 53, 60, 61, 54, 47, 55, 62, 63
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96 const uint32_t ff_inverse[256]={
97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
131 /* Input permutation for the simple_idct_mmx */
132 static const uint8_t simple_mmx_permutation[64]={
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
143 static int pix_sum_c(uint8_t * pix, int line_size)
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 pix += line_size - 16;
165 static int pix_norm1_c(uint8_t * pix, int line_size)
168 uint32_t *sq = ff_squareTbl + 256;
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
183 #if LONG_MAX > 2147483647
184 register uint64_t x=*(uint64_t*)pix;
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 s += sq[(x>>32)&0xff];
190 s += sq[(x>>40)&0xff];
191 s += sq[(x>>48)&0xff];
192 s += sq[(x>>56)&0xff];
194 register uint32_t x=*(uint32_t*)pix;
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199 x=*(uint32_t*)(pix+4);
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
208 pix += line_size - 16;
213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
216 for(i=0; i+8<=w; i+=8){
217 dst[i+0]= bswap_32(src[i+0]);
218 dst[i+1]= bswap_32(src[i+1]);
219 dst[i+2]= bswap_32(src[i+2]);
220 dst[i+3]= bswap_32(src[i+3]);
221 dst[i+4]= bswap_32(src[i+4]);
222 dst[i+5]= bswap_32(src[i+5]);
223 dst[i+6]= bswap_32(src[i+6]);
224 dst[i+7]= bswap_32(src[i+7]);
227 dst[i+0]= bswap_32(src[i+0]);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
303 const int dec_count= w==8 ? 3 : 4;
306 static const int scale[2][2][4][4]={
310 {268, 239, 239, 213},
314 // 9/7 16x16 or 32x32 dec=4
315 {344, 310, 310, 280},
323 {275, 245, 245, 218},
327 // 5/3 16x16 or 32x32 dec=4
328 {352, 317, 317, 286},
336 for (i = 0; i < h; i++) {
337 for (j = 0; j < w; j+=4) {
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
351 for(level=0; level<dec_count; level++){
352 for(ori= level ? 1 : 0; ori<4; ori++){
353 int size= w>>(dec_count-level);
354 int sx= (ori&1) ? size : 0;
355 int stride= 32<<(dec_count-level);
356 int sy= (ori&2) ? stride>>1 : 0;
358 for(i=0; i<size; i++){
359 for(j=0; j<size; j++){
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371 return w_c(v, pix1, pix2, line_size, 8, h, 1);
374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 0);
378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 16, h, 1);
382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 0);
386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 32, h, 1);
390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 0);
395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
399 /* read the pixels */
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415 const uint8_t *s2, int stride){
418 /* read the pixels */
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
439 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
441 /* read the pixels */
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
463 /* read the pixels */
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
479 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
481 /* read the pixels */
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
492 uint8_t *restrict pixels,
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
501 else if (*block > 127)
504 *pixels = (uint8_t)(*block + 128);
508 pixels += (line_size - 8);
512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
516 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
518 /* read the pixels */
520 pixels[0] = cm[pixels[0] + block[0]];
521 pixels[1] = cm[pixels[1] + block[1]];
522 pixels[2] = cm[pixels[2] + block[2]];
523 pixels[3] = cm[pixels[3] + block[3]];
524 pixels[4] = cm[pixels[4] + block[4]];
525 pixels[5] = cm[pixels[5] + block[5]];
526 pixels[6] = cm[pixels[6] + block[6]];
527 pixels[7] = cm[pixels[7] + block[7]];
533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
539 /* read the pixels */
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
554 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
556 /* read the pixels */
558 pixels[0] = cm[pixels[0] + block[0]];
559 pixels[1] = cm[pixels[1] + block[1]];
565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
569 pixels[0] += block[0];
570 pixels[1] += block[1];
571 pixels[2] += block[2];
572 pixels[3] += block[3];
573 pixels[4] += block[4];
574 pixels[5] += block[5];
575 pixels[6] += block[6];
576 pixels[7] += block[7];
582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
586 pixels[0] += block[0];
587 pixels[1] += block[1];
588 pixels[2] += block[2];
589 pixels[3] += block[3];
595 static int sum_abs_dctelem_c(DCTELEM *block)
599 sum+= FFABS(block[i]);
605 #define PIXOP2(OPNAME, OP) \
606 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
610 OP(*((uint64_t*)block), LD64(pixels));\
616 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
620 const uint64_t a= LD64(pixels );\
621 const uint64_t b= LD64(pixels+1);\
622 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
628 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
632 const uint64_t a= LD64(pixels );\
633 const uint64_t b= LD64(pixels+1);\
634 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
640 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
644 const uint64_t a= LD64(pixels );\
645 const uint64_t b= LD64(pixels+line_size);\
646 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
652 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
656 const uint64_t a= LD64(pixels );\
657 const uint64_t b= LD64(pixels+line_size);\
658 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
664 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
667 const uint64_t a= LD64(pixels );\
668 const uint64_t b= LD64(pixels+1);\
669 uint64_t l0= (a&0x0303030303030303ULL)\
670 + (b&0x0303030303030303ULL)\
671 + 0x0202020202020202ULL;\
672 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
677 for(i=0; i<h; i+=2){\
678 uint64_t a= LD64(pixels );\
679 uint64_t b= LD64(pixels+1);\
680 l1= (a&0x0303030303030303ULL)\
681 + (b&0x0303030303030303ULL);\
682 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
689 l0= (a&0x0303030303030303ULL)\
690 + (b&0x0303030303030303ULL)\
691 + 0x0202020202020202ULL;\
692 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
693 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
694 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
700 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
703 const uint64_t a= LD64(pixels );\
704 const uint64_t b= LD64(pixels+1);\
705 uint64_t l0= (a&0x0303030303030303ULL)\
706 + (b&0x0303030303030303ULL)\
707 + 0x0101010101010101ULL;\
708 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
713 for(i=0; i<h; i+=2){\
714 uint64_t a= LD64(pixels );\
715 uint64_t b= LD64(pixels+1);\
716 l1= (a&0x0303030303030303ULL)\
717 + (b&0x0303030303030303ULL);\
718 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
725 l0= (a&0x0303030303030303ULL)\
726 + (b&0x0303030303030303ULL)\
727 + 0x0101010101010101ULL;\
728 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
729 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
730 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
736 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
737 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
738 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
739 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
740 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
741 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
744 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
745 #else // 64 bit variant
747 #define PIXOP2(OPNAME, OP) \
748 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
751 OP(*((uint16_t*)(block )), LD16(pixels ));\
756 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
759 OP(*((uint32_t*)(block )), LD32(pixels ));\
764 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
767 OP(*((uint32_t*)(block )), LD32(pixels ));\
768 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
773 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
774 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
777 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
778 int src_stride1, int src_stride2, int h){\
782 a= LD32(&src1[i*src_stride1 ]);\
783 b= LD32(&src2[i*src_stride2 ]);\
784 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
785 a= LD32(&src1[i*src_stride1+4]);\
786 b= LD32(&src2[i*src_stride2+4]);\
787 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
791 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
792 int src_stride1, int src_stride2, int h){\
796 a= LD32(&src1[i*src_stride1 ]);\
797 b= LD32(&src2[i*src_stride2 ]);\
798 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
799 a= LD32(&src1[i*src_stride1+4]);\
800 b= LD32(&src2[i*src_stride2+4]);\
801 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
805 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
806 int src_stride1, int src_stride2, int h){\
810 a= LD32(&src1[i*src_stride1 ]);\
811 b= LD32(&src2[i*src_stride2 ]);\
812 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
816 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
817 int src_stride1, int src_stride2, int h){\
821 a= LD16(&src1[i*src_stride1 ]);\
822 b= LD16(&src2[i*src_stride2 ]);\
823 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
827 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
828 int src_stride1, int src_stride2, int h){\
829 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
830 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
833 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
834 int src_stride1, int src_stride2, int h){\
835 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
836 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
839 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
843 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
851 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
855 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
856 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
859 uint32_t a, b, c, d, l0, l1, h0, h1;\
860 a= LD32(&src1[i*src_stride1]);\
861 b= LD32(&src2[i*src_stride2]);\
862 c= LD32(&src3[i*src_stride3]);\
863 d= LD32(&src4[i*src_stride4]);\
864 l0= (a&0x03030303UL)\
867 h0= ((a&0xFCFCFCFCUL)>>2)\
868 + ((b&0xFCFCFCFCUL)>>2);\
869 l1= (c&0x03030303UL)\
871 h1= ((c&0xFCFCFCFCUL)>>2)\
872 + ((d&0xFCFCFCFCUL)>>2);\
873 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
874 a= LD32(&src1[i*src_stride1+4]);\
875 b= LD32(&src2[i*src_stride2+4]);\
876 c= LD32(&src3[i*src_stride3+4]);\
877 d= LD32(&src4[i*src_stride4+4]);\
878 l0= (a&0x03030303UL)\
881 h0= ((a&0xFCFCFCFCUL)>>2)\
882 + ((b&0xFCFCFCFCUL)>>2);\
883 l1= (c&0x03030303UL)\
885 h1= ((c&0xFCFCFCFCUL)>>2)\
886 + ((d&0xFCFCFCFCUL)>>2);\
887 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
891 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
895 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
908 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
911 uint32_t a, b, c, d, l0, l1, h0, h1;\
912 a= LD32(&src1[i*src_stride1]);\
913 b= LD32(&src2[i*src_stride2]);\
914 c= LD32(&src3[i*src_stride3]);\
915 d= LD32(&src4[i*src_stride4]);\
916 l0= (a&0x03030303UL)\
919 h0= ((a&0xFCFCFCFCUL)>>2)\
920 + ((b&0xFCFCFCFCUL)>>2);\
921 l1= (c&0x03030303UL)\
923 h1= ((c&0xFCFCFCFCUL)>>2)\
924 + ((d&0xFCFCFCFCUL)>>2);\
925 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
926 a= LD32(&src1[i*src_stride1+4]);\
927 b= LD32(&src2[i*src_stride2+4]);\
928 c= LD32(&src3[i*src_stride3+4]);\
929 d= LD32(&src4[i*src_stride4+4]);\
930 l0= (a&0x03030303UL)\
933 h0= ((a&0xFCFCFCFCUL)>>2)\
934 + ((b&0xFCFCFCFCUL)>>2);\
935 l1= (c&0x03030303UL)\
937 h1= ((c&0xFCFCFCFCUL)>>2)\
938 + ((d&0xFCFCFCFCUL)>>2);\
939 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
942 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
943 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
944 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
945 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
947 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
948 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
949 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
950 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
953 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
955 int i, a0, b0, a1, b1;\
962 for(i=0; i<h; i+=2){\
968 block[0]= (a1+a0)>>2; /* FIXME non put */\
969 block[1]= (b1+b0)>>2;\
979 block[0]= (a1+a0)>>2;\
980 block[1]= (b1+b0)>>2;\
986 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
989 const uint32_t a= LD32(pixels );\
990 const uint32_t b= LD32(pixels+1);\
991 uint32_t l0= (a&0x03030303UL)\
994 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
995 + ((b&0xFCFCFCFCUL)>>2);\
999 for(i=0; i<h; i+=2){\
1000 uint32_t a= LD32(pixels );\
1001 uint32_t b= LD32(pixels+1);\
1002 l1= (a&0x03030303UL)\
1003 + (b&0x03030303UL);\
1004 h1= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 l0= (a&0x03030303UL)\
1014 h0= ((a&0xFCFCFCFCUL)>>2)\
1015 + ((b&0xFCFCFCFCUL)>>2);\
1016 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1022 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1025 for(j=0; j<2; j++){\
1027 const uint32_t a= LD32(pixels );\
1028 const uint32_t b= LD32(pixels+1);\
1029 uint32_t l0= (a&0x03030303UL)\
1032 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1037 for(i=0; i<h; i+=2){\
1038 uint32_t a= LD32(pixels );\
1039 uint32_t b= LD32(pixels+1);\
1040 l1= (a&0x03030303UL)\
1041 + (b&0x03030303UL);\
1042 h1= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 l0= (a&0x03030303UL)\
1052 h0= ((a&0xFCFCFCFCUL)>>2)\
1053 + ((b&0xFCFCFCFCUL)>>2);\
1054 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1058 pixels+=4-line_size*(h+1);\
1059 block +=4-line_size*h;\
1063 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1066 for(j=0; j<2; j++){\
1068 const uint32_t a= LD32(pixels );\
1069 const uint32_t b= LD32(pixels+1);\
1070 uint32_t l0= (a&0x03030303UL)\
1073 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1078 for(i=0; i<h; i+=2){\
1079 uint32_t a= LD32(pixels );\
1080 uint32_t b= LD32(pixels+1);\
1081 l1= (a&0x03030303UL)\
1082 + (b&0x03030303UL);\
1083 h1= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090 l0= (a&0x03030303UL)\
1093 h0= ((a&0xFCFCFCFCUL)>>2)\
1094 + ((b&0xFCFCFCFCUL)>>2);\
1095 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1099 pixels+=4-line_size*(h+1);\
1100 block +=4-line_size*h;\
1104 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1105 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1106 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1107 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1108 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1113 #define op_avg(a, b) a = rnd_avg32(a, b)
1115 #define op_put(a, b) a = b
1122 #define avg2(a,b) ((a+b+1)>>1)
1123 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1125 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1126 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1129 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1133 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1135 const int A=(16-x16)*(16-y16);
1136 const int B=( x16)*(16-y16);
1137 const int C=(16-x16)*( y16);
1138 const int D=( x16)*( y16);
1143 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1144 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1145 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1146 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1147 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1148 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1149 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1150 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1156 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1157 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1160 const int s= 1<<shift;
1170 for(x=0; x<8; x++){ //XXX FIXME optimize
1171 int src_x, src_y, frac_x, frac_y, index;
1175 frac_x= src_x&(s-1);
1176 frac_y= src_y&(s-1);
1180 if((unsigned)src_x < width){
1181 if((unsigned)src_y < height){
1182 index= src_x + src_y*stride;
1183 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1184 + src[index +1]* frac_x )*(s-frac_y)
1185 + ( src[index+stride ]*(s-frac_x)
1186 + src[index+stride+1]* frac_x )* frac_y
1189 index= src_x + av_clip(src_y, 0, height)*stride;
1190 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1191 + src[index +1]* frac_x )*s
1195 if((unsigned)src_y < height){
1196 index= av_clip(src_x, 0, width) + src_y*stride;
1197 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1198 + src[index+stride ]* frac_y )*s
1201 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1202 dst[y*stride + x]= src[index ];
1214 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 case 2: put_pixels2_c (dst, src, stride, height); break;
1217 case 4: put_pixels4_c (dst, src, stride, height); break;
1218 case 8: put_pixels8_c (dst, src, stride, height); break;
1219 case 16:put_pixels16_c(dst, src, stride, height); break;
1223 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225 for (i=0; i < height; i++) {
1226 for (j=0; j < width; j++) {
1227 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1234 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236 for (i=0; i < height; i++) {
1237 for (j=0; j < width; j++) {
1238 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1245 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247 for (i=0; i < height; i++) {
1248 for (j=0; j < width; j++) {
1249 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1256 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258 for (i=0; i < height; i++) {
1259 for (j=0; j < width; j++) {
1260 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1267 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269 for (i=0; i < height; i++) {
1270 for (j=0; j < width; j++) {
1271 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1278 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1280 for (i=0; i < height; i++) {
1281 for (j=0; j < width; j++) {
1282 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1289 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1291 for (i=0; i < height; i++) {
1292 for (j=0; j < width; j++) {
1293 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1300 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1302 for (i=0; i < height; i++) {
1303 for (j=0; j < width; j++) {
1304 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1311 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 case 2: avg_pixels2_c (dst, src, stride, height); break;
1314 case 4: avg_pixels4_c (dst, src, stride, height); break;
1315 case 8: avg_pixels8_c (dst, src, stride, height); break;
1316 case 16:avg_pixels16_c(dst, src, stride, height); break;
1320 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322 for (i=0; i < height; i++) {
1323 for (j=0; j < width; j++) {
1324 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1331 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333 for (i=0; i < height; i++) {
1334 for (j=0; j < width; j++) {
1335 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1342 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344 for (i=0; i < height; i++) {
1345 for (j=0; j < width; j++) {
1346 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1353 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355 for (i=0; i < height; i++) {
1356 for (j=0; j < width; j++) {
1357 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1364 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1366 for (i=0; i < height; i++) {
1367 for (j=0; j < width; j++) {
1368 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1375 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1377 for (i=0; i < height; i++) {
1378 for (j=0; j < width; j++) {
1379 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1386 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1388 for (i=0; i < height; i++) {
1389 for (j=0; j < width; j++) {
1390 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1397 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1399 for (i=0; i < height; i++) {
1400 for (j=0; j < width; j++) {
1401 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1408 #define TPEL_WIDTH(width)\
1409 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1429 #define H264_CHROMA_MC(OPNAME, OP)\
1430 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1431 const int A=(8-x)*(8-y);\
1432 const int B=( x)*(8-y);\
1433 const int C=(8-x)*( y);\
1434 const int D=( x)*( y);\
1437 assert(x<8 && y<8 && x>=0 && y>=0);\
1441 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1442 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1449 const int A=(8-x)*(8-y);\
1450 const int B=( x)*(8-y);\
1451 const int C=(8-x)*( y);\
1452 const int D=( x)*( y);\
1455 assert(x<8 && y<8 && x>=0 && y>=0);\
1459 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1460 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1461 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1462 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1468 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1469 const int A=(8-x)*(8-y);\
1470 const int B=( x)*(8-y);\
1471 const int C=(8-x)*( y);\
1472 const int D=( x)*( y);\
1475 assert(x<8 && y<8 && x>=0 && y>=0);\
1479 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1480 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1481 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1482 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1483 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1484 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1485 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1486 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1492 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1493 #define op_put(a, b) a = (((b) + 32)>>6)
1495 H264_CHROMA_MC(put_ , op_put)
1496 H264_CHROMA_MC(avg_ , op_avg)
1500 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1501 const int A=(8-x)*(8-y);
1502 const int B=( x)*(8-y);
1503 const int C=(8-x)*( y);
1504 const int D=( x)*( y);
1507 assert(x<8 && y<8 && x>=0 && y>=0);
1511 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1512 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1513 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1514 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1515 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1516 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1517 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1518 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1524 #define QPEL_MC(r, OPNAME, RND, OP) \
1525 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1526 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1530 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1531 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1532 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1533 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1534 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1535 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1536 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1537 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1543 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1545 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1549 const int src0= src[0*srcStride];\
1550 const int src1= src[1*srcStride];\
1551 const int src2= src[2*srcStride];\
1552 const int src3= src[3*srcStride];\
1553 const int src4= src[4*srcStride];\
1554 const int src5= src[5*srcStride];\
1555 const int src6= src[6*srcStride];\
1556 const int src7= src[7*srcStride];\
1557 const int src8= src[8*srcStride];\
1558 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1559 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1560 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1561 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1562 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1563 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1564 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1565 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1571 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1572 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1577 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1578 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1579 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1580 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1581 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1582 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1583 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1584 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1585 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1586 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1587 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1588 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1589 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1590 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1591 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1592 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1598 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1599 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1604 const int src0= src[0*srcStride];\
1605 const int src1= src[1*srcStride];\
1606 const int src2= src[2*srcStride];\
1607 const int src3= src[3*srcStride];\
1608 const int src4= src[4*srcStride];\
1609 const int src5= src[5*srcStride];\
1610 const int src6= src[6*srcStride];\
1611 const int src7= src[7*srcStride];\
1612 const int src8= src[8*srcStride];\
1613 const int src9= src[9*srcStride];\
1614 const int src10= src[10*srcStride];\
1615 const int src11= src[11*srcStride];\
1616 const int src12= src[12*srcStride];\
1617 const int src13= src[13*srcStride];\
1618 const int src14= src[14*srcStride];\
1619 const int src15= src[15*srcStride];\
1620 const int src16= src[16*srcStride];\
1621 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1622 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1623 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1624 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1625 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1626 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1627 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1628 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1629 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1630 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1631 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1632 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1633 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1634 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1635 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1636 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1642 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1643 OPNAME ## pixels8_c(dst, src, stride, 8);\
1646 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1648 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1649 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1652 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1653 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1656 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1658 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1659 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1662 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1663 uint8_t full[16*9];\
1665 copy_block9(full, src, 16, stride, 9);\
1666 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1667 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1670 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1671 uint8_t full[16*9];\
1672 copy_block9(full, src, 16, stride, 9);\
1673 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1676 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1677 uint8_t full[16*9];\
1679 copy_block9(full, src, 16, stride, 9);\
1680 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1681 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1683 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1684 uint8_t full[16*9];\
1687 uint8_t halfHV[64];\
1688 copy_block9(full, src, 16, stride, 9);\
1689 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1690 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1691 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1692 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1694 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1695 uint8_t full[16*9];\
1697 uint8_t halfHV[64];\
1698 copy_block9(full, src, 16, stride, 9);\
1699 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1700 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1701 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1702 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1704 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1705 uint8_t full[16*9];\
1708 uint8_t halfHV[64];\
1709 copy_block9(full, src, 16, stride, 9);\
1710 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1711 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1712 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1713 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1715 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1716 uint8_t full[16*9];\
1718 uint8_t halfHV[64];\
1719 copy_block9(full, src, 16, stride, 9);\
1720 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1721 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1722 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1723 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1725 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t full[16*9];\
1729 uint8_t halfHV[64];\
1730 copy_block9(full, src, 16, stride, 9);\
1731 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[16*9];\
1739 uint8_t halfHV[64];\
1740 copy_block9(full, src, 16, stride, 9);\
1741 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1746 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747 uint8_t full[16*9];\
1750 uint8_t halfHV[64];\
1751 copy_block9(full, src, 16, stride, 9);\
1752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1753 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[16*9];\
1760 uint8_t halfHV[64];\
1761 copy_block9(full, src, 16, stride, 9);\
1762 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1767 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t halfHV[64];\
1770 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1771 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1772 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1774 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1776 uint8_t halfHV[64];\
1777 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1778 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1779 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1781 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1782 uint8_t full[16*9];\
1785 uint8_t halfHV[64];\
1786 copy_block9(full, src, 16, stride, 9);\
1787 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1788 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1790 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1792 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t full[16*9];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1797 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1798 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1800 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1801 uint8_t full[16*9];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1811 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t full[16*9];\
1814 copy_block9(full, src, 16, stride, 9);\
1815 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1816 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1817 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1824 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825 OPNAME ## pixels16_c(dst, src, stride, 16);\
1828 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1831 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1834 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1838 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1841 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1844 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[24*17];\
1847 copy_block17(full, src, 24, stride, 17);\
1848 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1849 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1852 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t full[24*17];\
1854 copy_block17(full, src, 24, stride, 17);\
1855 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1858 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859 uint8_t full[24*17];\
1861 copy_block17(full, src, 24, stride, 17);\
1862 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1863 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1865 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866 uint8_t full[24*17];\
1867 uint8_t halfH[272];\
1868 uint8_t halfV[256];\
1869 uint8_t halfHV[256];\
1870 copy_block17(full, src, 24, stride, 17);\
1871 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1872 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1873 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1874 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1876 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[24*17];\
1878 uint8_t halfH[272];\
1879 uint8_t halfHV[256];\
1880 copy_block17(full, src, 24, stride, 17);\
1881 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1882 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1883 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1884 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1886 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[24*17];\
1888 uint8_t halfH[272];\
1889 uint8_t halfV[256];\
1890 uint8_t halfHV[256];\
1891 copy_block17(full, src, 24, stride, 17);\
1892 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1893 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1894 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1895 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1897 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[24*17];\
1899 uint8_t halfH[272];\
1900 uint8_t halfHV[256];\
1901 copy_block17(full, src, 24, stride, 17);\
1902 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1903 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1904 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1905 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1907 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[24*17];\
1909 uint8_t halfH[272];\
1910 uint8_t halfV[256];\
1911 uint8_t halfHV[256];\
1912 copy_block17(full, src, 24, stride, 17);\
1913 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[24*17];\
1920 uint8_t halfH[272];\
1921 uint8_t halfHV[256];\
1922 copy_block17(full, src, 24, stride, 17);\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1928 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[24*17];\
1930 uint8_t halfH[272];\
1931 uint8_t halfV[256];\
1932 uint8_t halfHV[256];\
1933 copy_block17(full, src, 24, stride, 17);\
1934 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1935 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[24*17];\
1941 uint8_t halfH[272];\
1942 uint8_t halfHV[256];\
1943 copy_block17(full, src, 24, stride, 17);\
1944 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1949 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t halfH[272];\
1951 uint8_t halfHV[256];\
1952 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1953 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1954 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1956 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t halfH[272];\
1958 uint8_t halfHV[256];\
1959 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1960 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1961 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1963 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[24*17];\
1965 uint8_t halfH[272];\
1966 uint8_t halfV[256];\
1967 uint8_t halfHV[256];\
1968 copy_block17(full, src, 24, stride, 17);\
1969 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1970 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1972 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1974 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[24*17];\
1976 uint8_t halfH[272];\
1977 copy_block17(full, src, 24, stride, 17);\
1978 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1979 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1980 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1982 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[24*17];\
1984 uint8_t halfH[272];\
1985 uint8_t halfV[256];\
1986 uint8_t halfHV[256];\
1987 copy_block17(full, src, 24, stride, 17);\
1988 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1993 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t full[24*17];\
1995 uint8_t halfH[272];\
1996 copy_block17(full, src, 24, stride, 17);\
1997 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1998 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1999 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2001 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t halfH[272];\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2007 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2008 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2009 #define op_put(a, b) a = cm[((b) + 16)>>5]
2010 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2012 QPEL_MC(0, put_ , _ , op_put)
2013 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2014 QPEL_MC(0, avg_ , _ , op_avg)
2015 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2017 #undef op_avg_no_rnd
2019 #undef op_put_no_rnd
2022 #define H264_LOWPASS(OPNAME, OP, OP2) \
2023 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2025 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2029 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2030 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2036 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2038 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2042 const int srcB= src[-2*srcStride];\
2043 const int srcA= src[-1*srcStride];\
2044 const int src0= src[0 *srcStride];\
2045 const int src1= src[1 *srcStride];\
2046 const int src2= src[2 *srcStride];\
2047 const int src3= src[3 *srcStride];\
2048 const int src4= src[4 *srcStride];\
2049 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2050 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2056 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2059 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2061 src -= 2*srcStride;\
2062 for(i=0; i<h+5; i++)\
2064 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2065 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2069 tmp -= tmpStride*(h+5-2);\
2072 const int tmpB= tmp[-2*tmpStride];\
2073 const int tmpA= tmp[-1*tmpStride];\
2074 const int tmp0= tmp[0 *tmpStride];\
2075 const int tmp1= tmp[1 *tmpStride];\
2076 const int tmp2= tmp[2 *tmpStride];\
2077 const int tmp3= tmp[3 *tmpStride];\
2078 const int tmp4= tmp[4 *tmpStride];\
2079 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2080 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2085 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2087 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2091 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2092 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2093 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2094 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2100 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2102 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2106 const int srcB= src[-2*srcStride];\
2107 const int srcA= src[-1*srcStride];\
2108 const int src0= src[0 *srcStride];\
2109 const int src1= src[1 *srcStride];\
2110 const int src2= src[2 *srcStride];\
2111 const int src3= src[3 *srcStride];\
2112 const int src4= src[4 *srcStride];\
2113 const int src5= src[5 *srcStride];\
2114 const int src6= src[6 *srcStride];\
2115 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2116 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2117 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2118 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2124 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2127 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2129 src -= 2*srcStride;\
2130 for(i=0; i<h+5; i++)\
2132 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2133 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2134 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2135 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2139 tmp -= tmpStride*(h+5-2);\
2142 const int tmpB= tmp[-2*tmpStride];\
2143 const int tmpA= tmp[-1*tmpStride];\
2144 const int tmp0= tmp[0 *tmpStride];\
2145 const int tmp1= tmp[1 *tmpStride];\
2146 const int tmp2= tmp[2 *tmpStride];\
2147 const int tmp3= tmp[3 *tmpStride];\
2148 const int tmp4= tmp[4 *tmpStride];\
2149 const int tmp5= tmp[5 *tmpStride];\
2150 const int tmp6= tmp[6 *tmpStride];\
2151 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2152 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2153 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2154 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2160 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2166 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2167 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2168 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2169 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2170 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2171 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2172 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2173 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2179 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2181 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2185 const int srcB= src[-2*srcStride];\
2186 const int srcA= src[-1*srcStride];\
2187 const int src0= src[0 *srcStride];\
2188 const int src1= src[1 *srcStride];\
2189 const int src2= src[2 *srcStride];\
2190 const int src3= src[3 *srcStride];\
2191 const int src4= src[4 *srcStride];\
2192 const int src5= src[5 *srcStride];\
2193 const int src6= src[6 *srcStride];\
2194 const int src7= src[7 *srcStride];\
2195 const int src8= src[8 *srcStride];\
2196 const int src9= src[9 *srcStride];\
2197 const int src10=src[10*srcStride];\
2198 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2199 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2200 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2201 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2202 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2203 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2204 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2205 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2211 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2214 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2216 src -= 2*srcStride;\
2217 for(i=0; i<h+5; i++)\
2219 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2220 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2221 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2222 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2223 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2224 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2225 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2226 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2230 tmp -= tmpStride*(h+5-2);\
2233 const int tmpB= tmp[-2*tmpStride];\
2234 const int tmpA= tmp[-1*tmpStride];\
2235 const int tmp0= tmp[0 *tmpStride];\
2236 const int tmp1= tmp[1 *tmpStride];\
2237 const int tmp2= tmp[2 *tmpStride];\
2238 const int tmp3= tmp[3 *tmpStride];\
2239 const int tmp4= tmp[4 *tmpStride];\
2240 const int tmp5= tmp[5 *tmpStride];\
2241 const int tmp6= tmp[6 *tmpStride];\
2242 const int tmp7= tmp[7 *tmpStride];\
2243 const int tmp8= tmp[8 *tmpStride];\
2244 const int tmp9= tmp[9 *tmpStride];\
2245 const int tmp10=tmp[10*tmpStride];\
2246 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2247 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2248 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2249 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2250 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2251 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2252 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2253 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2259 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2261 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2262 src += 8*srcStride;\
2263 dst += 8*dstStride;\
2264 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2265 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2268 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2269 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2270 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2271 src += 8*srcStride;\
2272 dst += 8*dstStride;\
2273 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2274 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2277 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2278 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2279 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2280 src += 8*srcStride;\
2281 dst += 8*dstStride;\
2282 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2283 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2286 #define H264_MC(OPNAME, SIZE) \
2287 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2288 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2291 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2292 uint8_t half[SIZE*SIZE];\
2293 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2294 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2297 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2298 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t half[SIZE*SIZE];\
2303 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2304 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2307 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2308 uint8_t full[SIZE*(SIZE+5)];\
2309 uint8_t * const full_mid= full + SIZE*2;\
2310 uint8_t half[SIZE*SIZE];\
2311 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2312 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2313 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2316 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2317 uint8_t full[SIZE*(SIZE+5)];\
2318 uint8_t * const full_mid= full + SIZE*2;\
2319 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2320 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2323 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2324 uint8_t full[SIZE*(SIZE+5)];\
2325 uint8_t * const full_mid= full + SIZE*2;\
2326 uint8_t half[SIZE*SIZE];\
2327 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2328 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2329 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2333 uint8_t full[SIZE*(SIZE+5)];\
2334 uint8_t * const full_mid= full + SIZE*2;\
2335 uint8_t halfH[SIZE*SIZE];\
2336 uint8_t halfV[SIZE*SIZE];\
2337 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2338 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2339 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2343 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2344 uint8_t full[SIZE*(SIZE+5)];\
2345 uint8_t * const full_mid= full + SIZE*2;\
2346 uint8_t halfH[SIZE*SIZE];\
2347 uint8_t halfV[SIZE*SIZE];\
2348 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2349 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2350 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2351 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2354 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2355 uint8_t full[SIZE*(SIZE+5)];\
2356 uint8_t * const full_mid= full + SIZE*2;\
2357 uint8_t halfH[SIZE*SIZE];\
2358 uint8_t halfV[SIZE*SIZE];\
2359 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2360 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2361 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2362 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2365 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2366 uint8_t full[SIZE*(SIZE+5)];\
2367 uint8_t * const full_mid= full + SIZE*2;\
2368 uint8_t halfH[SIZE*SIZE];\
2369 uint8_t halfV[SIZE*SIZE];\
2370 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2371 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2372 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2377 int16_t tmp[SIZE*(SIZE+5)];\
2378 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2381 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2382 int16_t tmp[SIZE*(SIZE+5)];\
2383 uint8_t halfH[SIZE*SIZE];\
2384 uint8_t halfHV[SIZE*SIZE];\
2385 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2387 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2390 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2391 int16_t tmp[SIZE*(SIZE+5)];\
2392 uint8_t halfH[SIZE*SIZE];\
2393 uint8_t halfHV[SIZE*SIZE];\
2394 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2395 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2396 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2399 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2400 uint8_t full[SIZE*(SIZE+5)];\
2401 uint8_t * const full_mid= full + SIZE*2;\
2402 int16_t tmp[SIZE*(SIZE+5)];\
2403 uint8_t halfV[SIZE*SIZE];\
2404 uint8_t halfHV[SIZE*SIZE];\
2405 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2406 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2408 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2411 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2412 uint8_t full[SIZE*(SIZE+5)];\
2413 uint8_t * const full_mid= full + SIZE*2;\
2414 int16_t tmp[SIZE*(SIZE+5)];\
2415 uint8_t halfV[SIZE*SIZE];\
2416 uint8_t halfHV[SIZE*SIZE];\
2417 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2418 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2419 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2420 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2423 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2424 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2425 #define op_put(a, b) a = cm[((b) + 16)>>5]
2426 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2427 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2429 H264_LOWPASS(put_ , op_put, op2_put)
2430 H264_LOWPASS(avg_ , op_avg, op2_avg)
2445 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2446 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2447 #define H264_WEIGHT(W,H) \
2448 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2450 offset <<= log2_denom; \
2451 if(log2_denom) offset += 1<<(log2_denom-1); \
2452 for(y=0; y<H; y++, block += stride){ \
2455 if(W==2) continue; \
2458 if(W==4) continue; \
2463 if(W==8) continue; \
2474 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2476 offset = ((offset + 1) | 1) << log2_denom; \
2477 for(y=0; y<H; y++, dst += stride, src += stride){ \
2480 if(W==2) continue; \
2483 if(W==4) continue; \
2488 if(W==8) continue; \
2515 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2516 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2520 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2521 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2522 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2523 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2524 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2525 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2526 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2527 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2533 #ifdef CONFIG_CAVS_DECODER
2535 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2537 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2538 put_pixels8_c(dst, src, stride, 8);
2540 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2541 avg_pixels8_c(dst, src, stride, 8);
2543 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2544 put_pixels16_c(dst, src, stride, 16);
2546 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2547 avg_pixels16_c(dst, src, stride, 16);
2549 #endif /* CONFIG_CAVS_DECODER */
2551 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2553 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2555 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2556 put_pixels8_c(dst, src, stride, 8);
2558 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2560 #if defined(CONFIG_H264_ENCODER)
2562 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2563 #endif /* CONFIG_H264_ENCODER */
2565 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2566 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2570 const int src_1= src[ -srcStride];
2571 const int src0 = src[0 ];
2572 const int src1 = src[ srcStride];
2573 const int src2 = src[2*srcStride];
2574 const int src3 = src[3*srcStride];
2575 const int src4 = src[4*srcStride];
2576 const int src5 = src[5*srcStride];
2577 const int src6 = src[6*srcStride];
2578 const int src7 = src[7*srcStride];
2579 const int src8 = src[8*srcStride];
2580 const int src9 = src[9*srcStride];
2581 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2582 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2583 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2584 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2585 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2586 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2587 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2588 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2594 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2595 put_pixels8_c(dst, src, stride, 8);
2598 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2600 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2601 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2604 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2605 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2608 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2610 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2611 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2614 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2615 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2618 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2622 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2623 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2624 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2625 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2627 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2631 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2632 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2633 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2634 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2636 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2638 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2639 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2642 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2644 const int strength= ff_h263_loop_filter_strength[qscale];
2648 int p0= src[x-2*stride];
2649 int p1= src[x-1*stride];
2650 int p2= src[x+0*stride];
2651 int p3= src[x+1*stride];
2652 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2654 if (d<-2*strength) d1= 0;
2655 else if(d<- strength) d1=-2*strength - d;
2656 else if(d< strength) d1= d;
2657 else if(d< 2*strength) d1= 2*strength - d;
2662 if(p1&256) p1= ~(p1>>31);
2663 if(p2&256) p2= ~(p2>>31);
2665 src[x-1*stride] = p1;
2666 src[x+0*stride] = p2;
2670 d2= av_clip((p0-p3)/4, -ad1, ad1);
2672 src[x-2*stride] = p0 - d2;
2673 src[x+ stride] = p3 + d2;
2677 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2679 const int strength= ff_h263_loop_filter_strength[qscale];
2683 int p0= src[y*stride-2];
2684 int p1= src[y*stride-1];
2685 int p2= src[y*stride+0];
2686 int p3= src[y*stride+1];
2687 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2689 if (d<-2*strength) d1= 0;
2690 else if(d<- strength) d1=-2*strength - d;
2691 else if(d< strength) d1= d;
2692 else if(d< 2*strength) d1= 2*strength - d;
2697 if(p1&256) p1= ~(p1>>31);
2698 if(p2&256) p2= ~(p2>>31);
2700 src[y*stride-1] = p1;
2701 src[y*stride+0] = p2;
2705 d2= av_clip((p0-p3)/4, -ad1, ad1);
2707 src[y*stride-2] = p0 - d2;
2708 src[y*stride+1] = p3 + d2;
2712 static void h261_loop_filter_c(uint8_t *src, int stride){
2717 temp[x ] = 4*src[x ];
2718 temp[x + 7*8] = 4*src[x + 7*stride];
2722 xy = y * stride + x;
2724 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2729 src[ y*stride] = (temp[ y*8] + 2)>>2;
2730 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2732 xy = y * stride + x;
2734 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2739 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2742 for( i = 0; i < 4; i++ ) {
2747 for( d = 0; d < 4; d++ ) {
2748 const int p0 = pix[-1*xstride];
2749 const int p1 = pix[-2*xstride];
2750 const int p2 = pix[-3*xstride];
2751 const int q0 = pix[0];
2752 const int q1 = pix[1*xstride];
2753 const int q2 = pix[2*xstride];
2755 if( FFABS( p0 - q0 ) < alpha &&
2756 FFABS( p1 - p0 ) < beta &&
2757 FFABS( q1 - q0 ) < beta ) {
2762 if( FFABS( p2 - p0 ) < beta ) {
2763 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2766 if( FFABS( q2 - q0 ) < beta ) {
2767 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2771 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2772 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2773 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2779 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2781 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2783 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2785 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2788 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2791 for( i = 0; i < 4; i++ ) {
2792 const int tc = tc0[i];
2797 for( d = 0; d < 2; d++ ) {
2798 const int p0 = pix[-1*xstride];
2799 const int p1 = pix[-2*xstride];
2800 const int q0 = pix[0];
2801 const int q1 = pix[1*xstride];
2803 if( FFABS( p0 - q0 ) < alpha &&
2804 FFABS( p1 - p0 ) < beta &&
2805 FFABS( q1 - q0 ) < beta ) {
2807 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2809 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2810 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2816 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2818 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2820 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2822 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2825 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2828 for( d = 0; d < 8; d++ ) {
2829 const int p0 = pix[-1*xstride];
2830 const int p1 = pix[-2*xstride];
2831 const int q0 = pix[0];
2832 const int q1 = pix[1*xstride];
2834 if( FFABS( p0 - q0 ) < alpha &&
2835 FFABS( p1 - p0 ) < beta &&
2836 FFABS( q1 - q0 ) < beta ) {
2838 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2839 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2844 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2846 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2848 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2850 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2853 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2859 s += abs(pix1[0] - pix2[0]);
2860 s += abs(pix1[1] - pix2[1]);
2861 s += abs(pix1[2] - pix2[2]);
2862 s += abs(pix1[3] - pix2[3]);
2863 s += abs(pix1[4] - pix2[4]);
2864 s += abs(pix1[5] - pix2[5]);
2865 s += abs(pix1[6] - pix2[6]);
2866 s += abs(pix1[7] - pix2[7]);
2867 s += abs(pix1[8] - pix2[8]);
2868 s += abs(pix1[9] - pix2[9]);
2869 s += abs(pix1[10] - pix2[10]);
2870 s += abs(pix1[11] - pix2[11]);
2871 s += abs(pix1[12] - pix2[12]);
2872 s += abs(pix1[13] - pix2[13]);
2873 s += abs(pix1[14] - pix2[14]);
2874 s += abs(pix1[15] - pix2[15]);
2881 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2887 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2888 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2889 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2890 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2891 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2892 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2893 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2894 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2895 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2896 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2897 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2898 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2899 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2900 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2901 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2902 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2909 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2912 uint8_t *pix3 = pix2 + line_size;
2916 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2917 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2918 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2919 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2920 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2921 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2922 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2923 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2924 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2925 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2926 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2927 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2928 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2929 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2930 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2931 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2939 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2942 uint8_t *pix3 = pix2 + line_size;
2946 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2947 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2948 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2949 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2950 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2951 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2952 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2953 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2954 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2955 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2956 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2957 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2958 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2959 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2960 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2961 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2969 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2975 s += abs(pix1[0] - pix2[0]);
2976 s += abs(pix1[1] - pix2[1]);
2977 s += abs(pix1[2] - pix2[2]);
2978 s += abs(pix1[3] - pix2[3]);
2979 s += abs(pix1[4] - pix2[4]);
2980 s += abs(pix1[5] - pix2[5]);
2981 s += abs(pix1[6] - pix2[6]);
2982 s += abs(pix1[7] - pix2[7]);
2989 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2995 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2996 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2997 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2998 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2999 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3000 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3001 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3002 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3009 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3012 uint8_t *pix3 = pix2 + line_size;
3016 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3017 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3018 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3019 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3020 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3021 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3022 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3023 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3031 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3034 uint8_t *pix3 = pix2 + line_size;
3038 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3039 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3040 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3041 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3042 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3043 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3044 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3045 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3053 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3054 MpegEncContext *c = v;
3060 for(x=0; x<16; x++){
3061 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3064 for(x=0; x<15; x++){
3065 score2+= FFABS( s1[x ] - s1[x +stride]
3066 - s1[x+1] + s1[x+1+stride])
3067 -FFABS( s2[x ] - s2[x +stride]
3068 - s2[x+1] + s2[x+1+stride]);
3075 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3076 else return score1 + FFABS(score2)*8;
3079 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3080 MpegEncContext *c = v;
3087 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3091 score2+= FFABS( s1[x ] - s1[x +stride]
3092 - s1[x+1] + s1[x+1+stride])
3093 -FFABS( s2[x ] - s2[x +stride]
3094 - s2[x+1] + s2[x+1+stride]);
3101 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3102 else return score1 + FFABS(score2)*8;
3105 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3109 for(i=0; i<8*8; i++){
3110 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3113 assert(-512<b && b<512);
3115 sum += (w*b)*(w*b)>>4;
3120 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3123 for(i=0; i<8*8; i++){
3124 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3129 * permutes an 8x8 block.
3130 * @param block the block which will be permuted according to the given permutation vector
3131 * @param permutation the permutation vector
3132 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3133 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3134 * (inverse) permutated to scantable order!
3136 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3142 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3144 for(i=0; i<=last; i++){
3145 const int j= scantable[i];
3150 for(i=0; i<=last; i++){
3151 const int j= scantable[i];
3152 const int perm_j= permutation[j];
3153 block[perm_j]= temp[j];
3157 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3161 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3164 memset(cmp, 0, sizeof(void*)*5);
3172 cmp[i]= c->hadamard8_diff[i];
3178 cmp[i]= c->dct_sad[i];
3181 cmp[i]= c->dct264_sad[i];
3184 cmp[i]= c->dct_max[i];
3187 cmp[i]= c->quant_psnr[i];
3207 #ifdef CONFIG_SNOW_ENCODER
3216 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3222 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3224 static void clear_blocks_c(DCTELEM *blocks)
3226 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3229 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3231 for(i=0; i+7<w; i+=8){
3232 dst[i+0] += src[i+0];
3233 dst[i+1] += src[i+1];
3234 dst[i+2] += src[i+2];
3235 dst[i+3] += src[i+3];
3236 dst[i+4] += src[i+4];
3237 dst[i+5] += src[i+5];
3238 dst[i+6] += src[i+6];
3239 dst[i+7] += src[i+7];
3242 dst[i+0] += src[i+0];
3245 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3247 for(i=0; i+7<w; i+=8){
3248 dst[i+0] = src1[i+0]-src2[i+0];
3249 dst[i+1] = src1[i+1]-src2[i+1];
3250 dst[i+2] = src1[i+2]-src2[i+2];
3251 dst[i+3] = src1[i+3]-src2[i+3];
3252 dst[i+4] = src1[i+4]-src2[i+4];
3253 dst[i+5] = src1[i+5]-src2[i+5];
3254 dst[i+6] = src1[i+6]-src2[i+6];
3255 dst[i+7] = src1[i+7]-src2[i+7];
3258 dst[i+0] = src1[i+0]-src2[i+0];
3261 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3269 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3279 #define BUTTERFLY2(o1,o2,i1,i2) \
3283 #define BUTTERFLY1(x,y) \
3292 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3294 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3302 //FIXME try pointer walks
3303 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3304 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3305 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3306 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3308 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3309 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3310 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3311 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3313 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3314 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3315 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3316 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3320 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3321 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3322 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3323 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3325 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3326 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3327 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3328 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3331 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3332 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3333 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3334 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3340 printf("MAX:%d\n", maxi);
3346 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3354 //FIXME try pointer walks
3355 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3356 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3357 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3358 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3360 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3361 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3362 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3363 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3365 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3366 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3367 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3368 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3372 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3373 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3374 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3375 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3377 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3378 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3379 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3380 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3383 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3384 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3385 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3386 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3389 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3394 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3395 MpegEncContext * const s= (MpegEncContext *)c;
3396 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3397 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3401 s->dsp.diff_pixels(temp, src1, src2, stride);
3403 return s->dsp.sum_abs_dctelem(temp);
3408 const int s07 = SRC(0) + SRC(7);\
3409 const int s16 = SRC(1) + SRC(6);\
3410 const int s25 = SRC(2) + SRC(5);\
3411 const int s34 = SRC(3) + SRC(4);\
3412 const int a0 = s07 + s34;\
3413 const int a1 = s16 + s25;\
3414 const int a2 = s07 - s34;\
3415 const int a3 = s16 - s25;\
3416 const int d07 = SRC(0) - SRC(7);\
3417 const int d16 = SRC(1) - SRC(6);\
3418 const int d25 = SRC(2) - SRC(5);\
3419 const int d34 = SRC(3) - SRC(4);\
3420 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3421 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3422 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3423 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3425 DST(1, a4 + (a7>>2)) ;\
3426 DST(2, a2 + (a3>>1)) ;\
3427 DST(3, a5 + (a6>>2)) ;\
3429 DST(5, a6 - (a5>>2)) ;\
3430 DST(6, (a2>>1) - a3 ) ;\
3431 DST(7, (a4>>2) - a7 ) ;\
3434 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3435 MpegEncContext * const s= (MpegEncContext *)c;
3440 s->dsp.diff_pixels(dct, src1, src2, stride);
3442 #define SRC(x) dct[i][x]
3443 #define DST(x,v) dct[i][x]= v
3444 for( i = 0; i < 8; i++ )
3449 #define SRC(x) dct[x][i]
3450 #define DST(x,v) sum += FFABS(v)
3451 for( i = 0; i < 8; i++ )
3459 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3460 MpegEncContext * const s= (MpegEncContext *)c;
3461 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3462 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3467 s->dsp.diff_pixels(temp, src1, src2, stride);
3471 sum= FFMAX(sum, FFABS(temp[i]));
3476 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3477 MpegEncContext * const s= (MpegEncContext *)c;
3478 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3479 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3480 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3486 s->dsp.diff_pixels(temp, src1, src2, stride);
3488 memcpy(bak, temp, 64*sizeof(DCTELEM));
3490 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3491 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3492 simple_idct(temp); //FIXME
3495 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3500 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3501 MpegEncContext * const s= (MpegEncContext *)c;
3502 const uint8_t *scantable= s->intra_scantable.permutated;
3503 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3504 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3505 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3506 uint8_t * const bak= (uint8_t*)aligned_bak;
3507 int i, last, run, bits, level, distoration, start_i;
3508 const int esc_length= s->ac_esc_length;
3510 uint8_t * last_length;
3515 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3516 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3519 s->dsp.diff_pixels(temp, src1, src2, stride);
3521 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3527 length = s->intra_ac_vlc_length;
3528 last_length= s->intra_ac_vlc_last_length;
3529 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3532 length = s->inter_ac_vlc_length;
3533 last_length= s->inter_ac_vlc_last_length;
3538 for(i=start_i; i<last; i++){
3539 int j= scantable[i];
3544 if((level&(~127)) == 0){
3545 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3554 level= temp[i] + 64;
3558 if((level&(~127)) == 0){
3559 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3567 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3569 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3572 s->dsp.idct_add(bak, stride, temp);
3574 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3576 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3579 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3580 MpegEncContext * const s= (MpegEncContext *)c;
3581 const uint8_t *scantable= s->intra_scantable.permutated;
3582 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3583 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3584 int i, last, run, bits, level, start_i;
3585 const int esc_length= s->ac_esc_length;
3587 uint8_t * last_length;
3591 s->dsp.diff_pixels(temp, src1, src2, stride);
3593 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3599 length = s->intra_ac_vlc_length;
3600 last_length= s->intra_ac_vlc_last_length;
3601 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3604 length = s->inter_ac_vlc_length;
3605 last_length= s->inter_ac_vlc_last_length;
3610 for(i=start_i; i<last; i++){
3611 int j= scantable[i];
3616 if((level&(~127)) == 0){
3617 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3626 level= temp[i] + 64;
3630 if((level&(~127)) == 0){
3631 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3639 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3644 for(x=0; x<16; x+=4){
3645 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3646 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3654 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3659 for(x=0; x<16; x++){
3660 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3669 #define SQ(a) ((a)*(a))
3670 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3675 for(x=0; x<16; x+=4){
3676 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3677 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3685 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3690 for(x=0; x<16; x++){
3691 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3700 static int ssd_int8_vs_int16_c(int8_t *pix1, int16_t *pix2, int size){
3703 for(i=0; i<size; i++)
3704 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3708 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3709 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3710 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3712 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3714 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3715 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3716 WARPER8_16_SQ(rd8x8_c, rd16_c)
3717 WARPER8_16_SQ(bit8x8_c, bit16_c)
3719 static void vector_fmul_c(float *dst, const float *src, int len){
3721 for(i=0; i<len; i++)
3725 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3728 for(i=0; i<len; i++)
3729 dst[i] = src0[i] * src1[-i];
3732 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3734 for(i=0; i<len; i++)
3735 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3738 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3740 for(i=0; i<len; i++) {
3741 int_fast32_t tmp = ((int32_t*)src)[i];
3743 tmp = (0x43c0ffff - tmp)>>31;
3744 // is this faster on some gcc/cpu combinations?
3745 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3748 dst[i] = tmp - 0x8000;
3752 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3754 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3757 put_pixels_clamped_c(block, dest, line_size);
3759 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3762 add_pixels_clamped_c(block, dest, line_size);
3765 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3768 put_pixels_clamped4_c(block, dest, line_size);
3770 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3773 add_pixels_clamped4_c(block, dest, line_size);
3776 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3779 put_pixels_clamped2_c(block, dest, line_size);
3781 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3784 add_pixels_clamped2_c(block, dest, line_size);
3787 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3789 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3791 dest[0] = cm[(block[0] + 4)>>3];
3793 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3795 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3797 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3800 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3802 /* init static data */
3803 void dsputil_static_init(void)
3807 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3808 for(i=0;i<MAX_NEG_CROP;i++) {
3810 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3813 for(i=0;i<512;i++) {
3814 ff_squareTbl[i] = (i - 256) * (i - 256);
3817 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3820 int ff_check_alignment(void){
3821 static int did_fail=0;
3822 DECLARE_ALIGNED_16(int, aligned);
3824 if((long)&aligned & 15){
3826 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3827 av_log(NULL, AV_LOG_ERROR,
3828 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3829 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3830 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3839 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3843 ff_check_alignment();
3845 #ifdef CONFIG_ENCODERS
3846 if(avctx->dct_algo==FF_DCT_FASTINT) {
3847 c->fdct = fdct_ifast;
3848 c->fdct248 = fdct_ifast248;
3850 else if(avctx->dct_algo==FF_DCT_FAAN) {
3851 c->fdct = ff_faandct;
3852 c->fdct248 = ff_faandct248;
3855 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3856 c->fdct248 = ff_fdct248_islow;
3858 #endif //CONFIG_ENCODERS
3860 if(avctx->lowres==1){
3861 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3862 c->idct_put= ff_jref_idct4_put;
3863 c->idct_add= ff_jref_idct4_add;
3865 c->idct_put= ff_h264_lowres_idct_put_c;
3866 c->idct_add= ff_h264_lowres_idct_add_c;
3868 c->idct = j_rev_dct4;
3869 c->idct_permutation_type= FF_NO_IDCT_PERM;
3870 }else if(avctx->lowres==2){
3871 c->idct_put= ff_jref_idct2_put;
3872 c->idct_add= ff_jref_idct2_add;
3873 c->idct = j_rev_dct2;
3874 c->idct_permutation_type= FF_NO_IDCT_PERM;
3875 }else if(avctx->lowres==3){
3876 c->idct_put= ff_jref_idct1_put;
3877 c->idct_add= ff_jref_idct1_add;
3878 c->idct = j_rev_dct1;
3879 c->idct_permutation_type= FF_NO_IDCT_PERM;
3881 if(avctx->idct_algo==FF_IDCT_INT){
3882 c->idct_put= ff_jref_idct_put;
3883 c->idct_add= ff_jref_idct_add;
3884 c->idct = j_rev_dct;
3885 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3886 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3887 avctx->idct_algo==FF_IDCT_VP3){
3888 c->idct_put= ff_vp3_idct_put_c;
3889 c->idct_add= ff_vp3_idct_add_c;
3890 c->idct = ff_vp3_idct_c;
3891 c->idct_permutation_type= FF_NO_IDCT_PERM;
3892 }else{ //accurate/default
3893 c->idct_put= simple_idct_put;
3894 c->idct_add= simple_idct_add;
3895 c->idct = simple_idct;
3896 c->idct_permutation_type= FF_NO_IDCT_PERM;
3900 if (ENABLE_H264_DECODER) {
3901 c->h264_idct_add= ff_h264_idct_add_c;
3902 c->h264_idct8_add= ff_h264_idct8_add_c;
3903 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3904 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3907 c->get_pixels = get_pixels_c;
3908 c->diff_pixels = diff_pixels_c;
3909 c->put_pixels_clamped = put_pixels_clamped_c;
3910 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3911 c->add_pixels_clamped = add_pixels_clamped_c;
3912 c->add_pixels8 = add_pixels8_c;
3913 c->add_pixels4 = add_pixels4_c;
3914 c->sum_abs_dctelem = sum_abs_dctelem_c;
3917 c->clear_blocks = clear_blocks_c;
3918 c->pix_sum = pix_sum_c;
3919 c->pix_norm1 = pix_norm1_c;
3921 /* TODO [0] 16 [1] 8 */
3922 c->pix_abs[0][0] = pix_abs16_c;
3923 c->pix_abs[0][1] = pix_abs16_x2_c;
3924 c->pix_abs[0][2] = pix_abs16_y2_c;
3925 c->pix_abs[0][3] = pix_abs16_xy2_c;
3926 c->pix_abs[1][0] = pix_abs8_c;
3927 c->pix_abs[1][1] = pix_abs8_x2_c;
3928 c->pix_abs[1][2] = pix_abs8_y2_c;
3929 c->pix_abs[1][3] = pix_abs8_xy2_c;
3931 #define dspfunc(PFX, IDX, NUM) \
3932 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3933 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3934 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3935 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3937 dspfunc(put, 0, 16);
3938 dspfunc(put_no_rnd, 0, 16);
3940 dspfunc(put_no_rnd, 1, 8);
3944 dspfunc(avg, 0, 16);
3945 dspfunc(avg_no_rnd, 0, 16);
3947 dspfunc(avg_no_rnd, 1, 8);
3952 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3953 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3955 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3956 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3957 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3958 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3959 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3960 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3961 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3962 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3963 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3965 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3966 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3967 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3968 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3969 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3970 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3971 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3972 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3973 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3975 #define dspfunc(PFX, IDX, NUM) \
3976 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3977 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3978 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3979 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3980 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3981 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3982 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3983 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3984 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3985 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3986 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3987 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3988 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3989 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3990 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3991 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3993 dspfunc(put_qpel, 0, 16);
3994 dspfunc(put_no_rnd_qpel, 0, 16);
3996 dspfunc(avg_qpel, 0, 16);
3997 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3999 dspfunc(put_qpel, 1, 8);
4000 dspfunc(put_no_rnd_qpel, 1, 8);
4002 dspfunc(avg_qpel, 1, 8);
4003 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4005 dspfunc(put_h264_qpel, 0, 16);
4006 dspfunc(put_h264_qpel, 1, 8);
4007 dspfunc(put_h264_qpel, 2, 4);
4008 dspfunc(put_h264_qpel, 3, 2);
4009 dspfunc(avg_h264_qpel, 0, 16);
4010 dspfunc(avg_h264_qpel, 1, 8);
4011 dspfunc(avg_h264_qpel, 2, 4);
4014 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4015 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4016 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4017 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4018 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4019 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4020 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4022 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4023 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4024 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4025 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4026 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4027 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4028 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4029 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4030 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4031 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4032 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4033 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4034 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4035 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4036 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4037 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4038 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4039 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4040 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4041 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4043 #ifdef CONFIG_CAVS_DECODER
4044 ff_cavsdsp_init(c,avctx);
4046 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4047 ff_vc1dsp_init(c,avctx);
4049 #if defined(CONFIG_H264_ENCODER)
4050 ff_h264dsp_init(c,avctx);
4053 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4054 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4055 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4056 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4057 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4058 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4059 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4060 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4062 #define SET_CMP_FUNC(name) \
4063 c->name[0]= name ## 16_c;\
4064 c->name[1]= name ## 8x8_c;
4066 SET_CMP_FUNC(hadamard8_diff)
4067 c->hadamard8_diff[4]= hadamard8_intra16_c;
4068 SET_CMP_FUNC(dct_sad)
4069 SET_CMP_FUNC(dct_max)
4071 SET_CMP_FUNC(dct264_sad)
4073 c->sad[0]= pix_abs16_c;
4074 c->sad[1]= pix_abs8_c;
4078 SET_CMP_FUNC(quant_psnr)
4081 c->vsad[0]= vsad16_c;
4082 c->vsad[4]= vsad_intra16_c;
4083 c->vsse[0]= vsse16_c;
4084 c->vsse[4]= vsse_intra16_c;
4085 c->nsse[0]= nsse16_c;
4086 c->nsse[1]= nsse8_c;
4087 #ifdef CONFIG_SNOW_ENCODER
4088 c->w53[0]= w53_16_c;
4090 c->w97[0]= w97_16_c;
4094 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4096 c->add_bytes= add_bytes_c;
4097 c->diff_bytes= diff_bytes_c;
4098 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4099 c->bswap_buf= bswap_buf;
4101 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4102 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4103 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4104 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4105 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4106 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4107 c->h264_loop_filter_strength= NULL;
4109 c->h263_h_loop_filter= h263_h_loop_filter_c;
4110 c->h263_v_loop_filter= h263_v_loop_filter_c;
4112 c->h261_loop_filter= h261_loop_filter_c;
4114 c->try_8x8basis= try_8x8basis_c;
4115 c->add_8x8basis= add_8x8basis_c;
4117 #ifdef CONFIG_SNOW_DECODER
4118 c->vertical_compose97i = ff_snow_vertical_compose97i;
4119 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4120 c->inner_add_yblock = ff_snow_inner_add_yblock;
4123 #ifdef CONFIG_VORBIS_DECODER
4124 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4126 c->vector_fmul = vector_fmul_c;
4127 c->vector_fmul_reverse = vector_fmul_reverse_c;
4128 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4129 c->float_to_int16 = ff_float_to_int16_c;
4131 c->shrink[0]= ff_img_copy_plane;
4132 c->shrink[1]= ff_shrink22;
4133 c->shrink[2]= ff_shrink44;
4134 c->shrink[3]= ff_shrink88;
4136 c->prefetch= just_return;
4138 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4139 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4141 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4142 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4143 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4144 if (ENABLE_SPARC) dsputil_init_vis (c, avctx);
4145 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4146 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4147 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4148 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4149 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4151 for(i=0; i<64; i++){
4152 if(!c->put_2tap_qpel_pixels_tab[0][i])
4153 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4154 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4155 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4158 switch(c->idct_permutation_type){
4159 case FF_NO_IDCT_PERM:
4161 c->idct_permutation[i]= i;
4163 case FF_LIBMPEG2_IDCT_PERM:
4165 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4167 case FF_SIMPLE_IDCT_PERM:
4169 c->idct_permutation[i]= simple_mmx_permutation[i];
4171 case FF_TRANSPOSE_IDCT_PERM:
4173 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4175 case FF_PARTTRANS_IDCT_PERM:
4177 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4180 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");