3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38 uint32_t squareTbl[512] = {0, };
40 const uint8_t ff_zigzag_direct[64] = {
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
43 12, 19, 26, 33, 40, 48, 41, 34,
44 27, 20, 13, 6, 7, 14, 21, 28,
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53 const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
67 const uint8_t ff_alternate_horizontal_scan[64] = {
68 0, 1, 2, 3, 8, 9, 16, 17,
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
78 const uint8_t ff_alternate_vertical_scan[64] = {
79 0, 8, 16, 24, 1, 9, 2, 10,
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90 const uint32_t inverse[256]={
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
125 /* Input permutation for the simple_idct_mmx */
126 static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
221 dst[i+0]= bswap_32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
297 const int dec_count= w==8 ? 3 : 4;
301 static const int scale[2][2][4][4]={
305 {268, 239, 239, 213},
310 {344, 310, 310, 280},
318 {275, 245, 245, 218},
323 {352, 317, 317, 286},
332 for (i = 0; i < h; i++) {
333 for (j = 0; j < w; j+=4) {
334 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
343 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
347 for(level=0; level<dec_count; level++){
348 for(ori= level ? 1 : 0; ori<4; ori++){
349 int sx= (ori&1) ? 1<<level: 0;
350 int stride= 16<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
354 for(i=0; i<size; i++){
355 for(j=0; j<size; j++){
356 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
363 for (i = 0; i < h; i++) {
364 for (j = 0; j < w; j+=4) {
365 s+= ABS(tmp[16*i+j+0]);
366 s+= ABS(tmp[16*i+j+1]);
367 s+= ABS(tmp[16*i+j+2]);
368 s+= ABS(tmp[16*i+j+3]);
377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 8, h, 1);
381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 8, h, 0);
385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 16, h, 1);
389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 16, h, 0);
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397 /* read the pixels */
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
416 /* read the pixels */
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
439 /* read the pixels */
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461 /* read the pixels */
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
499 else if (*block > 127)
502 *pixels = (uint8_t)(*block + 128);
506 pixels += (line_size - 8);
510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
516 /* read the pixels */
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537 /* read the pixels */
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554 /* read the pixels */
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
567 pixels[0] += block[0];
568 pixels[1] += block[1];
569 pixels[2] += block[2];
570 pixels[3] += block[3];
571 pixels[4] += block[4];
572 pixels[5] += block[5];
573 pixels[6] += block[6];
574 pixels[7] += block[7];
580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
584 pixels[0] += block[0];
585 pixels[1] += block[1];
586 pixels[2] += block[2];
587 pixels[3] += block[3];
595 #define PIXOP2(OPNAME, OP) \
596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600 OP(*((uint64_t*)block), LD64(pixels));\
606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
610 const uint64_t a= LD64(pixels );\
611 const uint64_t b= LD64(pixels+1);\
612 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
622 const uint64_t a= LD64(pixels );\
623 const uint64_t b= LD64(pixels+1);\
624 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
634 const uint64_t a= LD64(pixels );\
635 const uint64_t b= LD64(pixels+line_size);\
636 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
646 const uint64_t a= LD64(pixels );\
647 const uint64_t b= LD64(pixels+line_size);\
648 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 const uint64_t a= LD64(pixels );\
658 const uint64_t b= LD64(pixels+1);\
659 uint64_t l0= (a&0x0303030303030303ULL)\
660 + (b&0x0303030303030303ULL)\
661 + 0x0202020202020202ULL;\
662 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
667 for(i=0; i<h; i+=2){\
668 uint64_t a= LD64(pixels );\
669 uint64_t b= LD64(pixels+1);\
670 l1= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL);\
672 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
679 l0= (a&0x0303030303030303ULL)\
680 + (b&0x0303030303030303ULL)\
681 + 0x0202020202020202ULL;\
682 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 const uint64_t a= LD64(pixels );\
694 const uint64_t b= LD64(pixels+1);\
695 uint64_t l0= (a&0x0303030303030303ULL)\
696 + (b&0x0303030303030303ULL)\
697 + 0x0101010101010101ULL;\
698 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
703 for(i=0; i<h; i+=2){\
704 uint64_t a= LD64(pixels );\
705 uint64_t b= LD64(pixels+1);\
706 l1= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL);\
708 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
715 l0= (a&0x0303030303030303ULL)\
716 + (b&0x0303030303030303ULL)\
717 + 0x0101010101010101ULL;\
718 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
726 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735 #else // 64 bit variant
737 #define PIXOP2(OPNAME, OP) \
738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741 OP(*((uint16_t*)(block )), LD16(pixels ));\
746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749 OP(*((uint32_t*)(block )), LD32(pixels ));\
754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757 OP(*((uint32_t*)(block )), LD32(pixels ));\
758 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768 int src_stride1, int src_stride2, int h){\
772 a= LD32(&src1[i*src_stride1 ]);\
773 b= LD32(&src2[i*src_stride2 ]);\
774 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
775 a= LD32(&src1[i*src_stride1+4]);\
776 b= LD32(&src2[i*src_stride2+4]);\
777 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= LD32(&src1[i*src_stride1 ]);\
787 b= LD32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
789 a= LD32(&src1[i*src_stride1+4]);\
790 b= LD32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= LD32(&src1[i*src_stride1 ]);\
801 b= LD32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
811 a= LD16(&src1[i*src_stride1 ]);\
812 b= LD16(&src2[i*src_stride2 ]);\
813 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
819 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
820 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824 int src_stride1, int src_stride2, int h){\
825 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
826 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
834 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849 uint32_t a, b, c, d, l0, l1, h0, h1;\
850 a= LD32(&src1[i*src_stride1]);\
851 b= LD32(&src2[i*src_stride2]);\
852 c= LD32(&src3[i*src_stride3]);\
853 d= LD32(&src4[i*src_stride4]);\
854 l0= (a&0x03030303UL)\
857 h0= ((a&0xFCFCFCFCUL)>>2)\
858 + ((b&0xFCFCFCFCUL)>>2);\
859 l1= (c&0x03030303UL)\
861 h1= ((c&0xFCFCFCFCUL)>>2)\
862 + ((d&0xFCFCFCFCUL)>>2);\
863 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864 a= LD32(&src1[i*src_stride1+4]);\
865 b= LD32(&src2[i*src_stride2+4]);\
866 c= LD32(&src3[i*src_stride3+4]);\
867 d= LD32(&src4[i*src_stride4+4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901 uint32_t a, b, c, d, l0, l1, h0, h1;\
902 a= LD32(&src1[i*src_stride1]);\
903 b= LD32(&src2[i*src_stride2]);\
904 c= LD32(&src3[i*src_stride3]);\
905 d= LD32(&src4[i*src_stride4]);\
906 l0= (a&0x03030303UL)\
909 h0= ((a&0xFCFCFCFCUL)>>2)\
910 + ((b&0xFCFCFCFCUL)>>2);\
911 l1= (c&0x03030303UL)\
913 h1= ((c&0xFCFCFCFCUL)>>2)\
914 + ((d&0xFCFCFCFCUL)>>2);\
915 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916 a= LD32(&src1[i*src_stride1+4]);\
917 b= LD32(&src2[i*src_stride2+4]);\
918 c= LD32(&src3[i*src_stride3+4]);\
919 d= LD32(&src4[i*src_stride4+4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 int i, a0, b0, a1, b1;\
952 for(i=0; i<h; i+=2){\
958 block[0]= (a1+a0)>>2; /* FIXME non put */\
959 block[1]= (b1+b0)>>2;\
969 block[0]= (a1+a0)>>2;\
970 block[1]= (b1+b0)>>2;\
976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979 const uint32_t a= LD32(pixels );\
980 const uint32_t b= LD32(pixels+1);\
981 uint32_t l0= (a&0x03030303UL)\
984 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985 + ((b&0xFCFCFCFCUL)>>2);\
989 for(i=0; i<h; i+=2){\
990 uint32_t a= LD32(pixels );\
991 uint32_t b= LD32(pixels+1);\
992 l1= (a&0x03030303UL)\
994 h1= ((a&0xFCFCFCFCUL)>>2)\
995 + ((b&0xFCFCFCFCUL)>>2);\
996 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001 l0= (a&0x03030303UL)\
1004 h0= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015 for(j=0; j<2; j++){\
1017 const uint32_t a= LD32(pixels );\
1018 const uint32_t b= LD32(pixels+1);\
1019 uint32_t l0= (a&0x03030303UL)\
1022 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023 + ((b&0xFCFCFCFCUL)>>2);\
1027 for(i=0; i<h; i+=2){\
1028 uint32_t a= LD32(pixels );\
1029 uint32_t b= LD32(pixels+1);\
1030 l1= (a&0x03030303UL)\
1031 + (b&0x03030303UL);\
1032 h1= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039 l0= (a&0x03030303UL)\
1042 h0= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1048 pixels+=4-line_size*(h+1);\
1049 block +=4-line_size*h;\
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056 for(j=0; j<2; j++){\
1058 const uint32_t a= LD32(pixels );\
1059 const uint32_t b= LD32(pixels+1);\
1060 uint32_t l0= (a&0x03030303UL)\
1063 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1068 for(i=0; i<h; i+=2){\
1069 uint32_t a= LD32(pixels );\
1070 uint32_t b= LD32(pixels+1);\
1071 l1= (a&0x03030303UL)\
1072 + (b&0x03030303UL);\
1073 h1= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1075 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080 l0= (a&0x03030303UL)\
1083 h0= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 pixels+=4-line_size*(h+1);\
1090 block +=4-line_size*h;\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1105 #define op_put(a, b) a = b
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 const int A=(16-x16)*(16-y16);
1126 const int B=( x16)*(16-y16);
1127 const int C=(16-x16)*( y16);
1128 const int D=( x16)*( y16);
1133 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1146 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150 const int s= 1<<shift;
1160 for(x=0; x<8; x++){ //XXX FIXME optimize
1161 int src_x, src_y, frac_x, frac_y, index;
1165 frac_x= src_x&(s-1);
1166 frac_y= src_y&(s-1);
1170 if((unsigned)src_x < width){
1171 if((unsigned)src_y < height){
1172 index= src_x + src_y*stride;
1173 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1174 + src[index +1]* frac_x )*(s-frac_y)
1175 + ( src[index+stride ]*(s-frac_x)
1176 + src[index+stride+1]* frac_x )* frac_y
1179 index= src_x + clip(src_y, 0, height)*stride;
1180 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1181 + src[index +1]* frac_x )*s
1185 if((unsigned)src_y < height){
1186 index= clip(src_x, 0, width) + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1188 + src[index+stride ]* frac_y )*s
1191 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192 dst[y*stride + x]= src[index ];
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 case 2: put_pixels2_c (dst, src, stride, height); break;
1207 case 4: put_pixels4_c (dst, src, stride, height); break;
1208 case 8: put_pixels8_c (dst, src, stride, height); break;
1209 case 16:put_pixels16_c(dst, src, stride, height); break;
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 for (i=0; i < height; i++) {
1216 for (j=0; j < width; j++) {
1217 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
1228 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
1239 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
1250 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
1261 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
1272 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 case 2: avg_pixels2_c (dst, src, stride, height); break;
1304 case 4: avg_pixels4_c (dst, src, stride, height); break;
1305 case 8: avg_pixels8_c (dst, src, stride, height); break;
1306 case 16:avg_pixels16_c(dst, src, stride, height); break;
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421 const int A=(8-x)*(8-y);\
1422 const int B=( x)*(8-y);\
1423 const int C=(8-x)*( y);\
1424 const int D=( x)*( y);\
1427 assert(x<8 && y<8 && x>=0 && y>=0);\
1431 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439 const int A=(8-x)*(8-y);\
1440 const int B=( x)*(8-y);\
1441 const int C=(8-x)*( y);\
1442 const int D=( x)*( y);\
1445 assert(x<8 && y<8 && x>=0 && y>=0);\
1449 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459 const int A=(8-x)*(8-y);\
1460 const int B=( x)*(8-y);\
1461 const int C=(8-x)*( y);\
1462 const int D=( x)*( y);\
1465 assert(x<8 && y<8 && x>=0 && y>=0);\
1469 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1485 H264_CHROMA_MC(put_ , op_put)
1486 H264_CHROMA_MC(avg_ , op_avg)
1490 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1495 ST16(dst , LD16(src ));
1501 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1506 ST32(dst , LD32(src ));
1512 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1517 ST32(dst , LD32(src ));
1518 ST32(dst+4 , LD32(src+4 ));
1524 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1529 ST32(dst , LD32(src ));
1530 ST32(dst+4 , LD32(src+4 ));
1531 ST32(dst+8 , LD32(src+8 ));
1532 ST32(dst+12, LD32(src+12));
1538 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543 ST32(dst , LD32(src ));
1544 ST32(dst+4 , LD32(src+4 ));
1545 ST32(dst+8 , LD32(src+8 ));
1546 ST32(dst+12, LD32(src+12));
1553 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1558 ST32(dst , LD32(src ));
1559 ST32(dst+4 , LD32(src+4 ));
1567 #define QPEL_MC(r, OPNAME, RND, OP) \
1568 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1573 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1574 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1575 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1576 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1577 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1578 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1579 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1580 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1586 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1588 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1592 const int src0= src[0*srcStride];\
1593 const int src1= src[1*srcStride];\
1594 const int src2= src[2*srcStride];\
1595 const int src3= src[3*srcStride];\
1596 const int src4= src[4*srcStride];\
1597 const int src5= src[5*srcStride];\
1598 const int src6= src[6*srcStride];\
1599 const int src7= src[7*srcStride];\
1600 const int src8= src[8*srcStride];\
1601 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1602 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1603 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1604 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1605 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1606 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1607 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1608 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1614 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1615 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1620 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1621 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1622 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1623 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1624 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1625 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1626 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1627 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1628 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1629 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1630 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1631 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1632 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1633 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1634 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1635 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1641 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1647 const int src0= src[0*srcStride];\
1648 const int src1= src[1*srcStride];\
1649 const int src2= src[2*srcStride];\
1650 const int src3= src[3*srcStride];\
1651 const int src4= src[4*srcStride];\
1652 const int src5= src[5*srcStride];\
1653 const int src6= src[6*srcStride];\
1654 const int src7= src[7*srcStride];\
1655 const int src8= src[8*srcStride];\
1656 const int src9= src[9*srcStride];\
1657 const int src10= src[10*srcStride];\
1658 const int src11= src[11*srcStride];\
1659 const int src12= src[12*srcStride];\
1660 const int src13= src[13*srcStride];\
1661 const int src14= src[14*srcStride];\
1662 const int src15= src[15*srcStride];\
1663 const int src16= src[16*srcStride];\
1664 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1665 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1666 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1667 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1668 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1669 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1670 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1671 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1672 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1673 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1674 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1675 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1676 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1677 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1678 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1679 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1685 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1686 OPNAME ## pixels8_c(dst, src, stride, 8);\
1689 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1692 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1695 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1696 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1699 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1702 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1705 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1708 copy_block9(full, src, 16, stride, 9);\
1709 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1710 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1713 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1714 uint8_t full[16*9];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1719 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1720 uint8_t full[16*9];\
1722 copy_block9(full, src, 16, stride, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1724 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1726 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[16*9];\
1772 uint8_t halfHV[64];\
1773 copy_block9(full, src, 16, stride, 9);\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1782 uint8_t halfHV[64];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[16*9];\
1793 uint8_t halfHV[64];\
1794 copy_block9(full, src, 16, stride, 9);\
1795 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1801 uint8_t full[16*9];\
1803 uint8_t halfHV[64];\
1804 copy_block9(full, src, 16, stride, 9);\
1805 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t halfHV[64];\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1817 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819 uint8_t halfHV[64];\
1820 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t full[16*9];\
1828 uint8_t halfHV[64];\
1829 copy_block9(full, src, 16, stride, 9);\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1835 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t full[16*9];\
1838 copy_block9(full, src, 16, stride, 9);\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1840 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1841 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1847 uint8_t halfHV[64];\
1848 copy_block9(full, src, 16, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1854 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1857 copy_block9(full, src, 16, stride, 9);\
1858 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1860 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1865 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1868 OPNAME ## pixels16_c(dst, src, stride, 16);\
1871 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1874 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1877 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1878 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1881 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1884 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1887 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1890 copy_block17(full, src, 24, stride, 17);\
1891 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1892 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1895 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1896 uint8_t full[24*17];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1901 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t full[24*17];\
1904 copy_block17(full, src, 24, stride, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1906 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t full[24*17];\
1952 uint8_t halfH[272];\
1953 uint8_t halfV[256];\
1954 uint8_t halfHV[256];\
1955 copy_block17(full, src, 24, stride, 17);\
1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfHV[256];\
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1972 uint8_t full[24*17];\
1973 uint8_t halfH[272];\
1974 uint8_t halfV[256];\
1975 uint8_t halfHV[256];\
1976 copy_block17(full, src, 24, stride, 17);\
1977 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1978 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[24*17];\
1984 uint8_t halfH[272];\
1985 uint8_t halfHV[256];\
1986 copy_block17(full, src, 24, stride, 17);\
1987 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1988 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1989 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1990 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1993 uint8_t halfH[272];\
1994 uint8_t halfHV[256];\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t halfH[272];\
2001 uint8_t halfHV[256];\
2002 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
2009 uint8_t halfV[256];\
2010 uint8_t halfHV[256];\
2011 copy_block17(full, src, 24, stride, 17);\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[24*17];\
2019 uint8_t halfH[272];\
2020 copy_block17(full, src, 24, stride, 17);\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2022 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2023 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t halfH[272];\
2028 uint8_t halfV[256];\
2029 uint8_t halfHV[256];\
2030 copy_block17(full, src, 24, stride, 17);\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[24*17];\
2038 uint8_t halfH[272];\
2039 copy_block17(full, src, 24, stride, 17);\
2040 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2041 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2042 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t halfH[272];\
2046 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2047 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2051 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2052 #define op_put(a, b) a = cm[((b) + 16)>>5]
2053 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055 QPEL_MC(0, put_ , _ , op_put)
2056 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2057 QPEL_MC(0, avg_ , _ , op_avg)
2058 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2060 #undef op_avg_no_rnd
2062 #undef op_put_no_rnd
2065 #define H264_LOWPASS(OPNAME, OP, OP2) \
2066 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2072 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2073 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2079 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2085 const int srcB= src[-2*srcStride];\
2086 const int srcA= src[-1*srcStride];\
2087 const int src0= src[0 *srcStride];\
2088 const int src1= src[1 *srcStride];\
2089 const int src2= src[2 *srcStride];\
2090 const int src3= src[3 *srcStride];\
2091 const int src4= src[4 *srcStride];\
2092 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2093 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2099 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2102 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104 src -= 2*srcStride;\
2105 for(i=0; i<h+5; i++)\
2107 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2108 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2112 tmp -= tmpStride*(h+5-2);\
2115 const int tmpB= tmp[-2*tmpStride];\
2116 const int tmpA= tmp[-1*tmpStride];\
2117 const int tmp0= tmp[0 *tmpStride];\
2118 const int tmp1= tmp[1 *tmpStride];\
2119 const int tmp2= tmp[2 *tmpStride];\
2120 const int tmp3= tmp[3 *tmpStride];\
2121 const int tmp4= tmp[4 *tmpStride];\
2122 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2123 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2128 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2134 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2135 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2136 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2137 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2143 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2149 const int srcB= src[-2*srcStride];\
2150 const int srcA= src[-1*srcStride];\
2151 const int src0= src[0 *srcStride];\
2152 const int src1= src[1 *srcStride];\
2153 const int src2= src[2 *srcStride];\
2154 const int src3= src[3 *srcStride];\
2155 const int src4= src[4 *srcStride];\
2156 const int src5= src[5 *srcStride];\
2157 const int src6= src[6 *srcStride];\
2158 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2159 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2160 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2161 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2167 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2170 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172 src -= 2*srcStride;\
2173 for(i=0; i<h+5; i++)\
2175 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2176 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2177 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2178 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2182 tmp -= tmpStride*(h+5-2);\
2185 const int tmpB= tmp[-2*tmpStride];\
2186 const int tmpA= tmp[-1*tmpStride];\
2187 const int tmp0= tmp[0 *tmpStride];\
2188 const int tmp1= tmp[1 *tmpStride];\
2189 const int tmp2= tmp[2 *tmpStride];\
2190 const int tmp3= tmp[3 *tmpStride];\
2191 const int tmp4= tmp[4 *tmpStride];\
2192 const int tmp5= tmp[5 *tmpStride];\
2193 const int tmp6= tmp[6 *tmpStride];\
2194 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2195 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2196 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2197 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2203 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2209 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2210 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2211 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2212 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2213 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2214 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2215 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2216 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2222 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2228 const int srcB= src[-2*srcStride];\
2229 const int srcA= src[-1*srcStride];\
2230 const int src0= src[0 *srcStride];\
2231 const int src1= src[1 *srcStride];\
2232 const int src2= src[2 *srcStride];\
2233 const int src3= src[3 *srcStride];\
2234 const int src4= src[4 *srcStride];\
2235 const int src5= src[5 *srcStride];\
2236 const int src6= src[6 *srcStride];\
2237 const int src7= src[7 *srcStride];\
2238 const int src8= src[8 *srcStride];\
2239 const int src9= src[9 *srcStride];\
2240 const int src10=src[10*srcStride];\
2241 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2242 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2243 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2244 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2246 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2247 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2248 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2254 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2257 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259 src -= 2*srcStride;\
2260 for(i=0; i<h+5; i++)\
2262 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2263 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2264 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2265 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2266 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2267 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2268 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2269 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2273 tmp -= tmpStride*(h+5-2);\
2276 const int tmpB= tmp[-2*tmpStride];\
2277 const int tmpA= tmp[-1*tmpStride];\
2278 const int tmp0= tmp[0 *tmpStride];\
2279 const int tmp1= tmp[1 *tmpStride];\
2280 const int tmp2= tmp[2 *tmpStride];\
2281 const int tmp3= tmp[3 *tmpStride];\
2282 const int tmp4= tmp[4 *tmpStride];\
2283 const int tmp5= tmp[5 *tmpStride];\
2284 const int tmp6= tmp[6 *tmpStride];\
2285 const int tmp7= tmp[7 *tmpStride];\
2286 const int tmp8= tmp[8 *tmpStride];\
2287 const int tmp9= tmp[9 *tmpStride];\
2288 const int tmp10=tmp[10*tmpStride];\
2289 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2292 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2293 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2294 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2295 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2296 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2302 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2304 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2305 src += 8*srcStride;\
2306 dst += 8*dstStride;\
2307 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2308 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2313 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2314 src += 8*srcStride;\
2315 dst += 8*dstStride;\
2316 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2317 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2320 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2321 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2323 src += 8*srcStride;\
2324 dst += 8*dstStride;\
2325 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2329 #define H264_MC(OPNAME, SIZE) \
2330 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2331 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2334 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2335 uint8_t half[SIZE*SIZE];\
2336 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2337 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2340 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2341 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t half[SIZE*SIZE];\
2346 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2347 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2351 uint8_t full[SIZE*(SIZE+5)];\
2352 uint8_t * const full_mid= full + SIZE*2;\
2353 uint8_t half[SIZE*SIZE];\
2354 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2355 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2356 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2359 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2360 uint8_t full[SIZE*(SIZE+5)];\
2361 uint8_t * const full_mid= full + SIZE*2;\
2362 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2363 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2367 uint8_t full[SIZE*(SIZE+5)];\
2368 uint8_t * const full_mid= full + SIZE*2;\
2369 uint8_t half[SIZE*SIZE];\
2370 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2371 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2372 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2375 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2376 uint8_t full[SIZE*(SIZE+5)];\
2377 uint8_t * const full_mid= full + SIZE*2;\
2378 uint8_t halfH[SIZE*SIZE];\
2379 uint8_t halfV[SIZE*SIZE];\
2380 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2381 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2382 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2383 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2387 uint8_t full[SIZE*(SIZE+5)];\
2388 uint8_t * const full_mid= full + SIZE*2;\
2389 uint8_t halfH[SIZE*SIZE];\
2390 uint8_t halfV[SIZE*SIZE];\
2391 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2392 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2393 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2394 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2398 uint8_t full[SIZE*(SIZE+5)];\
2399 uint8_t * const full_mid= full + SIZE*2;\
2400 uint8_t halfH[SIZE*SIZE];\
2401 uint8_t halfV[SIZE*SIZE];\
2402 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2403 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2404 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2405 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2409 uint8_t full[SIZE*(SIZE+5)];\
2410 uint8_t * const full_mid= full + SIZE*2;\
2411 uint8_t halfH[SIZE*SIZE];\
2412 uint8_t halfV[SIZE*SIZE];\
2413 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2414 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2415 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2420 int16_t tmp[SIZE*(SIZE+5)];\
2421 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2425 int16_t tmp[SIZE*(SIZE+5)];\
2426 uint8_t halfH[SIZE*SIZE];\
2427 uint8_t halfHV[SIZE*SIZE];\
2428 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2429 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2434 int16_t tmp[SIZE*(SIZE+5)];\
2435 uint8_t halfH[SIZE*SIZE];\
2436 uint8_t halfHV[SIZE*SIZE];\
2437 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2439 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2443 uint8_t full[SIZE*(SIZE+5)];\
2444 uint8_t * const full_mid= full + SIZE*2;\
2445 int16_t tmp[SIZE*(SIZE+5)];\
2446 uint8_t halfV[SIZE*SIZE];\
2447 uint8_t halfHV[SIZE*SIZE];\
2448 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2449 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2450 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2451 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2455 uint8_t full[SIZE*(SIZE+5)];\
2456 uint8_t * const full_mid= full + SIZE*2;\
2457 int16_t tmp[SIZE*(SIZE+5)];\
2458 uint8_t halfV[SIZE*SIZE];\
2459 uint8_t halfHV[SIZE*SIZE];\
2460 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2461 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2466 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2467 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2468 #define op_put(a, b) a = cm[((b) + 16)>>5]
2469 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2470 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2472 H264_LOWPASS(put_ , op_put, op2_put)
2473 H264_LOWPASS(avg_ , op_avg, op2_avg)
2488 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2489 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2490 #define H264_WEIGHT(W,H) \
2491 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2493 offset <<= log2_denom; \
2494 if(log2_denom) offset += 1<<(log2_denom-1); \
2495 for(y=0; y<H; y++, block += stride){ \
2498 if(W==2) continue; \
2501 if(W==4) continue; \
2506 if(W==8) continue; \
2517 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519 offset = ((offset + 1) | 1) << log2_denom; \
2520 for(y=0; y<H; y++, dst += stride, src += stride){ \
2523 if(W==2) continue; \
2526 if(W==4) continue; \
2531 if(W==8) continue; \
2558 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2559 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2563 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2564 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2565 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2566 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2567 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2568 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2569 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2570 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2577 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2581 const int src_1= src[ -srcStride];
2582 const int src0 = src[0 ];
2583 const int src1 = src[ srcStride];
2584 const int src2 = src[2*srcStride];
2585 const int src3 = src[3*srcStride];
2586 const int src4 = src[4*srcStride];
2587 const int src5 = src[5*srcStride];
2588 const int src6 = src[6*srcStride];
2589 const int src7 = src[7*srcStride];
2590 const int src8 = src[8*srcStride];
2591 const int src9 = src[9*srcStride];
2592 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2593 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2594 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2595 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2596 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2597 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2598 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2599 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2605 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2606 put_pixels8_c(dst, src, stride, 8);
2609 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2611 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2612 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2615 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2616 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2619 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2621 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2622 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2625 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2626 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2629 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2633 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2634 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2635 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2636 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2638 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2642 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2643 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2644 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2645 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2647 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2649 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2650 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2653 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2655 const int strength= ff_h263_loop_filter_strength[qscale];
2659 int p0= src[x-2*stride];
2660 int p1= src[x-1*stride];
2661 int p2= src[x+0*stride];
2662 int p3= src[x+1*stride];
2663 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2665 if (d<-2*strength) d1= 0;
2666 else if(d<- strength) d1=-2*strength - d;
2667 else if(d< strength) d1= d;
2668 else if(d< 2*strength) d1= 2*strength - d;
2673 if(p1&256) p1= ~(p1>>31);
2674 if(p2&256) p2= ~(p2>>31);
2676 src[x-1*stride] = p1;
2677 src[x+0*stride] = p2;
2681 d2= clip((p0-p3)/4, -ad1, ad1);
2683 src[x-2*stride] = p0 - d2;
2684 src[x+ stride] = p3 + d2;
2688 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2690 const int strength= ff_h263_loop_filter_strength[qscale];
2694 int p0= src[y*stride-2];
2695 int p1= src[y*stride-1];
2696 int p2= src[y*stride+0];
2697 int p3= src[y*stride+1];
2698 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2700 if (d<-2*strength) d1= 0;
2701 else if(d<- strength) d1=-2*strength - d;
2702 else if(d< strength) d1= d;
2703 else if(d< 2*strength) d1= 2*strength - d;
2708 if(p1&256) p1= ~(p1>>31);
2709 if(p2&256) p2= ~(p2>>31);
2711 src[y*stride-1] = p1;
2712 src[y*stride+0] = p2;
2716 d2= clip((p0-p3)/4, -ad1, ad1);
2718 src[y*stride-2] = p0 - d2;
2719 src[y*stride+1] = p3 + d2;
2723 static void h261_loop_filter_c(uint8_t *src, int stride){
2728 temp[x ] = 4*src[x ];
2729 temp[x + 7*8] = 4*src[x + 7*stride];
2733 xy = y * stride + x;
2735 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2740 src[ y*stride] = (temp[ y*8] + 2)>>2;
2741 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2743 xy = y * stride + x;
2745 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2750 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2753 for( i = 0; i < 4; i++ ) {
2758 for( d = 0; d < 4; d++ ) {
2759 const int p0 = pix[-1*xstride];
2760 const int p1 = pix[-2*xstride];
2761 const int p2 = pix[-3*xstride];
2762 const int q0 = pix[0];
2763 const int q1 = pix[1*xstride];
2764 const int q2 = pix[2*xstride];
2766 if( ABS( p0 - q0 ) < alpha &&
2767 ABS( p1 - p0 ) < beta &&
2768 ABS( q1 - q0 ) < beta ) {
2773 if( ABS( p2 - p0 ) < beta ) {
2774 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2777 if( ABS( q2 - q0 ) < beta ) {
2778 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2782 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2783 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2784 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2790 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2792 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2794 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2796 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2799 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2802 for( i = 0; i < 4; i++ ) {
2803 const int tc = tc0[i];
2808 for( d = 0; d < 2; d++ ) {
2809 const int p0 = pix[-1*xstride];
2810 const int p1 = pix[-2*xstride];
2811 const int q0 = pix[0];
2812 const int q1 = pix[1*xstride];
2814 if( ABS( p0 - q0 ) < alpha &&
2815 ABS( p1 - p0 ) < beta &&
2816 ABS( q1 - q0 ) < beta ) {
2818 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2820 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2821 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2827 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2829 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2831 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2836 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2839 for( d = 0; d < 8; d++ ) {
2840 const int p0 = pix[-1*xstride];
2841 const int p1 = pix[-2*xstride];
2842 const int q0 = pix[0];
2843 const int q1 = pix[1*xstride];
2845 if( ABS( p0 - q0 ) < alpha &&
2846 ABS( p1 - p0 ) < beta &&
2847 ABS( q1 - q0 ) < beta ) {
2849 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2850 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2855 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2857 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2859 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2861 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2864 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2870 s += abs(pix1[0] - pix2[0]);
2871 s += abs(pix1[1] - pix2[1]);
2872 s += abs(pix1[2] - pix2[2]);
2873 s += abs(pix1[3] - pix2[3]);
2874 s += abs(pix1[4] - pix2[4]);
2875 s += abs(pix1[5] - pix2[5]);
2876 s += abs(pix1[6] - pix2[6]);
2877 s += abs(pix1[7] - pix2[7]);
2878 s += abs(pix1[8] - pix2[8]);
2879 s += abs(pix1[9] - pix2[9]);
2880 s += abs(pix1[10] - pix2[10]);
2881 s += abs(pix1[11] - pix2[11]);
2882 s += abs(pix1[12] - pix2[12]);
2883 s += abs(pix1[13] - pix2[13]);
2884 s += abs(pix1[14] - pix2[14]);
2885 s += abs(pix1[15] - pix2[15]);
2892 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2898 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2899 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2900 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2901 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2902 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2903 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2904 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2905 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2906 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2907 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2908 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2909 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2910 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2911 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2912 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2913 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2920 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2923 uint8_t *pix3 = pix2 + line_size;
2927 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2928 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2929 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2930 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2931 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2932 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2933 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2934 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2935 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2936 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2937 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2938 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2939 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2940 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2941 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2942 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2950 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2953 uint8_t *pix3 = pix2 + line_size;
2957 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2958 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2959 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2960 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2961 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2962 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2963 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2964 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2965 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2966 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2967 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2968 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2969 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2970 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2971 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2972 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2980 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2986 s += abs(pix1[0] - pix2[0]);
2987 s += abs(pix1[1] - pix2[1]);
2988 s += abs(pix1[2] - pix2[2]);
2989 s += abs(pix1[3] - pix2[3]);
2990 s += abs(pix1[4] - pix2[4]);
2991 s += abs(pix1[5] - pix2[5]);
2992 s += abs(pix1[6] - pix2[6]);
2993 s += abs(pix1[7] - pix2[7]);
3000 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3006 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3007 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3008 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3009 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3010 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3011 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3012 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3013 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3020 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3023 uint8_t *pix3 = pix2 + line_size;
3027 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3028 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3029 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3030 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3031 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3032 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3033 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3034 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3042 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3045 uint8_t *pix3 = pix2 + line_size;
3049 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3050 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3051 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3052 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3053 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3054 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3055 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3056 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3064 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3065 MpegEncContext *c = v;
3071 for(x=0; x<16; x++){
3072 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3075 for(x=0; x<15; x++){
3076 score2+= ABS( s1[x ] - s1[x +stride]
3077 - s1[x+1] + s1[x+1+stride])
3078 -ABS( s2[x ] - s2[x +stride]
3079 - s2[x+1] + s2[x+1+stride]);
3086 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3087 else return score1 + ABS(score2)*8;
3090 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3091 MpegEncContext *c = v;
3098 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3102 score2+= ABS( s1[x ] - s1[x +stride]
3103 - s1[x+1] + s1[x+1+stride])
3104 -ABS( s2[x ] - s2[x +stride]
3105 - s2[x+1] + s2[x+1+stride]);
3112 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3113 else return score1 + ABS(score2)*8;
3116 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3120 for(i=0; i<8*8; i++){
3121 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3124 assert(-512<b && b<512);
3126 sum += (w*b)*(w*b)>>4;
3131 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3134 for(i=0; i<8*8; i++){
3135 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3140 * permutes an 8x8 block.
3141 * @param block the block which will be permuted according to the given permutation vector
3142 * @param permutation the permutation vector
3143 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3144 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3145 * (inverse) permutated to scantable order!
3147 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3153 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3155 for(i=0; i<=last; i++){
3156 const int j= scantable[i];
3161 for(i=0; i<=last; i++){
3162 const int j= scantable[i];
3163 const int perm_j= permutation[j];
3164 block[perm_j]= temp[j];
3168 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3172 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3175 memset(cmp, 0, sizeof(void*)*5);
3183 cmp[i]= c->hadamard8_diff[i];
3189 cmp[i]= c->dct_sad[i];
3192 cmp[i]= c->dct264_sad[i];
3195 cmp[i]= c->dct_max[i];
3198 cmp[i]= c->quant_psnr[i];
3225 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3231 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3233 static void clear_blocks_c(DCTELEM *blocks)
3235 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3238 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3240 for(i=0; i+7<w; i+=8){
3241 dst[i+0] += src[i+0];
3242 dst[i+1] += src[i+1];
3243 dst[i+2] += src[i+2];
3244 dst[i+3] += src[i+3];
3245 dst[i+4] += src[i+4];
3246 dst[i+5] += src[i+5];
3247 dst[i+6] += src[i+6];
3248 dst[i+7] += src[i+7];
3251 dst[i+0] += src[i+0];
3254 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3256 for(i=0; i+7<w; i+=8){
3257 dst[i+0] = src1[i+0]-src2[i+0];
3258 dst[i+1] = src1[i+1]-src2[i+1];
3259 dst[i+2] = src1[i+2]-src2[i+2];
3260 dst[i+3] = src1[i+3]-src2[i+3];
3261 dst[i+4] = src1[i+4]-src2[i+4];
3262 dst[i+5] = src1[i+5]-src2[i+5];
3263 dst[i+6] = src1[i+6]-src2[i+6];
3264 dst[i+7] = src1[i+7]-src2[i+7];
3267 dst[i+0] = src1[i+0]-src2[i+0];
3270 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3278 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3288 #define BUTTERFLY2(o1,o2,i1,i2) \
3292 #define BUTTERFLY1(x,y) \
3301 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3303 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3311 //FIXME try pointer walks
3312 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3313 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3314 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3315 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3317 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3318 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3319 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3320 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3322 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3323 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3324 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3325 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3329 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3330 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3331 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3332 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3334 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3335 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3336 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3337 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3340 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3341 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3342 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3343 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3349 printf("MAX:%d\n", maxi);
3355 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3363 //FIXME try pointer walks
3364 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3365 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3366 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3367 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3369 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3370 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3371 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3372 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3374 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3375 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3376 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3377 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3381 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3382 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3383 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3384 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3386 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3387 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3388 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3389 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3392 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3393 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3394 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3395 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3398 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3403 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3404 MpegEncContext * const s= (MpegEncContext *)c;
3405 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3406 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3411 s->dsp.diff_pixels(temp, src1, src2, stride);
3422 const int s07 = SRC(0) + SRC(7);\
3423 const int s16 = SRC(1) + SRC(6);\
3424 const int s25 = SRC(2) + SRC(5);\
3425 const int s34 = SRC(3) + SRC(4);\
3426 const int a0 = s07 + s34;\
3427 const int a1 = s16 + s25;\
3428 const int a2 = s07 - s34;\
3429 const int a3 = s16 - s25;\
3430 const int d07 = SRC(0) - SRC(7);\
3431 const int d16 = SRC(1) - SRC(6);\
3432 const int d25 = SRC(2) - SRC(5);\
3433 const int d34 = SRC(3) - SRC(4);\
3434 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3435 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3436 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3437 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3439 DST(1, a4 + (a7>>2)) ;\
3440 DST(2, a2 + (a3>>1)) ;\
3441 DST(3, a5 + (a6>>2)) ;\
3443 DST(5, a6 - (a5>>2)) ;\
3444 DST(6, (a2>>1) - a3 ) ;\
3445 DST(7, (a4>>2) - a7 ) ;\
3448 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3449 MpegEncContext * const s= (MpegEncContext *)c;
3454 s->dsp.diff_pixels(dct, src1, src2, stride);
3456 #define SRC(x) dct[i][x]
3457 #define DST(x,v) dct[i][x]= v
3458 for( i = 0; i < 8; i++ )
3463 #define SRC(x) dct[x][i]
3464 #define DST(x,v) sum += ABS(v)
3465 for( i = 0; i < 8; i++ )
3473 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474 MpegEncContext * const s= (MpegEncContext *)c;
3475 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3476 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3481 s->dsp.diff_pixels(temp, src1, src2, stride);
3485 sum= FFMAX(sum, ABS(temp[i]));
3490 void simple_idct(DCTELEM *block); //FIXME
3492 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3493 MpegEncContext * const s= (MpegEncContext *)c;
3494 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3495 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3496 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3502 s->dsp.diff_pixels(temp, src1, src2, stride);
3504 memcpy(bak, temp, 64*sizeof(DCTELEM));
3506 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3507 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3508 simple_idct(temp); //FIXME
3511 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3516 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3517 MpegEncContext * const s= (MpegEncContext *)c;
3518 const uint8_t *scantable= s->intra_scantable.permutated;
3519 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3520 uint64_t __align8 aligned_bak[stride];
3521 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3522 uint8_t * const bak= (uint8_t*)aligned_bak;
3523 int i, last, run, bits, level, distoration, start_i;
3524 const int esc_length= s->ac_esc_length;
3526 uint8_t * last_length;
3531 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3532 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3535 s->dsp.diff_pixels(temp, src1, src2, stride);
3537 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3543 length = s->intra_ac_vlc_length;
3544 last_length= s->intra_ac_vlc_last_length;
3545 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3548 length = s->inter_ac_vlc_length;
3549 last_length= s->inter_ac_vlc_last_length;
3554 for(i=start_i; i<last; i++){
3555 int j= scantable[i];
3560 if((level&(~127)) == 0){
3561 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3570 level= temp[i] + 64;
3574 if((level&(~127)) == 0){
3575 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3583 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3585 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3588 s->dsp.idct_add(bak, stride, temp);
3590 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3592 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3595 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3596 MpegEncContext * const s= (MpegEncContext *)c;
3597 const uint8_t *scantable= s->intra_scantable.permutated;
3598 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3599 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3600 int i, last, run, bits, level, start_i;
3601 const int esc_length= s->ac_esc_length;
3603 uint8_t * last_length;
3607 s->dsp.diff_pixels(temp, src1, src2, stride);
3609 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3615 length = s->intra_ac_vlc_length;
3616 last_length= s->intra_ac_vlc_last_length;
3617 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3620 length = s->inter_ac_vlc_length;
3621 last_length= s->inter_ac_vlc_last_length;
3626 for(i=start_i; i<last; i++){
3627 int j= scantable[i];
3632 if((level&(~127)) == 0){
3633 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3642 level= temp[i] + 64;
3646 if((level&(~127)) == 0){
3647 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3655 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3660 for(x=0; x<16; x+=4){
3661 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3662 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3670 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3675 for(x=0; x<16; x++){
3676 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3685 #define SQ(a) ((a)*(a))
3686 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3691 for(x=0; x<16; x+=4){
3692 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3693 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3701 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3706 for(x=0; x<16; x++){
3707 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3716 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3717 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3718 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3720 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3722 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3723 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3724 WARPER8_16_SQ(rd8x8_c, rd16_c)
3725 WARPER8_16_SQ(bit8x8_c, bit16_c)
3727 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3729 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3732 put_pixels_clamped_c(block, dest, line_size);
3734 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3737 add_pixels_clamped_c(block, dest, line_size);
3740 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3743 put_pixels_clamped4_c(block, dest, line_size);
3745 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3748 add_pixels_clamped4_c(block, dest, line_size);
3751 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3754 put_pixels_clamped2_c(block, dest, line_size);
3756 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3759 add_pixels_clamped2_c(block, dest, line_size);
3762 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3764 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3766 dest[0] = cm[(block[0] + 4)>>3];
3768 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3770 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3772 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3775 /* init static data */
3776 void dsputil_static_init(void)
3780 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3781 for(i=0;i<MAX_NEG_CROP;i++) {
3783 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3786 for(i=0;i<512;i++) {
3787 squareTbl[i] = (i - 256) * (i - 256);
3790 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3794 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3798 #ifdef CONFIG_ENCODERS
3799 if(avctx->dct_algo==FF_DCT_FASTINT) {
3800 c->fdct = fdct_ifast;
3801 c->fdct248 = fdct_ifast248;
3803 else if(avctx->dct_algo==FF_DCT_FAAN) {
3804 c->fdct = ff_faandct;
3805 c->fdct248 = ff_faandct248;
3808 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3809 c->fdct248 = ff_fdct248_islow;
3811 #endif //CONFIG_ENCODERS
3813 if(avctx->lowres==1){
3814 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3815 c->idct_put= ff_jref_idct4_put;
3816 c->idct_add= ff_jref_idct4_add;
3818 c->idct_put= ff_h264_lowres_idct_put_c;
3819 c->idct_add= ff_h264_lowres_idct_add_c;
3821 c->idct = j_rev_dct4;
3822 c->idct_permutation_type= FF_NO_IDCT_PERM;
3823 }else if(avctx->lowres==2){
3824 c->idct_put= ff_jref_idct2_put;
3825 c->idct_add= ff_jref_idct2_add;
3826 c->idct = j_rev_dct2;
3827 c->idct_permutation_type= FF_NO_IDCT_PERM;
3828 }else if(avctx->lowres==3){
3829 c->idct_put= ff_jref_idct1_put;
3830 c->idct_add= ff_jref_idct1_add;
3831 c->idct = j_rev_dct1;
3832 c->idct_permutation_type= FF_NO_IDCT_PERM;
3834 if(avctx->idct_algo==FF_IDCT_INT){
3835 c->idct_put= ff_jref_idct_put;
3836 c->idct_add= ff_jref_idct_add;
3837 c->idct = j_rev_dct;
3838 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3839 }else if(avctx->idct_algo==FF_IDCT_VP3){
3840 c->idct_put= ff_vp3_idct_put_c;
3841 c->idct_add= ff_vp3_idct_add_c;
3842 c->idct = ff_vp3_idct_c;
3843 c->idct_permutation_type= FF_NO_IDCT_PERM;
3844 }else{ //accurate/default
3845 c->idct_put= simple_idct_put;
3846 c->idct_add= simple_idct_add;
3847 c->idct = simple_idct;
3848 c->idct_permutation_type= FF_NO_IDCT_PERM;
3852 c->h264_idct_add= ff_h264_idct_add_c;
3853 c->h264_idct8_add= ff_h264_idct8_add_c;
3855 c->get_pixels = get_pixels_c;
3856 c->diff_pixels = diff_pixels_c;
3857 c->put_pixels_clamped = put_pixels_clamped_c;
3858 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3859 c->add_pixels_clamped = add_pixels_clamped_c;
3860 c->add_pixels8 = add_pixels8_c;
3861 c->add_pixels4 = add_pixels4_c;
3864 c->clear_blocks = clear_blocks_c;
3865 c->pix_sum = pix_sum_c;
3866 c->pix_norm1 = pix_norm1_c;
3868 /* TODO [0] 16 [1] 8 */
3869 c->pix_abs[0][0] = pix_abs16_c;
3870 c->pix_abs[0][1] = pix_abs16_x2_c;
3871 c->pix_abs[0][2] = pix_abs16_y2_c;
3872 c->pix_abs[0][3] = pix_abs16_xy2_c;
3873 c->pix_abs[1][0] = pix_abs8_c;
3874 c->pix_abs[1][1] = pix_abs8_x2_c;
3875 c->pix_abs[1][2] = pix_abs8_y2_c;
3876 c->pix_abs[1][3] = pix_abs8_xy2_c;
3878 #define dspfunc(PFX, IDX, NUM) \
3879 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3880 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3881 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3882 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3884 dspfunc(put, 0, 16);
3885 dspfunc(put_no_rnd, 0, 16);
3887 dspfunc(put_no_rnd, 1, 8);
3891 dspfunc(avg, 0, 16);
3892 dspfunc(avg_no_rnd, 0, 16);
3894 dspfunc(avg_no_rnd, 1, 8);
3899 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3900 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3902 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3903 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3904 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3905 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3906 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3907 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3908 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3909 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3910 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3912 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3913 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3914 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3915 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3916 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3917 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3918 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3919 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3920 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3922 #define dspfunc(PFX, IDX, NUM) \
3923 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3924 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3925 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3926 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3927 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3928 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3929 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3930 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3931 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3932 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3933 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3934 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3935 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3936 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3937 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3938 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3940 dspfunc(put_qpel, 0, 16);
3941 dspfunc(put_no_rnd_qpel, 0, 16);
3943 dspfunc(avg_qpel, 0, 16);
3944 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3946 dspfunc(put_qpel, 1, 8);
3947 dspfunc(put_no_rnd_qpel, 1, 8);
3949 dspfunc(avg_qpel, 1, 8);
3950 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3952 dspfunc(put_h264_qpel, 0, 16);
3953 dspfunc(put_h264_qpel, 1, 8);
3954 dspfunc(put_h264_qpel, 2, 4);
3955 dspfunc(put_h264_qpel, 3, 2);
3956 dspfunc(avg_h264_qpel, 0, 16);
3957 dspfunc(avg_h264_qpel, 1, 8);
3958 dspfunc(avg_h264_qpel, 2, 4);
3961 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3962 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3963 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3964 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3965 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3966 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3968 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3969 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3970 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3971 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3972 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3973 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3974 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3975 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3976 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3977 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3978 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3979 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3980 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3981 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3982 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3983 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3984 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3985 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3986 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3987 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3989 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3990 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3991 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3992 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3993 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3994 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3995 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3996 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3998 #define SET_CMP_FUNC(name) \
3999 c->name[0]= name ## 16_c;\
4000 c->name[1]= name ## 8x8_c;
4002 SET_CMP_FUNC(hadamard8_diff)
4003 c->hadamard8_diff[4]= hadamard8_intra16_c;
4004 SET_CMP_FUNC(dct_sad)
4005 SET_CMP_FUNC(dct_max)
4007 SET_CMP_FUNC(dct264_sad)
4009 c->sad[0]= pix_abs16_c;
4010 c->sad[1]= pix_abs8_c;
4014 SET_CMP_FUNC(quant_psnr)
4017 c->vsad[0]= vsad16_c;
4018 c->vsad[4]= vsad_intra16_c;
4019 c->vsse[0]= vsse16_c;
4020 c->vsse[4]= vsse_intra16_c;
4021 c->nsse[0]= nsse16_c;
4022 c->nsse[1]= nsse8_c;
4023 c->w53[0]= w53_16_c;
4025 c->w97[0]= w97_16_c;
4028 c->add_bytes= add_bytes_c;
4029 c->diff_bytes= diff_bytes_c;
4030 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4031 c->bswap_buf= bswap_buf;
4033 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4034 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4035 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4036 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4037 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4038 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4040 c->h263_h_loop_filter= h263_h_loop_filter_c;
4041 c->h263_v_loop_filter= h263_v_loop_filter_c;
4043 c->h261_loop_filter= h261_loop_filter_c;
4045 c->try_8x8basis= try_8x8basis_c;
4046 c->add_8x8basis= add_8x8basis_c;
4049 dsputil_init_mmx(c, avctx);
4052 dsputil_init_armv4l(c, avctx);
4055 dsputil_init_mlib(c, avctx);
4058 dsputil_init_vis(c,avctx);
4061 dsputil_init_alpha(c, avctx);
4064 dsputil_init_ppc(c, avctx);
4067 dsputil_init_mmi(c, avctx);
4070 dsputil_init_sh4(c,avctx);
4073 switch(c->idct_permutation_type){
4074 case FF_NO_IDCT_PERM:
4076 c->idct_permutation[i]= i;
4078 case FF_LIBMPEG2_IDCT_PERM:
4080 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4082 case FF_SIMPLE_IDCT_PERM:
4084 c->idct_permutation[i]= simple_mmx_permutation[i];
4086 case FF_TRANSPOSE_IDCT_PERM:
4088 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4090 case FF_PARTTRANS_IDCT_PERM:
4092 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4095 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");