3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
134 static int pix_sum_c(uint8_t * pix, int line_size)
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
151 pix += line_size - 16;
156 static int pix_norm1_c(uint8_t * pix, int line_size)
159 uint32_t *sq = squareTbl + 256;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
185 register uint32_t x=*(uint32_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
199 pix += line_size - 16;
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
218 dst[i+0]= bswap_32(src[i+0]);
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
225 uint32_t *sq = squareTbl + 256;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
242 uint32_t *sq = squareTbl + 256;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
263 uint32_t *sq = squareTbl + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
293 const int dec_count= w==8 ? 3 : 4;
297 static const int scale[2][2][4][4]={
301 {268, 239, 239, 213},
306 {344, 310, 310, 280},
314 {275, 245, 245, 218},
319 {352, 317, 317, 286},
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
391 /* read the pixels */
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
410 /* read the pixels */
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
433 /* read the pixels */
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
449 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453 uint8_t *cm = cropTbl + MAX_NEG_CROP;
455 /* read the pixels */
457 pixels[0] = cm[block[0]];
458 pixels[1] = cm[block[1]];
459 pixels[2] = cm[block[2]];
460 pixels[3] = cm[block[3]];
467 static void put_signed_pixels_clamped_c(const DCTELEM *block,
468 uint8_t *restrict pixels,
473 for (i = 0; i < 8; i++) {
474 for (j = 0; j < 8; j++) {
477 else if (*block > 127)
480 *pixels = (uint8_t)(*block + 128);
484 pixels += (line_size - 8);
488 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
492 uint8_t *cm = cropTbl + MAX_NEG_CROP;
494 /* read the pixels */
496 pixels[0] = cm[pixels[0] + block[0]];
497 pixels[1] = cm[pixels[1] + block[1]];
498 pixels[2] = cm[pixels[2] + block[2]];
499 pixels[3] = cm[pixels[3] + block[3]];
500 pixels[4] = cm[pixels[4] + block[4]];
501 pixels[5] = cm[pixels[5] + block[5]];
502 pixels[6] = cm[pixels[6] + block[6]];
503 pixels[7] = cm[pixels[7] + block[7]];
509 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
513 uint8_t *cm = cropTbl + MAX_NEG_CROP;
515 /* read the pixels */
517 pixels[0] = cm[pixels[0] + block[0]];
518 pixels[1] = cm[pixels[1] + block[1]];
519 pixels[2] = cm[pixels[2] + block[2]];
520 pixels[3] = cm[pixels[3] + block[3]];
527 #define PIXOP2(OPNAME, OP) \
528 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
532 OP(*((uint64_t*)block), LD64(pixels));\
538 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
542 const uint64_t a= LD64(pixels );\
543 const uint64_t b= LD64(pixels+1);\
544 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
550 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
554 const uint64_t a= LD64(pixels );\
555 const uint64_t b= LD64(pixels+1);\
556 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
562 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566 const uint64_t a= LD64(pixels );\
567 const uint64_t b= LD64(pixels+line_size);\
568 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
574 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
578 const uint64_t a= LD64(pixels );\
579 const uint64_t b= LD64(pixels+line_size);\
580 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
586 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
589 const uint64_t a= LD64(pixels );\
590 const uint64_t b= LD64(pixels+1);\
591 uint64_t l0= (a&0x0303030303030303ULL)\
592 + (b&0x0303030303030303ULL)\
593 + 0x0202020202020202ULL;\
594 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
595 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
599 for(i=0; i<h; i+=2){\
600 uint64_t a= LD64(pixels );\
601 uint64_t b= LD64(pixels+1);\
602 l1= (a&0x0303030303030303ULL)\
603 + (b&0x0303030303030303ULL);\
604 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
605 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
606 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
611 l0= (a&0x0303030303030303ULL)\
612 + (b&0x0303030303030303ULL)\
613 + 0x0202020202020202ULL;\
614 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
615 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
616 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
622 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
625 const uint64_t a= LD64(pixels );\
626 const uint64_t b= LD64(pixels+1);\
627 uint64_t l0= (a&0x0303030303030303ULL)\
628 + (b&0x0303030303030303ULL)\
629 + 0x0101010101010101ULL;\
630 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
631 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
635 for(i=0; i<h; i+=2){\
636 uint64_t a= LD64(pixels );\
637 uint64_t b= LD64(pixels+1);\
638 l1= (a&0x0303030303030303ULL)\
639 + (b&0x0303030303030303ULL);\
640 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
641 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
642 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
647 l0= (a&0x0303030303030303ULL)\
648 + (b&0x0303030303030303ULL)\
649 + 0x0101010101010101ULL;\
650 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
651 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
652 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
658 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
659 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
660 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
661 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
662 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
663 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
664 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
666 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
667 #else // 64 bit variant
669 #define PIXOP2(OPNAME, OP) \
670 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OP(*((uint16_t*)(block )), LD16(pixels ));\
678 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
681 OP(*((uint32_t*)(block )), LD32(pixels ));\
686 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
689 OP(*((uint32_t*)(block )), LD32(pixels ));\
690 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
695 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
696 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
699 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
700 int src_stride1, int src_stride2, int h){\
704 a= LD32(&src1[i*src_stride1 ]);\
705 b= LD32(&src2[i*src_stride2 ]);\
706 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
707 a= LD32(&src1[i*src_stride1+4]);\
708 b= LD32(&src2[i*src_stride2+4]);\
709 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
713 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
714 int src_stride1, int src_stride2, int h){\
718 a= LD32(&src1[i*src_stride1 ]);\
719 b= LD32(&src2[i*src_stride2 ]);\
720 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
721 a= LD32(&src1[i*src_stride1+4]);\
722 b= LD32(&src2[i*src_stride2+4]);\
723 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
727 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
728 int src_stride1, int src_stride2, int h){\
732 a= LD32(&src1[i*src_stride1 ]);\
733 b= LD32(&src2[i*src_stride2 ]);\
734 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
738 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
739 int src_stride1, int src_stride2, int h){\
743 a= LD16(&src1[i*src_stride1 ]);\
744 b= LD16(&src2[i*src_stride2 ]);\
745 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
749 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
750 int src_stride1, int src_stride2, int h){\
751 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
752 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
755 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
756 int src_stride1, int src_stride2, int h){\
757 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
758 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
761 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
762 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
765 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
769 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
770 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
773 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
774 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
777 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
778 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
781 uint32_t a, b, c, d, l0, l1, h0, h1;\
782 a= LD32(&src1[i*src_stride1]);\
783 b= LD32(&src2[i*src_stride2]);\
784 c= LD32(&src3[i*src_stride3]);\
785 d= LD32(&src4[i*src_stride4]);\
786 l0= (a&0x03030303UL)\
789 h0= ((a&0xFCFCFCFCUL)>>2)\
790 + ((b&0xFCFCFCFCUL)>>2);\
791 l1= (c&0x03030303UL)\
793 h1= ((c&0xFCFCFCFCUL)>>2)\
794 + ((d&0xFCFCFCFCUL)>>2);\
795 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796 a= LD32(&src1[i*src_stride1+4]);\
797 b= LD32(&src2[i*src_stride2+4]);\
798 c= LD32(&src3[i*src_stride3+4]);\
799 d= LD32(&src4[i*src_stride4+4]);\
800 l0= (a&0x03030303UL)\
803 h0= ((a&0xFCFCFCFCUL)>>2)\
804 + ((b&0xFCFCFCFCUL)>>2);\
805 l1= (c&0x03030303UL)\
807 h1= ((c&0xFCFCFCFCUL)>>2)\
808 + ((d&0xFCFCFCFCUL)>>2);\
809 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
813 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
814 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
817 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
818 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
821 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
822 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
825 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
826 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
829 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
830 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
833 uint32_t a, b, c, d, l0, l1, h0, h1;\
834 a= LD32(&src1[i*src_stride1]);\
835 b= LD32(&src2[i*src_stride2]);\
836 c= LD32(&src3[i*src_stride3]);\
837 d= LD32(&src4[i*src_stride4]);\
838 l0= (a&0x03030303UL)\
841 h0= ((a&0xFCFCFCFCUL)>>2)\
842 + ((b&0xFCFCFCFCUL)>>2);\
843 l1= (c&0x03030303UL)\
845 h1= ((c&0xFCFCFCFCUL)>>2)\
846 + ((d&0xFCFCFCFCUL)>>2);\
847 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
848 a= LD32(&src1[i*src_stride1+4]);\
849 b= LD32(&src2[i*src_stride2+4]);\
850 c= LD32(&src3[i*src_stride3+4]);\
851 d= LD32(&src4[i*src_stride4+4]);\
852 l0= (a&0x03030303UL)\
855 h0= ((a&0xFCFCFCFCUL)>>2)\
856 + ((b&0xFCFCFCFCUL)>>2);\
857 l1= (c&0x03030303UL)\
859 h1= ((c&0xFCFCFCFCUL)>>2)\
860 + ((d&0xFCFCFCFCUL)>>2);\
861 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
865 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
866 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
867 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
869 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
870 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
871 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
872 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
875 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
877 int i, a0, b0, a1, b1;\
884 for(i=0; i<h; i+=2){\
890 block[0]= (a1+a0)>>2; /* FIXME non put */\
891 block[1]= (b1+b0)>>2;\
901 block[0]= (a1+a0)>>2;\
902 block[1]= (b1+b0)>>2;\
908 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
911 const uint32_t a= LD32(pixels );\
912 const uint32_t b= LD32(pixels+1);\
913 uint32_t l0= (a&0x03030303UL)\
916 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
917 + ((b&0xFCFCFCFCUL)>>2);\
921 for(i=0; i<h; i+=2){\
922 uint32_t a= LD32(pixels );\
923 uint32_t b= LD32(pixels+1);\
924 l1= (a&0x03030303UL)\
926 h1= ((a&0xFCFCFCFCUL)>>2)\
927 + ((b&0xFCFCFCFCUL)>>2);\
928 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
933 l0= (a&0x03030303UL)\
936 h0= ((a&0xFCFCFCFCUL)>>2)\
937 + ((b&0xFCFCFCFCUL)>>2);\
938 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
944 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
949 const uint32_t a= LD32(pixels );\
950 const uint32_t b= LD32(pixels+1);\
951 uint32_t l0= (a&0x03030303UL)\
954 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
955 + ((b&0xFCFCFCFCUL)>>2);\
959 for(i=0; i<h; i+=2){\
960 uint32_t a= LD32(pixels );\
961 uint32_t b= LD32(pixels+1);\
962 l1= (a&0x03030303UL)\
964 h1= ((a&0xFCFCFCFCUL)>>2)\
965 + ((b&0xFCFCFCFCUL)>>2);\
966 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
971 l0= (a&0x03030303UL)\
974 h0= ((a&0xFCFCFCFCUL)>>2)\
975 + ((b&0xFCFCFCFCUL)>>2);\
976 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
980 pixels+=4-line_size*(h+1);\
981 block +=4-line_size*h;\
985 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
990 const uint32_t a= LD32(pixels );\
991 const uint32_t b= LD32(pixels+1);\
992 uint32_t l0= (a&0x03030303UL)\
995 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
1000 for(i=0; i<h; i+=2){\
1001 uint32_t a= LD32(pixels );\
1002 uint32_t b= LD32(pixels+1);\
1003 l1= (a&0x03030303UL)\
1004 + (b&0x03030303UL);\
1005 h1= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1012 l0= (a&0x03030303UL)\
1015 h0= ((a&0xFCFCFCFCUL)>>2)\
1016 + ((b&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021 pixels+=4-line_size*(h+1);\
1022 block +=4-line_size*h;\
1026 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1027 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1028 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1029 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1030 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1031 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1032 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1033 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1035 #define op_avg(a, b) a = rnd_avg32(a, b)
1037 #define op_put(a, b) a = b
1044 #define avg2(a,b) ((a+b+1)>>1)
1045 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1047 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1048 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1051 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1052 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1055 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1057 const int A=(16-x16)*(16-y16);
1058 const int B=( x16)*(16-y16);
1059 const int C=(16-x16)*( y16);
1060 const int D=( x16)*( y16);
1065 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1066 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1067 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1068 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1069 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1070 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1071 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1072 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1078 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1079 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1082 const int s= 1<<shift;
1092 for(x=0; x<8; x++){ //XXX FIXME optimize
1093 int src_x, src_y, frac_x, frac_y, index;
1097 frac_x= src_x&(s-1);
1098 frac_y= src_y&(s-1);
1102 if((unsigned)src_x < width){
1103 if((unsigned)src_y < height){
1104 index= src_x + src_y*stride;
1105 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1106 + src[index +1]* frac_x )*(s-frac_y)
1107 + ( src[index+stride ]*(s-frac_x)
1108 + src[index+stride+1]* frac_x )* frac_y
1111 index= src_x + clip(src_y, 0, height)*stride;
1112 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1113 + src[index +1]* frac_x )*s
1117 if((unsigned)src_y < height){
1118 index= clip(src_x, 0, width) + src_y*stride;
1119 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1120 + src[index+stride ]* frac_y )*s
1123 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1124 dst[y*stride + x]= src[index ];
1136 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1138 case 2: put_pixels2_c (dst, src, stride, height); break;
1139 case 4: put_pixels4_c (dst, src, stride, height); break;
1140 case 8: put_pixels8_c (dst, src, stride, height); break;
1141 case 16:put_pixels16_c(dst, src, stride, height); break;
1145 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1147 for (i=0; i < height; i++) {
1148 for (j=0; j < width; j++) {
1149 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1156 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1158 for (i=0; i < height; i++) {
1159 for (j=0; j < width; j++) {
1160 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1167 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1169 for (i=0; i < height; i++) {
1170 for (j=0; j < width; j++) {
1171 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1178 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1180 for (i=0; i < height; i++) {
1181 for (j=0; j < width; j++) {
1182 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1189 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1191 for (i=0; i < height; i++) {
1192 for (j=0; j < width; j++) {
1193 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1200 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1202 for (i=0; i < height; i++) {
1203 for (j=0; j < width; j++) {
1204 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1211 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1213 for (i=0; i < height; i++) {
1214 for (j=0; j < width; j++) {
1215 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1222 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1224 for (i=0; i < height; i++) {
1225 for (j=0; j < width; j++) {
1226 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1233 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1235 case 2: avg_pixels2_c (dst, src, stride, height); break;
1236 case 4: avg_pixels4_c (dst, src, stride, height); break;
1237 case 8: avg_pixels8_c (dst, src, stride, height); break;
1238 case 16:avg_pixels16_c(dst, src, stride, height); break;
1242 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244 for (i=0; i < height; i++) {
1245 for (j=0; j < width; j++) {
1246 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1253 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255 for (i=0; i < height; i++) {
1256 for (j=0; j < width; j++) {
1257 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1264 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266 for (i=0; i < height; i++) {
1267 for (j=0; j < width; j++) {
1268 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1275 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1277 for (i=0; i < height; i++) {
1278 for (j=0; j < width; j++) {
1279 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1286 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1288 for (i=0; i < height; i++) {
1289 for (j=0; j < width; j++) {
1290 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1297 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1299 for (i=0; i < height; i++) {
1300 for (j=0; j < width; j++) {
1301 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1308 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1310 for (i=0; i < height; i++) {
1311 for (j=0; j < width; j++) {
1312 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1319 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1321 for (i=0; i < height; i++) {
1322 for (j=0; j < width; j++) {
1323 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1330 #define TPEL_WIDTH(width)\
1331 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1332 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1333 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1334 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1335 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1336 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1337 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1338 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1339 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1340 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1341 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1342 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1343 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1344 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1345 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1346 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1347 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1348 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1351 #define H264_CHROMA_MC(OPNAME, OP)\
1352 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1353 const int A=(8-x)*(8-y);\
1354 const int B=( x)*(8-y);\
1355 const int C=(8-x)*( y);\
1356 const int D=( x)*( y);\
1359 assert(x<8 && y<8 && x>=0 && y>=0);\
1363 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1364 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1370 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1371 const int A=(8-x)*(8-y);\
1372 const int B=( x)*(8-y);\
1373 const int C=(8-x)*( y);\
1374 const int D=( x)*( y);\
1377 assert(x<8 && y<8 && x>=0 && y>=0);\
1381 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1382 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1383 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1384 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1390 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1391 const int A=(8-x)*(8-y);\
1392 const int B=( x)*(8-y);\
1393 const int C=(8-x)*( y);\
1394 const int D=( x)*( y);\
1397 assert(x<8 && y<8 && x>=0 && y>=0);\
1401 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1402 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1403 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1404 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1405 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1406 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1407 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1408 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1414 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1415 #define op_put(a, b) a = (((b) + 32)>>6)
1417 H264_CHROMA_MC(put_ , op_put)
1418 H264_CHROMA_MC(avg_ , op_avg)
1422 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1427 ST32(dst , LD32(src ));
1433 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1438 ST32(dst , LD32(src ));
1439 ST32(dst+4 , LD32(src+4 ));
1445 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1450 ST32(dst , LD32(src ));
1451 ST32(dst+4 , LD32(src+4 ));
1452 ST32(dst+8 , LD32(src+8 ));
1453 ST32(dst+12, LD32(src+12));
1459 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1464 ST32(dst , LD32(src ));
1465 ST32(dst+4 , LD32(src+4 ));
1466 ST32(dst+8 , LD32(src+8 ));
1467 ST32(dst+12, LD32(src+12));
1474 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1479 ST32(dst , LD32(src ));
1480 ST32(dst+4 , LD32(src+4 ));
1488 #define QPEL_MC(r, OPNAME, RND, OP) \
1489 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1490 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1494 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1495 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1496 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1497 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1498 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1499 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1500 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1501 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1507 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1509 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1513 const int src0= src[0*srcStride];\
1514 const int src1= src[1*srcStride];\
1515 const int src2= src[2*srcStride];\
1516 const int src3= src[3*srcStride];\
1517 const int src4= src[4*srcStride];\
1518 const int src5= src[5*srcStride];\
1519 const int src6= src[6*srcStride];\
1520 const int src7= src[7*srcStride];\
1521 const int src8= src[8*srcStride];\
1522 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1523 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1524 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1525 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1526 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1527 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1528 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1529 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1535 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1536 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1541 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1542 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1543 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1544 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1545 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1546 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1547 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1548 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1549 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1550 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1551 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1552 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1553 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1554 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1555 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1556 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1562 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1563 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1568 const int src0= src[0*srcStride];\
1569 const int src1= src[1*srcStride];\
1570 const int src2= src[2*srcStride];\
1571 const int src3= src[3*srcStride];\
1572 const int src4= src[4*srcStride];\
1573 const int src5= src[5*srcStride];\
1574 const int src6= src[6*srcStride];\
1575 const int src7= src[7*srcStride];\
1576 const int src8= src[8*srcStride];\
1577 const int src9= src[9*srcStride];\
1578 const int src10= src[10*srcStride];\
1579 const int src11= src[11*srcStride];\
1580 const int src12= src[12*srcStride];\
1581 const int src13= src[13*srcStride];\
1582 const int src14= src[14*srcStride];\
1583 const int src15= src[15*srcStride];\
1584 const int src16= src[16*srcStride];\
1585 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1586 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1587 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1588 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1589 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1590 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1591 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1592 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1593 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1594 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1595 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1596 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1597 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1598 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1599 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1600 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1606 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1607 OPNAME ## pixels8_c(dst, src, stride, 8);\
1610 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1612 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1613 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1616 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1617 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1620 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1622 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1623 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1626 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1631 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1634 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1635 uint8_t full[16*9];\
1636 copy_block9(full, src, 16, stride, 9);\
1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1640 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1641 uint8_t full[16*9];\
1643 copy_block9(full, src, 16, stride, 9);\
1644 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1645 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1647 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1648 uint8_t full[16*9];\
1651 uint8_t halfHV[64];\
1652 copy_block9(full, src, 16, stride, 9);\
1653 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1654 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1655 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1656 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1658 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1659 uint8_t full[16*9];\
1661 uint8_t halfHV[64];\
1662 copy_block9(full, src, 16, stride, 9);\
1663 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1664 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1665 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1666 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1668 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1672 uint8_t halfHV[64];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1675 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1676 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1677 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1679 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1680 uint8_t full[16*9];\
1682 uint8_t halfHV[64];\
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1686 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1689 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1690 uint8_t full[16*9];\
1693 uint8_t halfHV[64];\
1694 copy_block9(full, src, 16, stride, 9);\
1695 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1696 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1697 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1698 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1700 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1701 uint8_t full[16*9];\
1703 uint8_t halfHV[64];\
1704 copy_block9(full, src, 16, stride, 9);\
1705 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1710 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1714 uint8_t halfHV[64];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1718 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1719 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1721 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1722 uint8_t full[16*9];\
1724 uint8_t halfHV[64];\
1725 copy_block9(full, src, 16, stride, 9);\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1731 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1733 uint8_t halfHV[64];\
1734 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1738 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t halfHV[64];\
1741 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1743 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1745 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1746 uint8_t full[16*9];\
1749 uint8_t halfHV[64];\
1750 copy_block9(full, src, 16, stride, 9);\
1751 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1753 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1754 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1756 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1757 uint8_t full[16*9];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1762 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1764 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t full[16*9];\
1768 uint8_t halfHV[64];\
1769 copy_block9(full, src, 16, stride, 9);\
1770 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1771 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1775 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1776 uint8_t full[16*9];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1781 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1783 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1786 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1788 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1789 OPNAME ## pixels16_c(dst, src, stride, 16);\
1792 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1794 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1795 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1798 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1799 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1802 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1804 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1805 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1808 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1813 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1816 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t full[24*17];\
1818 copy_block17(full, src, 24, stride, 17);\
1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1822 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1823 uint8_t full[24*17];\
1825 copy_block17(full, src, 24, stride, 17);\
1826 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1827 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1829 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830 uint8_t full[24*17];\
1831 uint8_t halfH[272];\
1832 uint8_t halfV[256];\
1833 uint8_t halfHV[256];\
1834 copy_block17(full, src, 24, stride, 17);\
1835 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1836 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1837 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1838 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1840 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[24*17];\
1842 uint8_t halfH[272];\
1843 uint8_t halfHV[256];\
1844 copy_block17(full, src, 24, stride, 17);\
1845 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1846 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1847 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1848 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1850 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1852 uint8_t halfH[272];\
1853 uint8_t halfV[256];\
1854 uint8_t halfHV[256];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1857 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1858 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1859 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1861 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t full[24*17];\
1863 uint8_t halfH[272];\
1864 uint8_t halfHV[256];\
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1868 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1871 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[24*17];\
1873 uint8_t halfH[272];\
1874 uint8_t halfV[256];\
1875 uint8_t halfHV[256];\
1876 copy_block17(full, src, 24, stride, 17);\
1877 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1878 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1879 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1880 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1882 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1883 uint8_t full[24*17];\
1884 uint8_t halfH[272];\
1885 uint8_t halfHV[256];\
1886 copy_block17(full, src, 24, stride, 17);\
1887 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1892 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfV[256];\
1896 uint8_t halfHV[256];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1900 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1901 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1903 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[24*17];\
1905 uint8_t halfH[272];\
1906 uint8_t halfHV[256];\
1907 copy_block17(full, src, 24, stride, 17);\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1913 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t halfH[272];\
1915 uint8_t halfHV[256];\
1916 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1920 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1925 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1927 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1929 uint8_t halfH[272];\
1930 uint8_t halfV[256];\
1931 uint8_t halfHV[256];\
1932 copy_block17(full, src, 24, stride, 17);\
1933 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1935 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1936 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1938 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[24*17];\
1940 uint8_t halfH[272];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1944 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1946 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[24*17];\
1948 uint8_t halfH[272];\
1949 uint8_t halfV[256];\
1950 uint8_t halfHV[256];\
1951 copy_block17(full, src, 24, stride, 17);\
1952 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1953 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1957 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t full[24*17];\
1959 uint8_t halfH[272];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1963 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1965 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t halfH[272];\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1968 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1971 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1972 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1973 #define op_put(a, b) a = cm[((b) + 16)>>5]
1974 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1976 QPEL_MC(0, put_ , _ , op_put)
1977 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1978 QPEL_MC(0, avg_ , _ , op_avg)
1979 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1981 #undef op_avg_no_rnd
1983 #undef op_put_no_rnd
1986 #define H264_LOWPASS(OPNAME, OP, OP2) \
1987 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1989 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1993 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1994 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1995 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1996 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2002 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2004 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2008 const int srcB= src[-2*srcStride];\
2009 const int srcA= src[-1*srcStride];\
2010 const int src0= src[0 *srcStride];\
2011 const int src1= src[1 *srcStride];\
2012 const int src2= src[2 *srcStride];\
2013 const int src3= src[3 *srcStride];\
2014 const int src4= src[4 *srcStride];\
2015 const int src5= src[5 *srcStride];\
2016 const int src6= src[6 *srcStride];\
2017 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2018 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2019 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2020 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2026 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2029 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2031 src -= 2*srcStride;\
2032 for(i=0; i<h+5; i++)\
2034 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2035 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2036 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2037 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2041 tmp -= tmpStride*(h+5-2);\
2044 const int tmpB= tmp[-2*tmpStride];\
2045 const int tmpA= tmp[-1*tmpStride];\
2046 const int tmp0= tmp[0 *tmpStride];\
2047 const int tmp1= tmp[1 *tmpStride];\
2048 const int tmp2= tmp[2 *tmpStride];\
2049 const int tmp3= tmp[3 *tmpStride];\
2050 const int tmp4= tmp[4 *tmpStride];\
2051 const int tmp5= tmp[5 *tmpStride];\
2052 const int tmp6= tmp[6 *tmpStride];\
2053 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2054 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2055 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2056 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2062 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2064 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2068 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2069 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2070 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2071 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2072 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2073 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2074 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2075 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2081 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2083 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2087 const int srcB= src[-2*srcStride];\
2088 const int srcA= src[-1*srcStride];\
2089 const int src0= src[0 *srcStride];\
2090 const int src1= src[1 *srcStride];\
2091 const int src2= src[2 *srcStride];\
2092 const int src3= src[3 *srcStride];\
2093 const int src4= src[4 *srcStride];\
2094 const int src5= src[5 *srcStride];\
2095 const int src6= src[6 *srcStride];\
2096 const int src7= src[7 *srcStride];\
2097 const int src8= src[8 *srcStride];\
2098 const int src9= src[9 *srcStride];\
2099 const int src10=src[10*srcStride];\
2100 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2101 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2102 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2103 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2104 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2105 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2106 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2107 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2113 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2116 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2118 src -= 2*srcStride;\
2119 for(i=0; i<h+5; i++)\
2121 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2122 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2123 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2124 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2125 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2126 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2127 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2128 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2132 tmp -= tmpStride*(h+5-2);\
2135 const int tmpB= tmp[-2*tmpStride];\
2136 const int tmpA= tmp[-1*tmpStride];\
2137 const int tmp0= tmp[0 *tmpStride];\
2138 const int tmp1= tmp[1 *tmpStride];\
2139 const int tmp2= tmp[2 *tmpStride];\
2140 const int tmp3= tmp[3 *tmpStride];\
2141 const int tmp4= tmp[4 *tmpStride];\
2142 const int tmp5= tmp[5 *tmpStride];\
2143 const int tmp6= tmp[6 *tmpStride];\
2144 const int tmp7= tmp[7 *tmpStride];\
2145 const int tmp8= tmp[8 *tmpStride];\
2146 const int tmp9= tmp[9 *tmpStride];\
2147 const int tmp10=tmp[10*tmpStride];\
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2150 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2151 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2152 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2153 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2154 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2155 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2161 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2162 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2163 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2164 src += 8*srcStride;\
2165 dst += 8*dstStride;\
2166 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2167 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2170 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2172 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2173 src += 8*srcStride;\
2174 dst += 8*dstStride;\
2175 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2176 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2179 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2180 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2181 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2182 src += 8*srcStride;\
2183 dst += 8*dstStride;\
2184 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2185 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2188 #define H264_MC(OPNAME, SIZE) \
2189 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2190 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2193 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2194 uint8_t half[SIZE*SIZE];\
2195 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2196 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2199 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2200 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2203 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2204 uint8_t half[SIZE*SIZE];\
2205 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2206 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2209 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2210 uint8_t full[SIZE*(SIZE+5)];\
2211 uint8_t * const full_mid= full + SIZE*2;\
2212 uint8_t half[SIZE*SIZE];\
2213 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2214 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2215 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2218 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2219 uint8_t full[SIZE*(SIZE+5)];\
2220 uint8_t * const full_mid= full + SIZE*2;\
2221 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2222 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2225 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2226 uint8_t full[SIZE*(SIZE+5)];\
2227 uint8_t * const full_mid= full + SIZE*2;\
2228 uint8_t half[SIZE*SIZE];\
2229 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2230 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2231 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2234 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t full[SIZE*(SIZE+5)];\
2236 uint8_t * const full_mid= full + SIZE*2;\
2237 uint8_t halfH[SIZE*SIZE];\
2238 uint8_t halfV[SIZE*SIZE];\
2239 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2240 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2241 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2242 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2245 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[SIZE*(SIZE+5)];\
2247 uint8_t * const full_mid= full + SIZE*2;\
2248 uint8_t halfH[SIZE*SIZE];\
2249 uint8_t halfV[SIZE*SIZE];\
2250 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2251 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2252 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2253 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2256 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[SIZE*(SIZE+5)];\
2258 uint8_t * const full_mid= full + SIZE*2;\
2259 uint8_t halfH[SIZE*SIZE];\
2260 uint8_t halfV[SIZE*SIZE];\
2261 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2262 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2263 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2264 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2267 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2268 uint8_t full[SIZE*(SIZE+5)];\
2269 uint8_t * const full_mid= full + SIZE*2;\
2270 uint8_t halfH[SIZE*SIZE];\
2271 uint8_t halfV[SIZE*SIZE];\
2272 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2273 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2274 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2275 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2278 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2279 int16_t tmp[SIZE*(SIZE+5)];\
2280 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2283 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2284 int16_t tmp[SIZE*(SIZE+5)];\
2285 uint8_t halfH[SIZE*SIZE];\
2286 uint8_t halfHV[SIZE*SIZE];\
2287 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2289 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2292 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2293 int16_t tmp[SIZE*(SIZE+5)];\
2294 uint8_t halfH[SIZE*SIZE];\
2295 uint8_t halfHV[SIZE*SIZE];\
2296 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2297 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t full[SIZE*(SIZE+5)];\
2303 uint8_t * const full_mid= full + SIZE*2;\
2304 int16_t tmp[SIZE*(SIZE+5)];\
2305 uint8_t halfV[SIZE*SIZE];\
2306 uint8_t halfHV[SIZE*SIZE];\
2307 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2308 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2310 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2313 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2314 uint8_t full[SIZE*(SIZE+5)];\
2315 uint8_t * const full_mid= full + SIZE*2;\
2316 int16_t tmp[SIZE*(SIZE+5)];\
2317 uint8_t halfV[SIZE*SIZE];\
2318 uint8_t halfHV[SIZE*SIZE];\
2319 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2320 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2321 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2322 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2325 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2326 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2327 #define op_put(a, b) a = cm[((b) + 16)>>5]
2328 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2329 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2331 H264_LOWPASS(put_ , op_put, op2_put)
2332 H264_LOWPASS(avg_ , op_avg, op2_avg)
2346 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2347 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2351 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2352 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2353 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2354 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2355 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2356 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2357 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2358 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2364 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2365 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2369 const int src_1= src[ -srcStride];
2370 const int src0 = src[0 ];
2371 const int src1 = src[ srcStride];
2372 const int src2 = src[2*srcStride];
2373 const int src3 = src[3*srcStride];
2374 const int src4 = src[4*srcStride];
2375 const int src5 = src[5*srcStride];
2376 const int src6 = src[6*srcStride];
2377 const int src7 = src[7*srcStride];
2378 const int src8 = src[8*srcStride];
2379 const int src9 = src[9*srcStride];
2380 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2381 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2382 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2383 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2384 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2385 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2386 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2387 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2393 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2394 put_pixels8_c(dst, src, stride, 8);
2397 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2399 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2400 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2403 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2404 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2407 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2409 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2410 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2413 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2414 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2417 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2422 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2423 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2424 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2426 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2430 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2431 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2432 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2433 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2435 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2437 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2438 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2441 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2443 const int strength= ff_h263_loop_filter_strength[qscale];
2447 int p0= src[x-2*stride];
2448 int p1= src[x-1*stride];
2449 int p2= src[x+0*stride];
2450 int p3= src[x+1*stride];
2451 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2453 if (d<-2*strength) d1= 0;
2454 else if(d<- strength) d1=-2*strength - d;
2455 else if(d< strength) d1= d;
2456 else if(d< 2*strength) d1= 2*strength - d;
2461 if(p1&256) p1= ~(p1>>31);
2462 if(p2&256) p2= ~(p2>>31);
2464 src[x-1*stride] = p1;
2465 src[x+0*stride] = p2;
2469 d2= clip((p0-p3)/4, -ad1, ad1);
2471 src[x-2*stride] = p0 - d2;
2472 src[x+ stride] = p3 + d2;
2476 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2478 const int strength= ff_h263_loop_filter_strength[qscale];
2482 int p0= src[y*stride-2];
2483 int p1= src[y*stride-1];
2484 int p2= src[y*stride+0];
2485 int p3= src[y*stride+1];
2486 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2488 if (d<-2*strength) d1= 0;
2489 else if(d<- strength) d1=-2*strength - d;
2490 else if(d< strength) d1= d;
2491 else if(d< 2*strength) d1= 2*strength - d;
2496 if(p1&256) p1= ~(p1>>31);
2497 if(p2&256) p2= ~(p2>>31);
2499 src[y*stride-1] = p1;
2500 src[y*stride+0] = p2;
2504 d2= clip((p0-p3)/4, -ad1, ad1);
2506 src[y*stride-2] = p0 - d2;
2507 src[y*stride+1] = p3 + d2;
2511 static void h261_loop_filter_c(uint8_t *src, int stride){
2516 temp[x ] = 4*src[x ];
2517 temp[x + 7*8] = 4*src[x + 7*stride];
2521 xy = y * stride + x;
2523 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2528 src[ y*stride] = (temp[ y*8] + 2)>>2;
2529 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2531 xy = y * stride + x;
2533 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2538 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2544 s += abs(pix1[0] - pix2[0]);
2545 s += abs(pix1[1] - pix2[1]);
2546 s += abs(pix1[2] - pix2[2]);
2547 s += abs(pix1[3] - pix2[3]);
2548 s += abs(pix1[4] - pix2[4]);
2549 s += abs(pix1[5] - pix2[5]);
2550 s += abs(pix1[6] - pix2[6]);
2551 s += abs(pix1[7] - pix2[7]);
2552 s += abs(pix1[8] - pix2[8]);
2553 s += abs(pix1[9] - pix2[9]);
2554 s += abs(pix1[10] - pix2[10]);
2555 s += abs(pix1[11] - pix2[11]);
2556 s += abs(pix1[12] - pix2[12]);
2557 s += abs(pix1[13] - pix2[13]);
2558 s += abs(pix1[14] - pix2[14]);
2559 s += abs(pix1[15] - pix2[15]);
2566 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2572 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2573 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2574 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2575 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2576 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2577 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2578 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2579 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2580 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2581 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2582 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2583 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2584 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2585 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2586 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2587 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2594 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2597 uint8_t *pix3 = pix2 + line_size;
2601 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2602 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2603 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2604 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2605 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2606 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2607 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2608 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2609 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2610 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2611 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2612 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2613 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2614 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2615 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2616 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2624 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2627 uint8_t *pix3 = pix2 + line_size;
2631 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2632 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2633 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2634 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2635 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2636 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2637 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2638 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2639 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2640 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2641 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2642 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2643 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2644 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2645 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2646 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2654 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2660 s += abs(pix1[0] - pix2[0]);
2661 s += abs(pix1[1] - pix2[1]);
2662 s += abs(pix1[2] - pix2[2]);
2663 s += abs(pix1[3] - pix2[3]);
2664 s += abs(pix1[4] - pix2[4]);
2665 s += abs(pix1[5] - pix2[5]);
2666 s += abs(pix1[6] - pix2[6]);
2667 s += abs(pix1[7] - pix2[7]);
2674 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2680 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2681 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2682 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2683 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2684 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2685 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2686 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2687 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2694 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2697 uint8_t *pix3 = pix2 + line_size;
2701 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2702 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2703 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2704 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2705 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2706 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2707 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2708 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2716 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2719 uint8_t *pix3 = pix2 + line_size;
2723 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2724 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2725 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2726 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2727 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2728 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2729 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2730 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2738 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2744 for(x=0; x<16; x++){
2745 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2748 for(x=0; x<15; x++){
2749 score2+= ABS( s1[x ] - s1[x +stride]
2750 - s1[x+1] + s1[x+1+stride])
2751 -ABS( s2[x ] - s2[x +stride]
2752 - s2[x+1] + s2[x+1+stride]);
2759 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2760 else return score1 + ABS(score2)*8;
2763 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2770 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2774 score2+= ABS( s1[x ] - s1[x +stride]
2775 - s1[x+1] + s1[x+1+stride])
2776 -ABS( s2[x ] - s2[x +stride]
2777 - s2[x+1] + s2[x+1+stride]);
2784 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2785 else return score1 + ABS(score2)*8;
2788 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2792 for(i=0; i<8*8; i++){
2793 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2796 assert(-512<b && b<512);
2798 sum += (w*b)*(w*b)>>4;
2803 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2806 for(i=0; i<8*8; i++){
2807 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2812 * permutes an 8x8 block.
2813 * @param block the block which will be permuted according to the given permutation vector
2814 * @param permutation the permutation vector
2815 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2816 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2817 * (inverse) permutated to scantable order!
2819 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2825 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2827 for(i=0; i<=last; i++){
2828 const int j= scantable[i];
2833 for(i=0; i<=last; i++){
2834 const int j= scantable[i];
2835 const int perm_j= permutation[j];
2836 block[perm_j]= temp[j];
2840 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2844 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2847 memset(cmp, 0, sizeof(void*)*5);
2855 cmp[i]= c->hadamard8_diff[i];
2861 cmp[i]= c->dct_sad[i];
2864 cmp[i]= c->quant_psnr[i];
2891 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2897 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2899 static void clear_blocks_c(DCTELEM *blocks)
2901 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2904 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2906 for(i=0; i+7<w; i+=8){
2907 dst[i+0] += src[i+0];
2908 dst[i+1] += src[i+1];
2909 dst[i+2] += src[i+2];
2910 dst[i+3] += src[i+3];
2911 dst[i+4] += src[i+4];
2912 dst[i+5] += src[i+5];
2913 dst[i+6] += src[i+6];
2914 dst[i+7] += src[i+7];
2917 dst[i+0] += src[i+0];
2920 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2922 for(i=0; i+7<w; i+=8){
2923 dst[i+0] = src1[i+0]-src2[i+0];
2924 dst[i+1] = src1[i+1]-src2[i+1];
2925 dst[i+2] = src1[i+2]-src2[i+2];
2926 dst[i+3] = src1[i+3]-src2[i+3];
2927 dst[i+4] = src1[i+4]-src2[i+4];
2928 dst[i+5] = src1[i+5]-src2[i+5];
2929 dst[i+6] = src1[i+6]-src2[i+6];
2930 dst[i+7] = src1[i+7]-src2[i+7];
2933 dst[i+0] = src1[i+0]-src2[i+0];
2936 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2944 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2954 #define BUTTERFLY2(o1,o2,i1,i2) \
2958 #define BUTTERFLY1(x,y) \
2967 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2969 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2977 //FIXME try pointer walks
2978 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2979 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2980 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2981 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2983 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2984 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2985 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2986 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2988 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2989 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2990 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2991 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2995 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2996 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2997 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2998 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3000 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3001 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3002 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3003 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3006 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3007 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3008 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3009 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3015 printf("MAX:%d\n", maxi);
3021 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3029 //FIXME try pointer walks
3030 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3031 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3032 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3033 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3035 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3036 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3037 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3038 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3040 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3041 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3042 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3043 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3047 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3048 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3049 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3050 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3052 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3053 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3054 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3055 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3058 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3059 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3060 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3061 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3064 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3069 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3070 MpegEncContext * const s= (MpegEncContext *)c;
3071 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3072 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3077 s->dsp.diff_pixels(temp, src1, src2, stride);
3086 void simple_idct(DCTELEM *block); //FIXME
3088 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3089 MpegEncContext * const s= (MpegEncContext *)c;
3090 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3091 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3092 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3098 s->dsp.diff_pixels(temp, src1, src2, stride);
3100 memcpy(bak, temp, 64*sizeof(DCTELEM));
3102 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3103 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3104 simple_idct(temp); //FIXME
3107 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3112 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3113 MpegEncContext * const s= (MpegEncContext *)c;
3114 const uint8_t *scantable= s->intra_scantable.permutated;
3115 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3116 uint64_t __align8 aligned_bak[stride];
3117 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3118 uint8_t * const bak= (uint8_t*)aligned_bak;
3119 int i, last, run, bits, level, distoration, start_i;
3120 const int esc_length= s->ac_esc_length;
3122 uint8_t * last_length;
3127 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3128 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3131 s->dsp.diff_pixels(temp, src1, src2, stride);
3133 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3139 length = s->intra_ac_vlc_length;
3140 last_length= s->intra_ac_vlc_last_length;
3141 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3144 length = s->inter_ac_vlc_length;
3145 last_length= s->inter_ac_vlc_last_length;
3150 for(i=start_i; i<last; i++){
3151 int j= scantable[i];
3156 if((level&(~127)) == 0){
3157 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3166 level= temp[i] + 64;
3170 if((level&(~127)) == 0){
3171 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3179 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3181 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3184 s->dsp.idct_add(bak, stride, temp);
3186 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3188 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3191 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3192 MpegEncContext * const s= (MpegEncContext *)c;
3193 const uint8_t *scantable= s->intra_scantable.permutated;
3194 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3195 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3196 int i, last, run, bits, level, start_i;
3197 const int esc_length= s->ac_esc_length;
3199 uint8_t * last_length;
3203 s->dsp.diff_pixels(temp, src1, src2, stride);
3205 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3211 length = s->intra_ac_vlc_length;
3212 last_length= s->intra_ac_vlc_last_length;
3213 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3216 length = s->inter_ac_vlc_length;
3217 last_length= s->inter_ac_vlc_last_length;
3222 for(i=start_i; i<last; i++){
3223 int j= scantable[i];
3228 if((level&(~127)) == 0){
3229 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3238 level= temp[i] + 64;
3242 if((level&(~127)) == 0){
3243 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3251 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3256 for(x=0; x<16; x+=4){
3257 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3258 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3266 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3271 for(x=0; x<16; x++){
3272 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3281 #define SQ(a) ((a)*(a))
3282 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3287 for(x=0; x<16; x+=4){
3288 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3289 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3297 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3302 for(x=0; x<16; x++){
3303 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3312 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3313 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3314 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3315 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3316 WARPER8_16_SQ(rd8x8_c, rd16_c)
3317 WARPER8_16_SQ(bit8x8_c, bit16_c)
3319 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3321 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3324 put_pixels_clamped_c(block, dest, line_size);
3326 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3329 add_pixels_clamped_c(block, dest, line_size);
3332 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3335 put_pixels_clamped4_c(block, dest, line_size);
3337 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3340 add_pixels_clamped4_c(block, dest, line_size);
3343 /* init static data */
3344 void dsputil_static_init(void)
3348 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3349 for(i=0;i<MAX_NEG_CROP;i++) {
3351 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3354 for(i=0;i<512;i++) {
3355 squareTbl[i] = (i - 256) * (i - 256);
3358 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3362 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3366 #ifdef CONFIG_ENCODERS
3367 if(avctx->dct_algo==FF_DCT_FASTINT) {
3368 c->fdct = fdct_ifast;
3369 c->fdct248 = fdct_ifast248;
3371 else if(avctx->dct_algo==FF_DCT_FAAN) {
3372 c->fdct = ff_faandct;
3373 c->fdct248 = ff_faandct248;
3376 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3377 c->fdct248 = ff_fdct248_islow;
3379 #endif //CONFIG_ENCODERS
3381 if(avctx->lowres==1){
3382 c->idct_put= ff_jref_idct4_put;
3383 c->idct_add= ff_jref_idct4_add;
3384 c->idct = j_rev_dct4;
3385 c->idct_permutation_type= FF_NO_IDCT_PERM;
3387 if(avctx->idct_algo==FF_IDCT_INT){
3388 c->idct_put= ff_jref_idct_put;
3389 c->idct_add= ff_jref_idct_add;
3390 c->idct = j_rev_dct;
3391 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3392 }else{ //accurate/default
3393 c->idct_put= simple_idct_put;
3394 c->idct_add= simple_idct_add;
3395 c->idct = simple_idct;
3396 c->idct_permutation_type= FF_NO_IDCT_PERM;
3400 /* VP3 DSP support */
3401 c->vp3_dsp_init = vp3_dsp_init_c;
3402 c->vp3_idct = vp3_idct_c;
3404 c->get_pixels = get_pixels_c;
3405 c->diff_pixels = diff_pixels_c;
3406 c->put_pixels_clamped = put_pixels_clamped_c;
3407 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3408 c->add_pixels_clamped = add_pixels_clamped_c;
3411 c->clear_blocks = clear_blocks_c;
3412 c->pix_sum = pix_sum_c;
3413 c->pix_norm1 = pix_norm1_c;
3415 /* TODO [0] 16 [1] 8 */
3416 c->pix_abs[0][0] = pix_abs16_c;
3417 c->pix_abs[0][1] = pix_abs16_x2_c;
3418 c->pix_abs[0][2] = pix_abs16_y2_c;
3419 c->pix_abs[0][3] = pix_abs16_xy2_c;
3420 c->pix_abs[1][0] = pix_abs8_c;
3421 c->pix_abs[1][1] = pix_abs8_x2_c;
3422 c->pix_abs[1][2] = pix_abs8_y2_c;
3423 c->pix_abs[1][3] = pix_abs8_xy2_c;
3425 #define dspfunc(PFX, IDX, NUM) \
3426 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3427 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3428 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3429 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3431 dspfunc(put, 0, 16);
3432 dspfunc(put_no_rnd, 0, 16);
3434 dspfunc(put_no_rnd, 1, 8);
3438 dspfunc(avg, 0, 16);
3439 dspfunc(avg_no_rnd, 0, 16);
3441 dspfunc(avg_no_rnd, 1, 8);
3446 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3447 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3449 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3450 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3451 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3452 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3453 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3454 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3455 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3456 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3457 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3459 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3460 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3461 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3462 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3463 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3464 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3465 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3466 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3467 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3469 #define dspfunc(PFX, IDX, NUM) \
3470 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3471 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3472 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3473 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3474 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3475 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3476 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3477 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3478 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3479 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3480 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3481 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3482 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3483 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3484 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3485 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3487 dspfunc(put_qpel, 0, 16);
3488 dspfunc(put_no_rnd_qpel, 0, 16);
3490 dspfunc(avg_qpel, 0, 16);
3491 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3493 dspfunc(put_qpel, 1, 8);
3494 dspfunc(put_no_rnd_qpel, 1, 8);
3496 dspfunc(avg_qpel, 1, 8);
3497 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3499 dspfunc(put_h264_qpel, 0, 16);
3500 dspfunc(put_h264_qpel, 1, 8);
3501 dspfunc(put_h264_qpel, 2, 4);
3502 dspfunc(avg_h264_qpel, 0, 16);
3503 dspfunc(avg_h264_qpel, 1, 8);
3504 dspfunc(avg_h264_qpel, 2, 4);
3507 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3508 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3509 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3510 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3511 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3512 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3514 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3515 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3516 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3517 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3518 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3519 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3520 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3521 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3523 #define SET_CMP_FUNC(name) \
3524 c->name[0]= name ## 16_c;\
3525 c->name[1]= name ## 8x8_c;
3527 SET_CMP_FUNC(hadamard8_diff)
3528 c->hadamard8_diff[4]= hadamard8_intra16_c;
3529 SET_CMP_FUNC(dct_sad)
3530 c->sad[0]= pix_abs16_c;
3531 c->sad[1]= pix_abs8_c;
3535 SET_CMP_FUNC(quant_psnr)
3538 c->vsad[0]= vsad16_c;
3539 c->vsad[4]= vsad_intra16_c;
3540 c->vsse[0]= vsse16_c;
3541 c->vsse[4]= vsse_intra16_c;
3542 c->nsse[0]= nsse16_c;
3543 c->nsse[1]= nsse8_c;
3544 c->w53[0]= w53_16_c;
3546 c->w97[0]= w97_16_c;
3549 c->add_bytes= add_bytes_c;
3550 c->diff_bytes= diff_bytes_c;
3551 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3552 c->bswap_buf= bswap_buf;
3554 c->h263_h_loop_filter= h263_h_loop_filter_c;
3555 c->h263_v_loop_filter= h263_v_loop_filter_c;
3557 c->h261_loop_filter= h261_loop_filter_c;
3559 c->try_8x8basis= try_8x8basis_c;
3560 c->add_8x8basis= add_8x8basis_c;
3563 dsputil_init_mmx(c, avctx);
3566 dsputil_init_armv4l(c, avctx);
3569 dsputil_init_mlib(c, avctx);
3572 dsputil_init_vis(c,avctx);
3575 dsputil_init_alpha(c, avctx);
3578 dsputil_init_ppc(c, avctx);
3581 dsputil_init_mmi(c, avctx);
3584 dsputil_init_sh4(c,avctx);
3587 switch(c->idct_permutation_type){
3588 case FF_NO_IDCT_PERM:
3590 c->idct_permutation[i]= i;
3592 case FF_LIBMPEG2_IDCT_PERM:
3594 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3596 case FF_SIMPLE_IDCT_PERM:
3598 c->idct_permutation[i]= simple_mmx_permutation[i];
3600 case FF_TRANSPOSE_IDCT_PERM:
3602 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3605 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");