3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38 uint32_t squareTbl[512] = {0, };
40 const uint8_t ff_zigzag_direct[64] = {
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
43 12, 19, 26, 33, 40, 48, 41, 34,
44 27, 20, 13, 6, 7, 14, 21, 28,
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53 const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
67 const uint8_t ff_alternate_horizontal_scan[64] = {
68 0, 1, 2, 3, 8, 9, 16, 17,
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
78 const uint8_t ff_alternate_vertical_scan[64] = {
79 0, 8, 16, 24, 1, 9, 2, 10,
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90 const uint32_t inverse[256]={
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
125 /* Input permutation for the simple_idct_mmx */
126 static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
221 dst[i+0]= bswap_32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
296 const int dec_count= w==8 ? 3 : 4;
300 static const int scale[2][2][4][4]={
304 {268, 239, 239, 213},
309 {344, 310, 310, 280},
317 {275, 245, 245, 218},
322 {352, 317, 317, 286},
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
333 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
341 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
345 for(level=0; level<dec_count; level++){
346 for(ori= level ? 1 : 0; ori<4; ori++){
347 int sx= (ori&1) ? 1<<level: 0;
348 int stride= 16<<(dec_count-level);
349 int sy= (ori&2) ? stride>>1 : 0;
352 for(i=0; i<size; i++){
353 for(j=0; j<size; j++){
354 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
361 for (i = 0; i < h; i++) {
362 for (j = 0; j < w; j+=4) {
363 s+= ABS(tmp[16*i+j+0]);
364 s+= ABS(tmp[16*i+j+1]);
365 s+= ABS(tmp[16*i+j+2]);
366 s+= ABS(tmp[16*i+j+3]);
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394 /* read the pixels */
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410 const uint8_t *s2, int stride){
413 /* read the pixels */
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
436 /* read the pixels */
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
458 /* read the pixels */
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
476 /* read the pixels */
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
487 uint8_t *restrict pixels,
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
496 else if (*block > 127)
499 *pixels = (uint8_t)(*block + 128);
503 pixels += (line_size - 8);
507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
513 /* read the pixels */
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
534 /* read the pixels */
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
551 /* read the pixels */
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
561 #define PIXOP2(OPNAME, OP) \
562 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
566 OP(*((uint64_t*)block), LD64(pixels));\
572 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
576 const uint64_t a= LD64(pixels );\
577 const uint64_t b= LD64(pixels+1);\
578 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
584 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
588 const uint64_t a= LD64(pixels );\
589 const uint64_t b= LD64(pixels+1);\
590 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
596 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600 const uint64_t a= LD64(pixels );\
601 const uint64_t b= LD64(pixels+line_size);\
602 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
608 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+line_size);\
614 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
620 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 uint64_t l0= (a&0x0303030303030303ULL)\
626 + (b&0x0303030303030303ULL)\
627 + 0x0202020202020202ULL;\
628 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
629 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
633 for(i=0; i<h; i+=2){\
634 uint64_t a= LD64(pixels );\
635 uint64_t b= LD64(pixels+1);\
636 l1= (a&0x0303030303030303ULL)\
637 + (b&0x0303030303030303ULL);\
638 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
639 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
640 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
645 l0= (a&0x0303030303030303ULL)\
646 + (b&0x0303030303030303ULL)\
647 + 0x0202020202020202ULL;\
648 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
649 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
650 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
656 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0101010101010101ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0101010101010101ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
692 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
693 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
694 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
695 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
696 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
697 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
698 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
700 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
701 #else // 64 bit variant
703 #define PIXOP2(OPNAME, OP) \
704 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
707 OP(*((uint16_t*)(block )), LD16(pixels ));\
712 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
715 OP(*((uint32_t*)(block )), LD32(pixels ));\
720 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
723 OP(*((uint32_t*)(block )), LD32(pixels ));\
724 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
729 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
730 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
733 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
734 int src_stride1, int src_stride2, int h){\
738 a= LD32(&src1[i*src_stride1 ]);\
739 b= LD32(&src2[i*src_stride2 ]);\
740 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
741 a= LD32(&src1[i*src_stride1+4]);\
742 b= LD32(&src2[i*src_stride2+4]);\
743 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
747 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
748 int src_stride1, int src_stride2, int h){\
752 a= LD32(&src1[i*src_stride1 ]);\
753 b= LD32(&src2[i*src_stride2 ]);\
754 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
755 a= LD32(&src1[i*src_stride1+4]);\
756 b= LD32(&src2[i*src_stride2+4]);\
757 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
761 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
762 int src_stride1, int src_stride2, int h){\
766 a= LD32(&src1[i*src_stride1 ]);\
767 b= LD32(&src2[i*src_stride2 ]);\
768 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
772 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
773 int src_stride1, int src_stride2, int h){\
777 a= LD16(&src1[i*src_stride1 ]);\
778 b= LD16(&src2[i*src_stride2 ]);\
779 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
783 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
785 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
786 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
789 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
790 int src_stride1, int src_stride2, int h){\
791 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
792 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
795 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
796 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
799 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
800 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
803 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
804 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
807 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
808 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
811 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
812 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
815 uint32_t a, b, c, d, l0, l1, h0, h1;\
816 a= LD32(&src1[i*src_stride1]);\
817 b= LD32(&src2[i*src_stride2]);\
818 c= LD32(&src3[i*src_stride3]);\
819 d= LD32(&src4[i*src_stride4]);\
820 l0= (a&0x03030303UL)\
823 h0= ((a&0xFCFCFCFCUL)>>2)\
824 + ((b&0xFCFCFCFCUL)>>2);\
825 l1= (c&0x03030303UL)\
827 h1= ((c&0xFCFCFCFCUL)>>2)\
828 + ((d&0xFCFCFCFCUL)>>2);\
829 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
830 a= LD32(&src1[i*src_stride1+4]);\
831 b= LD32(&src2[i*src_stride2+4]);\
832 c= LD32(&src3[i*src_stride3+4]);\
833 d= LD32(&src4[i*src_stride4+4]);\
834 l0= (a&0x03030303UL)\
837 h0= ((a&0xFCFCFCFCUL)>>2)\
838 + ((b&0xFCFCFCFCUL)>>2);\
839 l1= (c&0x03030303UL)\
841 h1= ((c&0xFCFCFCFCUL)>>2)\
842 + ((d&0xFCFCFCFCUL)>>2);\
843 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
847 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
851 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
855 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
859 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
860 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
863 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
864 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
867 uint32_t a, b, c, d, l0, l1, h0, h1;\
868 a= LD32(&src1[i*src_stride1]);\
869 b= LD32(&src2[i*src_stride2]);\
870 c= LD32(&src3[i*src_stride3]);\
871 d= LD32(&src4[i*src_stride4]);\
872 l0= (a&0x03030303UL)\
875 h0= ((a&0xFCFCFCFCUL)>>2)\
876 + ((b&0xFCFCFCFCUL)>>2);\
877 l1= (c&0x03030303UL)\
879 h1= ((c&0xFCFCFCFCUL)>>2)\
880 + ((d&0xFCFCFCFCUL)>>2);\
881 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
882 a= LD32(&src1[i*src_stride1+4]);\
883 b= LD32(&src2[i*src_stride2+4]);\
884 c= LD32(&src3[i*src_stride3+4]);\
885 d= LD32(&src4[i*src_stride4+4]);\
886 l0= (a&0x03030303UL)\
889 h0= ((a&0xFCFCFCFCUL)>>2)\
890 + ((b&0xFCFCFCFCUL)>>2);\
891 l1= (c&0x03030303UL)\
893 h1= ((c&0xFCFCFCFCUL)>>2)\
894 + ((d&0xFCFCFCFCUL)>>2);\
895 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
898 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
900 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
901 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
903 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
904 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
905 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
906 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
909 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
911 int i, a0, b0, a1, b1;\
918 for(i=0; i<h; i+=2){\
924 block[0]= (a1+a0)>>2; /* FIXME non put */\
925 block[1]= (b1+b0)>>2;\
935 block[0]= (a1+a0)>>2;\
936 block[1]= (b1+b0)>>2;\
942 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 const uint32_t a= LD32(pixels );\
946 const uint32_t b= LD32(pixels+1);\
947 uint32_t l0= (a&0x03030303UL)\
950 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
951 + ((b&0xFCFCFCFCUL)>>2);\
955 for(i=0; i<h; i+=2){\
956 uint32_t a= LD32(pixels );\
957 uint32_t b= LD32(pixels+1);\
958 l1= (a&0x03030303UL)\
960 h1= ((a&0xFCFCFCFCUL)>>2)\
961 + ((b&0xFCFCFCFCUL)>>2);\
962 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
967 l0= (a&0x03030303UL)\
970 h0= ((a&0xFCFCFCFCUL)>>2)\
971 + ((b&0xFCFCFCFCUL)>>2);\
972 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
978 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
983 const uint32_t a= LD32(pixels );\
984 const uint32_t b= LD32(pixels+1);\
985 uint32_t l0= (a&0x03030303UL)\
988 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
989 + ((b&0xFCFCFCFCUL)>>2);\
993 for(i=0; i<h; i+=2){\
994 uint32_t a= LD32(pixels );\
995 uint32_t b= LD32(pixels+1);\
996 l1= (a&0x03030303UL)\
998 h1= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1000 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1005 l0= (a&0x03030303UL)\
1008 h0= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 pixels+=4-line_size*(h+1);\
1015 block +=4-line_size*h;\
1019 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1022 for(j=0; j<2; j++){\
1024 const uint32_t a= LD32(pixels );\
1025 const uint32_t b= LD32(pixels+1);\
1026 uint32_t l0= (a&0x03030303UL)\
1029 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1034 for(i=0; i<h; i+=2){\
1035 uint32_t a= LD32(pixels );\
1036 uint32_t b= LD32(pixels+1);\
1037 l1= (a&0x03030303UL)\
1038 + (b&0x03030303UL);\
1039 h1= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1046 l0= (a&0x03030303UL)\
1049 h0= ((a&0xFCFCFCFCUL)>>2)\
1050 + ((b&0xFCFCFCFCUL)>>2);\
1051 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1055 pixels+=4-line_size*(h+1);\
1056 block +=4-line_size*h;\
1060 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1061 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1062 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1063 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1064 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1065 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1066 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1067 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1069 #define op_avg(a, b) a = rnd_avg32(a, b)
1071 #define op_put(a, b) a = b
1078 #define avg2(a,b) ((a+b+1)>>1)
1079 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1081 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1082 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1085 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1086 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1089 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1091 const int A=(16-x16)*(16-y16);
1092 const int B=( x16)*(16-y16);
1093 const int C=(16-x16)*( y16);
1094 const int D=( x16)*( y16);
1099 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1100 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1101 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1102 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1103 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1104 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1105 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1106 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1112 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1113 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1116 const int s= 1<<shift;
1126 for(x=0; x<8; x++){ //XXX FIXME optimize
1127 int src_x, src_y, frac_x, frac_y, index;
1131 frac_x= src_x&(s-1);
1132 frac_y= src_y&(s-1);
1136 if((unsigned)src_x < width){
1137 if((unsigned)src_y < height){
1138 index= src_x + src_y*stride;
1139 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1140 + src[index +1]* frac_x )*(s-frac_y)
1141 + ( src[index+stride ]*(s-frac_x)
1142 + src[index+stride+1]* frac_x )* frac_y
1145 index= src_x + clip(src_y, 0, height)*stride;
1146 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1147 + src[index +1]* frac_x )*s
1151 if((unsigned)src_y < height){
1152 index= clip(src_x, 0, width) + src_y*stride;
1153 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1154 + src[index+stride ]* frac_y )*s
1157 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1158 dst[y*stride + x]= src[index ];
1170 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1172 case 2: put_pixels2_c (dst, src, stride, height); break;
1173 case 4: put_pixels4_c (dst, src, stride, height); break;
1174 case 8: put_pixels8_c (dst, src, stride, height); break;
1175 case 16:put_pixels16_c(dst, src, stride, height); break;
1179 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1181 for (i=0; i < height; i++) {
1182 for (j=0; j < width; j++) {
1183 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1190 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1192 for (i=0; i < height; i++) {
1193 for (j=0; j < width; j++) {
1194 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1201 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1203 for (i=0; i < height; i++) {
1204 for (j=0; j < width; j++) {
1205 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1212 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1214 for (i=0; i < height; i++) {
1215 for (j=0; j < width; j++) {
1216 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1223 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1225 for (i=0; i < height; i++) {
1226 for (j=0; j < width; j++) {
1227 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1234 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1236 for (i=0; i < height; i++) {
1237 for (j=0; j < width; j++) {
1238 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1245 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1247 for (i=0; i < height; i++) {
1248 for (j=0; j < width; j++) {
1249 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1256 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1258 for (i=0; i < height; i++) {
1259 for (j=0; j < width; j++) {
1260 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1267 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1269 case 2: avg_pixels2_c (dst, src, stride, height); break;
1270 case 4: avg_pixels4_c (dst, src, stride, height); break;
1271 case 8: avg_pixels8_c (dst, src, stride, height); break;
1272 case 16:avg_pixels16_c(dst, src, stride, height); break;
1276 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
1280 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1287 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
1291 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1298 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300 for (i=0; i < height; i++) {
1301 for (j=0; j < width; j++) {
1302 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1309 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1311 for (i=0; i < height; i++) {
1312 for (j=0; j < width; j++) {
1313 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1320 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1322 for (i=0; i < height; i++) {
1323 for (j=0; j < width; j++) {
1324 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1331 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1333 for (i=0; i < height; i++) {
1334 for (j=0; j < width; j++) {
1335 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1342 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1344 for (i=0; i < height; i++) {
1345 for (j=0; j < width; j++) {
1346 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1353 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1355 for (i=0; i < height; i++) {
1356 for (j=0; j < width; j++) {
1357 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1364 #define TPEL_WIDTH(width)\
1365 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1366 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1367 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1368 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1369 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1370 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1371 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1372 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1373 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1374 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1375 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1376 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1377 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1378 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1379 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1380 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1381 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1382 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1385 #define H264_CHROMA_MC(OPNAME, OP)\
1386 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1387 const int A=(8-x)*(8-y);\
1388 const int B=( x)*(8-y);\
1389 const int C=(8-x)*( y);\
1390 const int D=( x)*( y);\
1393 assert(x<8 && y<8 && x>=0 && y>=0);\
1397 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1398 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1404 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1405 const int A=(8-x)*(8-y);\
1406 const int B=( x)*(8-y);\
1407 const int C=(8-x)*( y);\
1408 const int D=( x)*( y);\
1411 assert(x<8 && y<8 && x>=0 && y>=0);\
1415 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1416 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1417 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1418 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1424 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1425 const int A=(8-x)*(8-y);\
1426 const int B=( x)*(8-y);\
1427 const int C=(8-x)*( y);\
1428 const int D=( x)*( y);\
1431 assert(x<8 && y<8 && x>=0 && y>=0);\
1435 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1436 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1437 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1438 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1439 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1440 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1441 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1442 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1448 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1449 #define op_put(a, b) a = (((b) + 32)>>6)
1451 H264_CHROMA_MC(put_ , op_put)
1452 H264_CHROMA_MC(avg_ , op_avg)
1456 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1461 ST32(dst , LD32(src ));
1467 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1472 ST32(dst , LD32(src ));
1473 ST32(dst+4 , LD32(src+4 ));
1479 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1484 ST32(dst , LD32(src ));
1485 ST32(dst+4 , LD32(src+4 ));
1486 ST32(dst+8 , LD32(src+8 ));
1487 ST32(dst+12, LD32(src+12));
1493 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1498 ST32(dst , LD32(src ));
1499 ST32(dst+4 , LD32(src+4 ));
1500 ST32(dst+8 , LD32(src+8 ));
1501 ST32(dst+12, LD32(src+12));
1508 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1513 ST32(dst , LD32(src ));
1514 ST32(dst+4 , LD32(src+4 ));
1522 #define QPEL_MC(r, OPNAME, RND, OP) \
1523 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1524 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1528 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1529 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1530 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1531 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1532 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1533 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1534 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1535 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1541 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1543 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1547 const int src0= src[0*srcStride];\
1548 const int src1= src[1*srcStride];\
1549 const int src2= src[2*srcStride];\
1550 const int src3= src[3*srcStride];\
1551 const int src4= src[4*srcStride];\
1552 const int src5= src[5*srcStride];\
1553 const int src6= src[6*srcStride];\
1554 const int src7= src[7*srcStride];\
1555 const int src8= src[8*srcStride];\
1556 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1557 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1558 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1559 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1560 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1561 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1562 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1563 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1569 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1575 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1576 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1577 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1578 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1579 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1580 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1581 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1582 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1583 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1584 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1585 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1586 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1587 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1588 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1589 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1590 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1596 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1597 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1602 const int src0= src[0*srcStride];\
1603 const int src1= src[1*srcStride];\
1604 const int src2= src[2*srcStride];\
1605 const int src3= src[3*srcStride];\
1606 const int src4= src[4*srcStride];\
1607 const int src5= src[5*srcStride];\
1608 const int src6= src[6*srcStride];\
1609 const int src7= src[7*srcStride];\
1610 const int src8= src[8*srcStride];\
1611 const int src9= src[9*srcStride];\
1612 const int src10= src[10*srcStride];\
1613 const int src11= src[11*srcStride];\
1614 const int src12= src[12*srcStride];\
1615 const int src13= src[13*srcStride];\
1616 const int src14= src[14*srcStride];\
1617 const int src15= src[15*srcStride];\
1618 const int src16= src[16*srcStride];\
1619 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1620 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1621 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1622 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1623 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1624 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1625 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1626 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1627 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1628 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1629 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1630 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1631 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1632 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1633 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1634 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1640 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1641 OPNAME ## pixels8_c(dst, src, stride, 8);\
1644 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1646 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1647 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1650 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1651 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1654 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1656 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1657 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1660 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1661 uint8_t full[16*9];\
1663 copy_block9(full, src, 16, stride, 9);\
1664 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1665 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1668 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1670 copy_block9(full, src, 16, stride, 9);\
1671 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1674 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[16*9];\
1677 copy_block9(full, src, 16, stride, 9);\
1678 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1679 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1681 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[16*9];\
1685 uint8_t halfHV[64];\
1686 copy_block9(full, src, 16, stride, 9);\
1687 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1688 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1689 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1690 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1692 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1693 uint8_t full[16*9];\
1695 uint8_t halfHV[64];\
1696 copy_block9(full, src, 16, stride, 9);\
1697 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1698 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1699 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1700 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1702 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1706 uint8_t halfHV[64];\
1707 copy_block9(full, src, 16, stride, 9);\
1708 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1709 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1710 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1711 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1713 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1714 uint8_t full[16*9];\
1716 uint8_t halfHV[64];\
1717 copy_block9(full, src, 16, stride, 9);\
1718 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1719 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1720 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1721 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1723 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1727 uint8_t halfHV[64];\
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1734 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1737 uint8_t halfHV[64];\
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1744 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1748 uint8_t halfHV[64];\
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1755 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1758 uint8_t halfHV[64];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1765 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1767 uint8_t halfHV[64];\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1769 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1770 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1772 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t halfHV[64];\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1779 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1790 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1795 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1796 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1798 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1799 uint8_t full[16*9];\
1802 uint8_t halfHV[64];\
1803 copy_block9(full, src, 16, stride, 9);\
1804 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1809 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1810 uint8_t full[16*9];\
1812 copy_block9(full, src, 16, stride, 9);\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1814 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1815 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1817 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1822 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1823 OPNAME ## pixels16_c(dst, src, stride, 16);\
1826 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1828 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1829 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1832 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1833 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1836 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1838 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1839 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1842 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[24*17];\
1845 copy_block17(full, src, 24, stride, 17);\
1846 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1847 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1850 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1852 copy_block17(full, src, 24, stride, 17);\
1853 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1856 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[24*17];\
1859 copy_block17(full, src, 24, stride, 17);\
1860 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1861 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1863 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1864 uint8_t full[24*17];\
1865 uint8_t halfH[272];\
1866 uint8_t halfV[256];\
1867 uint8_t halfHV[256];\
1868 copy_block17(full, src, 24, stride, 17);\
1869 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1870 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1871 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1872 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1874 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1875 uint8_t full[24*17];\
1876 uint8_t halfH[272];\
1877 uint8_t halfHV[256];\
1878 copy_block17(full, src, 24, stride, 17);\
1879 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1880 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1881 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1882 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1884 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1886 uint8_t halfH[272];\
1887 uint8_t halfV[256];\
1888 uint8_t halfHV[256];\
1889 copy_block17(full, src, 24, stride, 17);\
1890 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1891 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1892 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1893 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1895 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1896 uint8_t full[24*17];\
1897 uint8_t halfH[272];\
1898 uint8_t halfHV[256];\
1899 copy_block17(full, src, 24, stride, 17);\
1900 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1901 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1902 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1903 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1905 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1916 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1926 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1937 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1947 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t halfH[272];\
1949 uint8_t halfHV[256];\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1951 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1952 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1954 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t halfH[272];\
1956 uint8_t halfHV[256];\
1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1961 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfV[256];\
1965 uint8_t halfHV[256];\
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1972 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1977 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1978 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1980 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1981 uint8_t full[24*17];\
1982 uint8_t halfH[272];\
1983 uint8_t halfV[256];\
1984 uint8_t halfHV[256];\
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1991 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t full[24*17];\
1993 uint8_t halfH[272];\
1994 copy_block17(full, src, 24, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1996 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1997 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1999 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t halfH[272];\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2005 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2006 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2007 #define op_put(a, b) a = cm[((b) + 16)>>5]
2008 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2010 QPEL_MC(0, put_ , _ , op_put)
2011 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2012 QPEL_MC(0, avg_ , _ , op_avg)
2013 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2015 #undef op_avg_no_rnd
2017 #undef op_put_no_rnd
2020 #define H264_LOWPASS(OPNAME, OP, OP2) \
2021 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2023 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2027 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2028 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2029 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2030 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2036 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2038 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2042 const int srcB= src[-2*srcStride];\
2043 const int srcA= src[-1*srcStride];\
2044 const int src0= src[0 *srcStride];\
2045 const int src1= src[1 *srcStride];\
2046 const int src2= src[2 *srcStride];\
2047 const int src3= src[3 *srcStride];\
2048 const int src4= src[4 *srcStride];\
2049 const int src5= src[5 *srcStride];\
2050 const int src6= src[6 *srcStride];\
2051 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2052 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2053 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2054 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2060 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2063 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2065 src -= 2*srcStride;\
2066 for(i=0; i<h+5; i++)\
2068 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2070 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2071 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2075 tmp -= tmpStride*(h+5-2);\
2078 const int tmpB= tmp[-2*tmpStride];\
2079 const int tmpA= tmp[-1*tmpStride];\
2080 const int tmp0= tmp[0 *tmpStride];\
2081 const int tmp1= tmp[1 *tmpStride];\
2082 const int tmp2= tmp[2 *tmpStride];\
2083 const int tmp3= tmp[3 *tmpStride];\
2084 const int tmp4= tmp[4 *tmpStride];\
2085 const int tmp5= tmp[5 *tmpStride];\
2086 const int tmp6= tmp[6 *tmpStride];\
2087 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2088 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2089 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2090 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2096 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2098 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2102 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2103 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2104 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2105 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2106 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2107 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2108 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2109 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2115 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2117 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2121 const int srcB= src[-2*srcStride];\
2122 const int srcA= src[-1*srcStride];\
2123 const int src0= src[0 *srcStride];\
2124 const int src1= src[1 *srcStride];\
2125 const int src2= src[2 *srcStride];\
2126 const int src3= src[3 *srcStride];\
2127 const int src4= src[4 *srcStride];\
2128 const int src5= src[5 *srcStride];\
2129 const int src6= src[6 *srcStride];\
2130 const int src7= src[7 *srcStride];\
2131 const int src8= src[8 *srcStride];\
2132 const int src9= src[9 *srcStride];\
2133 const int src10=src[10*srcStride];\
2134 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2135 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2136 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2137 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2138 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2139 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2140 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2141 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2147 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2150 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2152 src -= 2*srcStride;\
2153 for(i=0; i<h+5; i++)\
2155 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2156 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2157 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2158 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2159 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2160 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2161 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2162 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2166 tmp -= tmpStride*(h+5-2);\
2169 const int tmpB= tmp[-2*tmpStride];\
2170 const int tmpA= tmp[-1*tmpStride];\
2171 const int tmp0= tmp[0 *tmpStride];\
2172 const int tmp1= tmp[1 *tmpStride];\
2173 const int tmp2= tmp[2 *tmpStride];\
2174 const int tmp3= tmp[3 *tmpStride];\
2175 const int tmp4= tmp[4 *tmpStride];\
2176 const int tmp5= tmp[5 *tmpStride];\
2177 const int tmp6= tmp[6 *tmpStride];\
2178 const int tmp7= tmp[7 *tmpStride];\
2179 const int tmp8= tmp[8 *tmpStride];\
2180 const int tmp9= tmp[9 *tmpStride];\
2181 const int tmp10=tmp[10*tmpStride];\
2182 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2183 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2184 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2185 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2186 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2187 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2188 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2189 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2195 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2196 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2197 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2198 src += 8*srcStride;\
2199 dst += 8*dstStride;\
2200 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2201 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2204 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2206 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2207 src += 8*srcStride;\
2208 dst += 8*dstStride;\
2209 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2210 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2213 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2214 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2216 src += 8*srcStride;\
2217 dst += 8*dstStride;\
2218 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2219 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2222 #define H264_MC(OPNAME, SIZE) \
2223 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2224 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2227 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2228 uint8_t half[SIZE*SIZE];\
2229 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2230 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2233 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2234 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2237 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2238 uint8_t half[SIZE*SIZE];\
2239 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2240 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2243 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2244 uint8_t full[SIZE*(SIZE+5)];\
2245 uint8_t * const full_mid= full + SIZE*2;\
2246 uint8_t half[SIZE*SIZE];\
2247 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2248 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2249 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2252 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2253 uint8_t full[SIZE*(SIZE+5)];\
2254 uint8_t * const full_mid= full + SIZE*2;\
2255 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2256 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2259 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2260 uint8_t full[SIZE*(SIZE+5)];\
2261 uint8_t * const full_mid= full + SIZE*2;\
2262 uint8_t half[SIZE*SIZE];\
2263 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2264 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2265 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2268 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2269 uint8_t full[SIZE*(SIZE+5)];\
2270 uint8_t * const full_mid= full + SIZE*2;\
2271 uint8_t halfH[SIZE*SIZE];\
2272 uint8_t halfV[SIZE*SIZE];\
2273 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2274 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2275 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2276 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2280 uint8_t full[SIZE*(SIZE+5)];\
2281 uint8_t * const full_mid= full + SIZE*2;\
2282 uint8_t halfH[SIZE*SIZE];\
2283 uint8_t halfV[SIZE*SIZE];\
2284 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2285 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2286 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2287 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2290 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2291 uint8_t full[SIZE*(SIZE+5)];\
2292 uint8_t * const full_mid= full + SIZE*2;\
2293 uint8_t halfH[SIZE*SIZE];\
2294 uint8_t halfV[SIZE*SIZE];\
2295 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2296 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2297 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2302 uint8_t full[SIZE*(SIZE+5)];\
2303 uint8_t * const full_mid= full + SIZE*2;\
2304 uint8_t halfH[SIZE*SIZE];\
2305 uint8_t halfV[SIZE*SIZE];\
2306 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2307 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2308 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2309 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2312 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2313 int16_t tmp[SIZE*(SIZE+5)];\
2314 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2317 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2318 int16_t tmp[SIZE*(SIZE+5)];\
2319 uint8_t halfH[SIZE*SIZE];\
2320 uint8_t halfHV[SIZE*SIZE];\
2321 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2322 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2323 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2326 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2327 int16_t tmp[SIZE*(SIZE+5)];\
2328 uint8_t halfH[SIZE*SIZE];\
2329 uint8_t halfHV[SIZE*SIZE];\
2330 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2331 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 int16_t tmp[SIZE*(SIZE+5)];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 uint8_t halfHV[SIZE*SIZE];\
2341 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 int16_t tmp[SIZE*(SIZE+5)];\
2351 uint8_t halfV[SIZE*SIZE];\
2352 uint8_t halfHV[SIZE*SIZE];\
2353 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2356 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2359 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2360 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2361 #define op_put(a, b) a = cm[((b) + 16)>>5]
2362 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2363 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2365 H264_LOWPASS(put_ , op_put, op2_put)
2366 H264_LOWPASS(avg_ , op_avg, op2_avg)
2380 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2381 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2382 #define H264_WEIGHT(W,H) \
2383 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2384 int attribute_unused x, y; \
2385 offset <<= log2_denom; \
2386 if(log2_denom) offset += 1<<(log2_denom-1); \
2387 for(y=0; y<H; y++, block += stride){ \
2390 if(W==2) continue; \
2393 if(W==4) continue; \
2398 if(W==8) continue; \
2409 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2410 int attribute_unused x, y; \
2411 int offset = (offsets + offsetd + 1) >> 1; \
2412 offset = ((offset << 1) + 1) << log2_denom; \
2413 for(y=0; y<H; y++, dst += stride, src += stride){ \
2416 if(W==2) continue; \
2419 if(W==4) continue; \
2424 if(W==8) continue; \
2451 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2452 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2456 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2457 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2458 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2459 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2460 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2461 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2462 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2463 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2469 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2470 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2474 const int src_1= src[ -srcStride];
2475 const int src0 = src[0 ];
2476 const int src1 = src[ srcStride];
2477 const int src2 = src[2*srcStride];
2478 const int src3 = src[3*srcStride];
2479 const int src4 = src[4*srcStride];
2480 const int src5 = src[5*srcStride];
2481 const int src6 = src[6*srcStride];
2482 const int src7 = src[7*srcStride];
2483 const int src8 = src[8*srcStride];
2484 const int src9 = src[9*srcStride];
2485 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2486 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2487 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2488 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2489 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2490 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2491 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2492 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2498 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2499 put_pixels8_c(dst, src, stride, 8);
2502 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2504 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2505 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2508 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2509 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2512 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2514 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2515 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2518 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2519 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2522 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2526 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2527 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2528 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2529 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2531 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2535 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2536 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2537 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2538 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2540 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2542 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2543 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2546 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2548 const int strength= ff_h263_loop_filter_strength[qscale];
2552 int p0= src[x-2*stride];
2553 int p1= src[x-1*stride];
2554 int p2= src[x+0*stride];
2555 int p3= src[x+1*stride];
2556 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2558 if (d<-2*strength) d1= 0;
2559 else if(d<- strength) d1=-2*strength - d;
2560 else if(d< strength) d1= d;
2561 else if(d< 2*strength) d1= 2*strength - d;
2566 if(p1&256) p1= ~(p1>>31);
2567 if(p2&256) p2= ~(p2>>31);
2569 src[x-1*stride] = p1;
2570 src[x+0*stride] = p2;
2574 d2= clip((p0-p3)/4, -ad1, ad1);
2576 src[x-2*stride] = p0 - d2;
2577 src[x+ stride] = p3 + d2;
2581 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2583 const int strength= ff_h263_loop_filter_strength[qscale];
2587 int p0= src[y*stride-2];
2588 int p1= src[y*stride-1];
2589 int p2= src[y*stride+0];
2590 int p3= src[y*stride+1];
2591 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2593 if (d<-2*strength) d1= 0;
2594 else if(d<- strength) d1=-2*strength - d;
2595 else if(d< strength) d1= d;
2596 else if(d< 2*strength) d1= 2*strength - d;
2601 if(p1&256) p1= ~(p1>>31);
2602 if(p2&256) p2= ~(p2>>31);
2604 src[y*stride-1] = p1;
2605 src[y*stride+0] = p2;
2609 d2= clip((p0-p3)/4, -ad1, ad1);
2611 src[y*stride-2] = p0 - d2;
2612 src[y*stride+1] = p3 + d2;
2616 static void h261_loop_filter_c(uint8_t *src, int stride){
2621 temp[x ] = 4*src[x ];
2622 temp[x + 7*8] = 4*src[x + 7*stride];
2626 xy = y * stride + x;
2628 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2633 src[ y*stride] = (temp[ y*8] + 2)>>2;
2634 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2636 xy = y * stride + x;
2638 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2643 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2649 s += abs(pix1[0] - pix2[0]);
2650 s += abs(pix1[1] - pix2[1]);
2651 s += abs(pix1[2] - pix2[2]);
2652 s += abs(pix1[3] - pix2[3]);
2653 s += abs(pix1[4] - pix2[4]);
2654 s += abs(pix1[5] - pix2[5]);
2655 s += abs(pix1[6] - pix2[6]);
2656 s += abs(pix1[7] - pix2[7]);
2657 s += abs(pix1[8] - pix2[8]);
2658 s += abs(pix1[9] - pix2[9]);
2659 s += abs(pix1[10] - pix2[10]);
2660 s += abs(pix1[11] - pix2[11]);
2661 s += abs(pix1[12] - pix2[12]);
2662 s += abs(pix1[13] - pix2[13]);
2663 s += abs(pix1[14] - pix2[14]);
2664 s += abs(pix1[15] - pix2[15]);
2671 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2677 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2678 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2679 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2680 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2681 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2682 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2683 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2684 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2685 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2686 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2687 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2688 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2689 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2690 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2691 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2692 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2699 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2702 uint8_t *pix3 = pix2 + line_size;
2706 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2707 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2708 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2709 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2710 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2711 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2712 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2713 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2714 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2715 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2716 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2717 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2718 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2719 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2720 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2721 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2729 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2732 uint8_t *pix3 = pix2 + line_size;
2736 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2737 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2738 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2739 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2740 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2741 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2742 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2743 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2744 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2745 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2746 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2747 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2748 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2749 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2750 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2751 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2759 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2765 s += abs(pix1[0] - pix2[0]);
2766 s += abs(pix1[1] - pix2[1]);
2767 s += abs(pix1[2] - pix2[2]);
2768 s += abs(pix1[3] - pix2[3]);
2769 s += abs(pix1[4] - pix2[4]);
2770 s += abs(pix1[5] - pix2[5]);
2771 s += abs(pix1[6] - pix2[6]);
2772 s += abs(pix1[7] - pix2[7]);
2779 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2785 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2786 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2787 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2788 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2789 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2790 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2791 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2792 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2799 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2802 uint8_t *pix3 = pix2 + line_size;
2806 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2807 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2808 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2809 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2810 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2811 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2812 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2813 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2821 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2824 uint8_t *pix3 = pix2 + line_size;
2828 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2829 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2830 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2831 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2832 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2833 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2834 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2835 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2843 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2849 for(x=0; x<16; x++){
2850 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2853 for(x=0; x<15; x++){
2854 score2+= ABS( s1[x ] - s1[x +stride]
2855 - s1[x+1] + s1[x+1+stride])
2856 -ABS( s2[x ] - s2[x +stride]
2857 - s2[x+1] + s2[x+1+stride]);
2864 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2865 else return score1 + ABS(score2)*8;
2868 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2875 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2879 score2+= ABS( s1[x ] - s1[x +stride]
2880 - s1[x+1] + s1[x+1+stride])
2881 -ABS( s2[x ] - s2[x +stride]
2882 - s2[x+1] + s2[x+1+stride]);
2889 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2890 else return score1 + ABS(score2)*8;
2893 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2897 for(i=0; i<8*8; i++){
2898 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2901 assert(-512<b && b<512);
2903 sum += (w*b)*(w*b)>>4;
2908 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2911 for(i=0; i<8*8; i++){
2912 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2917 * permutes an 8x8 block.
2918 * @param block the block which will be permuted according to the given permutation vector
2919 * @param permutation the permutation vector
2920 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2921 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2922 * (inverse) permutated to scantable order!
2924 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2930 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2932 for(i=0; i<=last; i++){
2933 const int j= scantable[i];
2938 for(i=0; i<=last; i++){
2939 const int j= scantable[i];
2940 const int perm_j= permutation[j];
2941 block[perm_j]= temp[j];
2945 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2949 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2952 memset(cmp, 0, sizeof(void*)*5);
2960 cmp[i]= c->hadamard8_diff[i];
2966 cmp[i]= c->dct_sad[i];
2969 cmp[i]= c->dct_max[i];
2972 cmp[i]= c->quant_psnr[i];
2999 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3005 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3007 static void clear_blocks_c(DCTELEM *blocks)
3009 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3012 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3014 for(i=0; i+7<w; i+=8){
3015 dst[i+0] += src[i+0];
3016 dst[i+1] += src[i+1];
3017 dst[i+2] += src[i+2];
3018 dst[i+3] += src[i+3];
3019 dst[i+4] += src[i+4];
3020 dst[i+5] += src[i+5];
3021 dst[i+6] += src[i+6];
3022 dst[i+7] += src[i+7];
3025 dst[i+0] += src[i+0];
3028 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3030 for(i=0; i+7<w; i+=8){
3031 dst[i+0] = src1[i+0]-src2[i+0];
3032 dst[i+1] = src1[i+1]-src2[i+1];
3033 dst[i+2] = src1[i+2]-src2[i+2];
3034 dst[i+3] = src1[i+3]-src2[i+3];
3035 dst[i+4] = src1[i+4]-src2[i+4];
3036 dst[i+5] = src1[i+5]-src2[i+5];
3037 dst[i+6] = src1[i+6]-src2[i+6];
3038 dst[i+7] = src1[i+7]-src2[i+7];
3041 dst[i+0] = src1[i+0]-src2[i+0];
3044 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3052 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3062 #define BUTTERFLY2(o1,o2,i1,i2) \
3066 #define BUTTERFLY1(x,y) \
3075 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3077 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3085 //FIXME try pointer walks
3086 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3087 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3088 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3089 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3091 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3092 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3093 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3094 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3096 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3097 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3098 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3099 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3103 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3104 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3105 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3106 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3108 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3109 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3110 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3111 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3114 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3115 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3116 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3117 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3123 printf("MAX:%d\n", maxi);
3129 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3137 //FIXME try pointer walks
3138 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3139 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3140 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3141 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3143 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3144 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3145 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3146 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3148 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3149 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3150 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3151 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3155 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3156 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3157 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3158 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3160 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3161 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3162 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3163 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3166 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3167 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3168 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3169 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3172 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3177 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3178 MpegEncContext * const s= (MpegEncContext *)c;
3179 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3180 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3185 s->dsp.diff_pixels(temp, src1, src2, stride);
3194 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3195 MpegEncContext * const s= (MpegEncContext *)c;
3196 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3197 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3202 s->dsp.diff_pixels(temp, src1, src2, stride);
3206 sum= FFMAX(sum, ABS(temp[i]));
3211 void simple_idct(DCTELEM *block); //FIXME
3213 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3214 MpegEncContext * const s= (MpegEncContext *)c;
3215 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3216 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3217 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3223 s->dsp.diff_pixels(temp, src1, src2, stride);
3225 memcpy(bak, temp, 64*sizeof(DCTELEM));
3227 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3228 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3229 simple_idct(temp); //FIXME
3232 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3237 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3238 MpegEncContext * const s= (MpegEncContext *)c;
3239 const uint8_t *scantable= s->intra_scantable.permutated;
3240 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3241 uint64_t __align8 aligned_bak[stride];
3242 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3243 uint8_t * const bak= (uint8_t*)aligned_bak;
3244 int i, last, run, bits, level, distoration, start_i;
3245 const int esc_length= s->ac_esc_length;
3247 uint8_t * last_length;
3252 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3253 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3256 s->dsp.diff_pixels(temp, src1, src2, stride);
3258 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3264 length = s->intra_ac_vlc_length;
3265 last_length= s->intra_ac_vlc_last_length;
3266 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3269 length = s->inter_ac_vlc_length;
3270 last_length= s->inter_ac_vlc_last_length;
3275 for(i=start_i; i<last; i++){
3276 int j= scantable[i];
3281 if((level&(~127)) == 0){
3282 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3291 level= temp[i] + 64;
3295 if((level&(~127)) == 0){
3296 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3304 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3306 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3309 s->dsp.idct_add(bak, stride, temp);
3311 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3313 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3316 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3317 MpegEncContext * const s= (MpegEncContext *)c;
3318 const uint8_t *scantable= s->intra_scantable.permutated;
3319 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3320 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3321 int i, last, run, bits, level, start_i;
3322 const int esc_length= s->ac_esc_length;
3324 uint8_t * last_length;
3328 s->dsp.diff_pixels(temp, src1, src2, stride);
3330 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3336 length = s->intra_ac_vlc_length;
3337 last_length= s->intra_ac_vlc_last_length;
3338 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3341 length = s->inter_ac_vlc_length;
3342 last_length= s->inter_ac_vlc_last_length;
3347 for(i=start_i; i<last; i++){
3348 int j= scantable[i];
3353 if((level&(~127)) == 0){
3354 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3363 level= temp[i] + 64;
3367 if((level&(~127)) == 0){
3368 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3376 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3381 for(x=0; x<16; x+=4){
3382 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3383 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3391 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3396 for(x=0; x<16; x++){
3397 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3406 #define SQ(a) ((a)*(a))
3407 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3412 for(x=0; x<16; x+=4){
3413 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3414 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3422 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3427 for(x=0; x<16; x++){
3428 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3437 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3438 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3439 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3440 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3441 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3442 WARPER8_16_SQ(rd8x8_c, rd16_c)
3443 WARPER8_16_SQ(bit8x8_c, bit16_c)
3445 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3447 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3450 put_pixels_clamped_c(block, dest, line_size);
3452 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3455 add_pixels_clamped_c(block, dest, line_size);
3458 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3461 put_pixels_clamped4_c(block, dest, line_size);
3463 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3466 add_pixels_clamped4_c(block, dest, line_size);
3469 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3472 put_pixels_clamped2_c(block, dest, line_size);
3474 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3477 add_pixels_clamped2_c(block, dest, line_size);
3480 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3482 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3484 dest[0] = cm[(block[0] + 4)>>3];
3486 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3488 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3490 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3493 /* init static data */
3494 void dsputil_static_init(void)
3498 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3499 for(i=0;i<MAX_NEG_CROP;i++) {
3501 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3504 for(i=0;i<512;i++) {
3505 squareTbl[i] = (i - 256) * (i - 256);
3508 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3512 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3516 #ifdef CONFIG_ENCODERS
3517 if(avctx->dct_algo==FF_DCT_FASTINT) {
3518 c->fdct = fdct_ifast;
3519 c->fdct248 = fdct_ifast248;
3521 else if(avctx->dct_algo==FF_DCT_FAAN) {
3522 c->fdct = ff_faandct;
3523 c->fdct248 = ff_faandct248;
3526 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3527 c->fdct248 = ff_fdct248_islow;
3529 #endif //CONFIG_ENCODERS
3531 if(avctx->lowres==1){
3532 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3533 c->idct_put= ff_jref_idct4_put;
3534 c->idct_add= ff_jref_idct4_add;
3536 c->idct_put= ff_h264_lowres_idct_put_c;
3537 c->idct_add= ff_h264_lowres_idct_add_c;
3539 c->idct = j_rev_dct4;
3540 c->idct_permutation_type= FF_NO_IDCT_PERM;
3541 }else if(avctx->lowres==2){
3542 c->idct_put= ff_jref_idct2_put;
3543 c->idct_add= ff_jref_idct2_add;
3544 c->idct = j_rev_dct2;
3545 c->idct_permutation_type= FF_NO_IDCT_PERM;
3546 }else if(avctx->lowres==3){
3547 c->idct_put= ff_jref_idct1_put;
3548 c->idct_add= ff_jref_idct1_add;
3549 c->idct = j_rev_dct1;
3550 c->idct_permutation_type= FF_NO_IDCT_PERM;
3552 if(avctx->idct_algo==FF_IDCT_INT){
3553 c->idct_put= ff_jref_idct_put;
3554 c->idct_add= ff_jref_idct_add;
3555 c->idct = j_rev_dct;
3556 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3557 }else{ //accurate/default
3558 c->idct_put= simple_idct_put;
3559 c->idct_add= simple_idct_add;
3560 c->idct = simple_idct;
3561 c->idct_permutation_type= FF_NO_IDCT_PERM;
3565 c->h264_idct_add= ff_h264_idct_add_c;
3567 /* VP3 DSP support */
3568 c->vp3_dsp_init = vp3_dsp_init_c;
3569 c->vp3_idct = vp3_idct_c;
3571 c->get_pixels = get_pixels_c;
3572 c->diff_pixels = diff_pixels_c;
3573 c->put_pixels_clamped = put_pixels_clamped_c;
3574 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3575 c->add_pixels_clamped = add_pixels_clamped_c;
3578 c->clear_blocks = clear_blocks_c;
3579 c->pix_sum = pix_sum_c;
3580 c->pix_norm1 = pix_norm1_c;
3582 /* TODO [0] 16 [1] 8 */
3583 c->pix_abs[0][0] = pix_abs16_c;
3584 c->pix_abs[0][1] = pix_abs16_x2_c;
3585 c->pix_abs[0][2] = pix_abs16_y2_c;
3586 c->pix_abs[0][3] = pix_abs16_xy2_c;
3587 c->pix_abs[1][0] = pix_abs8_c;
3588 c->pix_abs[1][1] = pix_abs8_x2_c;
3589 c->pix_abs[1][2] = pix_abs8_y2_c;
3590 c->pix_abs[1][3] = pix_abs8_xy2_c;
3592 #define dspfunc(PFX, IDX, NUM) \
3593 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3594 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3595 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3596 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3598 dspfunc(put, 0, 16);
3599 dspfunc(put_no_rnd, 0, 16);
3601 dspfunc(put_no_rnd, 1, 8);
3605 dspfunc(avg, 0, 16);
3606 dspfunc(avg_no_rnd, 0, 16);
3608 dspfunc(avg_no_rnd, 1, 8);
3613 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3614 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3616 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3617 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3618 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3619 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3620 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3621 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3622 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3623 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3624 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3626 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3627 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3628 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3629 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3630 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3631 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3632 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3633 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3634 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3636 #define dspfunc(PFX, IDX, NUM) \
3637 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3638 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3639 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3640 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3641 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3642 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3643 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3644 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3645 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3646 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3647 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3648 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3649 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3650 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3651 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3652 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3654 dspfunc(put_qpel, 0, 16);
3655 dspfunc(put_no_rnd_qpel, 0, 16);
3657 dspfunc(avg_qpel, 0, 16);
3658 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3660 dspfunc(put_qpel, 1, 8);
3661 dspfunc(put_no_rnd_qpel, 1, 8);
3663 dspfunc(avg_qpel, 1, 8);
3664 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3666 dspfunc(put_h264_qpel, 0, 16);
3667 dspfunc(put_h264_qpel, 1, 8);
3668 dspfunc(put_h264_qpel, 2, 4);
3669 dspfunc(avg_h264_qpel, 0, 16);
3670 dspfunc(avg_h264_qpel, 1, 8);
3671 dspfunc(avg_h264_qpel, 2, 4);
3674 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3675 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3676 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3677 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3678 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3679 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3681 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3682 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3683 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3684 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3685 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3686 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3687 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3688 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3689 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3690 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3691 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3692 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3693 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3694 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3695 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3696 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3697 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3698 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3699 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3700 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3702 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3703 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3704 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3705 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3706 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3707 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3708 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3709 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3711 #define SET_CMP_FUNC(name) \
3712 c->name[0]= name ## 16_c;\
3713 c->name[1]= name ## 8x8_c;
3715 SET_CMP_FUNC(hadamard8_diff)
3716 c->hadamard8_diff[4]= hadamard8_intra16_c;
3717 SET_CMP_FUNC(dct_sad)
3718 SET_CMP_FUNC(dct_max)
3719 c->sad[0]= pix_abs16_c;
3720 c->sad[1]= pix_abs8_c;
3724 SET_CMP_FUNC(quant_psnr)
3727 c->vsad[0]= vsad16_c;
3728 c->vsad[4]= vsad_intra16_c;
3729 c->vsse[0]= vsse16_c;
3730 c->vsse[4]= vsse_intra16_c;
3731 c->nsse[0]= nsse16_c;
3732 c->nsse[1]= nsse8_c;
3733 c->w53[0]= w53_16_c;
3735 c->w97[0]= w97_16_c;
3738 c->add_bytes= add_bytes_c;
3739 c->diff_bytes= diff_bytes_c;
3740 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3741 c->bswap_buf= bswap_buf;
3743 c->h263_h_loop_filter= h263_h_loop_filter_c;
3744 c->h263_v_loop_filter= h263_v_loop_filter_c;
3746 c->h261_loop_filter= h261_loop_filter_c;
3748 c->try_8x8basis= try_8x8basis_c;
3749 c->add_8x8basis= add_8x8basis_c;
3752 dsputil_init_mmx(c, avctx);
3755 dsputil_init_armv4l(c, avctx);
3758 dsputil_init_mlib(c, avctx);
3761 dsputil_init_vis(c,avctx);
3764 dsputil_init_alpha(c, avctx);
3767 dsputil_init_ppc(c, avctx);
3770 dsputil_init_mmi(c, avctx);
3773 dsputil_init_sh4(c,avctx);
3776 switch(c->idct_permutation_type){
3777 case FF_NO_IDCT_PERM:
3779 c->idct_permutation[i]= i;
3781 case FF_LIBMPEG2_IDCT_PERM:
3783 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3785 case FF_SIMPLE_IDCT_PERM:
3787 c->idct_permutation[i]= simple_mmx_permutation[i];
3789 case FF_TRANSPOSE_IDCT_PERM:
3791 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3794 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");