3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
47 /* Specific zigzag scan for 248 idct. NOTE that unlike the
48 specification, we interleave the fields */
49 const uint8_t ff_zigzag248_direct[64] = {
50 0, 8, 1, 9, 16, 24, 2, 10,
51 17, 25, 32, 40, 48, 56, 33, 41,
52 18, 26, 3, 11, 4, 12, 19, 27,
53 34, 42, 49, 57, 50, 58, 35, 43,
54 20, 28, 5, 13, 6, 14, 21, 29,
55 36, 44, 51, 59, 52, 60, 37, 45,
56 22, 30, 7, 15, 23, 31, 38, 46,
57 53, 61, 54, 62, 39, 47, 55, 63,
60 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
61 uint16_t __align8 inv_zigzag_direct16[64];
63 const uint8_t ff_alternate_horizontal_scan[64] = {
64 0, 1, 2, 3, 8, 9, 16, 17,
65 10, 11, 4, 5, 6, 7, 15, 14,
66 13, 12, 19, 18, 24, 25, 32, 33,
67 26, 27, 20, 21, 22, 23, 28, 29,
68 30, 31, 34, 35, 40, 41, 48, 49,
69 42, 43, 36, 37, 38, 39, 44, 45,
70 46, 47, 50, 51, 56, 57, 58, 59,
71 52, 53, 54, 55, 60, 61, 62, 63,
74 const uint8_t ff_alternate_vertical_scan[64] = {
75 0, 8, 16, 24, 1, 9, 2, 10,
76 17, 25, 32, 40, 48, 56, 57, 49,
77 41, 33, 26, 18, 3, 11, 4, 12,
78 19, 27, 34, 42, 50, 58, 35, 43,
79 51, 59, 20, 28, 5, 13, 6, 14,
80 21, 29, 36, 44, 52, 60, 37, 45,
81 53, 61, 22, 30, 7, 15, 23, 31,
82 38, 46, 54, 62, 39, 47, 55, 63,
85 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
86 const uint32_t inverse[256]={
87 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
88 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
89 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
90 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
91 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
92 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
93 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
94 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
95 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
96 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
97 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
98 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
99 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
100 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
101 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
102 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
103 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
104 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
105 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
106 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
107 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
108 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
109 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
110 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
111 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
112 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
113 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
114 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
115 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
116 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
117 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
118 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
121 /* Input permutation for the simple_idct_mmx */
122 static const uint8_t simple_mmx_permutation[64]={
123 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
124 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
125 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
126 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
127 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
128 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
129 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
130 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
133 static int pix_sum_c(uint8_t * pix, int line_size)
138 for (i = 0; i < 16; i++) {
139 for (j = 0; j < 16; j += 8) {
150 pix += line_size - 16;
155 static int pix_norm1_c(uint8_t * pix, int line_size)
158 uint32_t *sq = squareTbl + 256;
161 for (i = 0; i < 16; i++) {
162 for (j = 0; j < 16; j += 8) {
173 #if LONG_MAX > 2147483647
174 register uint64_t x=*(uint64_t*)pix;
176 s += sq[(x>>8)&0xff];
177 s += sq[(x>>16)&0xff];
178 s += sq[(x>>24)&0xff];
179 s += sq[(x>>32)&0xff];
180 s += sq[(x>>40)&0xff];
181 s += sq[(x>>48)&0xff];
182 s += sq[(x>>56)&0xff];
184 register uint32_t x=*(uint32_t*)pix;
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 x=*(uint32_t*)(pix+4);
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
198 pix += line_size - 16;
203 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
206 for(i=0; i+8<=w; i+=8){
207 dst[i+0]= bswap_32(src[i+0]);
208 dst[i+1]= bswap_32(src[i+1]);
209 dst[i+2]= bswap_32(src[i+2]);
210 dst[i+3]= bswap_32(src[i+3]);
211 dst[i+4]= bswap_32(src[i+4]);
212 dst[i+5]= bswap_32(src[i+5]);
213 dst[i+6]= bswap_32(src[i+6]);
214 dst[i+7]= bswap_32(src[i+7]);
217 dst[i+0]= bswap_32(src[i+0]);
221 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
224 uint32_t *sq = squareTbl + 256;
227 for (i = 0; i < 8; i++) {
228 s += sq[pix1[0] - pix2[0]];
229 s += sq[pix1[1] - pix2[1]];
230 s += sq[pix1[2] - pix2[2]];
231 s += sq[pix1[3] - pix2[3]];
232 s += sq[pix1[4] - pix2[4]];
233 s += sq[pix1[5] - pix2[5]];
234 s += sq[pix1[6] - pix2[6]];
235 s += sq[pix1[7] - pix2[7]];
242 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
245 uint32_t *sq = squareTbl + 256;
248 for (i = 0; i < 16; i++) {
249 s += sq[pix1[ 0] - pix2[ 0]];
250 s += sq[pix1[ 1] - pix2[ 1]];
251 s += sq[pix1[ 2] - pix2[ 2]];
252 s += sq[pix1[ 3] - pix2[ 3]];
253 s += sq[pix1[ 4] - pix2[ 4]];
254 s += sq[pix1[ 5] - pix2[ 5]];
255 s += sq[pix1[ 6] - pix2[ 6]];
256 s += sq[pix1[ 7] - pix2[ 7]];
257 s += sq[pix1[ 8] - pix2[ 8]];
258 s += sq[pix1[ 9] - pix2[ 9]];
259 s += sq[pix1[10] - pix2[10]];
260 s += sq[pix1[11] - pix2[11]];
261 s += sq[pix1[12] - pix2[12]];
262 s += sq[pix1[13] - pix2[13]];
263 s += sq[pix1[14] - pix2[14]];
264 s += sq[pix1[15] - pix2[15]];
272 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
276 /* read the pixels */
278 block[0] = pixels[0];
279 block[1] = pixels[1];
280 block[2] = pixels[2];
281 block[3] = pixels[3];
282 block[4] = pixels[4];
283 block[5] = pixels[5];
284 block[6] = pixels[6];
285 block[7] = pixels[7];
291 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
292 const uint8_t *s2, int stride){
295 /* read the pixels */
297 block[0] = s1[0] - s2[0];
298 block[1] = s1[1] - s2[1];
299 block[2] = s1[2] - s2[2];
300 block[3] = s1[3] - s2[3];
301 block[4] = s1[4] - s2[4];
302 block[5] = s1[5] - s2[5];
303 block[6] = s1[6] - s2[6];
304 block[7] = s1[7] - s2[7];
312 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
316 uint8_t *cm = cropTbl + MAX_NEG_CROP;
318 /* read the pixels */
320 pixels[0] = cm[block[0]];
321 pixels[1] = cm[block[1]];
322 pixels[2] = cm[block[2]];
323 pixels[3] = cm[block[3]];
324 pixels[4] = cm[block[4]];
325 pixels[5] = cm[block[5]];
326 pixels[6] = cm[block[6]];
327 pixels[7] = cm[block[7]];
334 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
338 uint8_t *cm = cropTbl + MAX_NEG_CROP;
340 /* read the pixels */
342 pixels[0] = cm[pixels[0] + block[0]];
343 pixels[1] = cm[pixels[1] + block[1]];
344 pixels[2] = cm[pixels[2] + block[2]];
345 pixels[3] = cm[pixels[3] + block[3]];
346 pixels[4] = cm[pixels[4] + block[4]];
347 pixels[5] = cm[pixels[5] + block[5]];
348 pixels[6] = cm[pixels[6] + block[6]];
349 pixels[7] = cm[pixels[7] + block[7]];
356 #define PIXOP2(OPNAME, OP) \
357 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
361 OP(*((uint64_t*)block), LD64(pixels));\
367 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
371 const uint64_t a= LD64(pixels );\
372 const uint64_t b= LD64(pixels+1);\
373 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
379 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
383 const uint64_t a= LD64(pixels );\
384 const uint64_t b= LD64(pixels+1);\
385 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
391 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
395 const uint64_t a= LD64(pixels );\
396 const uint64_t b= LD64(pixels+line_size);\
397 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
403 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
407 const uint64_t a= LD64(pixels );\
408 const uint64_t b= LD64(pixels+line_size);\
409 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
415 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
418 const uint64_t a= LD64(pixels );\
419 const uint64_t b= LD64(pixels+1);\
420 uint64_t l0= (a&0x0303030303030303ULL)\
421 + (b&0x0303030303030303ULL)\
422 + 0x0202020202020202ULL;\
423 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
424 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
428 for(i=0; i<h; i+=2){\
429 uint64_t a= LD64(pixels );\
430 uint64_t b= LD64(pixels+1);\
431 l1= (a&0x0303030303030303ULL)\
432 + (b&0x0303030303030303ULL);\
433 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
434 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
435 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
440 l0= (a&0x0303030303030303ULL)\
441 + (b&0x0303030303030303ULL)\
442 + 0x0202020202020202ULL;\
443 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
444 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
445 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
451 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
454 const uint64_t a= LD64(pixels );\
455 const uint64_t b= LD64(pixels+1);\
456 uint64_t l0= (a&0x0303030303030303ULL)\
457 + (b&0x0303030303030303ULL)\
458 + 0x0101010101010101ULL;\
459 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
460 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
464 for(i=0; i<h; i+=2){\
465 uint64_t a= LD64(pixels );\
466 uint64_t b= LD64(pixels+1);\
467 l1= (a&0x0303030303030303ULL)\
468 + (b&0x0303030303030303ULL);\
469 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
470 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
471 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
476 l0= (a&0x0303030303030303ULL)\
477 + (b&0x0303030303030303ULL)\
478 + 0x0101010101010101ULL;\
479 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
480 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
481 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
487 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
488 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
491 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
495 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
496 #else // 64 bit variant
498 #define PIXOP2(OPNAME, OP) \
499 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
502 OP(*((uint16_t*)(block )), LD16(pixels ));\
507 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
510 OP(*((uint32_t*)(block )), LD32(pixels ));\
515 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
518 OP(*((uint32_t*)(block )), LD32(pixels ));\
519 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
524 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
525 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
528 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
529 int src_stride1, int src_stride2, int h){\
533 a= LD32(&src1[i*src_stride1 ]);\
534 b= LD32(&src2[i*src_stride2 ]);\
535 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
536 a= LD32(&src1[i*src_stride1+4]);\
537 b= LD32(&src2[i*src_stride2+4]);\
538 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
542 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
543 int src_stride1, int src_stride2, int h){\
547 a= LD32(&src1[i*src_stride1 ]);\
548 b= LD32(&src2[i*src_stride2 ]);\
549 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
550 a= LD32(&src1[i*src_stride1+4]);\
551 b= LD32(&src2[i*src_stride2+4]);\
552 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
556 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
557 int src_stride1, int src_stride2, int h){\
561 a= LD32(&src1[i*src_stride1 ]);\
562 b= LD32(&src2[i*src_stride2 ]);\
563 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
567 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
568 int src_stride1, int src_stride2, int h){\
572 a= LD16(&src1[i*src_stride1 ]);\
573 b= LD16(&src2[i*src_stride2 ]);\
574 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
578 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
580 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
581 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
584 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
585 int src_stride1, int src_stride2, int h){\
586 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
587 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
590 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
591 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
594 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
595 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
598 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
599 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
602 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
603 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
606 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
607 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610 uint32_t a, b, c, d, l0, l1, h0, h1;\
611 a= LD32(&src1[i*src_stride1]);\
612 b= LD32(&src2[i*src_stride2]);\
613 c= LD32(&src3[i*src_stride3]);\
614 d= LD32(&src4[i*src_stride4]);\
615 l0= (a&0x03030303UL)\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
625 a= LD32(&src1[i*src_stride1+4]);\
626 b= LD32(&src2[i*src_stride2+4]);\
627 c= LD32(&src3[i*src_stride3+4]);\
628 d= LD32(&src4[i*src_stride4+4]);\
629 l0= (a&0x03030303UL)\
632 h0= ((a&0xFCFCFCFCUL)>>2)\
633 + ((b&0xFCFCFCFCUL)>>2);\
634 l1= (c&0x03030303UL)\
636 h1= ((c&0xFCFCFCFCUL)>>2)\
637 + ((d&0xFCFCFCFCUL)>>2);\
638 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
642 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
643 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
646 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
647 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
650 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
651 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
654 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
655 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
658 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
659 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
662 uint32_t a, b, c, d, l0, l1, h0, h1;\
663 a= LD32(&src1[i*src_stride1]);\
664 b= LD32(&src2[i*src_stride2]);\
665 c= LD32(&src3[i*src_stride3]);\
666 d= LD32(&src4[i*src_stride4]);\
667 l0= (a&0x03030303UL)\
670 h0= ((a&0xFCFCFCFCUL)>>2)\
671 + ((b&0xFCFCFCFCUL)>>2);\
672 l1= (c&0x03030303UL)\
674 h1= ((c&0xFCFCFCFCUL)>>2)\
675 + ((d&0xFCFCFCFCUL)>>2);\
676 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
677 a= LD32(&src1[i*src_stride1+4]);\
678 b= LD32(&src2[i*src_stride2+4]);\
679 c= LD32(&src3[i*src_stride3+4]);\
680 d= LD32(&src4[i*src_stride4+4]);\
681 l0= (a&0x03030303UL)\
684 h0= ((a&0xFCFCFCFCUL)>>2)\
685 + ((b&0xFCFCFCFCUL)>>2);\
686 l1= (c&0x03030303UL)\
688 h1= ((c&0xFCFCFCFCUL)>>2)\
689 + ((d&0xFCFCFCFCUL)>>2);\
690 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
693 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
694 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
695 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
696 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
698 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
699 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
700 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
701 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
704 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
706 int i, a0, b0, a1, b1;\
713 for(i=0; i<h; i+=2){\
719 block[0]= (a1+a0)>>2; /* FIXME non put */\
720 block[1]= (b1+b0)>>2;\
730 block[0]= (a1+a0)>>2;\
731 block[1]= (b1+b0)>>2;\
737 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
740 const uint32_t a= LD32(pixels );\
741 const uint32_t b= LD32(pixels+1);\
742 uint32_t l0= (a&0x03030303UL)\
745 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
746 + ((b&0xFCFCFCFCUL)>>2);\
750 for(i=0; i<h; i+=2){\
751 uint32_t a= LD32(pixels );\
752 uint32_t b= LD32(pixels+1);\
753 l1= (a&0x03030303UL)\
755 h1= ((a&0xFCFCFCFCUL)>>2)\
756 + ((b&0xFCFCFCFCUL)>>2);\
757 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
762 l0= (a&0x03030303UL)\
765 h0= ((a&0xFCFCFCFCUL)>>2)\
766 + ((b&0xFCFCFCFCUL)>>2);\
767 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
773 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
778 const uint32_t a= LD32(pixels );\
779 const uint32_t b= LD32(pixels+1);\
780 uint32_t l0= (a&0x03030303UL)\
783 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
784 + ((b&0xFCFCFCFCUL)>>2);\
788 for(i=0; i<h; i+=2){\
789 uint32_t a= LD32(pixels );\
790 uint32_t b= LD32(pixels+1);\
791 l1= (a&0x03030303UL)\
793 h1= ((a&0xFCFCFCFCUL)>>2)\
794 + ((b&0xFCFCFCFCUL)>>2);\
795 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
800 l0= (a&0x03030303UL)\
803 h0= ((a&0xFCFCFCFCUL)>>2)\
804 + ((b&0xFCFCFCFCUL)>>2);\
805 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
809 pixels+=4-line_size*(h+1);\
810 block +=4-line_size*h;\
814 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
819 const uint32_t a= LD32(pixels );\
820 const uint32_t b= LD32(pixels+1);\
821 uint32_t l0= (a&0x03030303UL)\
824 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
825 + ((b&0xFCFCFCFCUL)>>2);\
829 for(i=0; i<h; i+=2){\
830 uint32_t a= LD32(pixels );\
831 uint32_t b= LD32(pixels+1);\
832 l1= (a&0x03030303UL)\
834 h1= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
841 l0= (a&0x03030303UL)\
844 h0= ((a&0xFCFCFCFCUL)>>2)\
845 + ((b&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
850 pixels+=4-line_size*(h+1);\
851 block +=4-line_size*h;\
855 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
856 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
859 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
864 #define op_avg(a, b) a = rnd_avg32(a, b)
866 #define op_put(a, b) a = b
873 #define avg2(a,b) ((a+b+1)>>1)
874 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
877 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
879 const int A=(16-x16)*(16-y16);
880 const int B=( x16)*(16-y16);
881 const int C=(16-x16)*( y16);
882 const int D=( x16)*( y16);
887 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
888 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
889 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
890 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
891 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
892 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
893 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
894 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
900 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
901 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
904 const int s= 1<<shift;
914 for(x=0; x<8; x++){ //XXX FIXME optimize
915 int src_x, src_y, frac_x, frac_y, index;
924 if((unsigned)src_x < width){
925 if((unsigned)src_y < height){
926 index= src_x + src_y*stride;
927 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
928 + src[index +1]* frac_x )*(s-frac_y)
929 + ( src[index+stride ]*(s-frac_x)
930 + src[index+stride+1]* frac_x )* frac_y
933 index= src_x + clip(src_y, 0, height)*stride;
934 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
935 + src[index +1]* frac_x )*s
939 if((unsigned)src_y < height){
940 index= clip(src_x, 0, width) + src_y*stride;
941 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
942 + src[index+stride ]* frac_y )*s
945 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
946 dst[y*stride + x]= src[index ];
958 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
960 case 2: put_pixels2_c (dst, src, stride, height); break;
961 case 4: put_pixels4_c (dst, src, stride, height); break;
962 case 8: put_pixels8_c (dst, src, stride, height); break;
963 case 16:put_pixels16_c(dst, src, stride, height); break;
967 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
969 for (i=0; i < height; i++) {
970 for (j=0; j < width; j++) {
971 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
978 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
980 for (i=0; i < height; i++) {
981 for (j=0; j < width; j++) {
982 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
989 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
991 for (i=0; i < height; i++) {
992 for (j=0; j < width; j++) {
993 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1000 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1002 for (i=0; i < height; i++) {
1003 for (j=0; j < width; j++) {
1004 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1011 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1013 for (i=0; i < height; i++) {
1014 for (j=0; j < width; j++) {
1015 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1022 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1024 for (i=0; i < height; i++) {
1025 for (j=0; j < width; j++) {
1026 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1033 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1035 for (i=0; i < height; i++) {
1036 for (j=0; j < width; j++) {
1037 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1044 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1046 for (i=0; i < height; i++) {
1047 for (j=0; j < width; j++) {
1048 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1055 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1057 case 2: avg_pixels2_c (dst, src, stride, height); break;
1058 case 4: avg_pixels4_c (dst, src, stride, height); break;
1059 case 8: avg_pixels8_c (dst, src, stride, height); break;
1060 case 16:avg_pixels16_c(dst, src, stride, height); break;
1064 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1066 for (i=0; i < height; i++) {
1067 for (j=0; j < width; j++) {
1068 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1075 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1077 for (i=0; i < height; i++) {
1078 for (j=0; j < width; j++) {
1079 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1086 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1088 for (i=0; i < height; i++) {
1089 for (j=0; j < width; j++) {
1090 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1097 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1099 for (i=0; i < height; i++) {
1100 for (j=0; j < width; j++) {
1101 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1108 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1110 for (i=0; i < height; i++) {
1111 for (j=0; j < width; j++) {
1112 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1119 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1121 for (i=0; i < height; i++) {
1122 for (j=0; j < width; j++) {
1123 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1130 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1132 for (i=0; i < height; i++) {
1133 for (j=0; j < width; j++) {
1134 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1141 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1143 for (i=0; i < height; i++) {
1144 for (j=0; j < width; j++) {
1145 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1152 #define TPEL_WIDTH(width)\
1153 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1154 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1155 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1156 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1157 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1158 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1159 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1160 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1161 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1173 #define H264_CHROMA_MC(OPNAME, OP)\
1174 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1175 const int A=(8-x)*(8-y);\
1176 const int B=( x)*(8-y);\
1177 const int C=(8-x)*( y);\
1178 const int D=( x)*( y);\
1181 assert(x<8 && y<8 && x>=0 && y>=0);\
1185 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1186 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1193 const int A=(8-x)*(8-y);\
1194 const int B=( x)*(8-y);\
1195 const int C=(8-x)*( y);\
1196 const int D=( x)*( y);\
1199 assert(x<8 && y<8 && x>=0 && y>=0);\
1203 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1204 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1205 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1206 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1212 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1213 const int A=(8-x)*(8-y);\
1214 const int B=( x)*(8-y);\
1215 const int C=(8-x)*( y);\
1216 const int D=( x)*( y);\
1219 assert(x<8 && y<8 && x>=0 && y>=0);\
1223 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1224 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1225 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1226 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1227 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1228 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1229 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1230 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1236 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1237 #define op_put(a, b) a = (((b) + 32)>>6)
1239 H264_CHROMA_MC(put_ , op_put)
1240 H264_CHROMA_MC(avg_ , op_avg)
1244 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1249 ST32(dst , LD32(src ));
1255 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1260 ST32(dst , LD32(src ));
1261 ST32(dst+4 , LD32(src+4 ));
1267 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1272 ST32(dst , LD32(src ));
1273 ST32(dst+4 , LD32(src+4 ));
1274 ST32(dst+8 , LD32(src+8 ));
1275 ST32(dst+12, LD32(src+12));
1281 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1286 ST32(dst , LD32(src ));
1287 ST32(dst+4 , LD32(src+4 ));
1288 ST32(dst+8 , LD32(src+8 ));
1289 ST32(dst+12, LD32(src+12));
1296 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1310 #define QPEL_MC(r, OPNAME, RND, OP) \
1311 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1312 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1316 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1317 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1318 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1319 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1320 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1321 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1322 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1323 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1329 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1331 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1335 const int src0= src[0*srcStride];\
1336 const int src1= src[1*srcStride];\
1337 const int src2= src[2*srcStride];\
1338 const int src3= src[3*srcStride];\
1339 const int src4= src[4*srcStride];\
1340 const int src5= src[5*srcStride];\
1341 const int src6= src[6*srcStride];\
1342 const int src7= src[7*srcStride];\
1343 const int src8= src[8*srcStride];\
1344 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1345 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1346 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1347 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1348 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1349 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1350 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1351 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1357 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1358 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1363 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1364 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1365 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1366 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1367 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1368 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1369 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1370 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1371 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1372 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1373 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1374 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1375 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1376 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1377 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1378 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1384 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1385 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1390 const int src0= src[0*srcStride];\
1391 const int src1= src[1*srcStride];\
1392 const int src2= src[2*srcStride];\
1393 const int src3= src[3*srcStride];\
1394 const int src4= src[4*srcStride];\
1395 const int src5= src[5*srcStride];\
1396 const int src6= src[6*srcStride];\
1397 const int src7= src[7*srcStride];\
1398 const int src8= src[8*srcStride];\
1399 const int src9= src[9*srcStride];\
1400 const int src10= src[10*srcStride];\
1401 const int src11= src[11*srcStride];\
1402 const int src12= src[12*srcStride];\
1403 const int src13= src[13*srcStride];\
1404 const int src14= src[14*srcStride];\
1405 const int src15= src[15*srcStride];\
1406 const int src16= src[16*srcStride];\
1407 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1408 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1409 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1410 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1411 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1412 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1413 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1414 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1415 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1416 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1417 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1418 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1419 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1420 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1421 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1422 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1428 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1429 OPNAME ## pixels8_c(dst, src, stride, 8);\
1432 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1434 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1435 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1438 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1439 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1442 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1444 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1445 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1448 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1449 uint8_t full[16*9];\
1451 copy_block9(full, src, 16, stride, 9);\
1452 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1453 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1456 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1458 copy_block9(full, src, 16, stride, 9);\
1459 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1462 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1463 uint8_t full[16*9];\
1465 copy_block9(full, src, 16, stride, 9);\
1466 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1467 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1469 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1470 uint8_t full[16*9];\
1473 uint8_t halfHV[64];\
1474 copy_block9(full, src, 16, stride, 9);\
1475 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1476 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1477 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1478 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1480 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1481 uint8_t full[16*9];\
1483 uint8_t halfHV[64];\
1484 copy_block9(full, src, 16, stride, 9);\
1485 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1486 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1487 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1488 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1490 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1491 uint8_t full[16*9];\
1494 uint8_t halfHV[64];\
1495 copy_block9(full, src, 16, stride, 9);\
1496 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1497 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1498 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1499 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1501 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1502 uint8_t full[16*9];\
1504 uint8_t halfHV[64];\
1505 copy_block9(full, src, 16, stride, 9);\
1506 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1507 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1508 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1509 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1511 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1512 uint8_t full[16*9];\
1515 uint8_t halfHV[64];\
1516 copy_block9(full, src, 16, stride, 9);\
1517 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1518 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1519 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1520 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1522 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1523 uint8_t full[16*9];\
1525 uint8_t halfHV[64];\
1526 copy_block9(full, src, 16, stride, 9);\
1527 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1528 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1529 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1530 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1532 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1533 uint8_t full[16*9];\
1536 uint8_t halfHV[64];\
1537 copy_block9(full, src, 16, stride, 9);\
1538 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1539 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1540 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1541 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1543 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1544 uint8_t full[16*9];\
1546 uint8_t halfHV[64];\
1547 copy_block9(full, src, 16, stride, 9);\
1548 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1549 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1550 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1551 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1553 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1555 uint8_t halfHV[64];\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1557 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1558 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1560 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t halfHV[64];\
1563 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1564 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1565 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1567 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1568 uint8_t full[16*9];\
1571 uint8_t halfHV[64];\
1572 copy_block9(full, src, 16, stride, 9);\
1573 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1574 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1575 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1576 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1578 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1579 uint8_t full[16*9];\
1581 copy_block9(full, src, 16, stride, 9);\
1582 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1583 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1584 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1586 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1587 uint8_t full[16*9];\
1590 uint8_t halfHV[64];\
1591 copy_block9(full, src, 16, stride, 9);\
1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1594 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1595 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1597 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1598 uint8_t full[16*9];\
1600 copy_block9(full, src, 16, stride, 9);\
1601 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1602 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1603 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1605 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1607 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1608 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1610 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1611 OPNAME ## pixels16_c(dst, src, stride, 16);\
1614 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1616 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1617 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1620 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1621 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1624 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1626 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1627 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1630 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1631 uint8_t full[24*17];\
1633 copy_block17(full, src, 24, stride, 17);\
1634 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1635 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1638 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1640 copy_block17(full, src, 24, stride, 17);\
1641 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1644 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1645 uint8_t full[24*17];\
1647 copy_block17(full, src, 24, stride, 17);\
1648 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1649 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1651 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1652 uint8_t full[24*17];\
1653 uint8_t halfH[272];\
1654 uint8_t halfV[256];\
1655 uint8_t halfHV[256];\
1656 copy_block17(full, src, 24, stride, 17);\
1657 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1658 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1659 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1660 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1662 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1663 uint8_t full[24*17];\
1664 uint8_t halfH[272];\
1665 uint8_t halfHV[256];\
1666 copy_block17(full, src, 24, stride, 17);\
1667 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1668 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1669 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1670 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1672 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1673 uint8_t full[24*17];\
1674 uint8_t halfH[272];\
1675 uint8_t halfV[256];\
1676 uint8_t halfHV[256];\
1677 copy_block17(full, src, 24, stride, 17);\
1678 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1679 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1680 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1681 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1683 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1684 uint8_t full[24*17];\
1685 uint8_t halfH[272];\
1686 uint8_t halfHV[256];\
1687 copy_block17(full, src, 24, stride, 17);\
1688 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1689 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1690 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1691 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1693 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1694 uint8_t full[24*17];\
1695 uint8_t halfH[272];\
1696 uint8_t halfV[256];\
1697 uint8_t halfHV[256];\
1698 copy_block17(full, src, 24, stride, 17);\
1699 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1700 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1701 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1702 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1704 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1705 uint8_t full[24*17];\
1706 uint8_t halfH[272];\
1707 uint8_t halfHV[256];\
1708 copy_block17(full, src, 24, stride, 17);\
1709 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1710 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1711 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1712 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1714 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t full[24*17];\
1716 uint8_t halfH[272];\
1717 uint8_t halfV[256];\
1718 uint8_t halfHV[256];\
1719 copy_block17(full, src, 24, stride, 17);\
1720 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1721 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1722 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1723 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1725 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t full[24*17];\
1727 uint8_t halfH[272];\
1728 uint8_t halfHV[256];\
1729 copy_block17(full, src, 24, stride, 17);\
1730 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1731 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1732 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1733 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1735 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1736 uint8_t halfH[272];\
1737 uint8_t halfHV[256];\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1739 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1740 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1742 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1743 uint8_t halfH[272];\
1744 uint8_t halfHV[256];\
1745 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1746 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1747 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1749 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[24*17];\
1751 uint8_t halfH[272];\
1752 uint8_t halfV[256];\
1753 uint8_t halfHV[256];\
1754 copy_block17(full, src, 24, stride, 17);\
1755 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1756 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1757 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1758 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1760 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1761 uint8_t full[24*17];\
1762 uint8_t halfH[272];\
1763 copy_block17(full, src, 24, stride, 17);\
1764 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1765 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1766 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1768 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[24*17];\
1770 uint8_t halfH[272];\
1771 uint8_t halfV[256];\
1772 uint8_t halfHV[256];\
1773 copy_block17(full, src, 24, stride, 17);\
1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1776 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1777 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1779 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[24*17];\
1781 uint8_t halfH[272];\
1782 copy_block17(full, src, 24, stride, 17);\
1783 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1784 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1785 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1787 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t halfH[272];\
1789 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1790 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1793 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1794 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1795 #define op_put(a, b) a = cm[((b) + 16)>>5]
1796 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1798 QPEL_MC(0, put_ , _ , op_put)
1799 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1800 QPEL_MC(0, avg_ , _ , op_avg)
1801 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1803 #undef op_avg_no_rnd
1805 #undef op_put_no_rnd
1808 #define H264_LOWPASS(OPNAME, OP, OP2) \
1809 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1811 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1815 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1816 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1817 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1818 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1824 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1826 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1830 const int srcB= src[-2*srcStride];\
1831 const int srcA= src[-1*srcStride];\
1832 const int src0= src[0 *srcStride];\
1833 const int src1= src[1 *srcStride];\
1834 const int src2= src[2 *srcStride];\
1835 const int src3= src[3 *srcStride];\
1836 const int src4= src[4 *srcStride];\
1837 const int src5= src[5 *srcStride];\
1838 const int src6= src[6 *srcStride];\
1839 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1840 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1841 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1842 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1848 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1851 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1853 src -= 2*srcStride;\
1854 for(i=0; i<h+5; i++)\
1856 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1857 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1858 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1859 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1863 tmp -= tmpStride*(h+5-2);\
1866 const int tmpB= tmp[-2*tmpStride];\
1867 const int tmpA= tmp[-1*tmpStride];\
1868 const int tmp0= tmp[0 *tmpStride];\
1869 const int tmp1= tmp[1 *tmpStride];\
1870 const int tmp2= tmp[2 *tmpStride];\
1871 const int tmp3= tmp[3 *tmpStride];\
1872 const int tmp4= tmp[4 *tmpStride];\
1873 const int tmp5= tmp[5 *tmpStride];\
1874 const int tmp6= tmp[6 *tmpStride];\
1875 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1876 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1877 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1878 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1884 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1886 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1890 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1891 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1892 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1893 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1894 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1895 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1896 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1897 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1903 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1905 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1909 const int srcB= src[-2*srcStride];\
1910 const int srcA= src[-1*srcStride];\
1911 const int src0= src[0 *srcStride];\
1912 const int src1= src[1 *srcStride];\
1913 const int src2= src[2 *srcStride];\
1914 const int src3= src[3 *srcStride];\
1915 const int src4= src[4 *srcStride];\
1916 const int src5= src[5 *srcStride];\
1917 const int src6= src[6 *srcStride];\
1918 const int src7= src[7 *srcStride];\
1919 const int src8= src[8 *srcStride];\
1920 const int src9= src[9 *srcStride];\
1921 const int src10=src[10*srcStride];\
1922 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1923 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1924 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1925 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1926 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1927 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1928 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1929 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1935 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1938 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1940 src -= 2*srcStride;\
1941 for(i=0; i<h+5; i++)\
1943 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1944 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1945 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1946 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1947 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1948 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1949 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1950 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1954 tmp -= tmpStride*(h+5-2);\
1957 const int tmpB= tmp[-2*tmpStride];\
1958 const int tmpA= tmp[-1*tmpStride];\
1959 const int tmp0= tmp[0 *tmpStride];\
1960 const int tmp1= tmp[1 *tmpStride];\
1961 const int tmp2= tmp[2 *tmpStride];\
1962 const int tmp3= tmp[3 *tmpStride];\
1963 const int tmp4= tmp[4 *tmpStride];\
1964 const int tmp5= tmp[5 *tmpStride];\
1965 const int tmp6= tmp[6 *tmpStride];\
1966 const int tmp7= tmp[7 *tmpStride];\
1967 const int tmp8= tmp[8 *tmpStride];\
1968 const int tmp9= tmp[9 *tmpStride];\
1969 const int tmp10=tmp[10*tmpStride];\
1970 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1971 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1972 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1973 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1974 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1975 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1976 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1977 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1983 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1984 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1985 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1986 src += 8*srcStride;\
1987 dst += 8*dstStride;\
1988 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1989 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1992 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1993 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1994 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1995 src += 8*srcStride;\
1996 dst += 8*dstStride;\
1997 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1998 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2001 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2002 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2003 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2004 src += 8*srcStride;\
2005 tmp += 8*tmpStride;\
2006 dst += 8*dstStride;\
2007 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2008 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2011 #define H264_MC(OPNAME, SIZE) \
2012 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2013 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2016 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t half[SIZE*SIZE];\
2018 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2019 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2022 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2023 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2026 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t half[SIZE*SIZE];\
2028 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2029 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2032 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t full[SIZE*(SIZE+5)];\
2034 uint8_t * const full_mid= full + SIZE*2;\
2035 uint8_t half[SIZE*SIZE];\
2036 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2037 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2038 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[SIZE*(SIZE+5)];\
2043 uint8_t * const full_mid= full + SIZE*2;\
2044 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2045 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2048 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2049 uint8_t full[SIZE*(SIZE+5)];\
2050 uint8_t * const full_mid= full + SIZE*2;\
2051 uint8_t half[SIZE*SIZE];\
2052 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2053 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2054 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2057 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[SIZE*(SIZE+5)];\
2059 uint8_t * const full_mid= full + SIZE*2;\
2060 uint8_t halfH[SIZE*SIZE];\
2061 uint8_t halfV[SIZE*SIZE];\
2062 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2063 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2064 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2065 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2068 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[SIZE*(SIZE+5)];\
2070 uint8_t * const full_mid= full + SIZE*2;\
2071 uint8_t halfH[SIZE*SIZE];\
2072 uint8_t halfV[SIZE*SIZE];\
2073 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2074 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2075 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2076 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2079 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t full[SIZE*(SIZE+5)];\
2081 uint8_t * const full_mid= full + SIZE*2;\
2082 uint8_t halfH[SIZE*SIZE];\
2083 uint8_t halfV[SIZE*SIZE];\
2084 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2085 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2086 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2087 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2090 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2091 uint8_t full[SIZE*(SIZE+5)];\
2092 uint8_t * const full_mid= full + SIZE*2;\
2093 uint8_t halfH[SIZE*SIZE];\
2094 uint8_t halfV[SIZE*SIZE];\
2095 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2096 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2097 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2098 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2101 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2102 int16_t tmp[SIZE*(SIZE+5)];\
2103 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2106 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2107 int16_t tmp[SIZE*(SIZE+5)];\
2108 uint8_t halfH[SIZE*SIZE];\
2109 uint8_t halfHV[SIZE*SIZE];\
2110 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2111 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2112 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2115 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2116 int16_t tmp[SIZE*(SIZE+5)];\
2117 uint8_t halfH[SIZE*SIZE];\
2118 uint8_t halfHV[SIZE*SIZE];\
2119 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2120 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2121 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2124 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2125 uint8_t full[SIZE*(SIZE+5)];\
2126 uint8_t * const full_mid= full + SIZE*2;\
2127 int16_t tmp[SIZE*(SIZE+5)];\
2128 uint8_t halfV[SIZE*SIZE];\
2129 uint8_t halfHV[SIZE*SIZE];\
2130 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2131 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2132 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2133 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2136 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2137 uint8_t full[SIZE*(SIZE+5)];\
2138 uint8_t * const full_mid= full + SIZE*2;\
2139 int16_t tmp[SIZE*(SIZE+5)];\
2140 uint8_t halfV[SIZE*SIZE];\
2141 uint8_t halfHV[SIZE*SIZE];\
2142 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2143 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2144 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2145 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2148 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2149 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2150 #define op_put(a, b) a = cm[((b) + 16)>>5]
2151 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2152 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2154 H264_LOWPASS(put_ , op_put, op2_put)
2155 H264_LOWPASS(avg_ , op_avg, op2_avg)
2169 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2170 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2174 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2175 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2176 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2177 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2178 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2179 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2180 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2181 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2187 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2188 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2192 const int src_1= src[ -srcStride];
2193 const int src0 = src[0 ];
2194 const int src1 = src[ srcStride];
2195 const int src2 = src[2*srcStride];
2196 const int src3 = src[3*srcStride];
2197 const int src4 = src[4*srcStride];
2198 const int src5 = src[5*srcStride];
2199 const int src6 = src[6*srcStride];
2200 const int src7 = src[7*srcStride];
2201 const int src8 = src[8*srcStride];
2202 const int src9 = src[9*srcStride];
2203 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2204 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2205 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2206 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2207 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2208 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2209 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2210 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2216 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2217 put_pixels8_c(dst, src, stride, 8);
2220 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2222 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2223 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2226 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2227 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2230 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2232 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2233 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2236 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2237 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2240 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2244 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2245 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2246 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2247 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2249 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2253 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2254 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2255 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2256 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2258 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2260 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2261 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2265 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2271 s += abs(pix1[0] - pix2[0]);
2272 s += abs(pix1[1] - pix2[1]);
2273 s += abs(pix1[2] - pix2[2]);
2274 s += abs(pix1[3] - pix2[3]);
2275 s += abs(pix1[4] - pix2[4]);
2276 s += abs(pix1[5] - pix2[5]);
2277 s += abs(pix1[6] - pix2[6]);
2278 s += abs(pix1[7] - pix2[7]);
2279 s += abs(pix1[8] - pix2[8]);
2280 s += abs(pix1[9] - pix2[9]);
2281 s += abs(pix1[10] - pix2[10]);
2282 s += abs(pix1[11] - pix2[11]);
2283 s += abs(pix1[12] - pix2[12]);
2284 s += abs(pix1[13] - pix2[13]);
2285 s += abs(pix1[14] - pix2[14]);
2286 s += abs(pix1[15] - pix2[15]);
2293 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2299 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2300 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2301 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2302 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2303 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2304 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2305 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2306 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2307 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2308 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2309 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2310 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2311 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2312 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2313 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2314 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2321 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2324 uint8_t *pix3 = pix2 + line_size;
2328 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2329 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2330 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2331 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2332 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2333 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2334 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2335 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2336 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2337 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2338 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2339 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2340 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2341 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2342 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2343 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2351 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2354 uint8_t *pix3 = pix2 + line_size;
2358 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2359 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2360 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2361 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2362 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2363 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2364 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2365 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2366 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2367 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2368 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2369 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2370 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2371 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2372 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2373 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2381 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2387 s += abs(pix1[0] - pix2[0]);
2388 s += abs(pix1[1] - pix2[1]);
2389 s += abs(pix1[2] - pix2[2]);
2390 s += abs(pix1[3] - pix2[3]);
2391 s += abs(pix1[4] - pix2[4]);
2392 s += abs(pix1[5] - pix2[5]);
2393 s += abs(pix1[6] - pix2[6]);
2394 s += abs(pix1[7] - pix2[7]);
2401 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2407 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2408 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2409 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2410 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2411 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2412 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2413 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2414 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2421 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2424 uint8_t *pix3 = pix2 + line_size;
2428 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2429 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2430 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2431 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2432 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2433 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2434 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2435 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2443 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2446 uint8_t *pix3 = pix2 + line_size;
2450 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2451 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2452 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2453 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2454 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2455 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2456 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2457 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2465 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2466 return pix_abs16x16_c(a,b,stride);
2469 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2470 return pix_abs8x8_c(a,b,stride);
2474 * permutes an 8x8 block.
2475 * @param block the block which will be permuted according to the given permutation vector
2476 * @param permutation the permutation vector
2477 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2478 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2479 * (inverse) permutated to scantable order!
2481 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2487 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2489 for(i=0; i<=last; i++){
2490 const int j= scantable[i];
2495 for(i=0; i<=last; i++){
2496 const int j= scantable[i];
2497 const int perm_j= permutation[j];
2498 block[perm_j]= temp[j];
2503 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2505 static void clear_blocks_c(DCTELEM *blocks)
2507 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2510 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2512 for(i=0; i+7<w; i+=8){
2513 dst[i+0] += src[i+0];
2514 dst[i+1] += src[i+1];
2515 dst[i+2] += src[i+2];
2516 dst[i+3] += src[i+3];
2517 dst[i+4] += src[i+4];
2518 dst[i+5] += src[i+5];
2519 dst[i+6] += src[i+6];
2520 dst[i+7] += src[i+7];
2523 dst[i+0] += src[i+0];
2526 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2528 for(i=0; i+7<w; i+=8){
2529 dst[i+0] = src1[i+0]-src2[i+0];
2530 dst[i+1] = src1[i+1]-src2[i+1];
2531 dst[i+2] = src1[i+2]-src2[i+2];
2532 dst[i+3] = src1[i+3]-src2[i+3];
2533 dst[i+4] = src1[i+4]-src2[i+4];
2534 dst[i+5] = src1[i+5]-src2[i+5];
2535 dst[i+6] = src1[i+6]-src2[i+6];
2536 dst[i+7] = src1[i+7]-src2[i+7];
2539 dst[i+0] = src1[i+0]-src2[i+0];
2542 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2550 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2560 #define BUTTERFLY2(o1,o2,i1,i2) \
2564 #define BUTTERFLY1(x,y) \
2573 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2575 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2581 //FIXME try pointer walks
2582 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2583 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2584 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2585 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2587 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2588 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2589 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2590 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2592 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2593 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2594 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2595 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2599 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2600 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2601 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2602 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2604 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2605 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2606 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2607 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2610 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2611 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2612 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2613 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2619 printf("MAX:%d\n", maxi);
2625 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2629 //FIXME OOOPS ignore 0 term instead of mean mess
2631 //FIXME try pointer walks
2632 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2633 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2634 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2635 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2637 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2638 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2639 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2640 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2642 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2643 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2644 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2645 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2649 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2650 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2651 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2652 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2654 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2655 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2656 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2657 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2660 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2661 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2662 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2663 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2669 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2670 MpegEncContext * const s= (MpegEncContext *)c;
2671 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2672 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2675 s->dsp.diff_pixels(temp, src1, src2, stride);
2684 void simple_idct(DCTELEM *block); //FIXME
2686 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2687 MpegEncContext * const s= (MpegEncContext *)c;
2688 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2689 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2690 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2695 s->dsp.diff_pixels(temp, src1, src2, stride);
2697 memcpy(bak, temp, 64*sizeof(DCTELEM));
2699 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2700 s->dct_unquantize(s, temp, 0, s->qscale);
2701 simple_idct(temp); //FIXME
2704 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2709 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2710 MpegEncContext * const s= (MpegEncContext *)c;
2711 const uint8_t *scantable= s->intra_scantable.permutated;
2712 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2713 uint64_t __align8 aligned_bak[stride];
2714 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2715 uint8_t * const bak= (uint8_t*)aligned_bak;
2716 int i, last, run, bits, level, distoration, start_i;
2717 const int esc_length= s->ac_esc_length;
2719 uint8_t * last_length;
2722 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2723 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2726 s->dsp.diff_pixels(temp, src1, src2, stride);
2728 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2734 length = s->intra_ac_vlc_length;
2735 last_length= s->intra_ac_vlc_last_length;
2736 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2739 length = s->inter_ac_vlc_length;
2740 last_length= s->inter_ac_vlc_last_length;
2745 for(i=start_i; i<last; i++){
2746 int j= scantable[i];
2751 if((level&(~127)) == 0){
2752 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2761 level= temp[i] + 64;
2765 if((level&(~127)) == 0){
2766 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2773 s->dct_unquantize(s, temp, 0, s->qscale);
2776 s->dsp.idct_add(bak, stride, temp);
2778 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2780 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2783 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2784 MpegEncContext * const s= (MpegEncContext *)c;
2785 const uint8_t *scantable= s->intra_scantable.permutated;
2786 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2788 int i, last, run, bits, level, start_i;
2789 const int esc_length= s->ac_esc_length;
2791 uint8_t * last_length;
2793 s->dsp.diff_pixels(temp, src1, src2, stride);
2795 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2801 length = s->intra_ac_vlc_length;
2802 last_length= s->intra_ac_vlc_last_length;
2803 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2806 length = s->inter_ac_vlc_length;
2807 last_length= s->inter_ac_vlc_last_length;
2812 for(i=start_i; i<last; i++){
2813 int j= scantable[i];
2818 if((level&(~127)) == 0){
2819 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2828 level= temp[i] + 64;
2832 if((level&(~127)) == 0){
2833 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2842 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2843 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2844 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2845 WARPER88_1616(rd8x8_c, rd16x16_c)
2846 WARPER88_1616(bit8x8_c, bit16x16_c)
2848 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2850 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2853 put_pixels_clamped_c(block, dest, line_size);
2855 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2858 add_pixels_clamped_c(block, dest, line_size);
2861 /* init static data */
2862 void dsputil_static_init(void)
2866 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2867 for(i=0;i<MAX_NEG_CROP;i++) {
2869 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2872 for(i=0;i<512;i++) {
2873 squareTbl[i] = (i - 256) * (i - 256);
2876 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2880 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2884 #ifdef CONFIG_ENCODERS
2885 if(avctx->dct_algo==FF_DCT_FASTINT) {
2886 c->fdct = fdct_ifast;
2887 c->fdct248 = fdct_ifast248;
2889 else if(avctx->dct_algo==FF_DCT_FAAN) {
2890 c->fdct = ff_faandct;
2891 c->fdct248 = ff_faandct248;
2894 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2895 c->fdct248 = ff_fdct248_islow;
2897 #endif //CONFIG_ENCODERS
2899 if(avctx->idct_algo==FF_IDCT_INT){
2900 c->idct_put= ff_jref_idct_put;
2901 c->idct_add= ff_jref_idct_add;
2902 c->idct = j_rev_dct;
2903 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2904 }else{ //accurate/default
2905 c->idct_put= simple_idct_put;
2906 c->idct_add= simple_idct_add;
2907 c->idct = simple_idct;
2908 c->idct_permutation_type= FF_NO_IDCT_PERM;
2911 c->get_pixels = get_pixels_c;
2912 c->diff_pixels = diff_pixels_c;
2913 c->put_pixels_clamped = put_pixels_clamped_c;
2914 c->add_pixels_clamped = add_pixels_clamped_c;
2917 c->clear_blocks = clear_blocks_c;
2918 c->pix_sum = pix_sum_c;
2919 c->pix_norm1 = pix_norm1_c;
2923 /* TODO [0] 16 [1] 8 */
2924 c->pix_abs16x16 = pix_abs16x16_c;
2925 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2926 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2927 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2928 c->pix_abs8x8 = pix_abs8x8_c;
2929 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2930 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2931 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2933 #define dspfunc(PFX, IDX, NUM) \
2934 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2935 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2936 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2937 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2939 dspfunc(put, 0, 16);
2940 dspfunc(put_no_rnd, 0, 16);
2942 dspfunc(put_no_rnd, 1, 8);
2946 dspfunc(avg, 0, 16);
2947 dspfunc(avg_no_rnd, 0, 16);
2949 dspfunc(avg_no_rnd, 1, 8);
2954 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2955 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2956 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2957 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2958 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2959 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2960 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2961 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2962 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2964 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2965 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2966 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2967 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2968 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2969 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2970 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2971 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2972 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2974 #define dspfunc(PFX, IDX, NUM) \
2975 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2976 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2977 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2978 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2979 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2980 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2981 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2982 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2983 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2984 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2985 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2986 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2987 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2988 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2989 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2990 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2992 dspfunc(put_qpel, 0, 16);
2993 dspfunc(put_no_rnd_qpel, 0, 16);
2995 dspfunc(avg_qpel, 0, 16);
2996 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2998 dspfunc(put_qpel, 1, 8);
2999 dspfunc(put_no_rnd_qpel, 1, 8);
3001 dspfunc(avg_qpel, 1, 8);
3002 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3004 dspfunc(put_h264_qpel, 0, 16);
3005 dspfunc(put_h264_qpel, 1, 8);
3006 dspfunc(put_h264_qpel, 2, 4);
3007 dspfunc(avg_h264_qpel, 0, 16);
3008 dspfunc(avg_h264_qpel, 1, 8);
3009 dspfunc(avg_h264_qpel, 2, 4);
3012 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3013 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3014 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3015 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3016 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3017 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3019 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3020 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3021 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3022 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3023 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3024 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3025 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3026 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3028 c->hadamard8_diff[0]= hadamard8_diff16_c;
3029 c->hadamard8_diff[1]= hadamard8_diff_c;
3030 c->hadamard8_abs = hadamard8_abs_c;
3032 c->dct_sad[0]= dct_sad16x16_c;
3033 c->dct_sad[1]= dct_sad8x8_c;
3035 c->sad[0]= sad16x16_c;
3036 c->sad[1]= sad8x8_c;
3038 c->quant_psnr[0]= quant_psnr16x16_c;
3039 c->quant_psnr[1]= quant_psnr8x8_c;
3041 c->rd[0]= rd16x16_c;
3044 c->bit[0]= bit16x16_c;
3045 c->bit[1]= bit8x8_c;
3047 c->add_bytes= add_bytes_c;
3048 c->diff_bytes= diff_bytes_c;
3049 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3050 c->bswap_buf= bswap_buf;
3053 dsputil_init_mmx(c, avctx);
3056 dsputil_init_armv4l(c, avctx);
3059 dsputil_init_mlib(c, avctx);
3062 dsputil_init_alpha(c, avctx);
3065 dsputil_init_ppc(c, avctx);
3068 dsputil_init_mmi(c, avctx);
3071 dsputil_init_sh4(c,avctx);
3074 switch(c->idct_permutation_type){
3075 case FF_NO_IDCT_PERM:
3077 c->idct_permutation[i]= i;
3079 case FF_LIBMPEG2_IDCT_PERM:
3081 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3083 case FF_SIMPLE_IDCT_PERM:
3085 c->idct_permutation[i]= simple_mmx_permutation[i];
3087 case FF_TRANSPOSE_IDCT_PERM:
3089 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3092 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");