3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
134 static int pix_sum_c(uint8_t * pix, int line_size)
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
151 pix += line_size - 16;
156 static int pix_norm1_c(uint8_t * pix, int line_size)
159 uint32_t *sq = squareTbl + 256;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
185 register uint32_t x=*(uint32_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
199 pix += line_size - 16;
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
218 dst[i+0]= bswap_32(src[i+0]);
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
225 uint32_t *sq = squareTbl + 256;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
246 uint32_t *sq = squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
277 /* read the pixels */
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
296 /* read the pixels */
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
319 /* read the pixels */
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
335 static void put_signed_pixels_clamped_c(const DCTELEM *block,
336 uint8_t *restrict pixels,
341 for (i = 0; i < 8; i++) {
342 for (j = 0; j < 8; j++) {
345 else if (*block > 127)
348 *pixels = (uint8_t)(*block + 128);
352 pixels += (line_size - 8);
356 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
360 uint8_t *cm = cropTbl + MAX_NEG_CROP;
362 /* read the pixels */
364 pixels[0] = cm[pixels[0] + block[0]];
365 pixels[1] = cm[pixels[1] + block[1]];
366 pixels[2] = cm[pixels[2] + block[2]];
367 pixels[3] = cm[pixels[3] + block[3]];
368 pixels[4] = cm[pixels[4] + block[4]];
369 pixels[5] = cm[pixels[5] + block[5]];
370 pixels[6] = cm[pixels[6] + block[6]];
371 pixels[7] = cm[pixels[7] + block[7]];
378 #define PIXOP2(OPNAME, OP) \
379 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
383 OP(*((uint64_t*)block), LD64(pixels));\
389 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
393 const uint64_t a= LD64(pixels );\
394 const uint64_t b= LD64(pixels+1);\
395 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
401 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
413 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
417 const uint64_t a= LD64(pixels );\
418 const uint64_t b= LD64(pixels+line_size);\
419 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
425 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
429 const uint64_t a= LD64(pixels );\
430 const uint64_t b= LD64(pixels+line_size);\
431 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
437 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
440 const uint64_t a= LD64(pixels );\
441 const uint64_t b= LD64(pixels+1);\
442 uint64_t l0= (a&0x0303030303030303ULL)\
443 + (b&0x0303030303030303ULL)\
444 + 0x0202020202020202ULL;\
445 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
446 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
450 for(i=0; i<h; i+=2){\
451 uint64_t a= LD64(pixels );\
452 uint64_t b= LD64(pixels+1);\
453 l1= (a&0x0303030303030303ULL)\
454 + (b&0x0303030303030303ULL);\
455 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
456 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
457 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
462 l0= (a&0x0303030303030303ULL)\
463 + (b&0x0303030303030303ULL)\
464 + 0x0202020202020202ULL;\
465 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
466 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
467 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
473 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
476 const uint64_t a= LD64(pixels );\
477 const uint64_t b= LD64(pixels+1);\
478 uint64_t l0= (a&0x0303030303030303ULL)\
479 + (b&0x0303030303030303ULL)\
480 + 0x0101010101010101ULL;\
481 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
482 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
486 for(i=0; i<h; i+=2){\
487 uint64_t a= LD64(pixels );\
488 uint64_t b= LD64(pixels+1);\
489 l1= (a&0x0303030303030303ULL)\
490 + (b&0x0303030303030303ULL);\
491 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
492 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
493 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
498 l0= (a&0x0303030303030303ULL)\
499 + (b&0x0303030303030303ULL)\
500 + 0x0101010101010101ULL;\
501 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
502 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
503 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
509 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
510 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
511 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
512 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
513 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
514 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
515 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
517 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
518 #else // 64 bit variant
520 #define PIXOP2(OPNAME, OP) \
521 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
524 OP(*((uint16_t*)(block )), LD16(pixels ));\
529 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
532 OP(*((uint32_t*)(block )), LD32(pixels ));\
537 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
540 OP(*((uint32_t*)(block )), LD32(pixels ));\
541 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
546 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
547 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
550 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
551 int src_stride1, int src_stride2, int h){\
555 a= LD32(&src1[i*src_stride1 ]);\
556 b= LD32(&src2[i*src_stride2 ]);\
557 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
558 a= LD32(&src1[i*src_stride1+4]);\
559 b= LD32(&src2[i*src_stride2+4]);\
560 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
564 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
565 int src_stride1, int src_stride2, int h){\
569 a= LD32(&src1[i*src_stride1 ]);\
570 b= LD32(&src2[i*src_stride2 ]);\
571 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
572 a= LD32(&src1[i*src_stride1+4]);\
573 b= LD32(&src2[i*src_stride2+4]);\
574 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
578 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
579 int src_stride1, int src_stride2, int h){\
583 a= LD32(&src1[i*src_stride1 ]);\
584 b= LD32(&src2[i*src_stride2 ]);\
585 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
589 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
590 int src_stride1, int src_stride2, int h){\
594 a= LD16(&src1[i*src_stride1 ]);\
595 b= LD16(&src2[i*src_stride2 ]);\
596 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
600 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
601 int src_stride1, int src_stride2, int h){\
602 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
603 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
606 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
607 int src_stride1, int src_stride2, int h){\
608 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
609 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
612 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
613 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
616 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
617 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
620 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
621 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
624 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
625 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
628 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
629 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
632 uint32_t a, b, c, d, l0, l1, h0, h1;\
633 a= LD32(&src1[i*src_stride1]);\
634 b= LD32(&src2[i*src_stride2]);\
635 c= LD32(&src3[i*src_stride3]);\
636 d= LD32(&src4[i*src_stride4]);\
637 l0= (a&0x03030303UL)\
640 h0= ((a&0xFCFCFCFCUL)>>2)\
641 + ((b&0xFCFCFCFCUL)>>2);\
642 l1= (c&0x03030303UL)\
644 h1= ((c&0xFCFCFCFCUL)>>2)\
645 + ((d&0xFCFCFCFCUL)>>2);\
646 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
647 a= LD32(&src1[i*src_stride1+4]);\
648 b= LD32(&src2[i*src_stride2+4]);\
649 c= LD32(&src3[i*src_stride3+4]);\
650 d= LD32(&src4[i*src_stride4+4]);\
651 l0= (a&0x03030303UL)\
654 h0= ((a&0xFCFCFCFCUL)>>2)\
655 + ((b&0xFCFCFCFCUL)>>2);\
656 l1= (c&0x03030303UL)\
658 h1= ((c&0xFCFCFCFCUL)>>2)\
659 + ((d&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
665 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
668 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
669 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
672 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
673 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
676 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
677 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
680 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
684 uint32_t a, b, c, d, l0, l1, h0, h1;\
685 a= LD32(&src1[i*src_stride1]);\
686 b= LD32(&src2[i*src_stride2]);\
687 c= LD32(&src3[i*src_stride3]);\
688 d= LD32(&src4[i*src_stride4]);\
689 l0= (a&0x03030303UL)\
692 h0= ((a&0xFCFCFCFCUL)>>2)\
693 + ((b&0xFCFCFCFCUL)>>2);\
694 l1= (c&0x03030303UL)\
696 h1= ((c&0xFCFCFCFCUL)>>2)\
697 + ((d&0xFCFCFCFCUL)>>2);\
698 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
699 a= LD32(&src1[i*src_stride1+4]);\
700 b= LD32(&src2[i*src_stride2+4]);\
701 c= LD32(&src3[i*src_stride3+4]);\
702 d= LD32(&src4[i*src_stride4+4]);\
703 l0= (a&0x03030303UL)\
706 h0= ((a&0xFCFCFCFCUL)>>2)\
707 + ((b&0xFCFCFCFCUL)>>2);\
708 l1= (c&0x03030303UL)\
710 h1= ((c&0xFCFCFCFCUL)>>2)\
711 + ((d&0xFCFCFCFCUL)>>2);\
712 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
715 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
716 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
717 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
718 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
720 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
721 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
722 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
723 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
726 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
728 int i, a0, b0, a1, b1;\
735 for(i=0; i<h; i+=2){\
741 block[0]= (a1+a0)>>2; /* FIXME non put */\
742 block[1]= (b1+b0)>>2;\
752 block[0]= (a1+a0)>>2;\
753 block[1]= (b1+b0)>>2;\
759 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
762 const uint32_t a= LD32(pixels );\
763 const uint32_t b= LD32(pixels+1);\
764 uint32_t l0= (a&0x03030303UL)\
767 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
768 + ((b&0xFCFCFCFCUL)>>2);\
772 for(i=0; i<h; i+=2){\
773 uint32_t a= LD32(pixels );\
774 uint32_t b= LD32(pixels+1);\
775 l1= (a&0x03030303UL)\
777 h1= ((a&0xFCFCFCFCUL)>>2)\
778 + ((b&0xFCFCFCFCUL)>>2);\
779 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
784 l0= (a&0x03030303UL)\
787 h0= ((a&0xFCFCFCFCUL)>>2)\
788 + ((b&0xFCFCFCFCUL)>>2);\
789 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
795 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800 const uint32_t a= LD32(pixels );\
801 const uint32_t b= LD32(pixels+1);\
802 uint32_t l0= (a&0x03030303UL)\
805 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
806 + ((b&0xFCFCFCFCUL)>>2);\
810 for(i=0; i<h; i+=2){\
811 uint32_t a= LD32(pixels );\
812 uint32_t b= LD32(pixels+1);\
813 l1= (a&0x03030303UL)\
815 h1= ((a&0xFCFCFCFCUL)>>2)\
816 + ((b&0xFCFCFCFCUL)>>2);\
817 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
822 l0= (a&0x03030303UL)\
825 h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
827 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
831 pixels+=4-line_size*(h+1);\
832 block +=4-line_size*h;\
836 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
841 const uint32_t a= LD32(pixels );\
842 const uint32_t b= LD32(pixels+1);\
843 uint32_t l0= (a&0x03030303UL)\
846 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
847 + ((b&0xFCFCFCFCUL)>>2);\
851 for(i=0; i<h; i+=2){\
852 uint32_t a= LD32(pixels );\
853 uint32_t b= LD32(pixels+1);\
854 l1= (a&0x03030303UL)\
856 h1= ((a&0xFCFCFCFCUL)>>2)\
857 + ((b&0xFCFCFCFCUL)>>2);\
858 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
863 l0= (a&0x03030303UL)\
866 h0= ((a&0xFCFCFCFCUL)>>2)\
867 + ((b&0xFCFCFCFCUL)>>2);\
868 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
872 pixels+=4-line_size*(h+1);\
873 block +=4-line_size*h;\
877 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
878 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
881 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
886 #define op_avg(a, b) a = rnd_avg32(a, b)
888 #define op_put(a, b) a = b
895 #define avg2(a,b) ((a+b+1)>>1)
896 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
898 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
899 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
902 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
903 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
906 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
908 const int A=(16-x16)*(16-y16);
909 const int B=( x16)*(16-y16);
910 const int C=(16-x16)*( y16);
911 const int D=( x16)*( y16);
916 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
917 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
918 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
919 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
920 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
921 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
922 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
923 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
929 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
930 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
933 const int s= 1<<shift;
943 for(x=0; x<8; x++){ //XXX FIXME optimize
944 int src_x, src_y, frac_x, frac_y, index;
953 if((unsigned)src_x < width){
954 if((unsigned)src_y < height){
955 index= src_x + src_y*stride;
956 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
957 + src[index +1]* frac_x )*(s-frac_y)
958 + ( src[index+stride ]*(s-frac_x)
959 + src[index+stride+1]* frac_x )* frac_y
962 index= src_x + clip(src_y, 0, height)*stride;
963 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
964 + src[index +1]* frac_x )*s
968 if((unsigned)src_y < height){
969 index= clip(src_x, 0, width) + src_y*stride;
970 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
971 + src[index+stride ]* frac_y )*s
974 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
975 dst[y*stride + x]= src[index ];
987 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
989 case 2: put_pixels2_c (dst, src, stride, height); break;
990 case 4: put_pixels4_c (dst, src, stride, height); break;
991 case 8: put_pixels8_c (dst, src, stride, height); break;
992 case 16:put_pixels16_c(dst, src, stride, height); break;
996 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
998 for (i=0; i < height; i++) {
999 for (j=0; j < width; j++) {
1000 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1007 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1009 for (i=0; i < height; i++) {
1010 for (j=0; j < width; j++) {
1011 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1018 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1020 for (i=0; i < height; i++) {
1021 for (j=0; j < width; j++) {
1022 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1029 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1031 for (i=0; i < height; i++) {
1032 for (j=0; j < width; j++) {
1033 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1040 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1042 for (i=0; i < height; i++) {
1043 for (j=0; j < width; j++) {
1044 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1051 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1062 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
1066 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1073 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1084 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1086 case 2: avg_pixels2_c (dst, src, stride, height); break;
1087 case 4: avg_pixels4_c (dst, src, stride, height); break;
1088 case 8: avg_pixels8_c (dst, src, stride, height); break;
1089 case 16:avg_pixels16_c(dst, src, stride, height); break;
1093 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1095 for (i=0; i < height; i++) {
1096 for (j=0; j < width; j++) {
1097 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1104 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1106 for (i=0; i < height; i++) {
1107 for (j=0; j < width; j++) {
1108 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1115 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1117 for (i=0; i < height; i++) {
1118 for (j=0; j < width; j++) {
1119 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1126 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1128 for (i=0; i < height; i++) {
1129 for (j=0; j < width; j++) {
1130 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1137 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1139 for (i=0; i < height; i++) {
1140 for (j=0; j < width; j++) {
1141 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1148 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1150 for (i=0; i < height; i++) {
1151 for (j=0; j < width; j++) {
1152 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1159 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1161 for (i=0; i < height; i++) {
1162 for (j=0; j < width; j++) {
1163 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1170 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1172 for (i=0; i < height; i++) {
1173 for (j=0; j < width; j++) {
1174 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1181 #define TPEL_WIDTH(width)\
1182 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1183 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1184 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1185 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1186 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1187 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1188 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1189 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1190 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1191 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1192 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1193 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1194 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1195 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1196 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1197 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1198 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1199 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1202 #define H264_CHROMA_MC(OPNAME, OP)\
1203 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1204 const int A=(8-x)*(8-y);\
1205 const int B=( x)*(8-y);\
1206 const int C=(8-x)*( y);\
1207 const int D=( x)*( y);\
1210 assert(x<8 && y<8 && x>=0 && y>=0);\
1214 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1215 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1221 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1222 const int A=(8-x)*(8-y);\
1223 const int B=( x)*(8-y);\
1224 const int C=(8-x)*( y);\
1225 const int D=( x)*( y);\
1228 assert(x<8 && y<8 && x>=0 && y>=0);\
1232 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1233 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1234 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1235 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1241 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1242 const int A=(8-x)*(8-y);\
1243 const int B=( x)*(8-y);\
1244 const int C=(8-x)*( y);\
1245 const int D=( x)*( y);\
1248 assert(x<8 && y<8 && x>=0 && y>=0);\
1252 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1253 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1254 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1255 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1256 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1257 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1258 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1259 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1265 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1266 #define op_put(a, b) a = (((b) + 32)>>6)
1268 H264_CHROMA_MC(put_ , op_put)
1269 H264_CHROMA_MC(avg_ , op_avg)
1273 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1278 ST32(dst , LD32(src ));
1284 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1289 ST32(dst , LD32(src ));
1290 ST32(dst+4 , LD32(src+4 ));
1296 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1301 ST32(dst , LD32(src ));
1302 ST32(dst+4 , LD32(src+4 ));
1303 ST32(dst+8 , LD32(src+8 ));
1304 ST32(dst+12, LD32(src+12));
1310 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1315 ST32(dst , LD32(src ));
1316 ST32(dst+4 , LD32(src+4 ));
1317 ST32(dst+8 , LD32(src+8 ));
1318 ST32(dst+12, LD32(src+12));
1325 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1330 ST32(dst , LD32(src ));
1331 ST32(dst+4 , LD32(src+4 ));
1339 #define QPEL_MC(r, OPNAME, RND, OP) \
1340 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1341 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1345 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1346 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1347 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1348 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1349 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1350 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1351 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1352 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1358 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1360 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1364 const int src0= src[0*srcStride];\
1365 const int src1= src[1*srcStride];\
1366 const int src2= src[2*srcStride];\
1367 const int src3= src[3*srcStride];\
1368 const int src4= src[4*srcStride];\
1369 const int src5= src[5*srcStride];\
1370 const int src6= src[6*srcStride];\
1371 const int src7= src[7*srcStride];\
1372 const int src8= src[8*srcStride];\
1373 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1374 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1375 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1376 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1377 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1378 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1379 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1380 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1386 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1387 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1392 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1393 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1394 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1395 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1396 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1397 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1398 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1399 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1400 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1401 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1402 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1403 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1404 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1405 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1406 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1407 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1413 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1414 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1419 const int src0= src[0*srcStride];\
1420 const int src1= src[1*srcStride];\
1421 const int src2= src[2*srcStride];\
1422 const int src3= src[3*srcStride];\
1423 const int src4= src[4*srcStride];\
1424 const int src5= src[5*srcStride];\
1425 const int src6= src[6*srcStride];\
1426 const int src7= src[7*srcStride];\
1427 const int src8= src[8*srcStride];\
1428 const int src9= src[9*srcStride];\
1429 const int src10= src[10*srcStride];\
1430 const int src11= src[11*srcStride];\
1431 const int src12= src[12*srcStride];\
1432 const int src13= src[13*srcStride];\
1433 const int src14= src[14*srcStride];\
1434 const int src15= src[15*srcStride];\
1435 const int src16= src[16*srcStride];\
1436 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1437 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1438 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1439 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1440 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1441 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1442 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1443 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1444 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1445 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1446 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1447 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1448 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1449 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1450 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1451 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1457 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1458 OPNAME ## pixels8_c(dst, src, stride, 8);\
1461 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1463 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1464 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1467 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1468 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1471 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1473 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1474 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1477 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1480 copy_block9(full, src, 16, stride, 9);\
1481 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1482 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1485 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1486 uint8_t full[16*9];\
1487 copy_block9(full, src, 16, stride, 9);\
1488 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1491 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1494 copy_block9(full, src, 16, stride, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1496 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1498 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1509 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1519 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1530 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1540 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1544 uint8_t halfHV[64];\
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1551 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1554 uint8_t halfHV[64];\
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1561 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1562 uint8_t full[16*9];\
1565 uint8_t halfHV[64];\
1566 copy_block9(full, src, 16, stride, 9);\
1567 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1568 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1569 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1570 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1572 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1573 uint8_t full[16*9];\
1575 uint8_t halfHV[64];\
1576 copy_block9(full, src, 16, stride, 9);\
1577 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1578 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1579 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1580 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1582 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1584 uint8_t halfHV[64];\
1585 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1586 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1587 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1589 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1591 uint8_t halfHV[64];\
1592 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1593 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1594 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1596 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1597 uint8_t full[16*9];\
1600 uint8_t halfHV[64];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1604 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1605 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1607 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1608 uint8_t full[16*9];\
1610 copy_block9(full, src, 16, stride, 9);\
1611 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1612 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1613 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1615 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1616 uint8_t full[16*9];\
1619 uint8_t halfHV[64];\
1620 copy_block9(full, src, 16, stride, 9);\
1621 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1622 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1623 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1624 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1626 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1627 uint8_t full[16*9];\
1629 copy_block9(full, src, 16, stride, 9);\
1630 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1631 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1632 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1634 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1636 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1637 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1639 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1640 OPNAME ## pixels16_c(dst, src, stride, 16);\
1643 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1645 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1646 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1649 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1650 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1653 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1655 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1656 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1659 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1662 copy_block17(full, src, 24, stride, 17);\
1663 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1664 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1667 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1668 uint8_t full[24*17];\
1669 copy_block17(full, src, 24, stride, 17);\
1670 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1673 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1676 copy_block17(full, src, 24, stride, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1678 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1680 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1691 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1701 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1712 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1722 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1733 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1743 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[24*17];\
1745 uint8_t halfH[272];\
1746 uint8_t halfV[256];\
1747 uint8_t halfHV[256];\
1748 copy_block17(full, src, 24, stride, 17);\
1749 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1750 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1751 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1752 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1754 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1755 uint8_t full[24*17];\
1756 uint8_t halfH[272];\
1757 uint8_t halfHV[256];\
1758 copy_block17(full, src, 24, stride, 17);\
1759 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1760 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1761 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1762 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1764 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1765 uint8_t halfH[272];\
1766 uint8_t halfHV[256];\
1767 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1768 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1769 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1771 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t halfH[272];\
1773 uint8_t halfHV[256];\
1774 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1775 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1776 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1778 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[24*17];\
1780 uint8_t halfH[272];\
1781 uint8_t halfV[256];\
1782 uint8_t halfHV[256];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1786 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1787 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1789 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[24*17];\
1791 uint8_t halfH[272];\
1792 copy_block17(full, src, 24, stride, 17);\
1793 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1794 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1795 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1797 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[24*17];\
1799 uint8_t halfH[272];\
1800 uint8_t halfV[256];\
1801 uint8_t halfHV[256];\
1802 copy_block17(full, src, 24, stride, 17);\
1803 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1804 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1805 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1806 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1808 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t full[24*17];\
1810 uint8_t halfH[272];\
1811 copy_block17(full, src, 24, stride, 17);\
1812 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1813 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1814 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1816 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfH[272];\
1818 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1819 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1822 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1823 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1824 #define op_put(a, b) a = cm[((b) + 16)>>5]
1825 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1827 QPEL_MC(0, put_ , _ , op_put)
1828 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1829 QPEL_MC(0, avg_ , _ , op_avg)
1830 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1832 #undef op_avg_no_rnd
1834 #undef op_put_no_rnd
1837 #define H264_LOWPASS(OPNAME, OP, OP2) \
1838 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1840 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1844 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1845 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1846 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1847 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1853 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1855 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1859 const int srcB= src[-2*srcStride];\
1860 const int srcA= src[-1*srcStride];\
1861 const int src0= src[0 *srcStride];\
1862 const int src1= src[1 *srcStride];\
1863 const int src2= src[2 *srcStride];\
1864 const int src3= src[3 *srcStride];\
1865 const int src4= src[4 *srcStride];\
1866 const int src5= src[5 *srcStride];\
1867 const int src6= src[6 *srcStride];\
1868 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1869 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1870 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1871 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1877 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1880 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1882 src -= 2*srcStride;\
1883 for(i=0; i<h+5; i++)\
1885 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1886 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1887 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1888 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1892 tmp -= tmpStride*(h+5-2);\
1895 const int tmpB= tmp[-2*tmpStride];\
1896 const int tmpA= tmp[-1*tmpStride];\
1897 const int tmp0= tmp[0 *tmpStride];\
1898 const int tmp1= tmp[1 *tmpStride];\
1899 const int tmp2= tmp[2 *tmpStride];\
1900 const int tmp3= tmp[3 *tmpStride];\
1901 const int tmp4= tmp[4 *tmpStride];\
1902 const int tmp5= tmp[5 *tmpStride];\
1903 const int tmp6= tmp[6 *tmpStride];\
1904 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1905 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1906 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1907 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1913 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1915 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1919 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1920 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1921 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1922 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1923 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1924 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1925 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1926 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1932 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1934 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1938 const int srcB= src[-2*srcStride];\
1939 const int srcA= src[-1*srcStride];\
1940 const int src0= src[0 *srcStride];\
1941 const int src1= src[1 *srcStride];\
1942 const int src2= src[2 *srcStride];\
1943 const int src3= src[3 *srcStride];\
1944 const int src4= src[4 *srcStride];\
1945 const int src5= src[5 *srcStride];\
1946 const int src6= src[6 *srcStride];\
1947 const int src7= src[7 *srcStride];\
1948 const int src8= src[8 *srcStride];\
1949 const int src9= src[9 *srcStride];\
1950 const int src10=src[10*srcStride];\
1951 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1952 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1953 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1954 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1955 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1956 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1957 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1958 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1964 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1967 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1969 src -= 2*srcStride;\
1970 for(i=0; i<h+5; i++)\
1972 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1973 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1974 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1975 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1976 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1977 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1978 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1979 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1983 tmp -= tmpStride*(h+5-2);\
1986 const int tmpB= tmp[-2*tmpStride];\
1987 const int tmpA= tmp[-1*tmpStride];\
1988 const int tmp0= tmp[0 *tmpStride];\
1989 const int tmp1= tmp[1 *tmpStride];\
1990 const int tmp2= tmp[2 *tmpStride];\
1991 const int tmp3= tmp[3 *tmpStride];\
1992 const int tmp4= tmp[4 *tmpStride];\
1993 const int tmp5= tmp[5 *tmpStride];\
1994 const int tmp6= tmp[6 *tmpStride];\
1995 const int tmp7= tmp[7 *tmpStride];\
1996 const int tmp8= tmp[8 *tmpStride];\
1997 const int tmp9= tmp[9 *tmpStride];\
1998 const int tmp10=tmp[10*tmpStride];\
1999 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2000 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2001 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2002 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2003 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2004 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2005 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2006 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2012 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2013 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2014 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2015 src += 8*srcStride;\
2016 dst += 8*dstStride;\
2017 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2018 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2021 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2022 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2023 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2024 src += 8*srcStride;\
2025 dst += 8*dstStride;\
2026 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2027 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2030 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2031 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2032 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2033 src += 8*srcStride;\
2034 tmp += 8*tmpStride;\
2035 dst += 8*dstStride;\
2036 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2037 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2040 #define H264_MC(OPNAME, SIZE) \
2041 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2042 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2045 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t half[SIZE*SIZE];\
2047 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2048 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2051 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2052 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t half[SIZE*SIZE];\
2057 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2058 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2061 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[SIZE*(SIZE+5)];\
2063 uint8_t * const full_mid= full + SIZE*2;\
2064 uint8_t half[SIZE*SIZE];\
2065 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2066 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2067 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2070 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[SIZE*(SIZE+5)];\
2072 uint8_t * const full_mid= full + SIZE*2;\
2073 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2074 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t half[SIZE*SIZE];\
2081 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2082 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2083 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2086 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2087 uint8_t full[SIZE*(SIZE+5)];\
2088 uint8_t * const full_mid= full + SIZE*2;\
2089 uint8_t halfH[SIZE*SIZE];\
2090 uint8_t halfV[SIZE*SIZE];\
2091 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2092 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2093 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2094 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2097 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2098 uint8_t full[SIZE*(SIZE+5)];\
2099 uint8_t * const full_mid= full + SIZE*2;\
2100 uint8_t halfH[SIZE*SIZE];\
2101 uint8_t halfV[SIZE*SIZE];\
2102 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2103 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2104 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2105 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2108 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2109 uint8_t full[SIZE*(SIZE+5)];\
2110 uint8_t * const full_mid= full + SIZE*2;\
2111 uint8_t halfH[SIZE*SIZE];\
2112 uint8_t halfV[SIZE*SIZE];\
2113 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2114 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2115 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2116 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2119 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2120 uint8_t full[SIZE*(SIZE+5)];\
2121 uint8_t * const full_mid= full + SIZE*2;\
2122 uint8_t halfH[SIZE*SIZE];\
2123 uint8_t halfV[SIZE*SIZE];\
2124 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2125 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2126 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2127 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2130 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2131 int16_t tmp[SIZE*(SIZE+5)];\
2132 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2135 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2136 int16_t tmp[SIZE*(SIZE+5)];\
2137 uint8_t halfH[SIZE*SIZE];\
2138 uint8_t halfHV[SIZE*SIZE];\
2139 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2145 int16_t tmp[SIZE*(SIZE+5)];\
2146 uint8_t halfH[SIZE*SIZE];\
2147 uint8_t halfHV[SIZE*SIZE];\
2148 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2149 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2150 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2153 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2154 uint8_t full[SIZE*(SIZE+5)];\
2155 uint8_t * const full_mid= full + SIZE*2;\
2156 int16_t tmp[SIZE*(SIZE+5)];\
2157 uint8_t halfV[SIZE*SIZE];\
2158 uint8_t halfHV[SIZE*SIZE];\
2159 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2160 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2161 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2162 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2165 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t full[SIZE*(SIZE+5)];\
2167 uint8_t * const full_mid= full + SIZE*2;\
2168 int16_t tmp[SIZE*(SIZE+5)];\
2169 uint8_t halfV[SIZE*SIZE];\
2170 uint8_t halfHV[SIZE*SIZE];\
2171 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2172 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2173 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2174 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2177 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2178 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2179 #define op_put(a, b) a = cm[((b) + 16)>>5]
2180 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2181 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2183 H264_LOWPASS(put_ , op_put, op2_put)
2184 H264_LOWPASS(avg_ , op_avg, op2_avg)
2198 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2199 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2203 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2204 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2205 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2206 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2207 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2208 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2209 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2210 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2216 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2217 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2221 const int src_1= src[ -srcStride];
2222 const int src0 = src[0 ];
2223 const int src1 = src[ srcStride];
2224 const int src2 = src[2*srcStride];
2225 const int src3 = src[3*srcStride];
2226 const int src4 = src[4*srcStride];
2227 const int src5 = src[5*srcStride];
2228 const int src6 = src[6*srcStride];
2229 const int src7 = src[7*srcStride];
2230 const int src8 = src[8*srcStride];
2231 const int src9 = src[9*srcStride];
2232 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2233 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2234 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2235 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2236 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2237 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2238 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2239 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2245 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2246 put_pixels8_c(dst, src, stride, 8);
2249 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2251 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2252 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2255 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2256 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2259 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2261 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2262 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2265 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2266 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2269 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2273 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2274 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2275 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2276 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2278 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2282 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2283 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2284 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2285 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2287 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2289 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2290 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2293 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2295 const int strength= ff_h263_loop_filter_strength[qscale];
2299 int p0= src[x-2*stride];
2300 int p1= src[x-1*stride];
2301 int p2= src[x+0*stride];
2302 int p3= src[x+1*stride];
2303 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2305 if (d<-2*strength) d1= 0;
2306 else if(d<- strength) d1=-2*strength - d;
2307 else if(d< strength) d1= d;
2308 else if(d< 2*strength) d1= 2*strength - d;
2313 if(p1&256) p1= ~(p1>>31);
2314 if(p2&256) p2= ~(p2>>31);
2316 src[x-1*stride] = p1;
2317 src[x+0*stride] = p2;
2321 d2= clip((p0-p3)/4, -ad1, ad1);
2323 src[x-2*stride] = p0 - d2;
2324 src[x+ stride] = p3 + d2;
2328 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2330 const int strength= ff_h263_loop_filter_strength[qscale];
2334 int p0= src[y*stride-2];
2335 int p1= src[y*stride-1];
2336 int p2= src[y*stride+0];
2337 int p3= src[y*stride+1];
2338 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2340 if (d<-2*strength) d1= 0;
2341 else if(d<- strength) d1=-2*strength - d;
2342 else if(d< strength) d1= d;
2343 else if(d< 2*strength) d1= 2*strength - d;
2348 if(p1&256) p1= ~(p1>>31);
2349 if(p2&256) p2= ~(p2>>31);
2351 src[y*stride-1] = p1;
2352 src[y*stride+0] = p2;
2356 d2= clip((p0-p3)/4, -ad1, ad1);
2358 src[y*stride-2] = p0 - d2;
2359 src[y*stride+1] = p3 + d2;
2363 static void h261_loop_filter_c(uint8_t *src, int stride){
2368 temp[x ] = 4*src[x ];
2369 temp[x + 7*8] = 4*src[x + 7*stride];
2373 xy = y * stride + x;
2375 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2380 src[ y*stride] = (temp[ y*8] + 2)>>2;
2381 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2383 xy = y * stride + x;
2385 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2390 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2396 s += abs(pix1[0] - pix2[0]);
2397 s += abs(pix1[1] - pix2[1]);
2398 s += abs(pix1[2] - pix2[2]);
2399 s += abs(pix1[3] - pix2[3]);
2400 s += abs(pix1[4] - pix2[4]);
2401 s += abs(pix1[5] - pix2[5]);
2402 s += abs(pix1[6] - pix2[6]);
2403 s += abs(pix1[7] - pix2[7]);
2404 s += abs(pix1[8] - pix2[8]);
2405 s += abs(pix1[9] - pix2[9]);
2406 s += abs(pix1[10] - pix2[10]);
2407 s += abs(pix1[11] - pix2[11]);
2408 s += abs(pix1[12] - pix2[12]);
2409 s += abs(pix1[13] - pix2[13]);
2410 s += abs(pix1[14] - pix2[14]);
2411 s += abs(pix1[15] - pix2[15]);
2418 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2424 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2425 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2426 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2427 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2428 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2429 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2430 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2431 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2432 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2433 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2434 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2435 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2436 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2437 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2438 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2439 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2446 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2449 uint8_t *pix3 = pix2 + line_size;
2453 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2454 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2455 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2456 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2457 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2458 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2459 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2460 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2461 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2462 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2463 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2464 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2465 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2466 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2467 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2468 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2476 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2479 uint8_t *pix3 = pix2 + line_size;
2483 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2484 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2485 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2486 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2487 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2488 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2489 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2490 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2491 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2492 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2493 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2494 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2495 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2496 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2497 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2498 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2506 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2512 s += abs(pix1[0] - pix2[0]);
2513 s += abs(pix1[1] - pix2[1]);
2514 s += abs(pix1[2] - pix2[2]);
2515 s += abs(pix1[3] - pix2[3]);
2516 s += abs(pix1[4] - pix2[4]);
2517 s += abs(pix1[5] - pix2[5]);
2518 s += abs(pix1[6] - pix2[6]);
2519 s += abs(pix1[7] - pix2[7]);
2526 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2532 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2533 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2534 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2535 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2536 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2537 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2538 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2539 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2546 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2549 uint8_t *pix3 = pix2 + line_size;
2553 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2554 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2555 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2556 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2557 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2558 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2559 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2560 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2568 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2571 uint8_t *pix3 = pix2 + line_size;
2575 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2576 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2577 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2578 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2579 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2580 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2581 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2582 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2590 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2596 for(x=0; x<16; x++){
2597 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2600 for(x=0; x<15; x++){
2601 score2+= ABS( s1[x ] - s1[x +stride]
2602 - s1[x+1] + s1[x+1+stride])
2603 -ABS( s2[x ] - s2[x +stride]
2604 - s2[x+1] + s2[x+1+stride]);
2611 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2612 else return score1 + ABS(score2)*8;
2615 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2622 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2626 score2+= ABS( s1[x ] - s1[x +stride]
2627 - s1[x+1] + s1[x+1+stride])
2628 -ABS( s2[x ] - s2[x +stride]
2629 - s2[x+1] + s2[x+1+stride]);
2636 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2637 else return score1 + ABS(score2)*8;
2640 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2644 for(i=0; i<8*8; i++){
2645 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2648 assert(-512<b && b<512);
2650 sum += (w*b)*(w*b)>>4;
2655 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2658 for(i=0; i<8*8; i++){
2659 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2664 * permutes an 8x8 block.
2665 * @param block the block which will be permuted according to the given permutation vector
2666 * @param permutation the permutation vector
2667 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2668 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2669 * (inverse) permutated to scantable order!
2671 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2677 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2679 for(i=0; i<=last; i++){
2680 const int j= scantable[i];
2685 for(i=0; i<=last; i++){
2686 const int j= scantable[i];
2687 const int perm_j= permutation[j];
2688 block[perm_j]= temp[j];
2692 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2696 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2699 memset(cmp, 0, sizeof(void*)*5);
2707 cmp[i]= c->hadamard8_diff[i];
2713 cmp[i]= c->dct_sad[i];
2716 cmp[i]= c->quant_psnr[i];
2737 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2743 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2745 static void clear_blocks_c(DCTELEM *blocks)
2747 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2750 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2752 for(i=0; i+7<w; i+=8){
2753 dst[i+0] += src[i+0];
2754 dst[i+1] += src[i+1];
2755 dst[i+2] += src[i+2];
2756 dst[i+3] += src[i+3];
2757 dst[i+4] += src[i+4];
2758 dst[i+5] += src[i+5];
2759 dst[i+6] += src[i+6];
2760 dst[i+7] += src[i+7];
2763 dst[i+0] += src[i+0];
2766 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2768 for(i=0; i+7<w; i+=8){
2769 dst[i+0] = src1[i+0]-src2[i+0];
2770 dst[i+1] = src1[i+1]-src2[i+1];
2771 dst[i+2] = src1[i+2]-src2[i+2];
2772 dst[i+3] = src1[i+3]-src2[i+3];
2773 dst[i+4] = src1[i+4]-src2[i+4];
2774 dst[i+5] = src1[i+5]-src2[i+5];
2775 dst[i+6] = src1[i+6]-src2[i+6];
2776 dst[i+7] = src1[i+7]-src2[i+7];
2779 dst[i+0] = src1[i+0]-src2[i+0];
2782 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2790 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2800 #define BUTTERFLY2(o1,o2,i1,i2) \
2804 #define BUTTERFLY1(x,y) \
2813 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2815 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2823 //FIXME try pointer walks
2824 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2825 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2826 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2827 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2829 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2830 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2831 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2832 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2834 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2835 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2836 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2837 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2841 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2842 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2843 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2844 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2846 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2847 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2848 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2849 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2852 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2853 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2854 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2855 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2861 printf("MAX:%d\n", maxi);
2867 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2875 //FIXME try pointer walks
2876 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2877 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2878 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2879 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2881 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2882 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2883 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2884 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2886 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2887 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2888 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2889 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2893 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2894 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2895 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2896 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2898 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2899 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2900 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2901 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2904 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2905 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2906 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2907 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2910 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2915 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2916 MpegEncContext * const s= (MpegEncContext *)c;
2917 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2918 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2923 s->dsp.diff_pixels(temp, src1, src2, stride);
2932 void simple_idct(DCTELEM *block); //FIXME
2934 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2935 MpegEncContext * const s= (MpegEncContext *)c;
2936 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2937 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2938 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2944 s->dsp.diff_pixels(temp, src1, src2, stride);
2946 memcpy(bak, temp, 64*sizeof(DCTELEM));
2948 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2949 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2950 simple_idct(temp); //FIXME
2953 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2958 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2959 MpegEncContext * const s= (MpegEncContext *)c;
2960 const uint8_t *scantable= s->intra_scantable.permutated;
2961 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2962 uint64_t __align8 aligned_bak[stride];
2963 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2964 uint8_t * const bak= (uint8_t*)aligned_bak;
2965 int i, last, run, bits, level, distoration, start_i;
2966 const int esc_length= s->ac_esc_length;
2968 uint8_t * last_length;
2973 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2974 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2977 s->dsp.diff_pixels(temp, src1, src2, stride);
2979 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2985 length = s->intra_ac_vlc_length;
2986 last_length= s->intra_ac_vlc_last_length;
2987 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2990 length = s->inter_ac_vlc_length;
2991 last_length= s->inter_ac_vlc_last_length;
2996 for(i=start_i; i<last; i++){
2997 int j= scantable[i];
3002 if((level&(~127)) == 0){
3003 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3012 level= temp[i] + 64;
3016 if((level&(~127)) == 0){
3017 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3025 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3027 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3030 s->dsp.idct_add(bak, stride, temp);
3032 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3034 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3037 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3038 MpegEncContext * const s= (MpegEncContext *)c;
3039 const uint8_t *scantable= s->intra_scantable.permutated;
3040 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3041 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3042 int i, last, run, bits, level, start_i;
3043 const int esc_length= s->ac_esc_length;
3045 uint8_t * last_length;
3049 s->dsp.diff_pixels(temp, src1, src2, stride);
3051 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3057 length = s->intra_ac_vlc_length;
3058 last_length= s->intra_ac_vlc_last_length;
3059 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3062 length = s->inter_ac_vlc_length;
3063 last_length= s->inter_ac_vlc_last_length;
3068 for(i=start_i; i<last; i++){
3069 int j= scantable[i];
3074 if((level&(~127)) == 0){
3075 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3084 level= temp[i] + 64;
3088 if((level&(~127)) == 0){
3089 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3097 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3102 for(x=0; x<16; x+=4){
3103 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3104 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3112 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3117 for(x=0; x<16; x++){
3118 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3127 #define SQ(a) ((a)*(a))
3128 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3133 for(x=0; x<16; x+=4){
3134 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3135 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3143 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3148 for(x=0; x<16; x++){
3149 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3158 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3159 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3160 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3161 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3162 WARPER8_16_SQ(rd8x8_c, rd16_c)
3163 WARPER8_16_SQ(bit8x8_c, bit16_c)
3165 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3167 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3170 put_pixels_clamped_c(block, dest, line_size);
3172 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3175 add_pixels_clamped_c(block, dest, line_size);
3178 /* init static data */
3179 void dsputil_static_init(void)
3183 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3184 for(i=0;i<MAX_NEG_CROP;i++) {
3186 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3189 for(i=0;i<512;i++) {
3190 squareTbl[i] = (i - 256) * (i - 256);
3193 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3197 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3201 #ifdef CONFIG_ENCODERS
3202 if(avctx->dct_algo==FF_DCT_FASTINT) {
3203 c->fdct = fdct_ifast;
3204 c->fdct248 = fdct_ifast248;
3206 else if(avctx->dct_algo==FF_DCT_FAAN) {
3207 c->fdct = ff_faandct;
3208 c->fdct248 = ff_faandct248;
3211 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3212 c->fdct248 = ff_fdct248_islow;
3214 #endif //CONFIG_ENCODERS
3216 if(avctx->idct_algo==FF_IDCT_INT){
3217 c->idct_put= ff_jref_idct_put;
3218 c->idct_add= ff_jref_idct_add;
3219 c->idct = j_rev_dct;
3220 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3221 }else{ //accurate/default
3222 c->idct_put= simple_idct_put;
3223 c->idct_add= simple_idct_add;
3224 c->idct = simple_idct;
3225 c->idct_permutation_type= FF_NO_IDCT_PERM;
3228 /* VP3 DSP support */
3229 c->vp3_dsp_init = vp3_dsp_init_c;
3230 c->vp3_idct = vp3_idct_c;
3232 c->get_pixels = get_pixels_c;
3233 c->diff_pixels = diff_pixels_c;
3234 c->put_pixels_clamped = put_pixels_clamped_c;
3235 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3236 c->add_pixels_clamped = add_pixels_clamped_c;
3239 c->clear_blocks = clear_blocks_c;
3240 c->pix_sum = pix_sum_c;
3241 c->pix_norm1 = pix_norm1_c;
3243 /* TODO [0] 16 [1] 8 */
3244 c->pix_abs[0][0] = pix_abs16_c;
3245 c->pix_abs[0][1] = pix_abs16_x2_c;
3246 c->pix_abs[0][2] = pix_abs16_y2_c;
3247 c->pix_abs[0][3] = pix_abs16_xy2_c;
3248 c->pix_abs[1][0] = pix_abs8_c;
3249 c->pix_abs[1][1] = pix_abs8_x2_c;
3250 c->pix_abs[1][2] = pix_abs8_y2_c;
3251 c->pix_abs[1][3] = pix_abs8_xy2_c;
3253 #define dspfunc(PFX, IDX, NUM) \
3254 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3255 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3256 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3257 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3259 dspfunc(put, 0, 16);
3260 dspfunc(put_no_rnd, 0, 16);
3262 dspfunc(put_no_rnd, 1, 8);
3266 dspfunc(avg, 0, 16);
3267 dspfunc(avg_no_rnd, 0, 16);
3269 dspfunc(avg_no_rnd, 1, 8);
3274 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3275 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3277 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3278 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3279 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3280 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3281 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3282 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3283 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3284 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3285 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3287 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3288 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3289 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3290 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3291 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3292 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3293 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3294 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3295 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3297 #define dspfunc(PFX, IDX, NUM) \
3298 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3299 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3300 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3301 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3302 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3303 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3304 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3305 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3306 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3307 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3308 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3309 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3310 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3311 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3312 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3313 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3315 dspfunc(put_qpel, 0, 16);
3316 dspfunc(put_no_rnd_qpel, 0, 16);
3318 dspfunc(avg_qpel, 0, 16);
3319 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3321 dspfunc(put_qpel, 1, 8);
3322 dspfunc(put_no_rnd_qpel, 1, 8);
3324 dspfunc(avg_qpel, 1, 8);
3325 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3327 dspfunc(put_h264_qpel, 0, 16);
3328 dspfunc(put_h264_qpel, 1, 8);
3329 dspfunc(put_h264_qpel, 2, 4);
3330 dspfunc(avg_h264_qpel, 0, 16);
3331 dspfunc(avg_h264_qpel, 1, 8);
3332 dspfunc(avg_h264_qpel, 2, 4);
3335 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3336 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3337 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3338 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3339 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3340 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3342 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3343 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3344 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3345 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3346 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3347 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3348 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3349 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3351 #define SET_CMP_FUNC(name) \
3352 c->name[0]= name ## 16_c;\
3353 c->name[1]= name ## 8x8_c;
3355 SET_CMP_FUNC(hadamard8_diff)
3356 c->hadamard8_diff[4]= hadamard8_intra16_c;
3357 SET_CMP_FUNC(dct_sad)
3358 c->sad[0]= pix_abs16_c;
3359 c->sad[1]= pix_abs8_c;
3362 SET_CMP_FUNC(quant_psnr)
3365 c->vsad[0]= vsad16_c;
3366 c->vsad[4]= vsad_intra16_c;
3367 c->vsse[0]= vsse16_c;
3368 c->vsse[4]= vsse_intra16_c;
3369 c->nsse[0]= nsse16_c;
3370 c->nsse[1]= nsse8_c;
3372 c->add_bytes= add_bytes_c;
3373 c->diff_bytes= diff_bytes_c;
3374 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3375 c->bswap_buf= bswap_buf;
3377 c->h263_h_loop_filter= h263_h_loop_filter_c;
3378 c->h263_v_loop_filter= h263_v_loop_filter_c;
3380 c->h261_loop_filter= h261_loop_filter_c;
3382 c->try_8x8basis= try_8x8basis_c;
3383 c->add_8x8basis= add_8x8basis_c;
3386 dsputil_init_mmx(c, avctx);
3389 dsputil_init_armv4l(c, avctx);
3392 dsputil_init_mlib(c, avctx);
3395 dsputil_init_vis(c,avctx);
3398 dsputil_init_alpha(c, avctx);
3401 dsputil_init_ppc(c, avctx);
3404 dsputil_init_mmi(c, avctx);
3407 dsputil_init_sh4(c,avctx);
3410 switch(c->idct_permutation_type){
3411 case FF_NO_IDCT_PERM:
3413 c->idct_permutation[i]= i;
3415 case FF_LIBMPEG2_IDCT_PERM:
3417 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3419 case FF_SIMPLE_IDCT_PERM:
3421 c->idct_permutation[i]= simple_mmx_permutation[i];
3423 case FF_TRANSPOSE_IDCT_PERM:
3425 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3428 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");