3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
134 static int pix_sum_c(uint8_t * pix, int line_size)
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
151 pix += line_size - 16;
156 static int pix_norm1_c(uint8_t * pix, int line_size)
159 uint32_t *sq = squareTbl + 256;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
185 register uint32_t x=*(uint32_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
199 pix += line_size - 16;
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
218 dst[i+0]= bswap_32(src[i+0]);
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
225 uint32_t *sq = squareTbl + 256;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
246 uint32_t *sq = squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
277 /* read the pixels */
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
296 /* read the pixels */
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
319 /* read the pixels */
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
339 uint8_t *cm = cropTbl + MAX_NEG_CROP;
341 /* read the pixels */
343 pixels[0] = cm[pixels[0] + block[0]];
344 pixels[1] = cm[pixels[1] + block[1]];
345 pixels[2] = cm[pixels[2] + block[2]];
346 pixels[3] = cm[pixels[3] + block[3]];
347 pixels[4] = cm[pixels[4] + block[4]];
348 pixels[5] = cm[pixels[5] + block[5]];
349 pixels[6] = cm[pixels[6] + block[6]];
350 pixels[7] = cm[pixels[7] + block[7]];
357 #define PIXOP2(OPNAME, OP) \
358 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362 OP(*((uint64_t*)block), LD64(pixels));\
368 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
372 const uint64_t a= LD64(pixels );\
373 const uint64_t b= LD64(pixels+1);\
374 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
384 const uint64_t a= LD64(pixels );\
385 const uint64_t b= LD64(pixels+1);\
386 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
392 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
396 const uint64_t a= LD64(pixels );\
397 const uint64_t b= LD64(pixels+line_size);\
398 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
404 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
408 const uint64_t a= LD64(pixels );\
409 const uint64_t b= LD64(pixels+line_size);\
410 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
416 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
419 const uint64_t a= LD64(pixels );\
420 const uint64_t b= LD64(pixels+1);\
421 uint64_t l0= (a&0x0303030303030303ULL)\
422 + (b&0x0303030303030303ULL)\
423 + 0x0202020202020202ULL;\
424 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
429 for(i=0; i<h; i+=2){\
430 uint64_t a= LD64(pixels );\
431 uint64_t b= LD64(pixels+1);\
432 l1= (a&0x0303030303030303ULL)\
433 + (b&0x0303030303030303ULL);\
434 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
441 l0= (a&0x0303030303030303ULL)\
442 + (b&0x0303030303030303ULL)\
443 + 0x0202020202020202ULL;\
444 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
455 const uint64_t a= LD64(pixels );\
456 const uint64_t b= LD64(pixels+1);\
457 uint64_t l0= (a&0x0303030303030303ULL)\
458 + (b&0x0303030303030303ULL)\
459 + 0x0101010101010101ULL;\
460 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
465 for(i=0; i<h; i+=2){\
466 uint64_t a= LD64(pixels );\
467 uint64_t b= LD64(pixels+1);\
468 l1= (a&0x0303030303030303ULL)\
469 + (b&0x0303030303030303ULL);\
470 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
477 l0= (a&0x0303030303030303ULL)\
478 + (b&0x0303030303030303ULL)\
479 + 0x0101010101010101ULL;\
480 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
488 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
496 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497 #else // 64 bit variant
499 #define PIXOP2(OPNAME, OP) \
500 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
503 OP(*((uint16_t*)(block )), LD16(pixels ));\
508 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
511 OP(*((uint32_t*)(block )), LD32(pixels ));\
516 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
519 OP(*((uint32_t*)(block )), LD32(pixels ));\
520 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
525 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
529 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
534 a= LD32(&src1[i*src_stride1 ]);\
535 b= LD32(&src2[i*src_stride2 ]);\
536 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
537 a= LD32(&src1[i*src_stride1+4]);\
538 b= LD32(&src2[i*src_stride2+4]);\
539 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
543 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544 int src_stride1, int src_stride2, int h){\
548 a= LD32(&src1[i*src_stride1 ]);\
549 b= LD32(&src2[i*src_stride2 ]);\
550 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
551 a= LD32(&src1[i*src_stride1+4]);\
552 b= LD32(&src2[i*src_stride2+4]);\
553 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
557 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558 int src_stride1, int src_stride2, int h){\
562 a= LD32(&src1[i*src_stride1 ]);\
563 b= LD32(&src2[i*src_stride2 ]);\
564 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
568 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569 int src_stride1, int src_stride2, int h){\
573 a= LD16(&src1[i*src_stride1 ]);\
574 b= LD16(&src2[i*src_stride2 ]);\
575 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
579 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580 int src_stride1, int src_stride2, int h){\
581 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
582 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
585 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586 int src_stride1, int src_stride2, int h){\
587 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
588 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
591 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
595 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
599 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
603 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
607 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
611 uint32_t a, b, c, d, l0, l1, h0, h1;\
612 a= LD32(&src1[i*src_stride1]);\
613 b= LD32(&src2[i*src_stride2]);\
614 c= LD32(&src3[i*src_stride3]);\
615 d= LD32(&src4[i*src_stride4]);\
616 l0= (a&0x03030303UL)\
619 h0= ((a&0xFCFCFCFCUL)>>2)\
620 + ((b&0xFCFCFCFCUL)>>2);\
621 l1= (c&0x03030303UL)\
623 h1= ((c&0xFCFCFCFCUL)>>2)\
624 + ((d&0xFCFCFCFCUL)>>2);\
625 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626 a= LD32(&src1[i*src_stride1+4]);\
627 b= LD32(&src2[i*src_stride2+4]);\
628 c= LD32(&src3[i*src_stride3+4]);\
629 d= LD32(&src4[i*src_stride4+4]);\
630 l0= (a&0x03030303UL)\
633 h0= ((a&0xFCFCFCFCUL)>>2)\
634 + ((b&0xFCFCFCFCUL)>>2);\
635 l1= (c&0x03030303UL)\
637 h1= ((c&0xFCFCFCFCUL)>>2)\
638 + ((d&0xFCFCFCFCUL)>>2);\
639 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
643 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
647 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
651 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
655 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
659 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
663 uint32_t a, b, c, d, l0, l1, h0, h1;\
664 a= LD32(&src1[i*src_stride1]);\
665 b= LD32(&src2[i*src_stride2]);\
666 c= LD32(&src3[i*src_stride3]);\
667 d= LD32(&src4[i*src_stride4]);\
668 l0= (a&0x03030303UL)\
671 h0= ((a&0xFCFCFCFCUL)>>2)\
672 + ((b&0xFCFCFCFCUL)>>2);\
673 l1= (c&0x03030303UL)\
675 h1= ((c&0xFCFCFCFCUL)>>2)\
676 + ((d&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678 a= LD32(&src1[i*src_stride1+4]);\
679 b= LD32(&src2[i*src_stride2+4]);\
680 c= LD32(&src3[i*src_stride3+4]);\
681 d= LD32(&src4[i*src_stride4+4]);\
682 l0= (a&0x03030303UL)\
685 h0= ((a&0xFCFCFCFCUL)>>2)\
686 + ((b&0xFCFCFCFCUL)>>2);\
687 l1= (c&0x03030303UL)\
689 h1= ((c&0xFCFCFCFCUL)>>2)\
690 + ((d&0xFCFCFCFCUL)>>2);\
691 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
694 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
699 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
705 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 int i, a0, b0, a1, b1;\
714 for(i=0; i<h; i+=2){\
720 block[0]= (a1+a0)>>2; /* FIXME non put */\
721 block[1]= (b1+b0)>>2;\
731 block[0]= (a1+a0)>>2;\
732 block[1]= (b1+b0)>>2;\
738 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
741 const uint32_t a= LD32(pixels );\
742 const uint32_t b= LD32(pixels+1);\
743 uint32_t l0= (a&0x03030303UL)\
746 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747 + ((b&0xFCFCFCFCUL)>>2);\
751 for(i=0; i<h; i+=2){\
752 uint32_t a= LD32(pixels );\
753 uint32_t b= LD32(pixels+1);\
754 l1= (a&0x03030303UL)\
756 h1= ((a&0xFCFCFCFCUL)>>2)\
757 + ((b&0xFCFCFCFCUL)>>2);\
758 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
763 l0= (a&0x03030303UL)\
766 h0= ((a&0xFCFCFCFCUL)>>2)\
767 + ((b&0xFCFCFCFCUL)>>2);\
768 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
774 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 const uint32_t a= LD32(pixels );\
780 const uint32_t b= LD32(pixels+1);\
781 uint32_t l0= (a&0x03030303UL)\
784 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785 + ((b&0xFCFCFCFCUL)>>2);\
789 for(i=0; i<h; i+=2){\
790 uint32_t a= LD32(pixels );\
791 uint32_t b= LD32(pixels+1);\
792 l1= (a&0x03030303UL)\
794 h1= ((a&0xFCFCFCFCUL)>>2)\
795 + ((b&0xFCFCFCFCUL)>>2);\
796 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
801 l0= (a&0x03030303UL)\
804 h0= ((a&0xFCFCFCFCUL)>>2)\
805 + ((b&0xFCFCFCFCUL)>>2);\
806 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
810 pixels+=4-line_size*(h+1);\
811 block +=4-line_size*h;\
815 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
820 const uint32_t a= LD32(pixels );\
821 const uint32_t b= LD32(pixels+1);\
822 uint32_t l0= (a&0x03030303UL)\
825 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
830 for(i=0; i<h; i+=2){\
831 uint32_t a= LD32(pixels );\
832 uint32_t b= LD32(pixels+1);\
833 l1= (a&0x03030303UL)\
835 h1= ((a&0xFCFCFCFCUL)>>2)\
836 + ((b&0xFCFCFCFCUL)>>2);\
837 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
842 l0= (a&0x03030303UL)\
845 h0= ((a&0xFCFCFCFCUL)>>2)\
846 + ((b&0xFCFCFCFCUL)>>2);\
847 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
851 pixels+=4-line_size*(h+1);\
852 block +=4-line_size*h;\
856 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
865 #define op_avg(a, b) a = rnd_avg32(a, b)
867 #define op_put(a, b) a = b
874 #define avg2(a,b) ((a+b+1)>>1)
875 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
877 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
878 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
881 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
882 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
885 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
887 const int A=(16-x16)*(16-y16);
888 const int B=( x16)*(16-y16);
889 const int C=(16-x16)*( y16);
890 const int D=( x16)*( y16);
895 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
896 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
897 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
898 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
899 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
900 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
901 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
902 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
908 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
909 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
912 const int s= 1<<shift;
922 for(x=0; x<8; x++){ //XXX FIXME optimize
923 int src_x, src_y, frac_x, frac_y, index;
932 if((unsigned)src_x < width){
933 if((unsigned)src_y < height){
934 index= src_x + src_y*stride;
935 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
936 + src[index +1]* frac_x )*(s-frac_y)
937 + ( src[index+stride ]*(s-frac_x)
938 + src[index+stride+1]* frac_x )* frac_y
941 index= src_x + clip(src_y, 0, height)*stride;
942 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
943 + src[index +1]* frac_x )*s
947 if((unsigned)src_y < height){
948 index= clip(src_x, 0, width) + src_y*stride;
949 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
950 + src[index+stride ]* frac_y )*s
953 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
954 dst[y*stride + x]= src[index ];
966 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
968 case 2: put_pixels2_c (dst, src, stride, height); break;
969 case 4: put_pixels4_c (dst, src, stride, height); break;
970 case 8: put_pixels8_c (dst, src, stride, height); break;
971 case 16:put_pixels16_c(dst, src, stride, height); break;
975 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
977 for (i=0; i < height; i++) {
978 for (j=0; j < width; j++) {
979 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
986 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
988 for (i=0; i < height; i++) {
989 for (j=0; j < width; j++) {
990 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
997 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
999 for (i=0; i < height; i++) {
1000 for (j=0; j < width; j++) {
1001 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1008 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1010 for (i=0; i < height; i++) {
1011 for (j=0; j < width; j++) {
1012 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1019 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1021 for (i=0; i < height; i++) {
1022 for (j=0; j < width; j++) {
1023 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1030 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1032 for (i=0; i < height; i++) {
1033 for (j=0; j < width; j++) {
1034 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1041 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1043 for (i=0; i < height; i++) {
1044 for (j=0; j < width; j++) {
1045 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1052 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1054 for (i=0; i < height; i++) {
1055 for (j=0; j < width; j++) {
1056 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1063 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1065 case 2: avg_pixels2_c (dst, src, stride, height); break;
1066 case 4: avg_pixels4_c (dst, src, stride, height); break;
1067 case 8: avg_pixels8_c (dst, src, stride, height); break;
1068 case 16:avg_pixels16_c(dst, src, stride, height); break;
1072 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1074 for (i=0; i < height; i++) {
1075 for (j=0; j < width; j++) {
1076 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1083 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1085 for (i=0; i < height; i++) {
1086 for (j=0; j < width; j++) {
1087 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1094 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1096 for (i=0; i < height; i++) {
1097 for (j=0; j < width; j++) {
1098 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1105 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1107 for (i=0; i < height; i++) {
1108 for (j=0; j < width; j++) {
1109 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1116 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1118 for (i=0; i < height; i++) {
1119 for (j=0; j < width; j++) {
1120 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1127 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1129 for (i=0; i < height; i++) {
1130 for (j=0; j < width; j++) {
1131 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1138 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1140 for (i=0; i < height; i++) {
1141 for (j=0; j < width; j++) {
1142 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1149 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1151 for (i=0; i < height; i++) {
1152 for (j=0; j < width; j++) {
1153 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1160 #define TPEL_WIDTH(width)\
1161 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1162 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1163 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1164 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1165 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1166 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1167 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1168 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1169 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1170 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1171 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1172 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1173 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1174 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1175 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1176 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1177 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1178 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1181 #define H264_CHROMA_MC(OPNAME, OP)\
1182 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1183 const int A=(8-x)*(8-y);\
1184 const int B=( x)*(8-y);\
1185 const int C=(8-x)*( y);\
1186 const int D=( x)*( y);\
1189 assert(x<8 && y<8 && x>=0 && y>=0);\
1193 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1194 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1200 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1201 const int A=(8-x)*(8-y);\
1202 const int B=( x)*(8-y);\
1203 const int C=(8-x)*( y);\
1204 const int D=( x)*( y);\
1207 assert(x<8 && y<8 && x>=0 && y>=0);\
1211 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1212 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1213 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1214 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1220 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1221 const int A=(8-x)*(8-y);\
1222 const int B=( x)*(8-y);\
1223 const int C=(8-x)*( y);\
1224 const int D=( x)*( y);\
1227 assert(x<8 && y<8 && x>=0 && y>=0);\
1231 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1232 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1233 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1234 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1235 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1236 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1237 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1238 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1244 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1245 #define op_put(a, b) a = (((b) + 32)>>6)
1247 H264_CHROMA_MC(put_ , op_put)
1248 H264_CHROMA_MC(avg_ , op_avg)
1252 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1257 ST32(dst , LD32(src ));
1263 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1268 ST32(dst , LD32(src ));
1269 ST32(dst+4 , LD32(src+4 ));
1275 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1280 ST32(dst , LD32(src ));
1281 ST32(dst+4 , LD32(src+4 ));
1282 ST32(dst+8 , LD32(src+8 ));
1283 ST32(dst+12, LD32(src+12));
1289 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1294 ST32(dst , LD32(src ));
1295 ST32(dst+4 , LD32(src+4 ));
1296 ST32(dst+8 , LD32(src+8 ));
1297 ST32(dst+12, LD32(src+12));
1304 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1309 ST32(dst , LD32(src ));
1310 ST32(dst+4 , LD32(src+4 ));
1318 #define QPEL_MC(r, OPNAME, RND, OP) \
1319 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1320 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1324 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1325 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1326 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1327 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1328 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1329 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1330 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1331 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1337 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1339 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1343 const int src0= src[0*srcStride];\
1344 const int src1= src[1*srcStride];\
1345 const int src2= src[2*srcStride];\
1346 const int src3= src[3*srcStride];\
1347 const int src4= src[4*srcStride];\
1348 const int src5= src[5*srcStride];\
1349 const int src6= src[6*srcStride];\
1350 const int src7= src[7*srcStride];\
1351 const int src8= src[8*srcStride];\
1352 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1353 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1354 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1355 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1356 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1357 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1358 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1359 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1365 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1366 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1371 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1372 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1373 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1374 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1375 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1376 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1377 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1378 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1379 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1380 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1381 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1382 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1383 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1384 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1385 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1386 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1392 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1393 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1398 const int src0= src[0*srcStride];\
1399 const int src1= src[1*srcStride];\
1400 const int src2= src[2*srcStride];\
1401 const int src3= src[3*srcStride];\
1402 const int src4= src[4*srcStride];\
1403 const int src5= src[5*srcStride];\
1404 const int src6= src[6*srcStride];\
1405 const int src7= src[7*srcStride];\
1406 const int src8= src[8*srcStride];\
1407 const int src9= src[9*srcStride];\
1408 const int src10= src[10*srcStride];\
1409 const int src11= src[11*srcStride];\
1410 const int src12= src[12*srcStride];\
1411 const int src13= src[13*srcStride];\
1412 const int src14= src[14*srcStride];\
1413 const int src15= src[15*srcStride];\
1414 const int src16= src[16*srcStride];\
1415 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1416 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1417 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1418 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1419 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1420 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1421 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1422 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1423 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1424 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1425 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1426 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1427 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1428 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1429 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1430 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1436 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1437 OPNAME ## pixels8_c(dst, src, stride, 8);\
1440 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1442 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1443 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1446 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1447 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1450 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1452 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1453 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1456 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1459 copy_block9(full, src, 16, stride, 9);\
1460 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1461 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1464 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1465 uint8_t full[16*9];\
1466 copy_block9(full, src, 16, stride, 9);\
1467 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1470 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1471 uint8_t full[16*9];\
1473 copy_block9(full, src, 16, stride, 9);\
1474 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1475 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1477 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1481 uint8_t halfHV[64];\
1482 copy_block9(full, src, 16, stride, 9);\
1483 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1485 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1488 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1489 uint8_t full[16*9];\
1491 uint8_t halfHV[64];\
1492 copy_block9(full, src, 16, stride, 9);\
1493 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1498 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1509 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1519 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1530 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1540 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1541 uint8_t full[16*9];\
1544 uint8_t halfHV[64];\
1545 copy_block9(full, src, 16, stride, 9);\
1546 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1547 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1548 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1549 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1551 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1552 uint8_t full[16*9];\
1554 uint8_t halfHV[64];\
1555 copy_block9(full, src, 16, stride, 9);\
1556 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1557 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1561 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1563 uint8_t halfHV[64];\
1564 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1568 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1570 uint8_t halfHV[64];\
1571 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1572 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1573 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1575 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1576 uint8_t full[16*9];\
1579 uint8_t halfHV[64];\
1580 copy_block9(full, src, 16, stride, 9);\
1581 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1582 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1583 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1584 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1586 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1587 uint8_t full[16*9];\
1589 copy_block9(full, src, 16, stride, 9);\
1590 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1591 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1592 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1594 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1595 uint8_t full[16*9];\
1598 uint8_t halfHV[64];\
1599 copy_block9(full, src, 16, stride, 9);\
1600 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1601 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1602 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1603 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1605 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1606 uint8_t full[16*9];\
1608 copy_block9(full, src, 16, stride, 9);\
1609 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1610 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1611 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1613 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1615 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1616 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1618 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1619 OPNAME ## pixels16_c(dst, src, stride, 16);\
1622 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1624 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1625 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1628 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1629 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1632 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1634 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1635 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1638 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1641 copy_block17(full, src, 24, stride, 17);\
1642 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1643 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1646 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1647 uint8_t full[24*17];\
1648 copy_block17(full, src, 24, stride, 17);\
1649 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1652 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1653 uint8_t full[24*17];\
1655 copy_block17(full, src, 24, stride, 17);\
1656 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1657 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1659 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t halfH[272];\
1662 uint8_t halfV[256];\
1663 uint8_t halfHV[256];\
1664 copy_block17(full, src, 24, stride, 17);\
1665 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1667 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1670 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1671 uint8_t full[24*17];\
1672 uint8_t halfH[272];\
1673 uint8_t halfHV[256];\
1674 copy_block17(full, src, 24, stride, 17);\
1675 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1680 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1691 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1701 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1712 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1722 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t full[24*17];\
1724 uint8_t halfH[272];\
1725 uint8_t halfV[256];\
1726 uint8_t halfHV[256];\
1727 copy_block17(full, src, 24, stride, 17);\
1728 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1729 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1730 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1731 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1733 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1734 uint8_t full[24*17];\
1735 uint8_t halfH[272];\
1736 uint8_t halfHV[256];\
1737 copy_block17(full, src, 24, stride, 17);\
1738 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1739 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1743 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t halfH[272];\
1745 uint8_t halfHV[256];\
1746 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1750 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t halfH[272];\
1752 uint8_t halfHV[256];\
1753 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1754 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1755 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1757 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[24*17];\
1759 uint8_t halfH[272];\
1760 uint8_t halfV[256];\
1761 uint8_t halfHV[256];\
1762 copy_block17(full, src, 24, stride, 17);\
1763 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1764 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1765 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1766 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1768 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[24*17];\
1770 uint8_t halfH[272];\
1771 copy_block17(full, src, 24, stride, 17);\
1772 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1773 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1774 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1776 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[24*17];\
1778 uint8_t halfH[272];\
1779 uint8_t halfV[256];\
1780 uint8_t halfHV[256];\
1781 copy_block17(full, src, 24, stride, 17);\
1782 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1783 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1784 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1785 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1787 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[24*17];\
1789 uint8_t halfH[272];\
1790 copy_block17(full, src, 24, stride, 17);\
1791 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1792 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1793 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1795 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1796 uint8_t halfH[272];\
1797 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1798 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1801 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1802 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1803 #define op_put(a, b) a = cm[((b) + 16)>>5]
1804 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1806 QPEL_MC(0, put_ , _ , op_put)
1807 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1808 QPEL_MC(0, avg_ , _ , op_avg)
1809 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1811 #undef op_avg_no_rnd
1813 #undef op_put_no_rnd
1816 #define H264_LOWPASS(OPNAME, OP, OP2) \
1817 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1819 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1823 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1824 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1825 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1826 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1832 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1834 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1838 const int srcB= src[-2*srcStride];\
1839 const int srcA= src[-1*srcStride];\
1840 const int src0= src[0 *srcStride];\
1841 const int src1= src[1 *srcStride];\
1842 const int src2= src[2 *srcStride];\
1843 const int src3= src[3 *srcStride];\
1844 const int src4= src[4 *srcStride];\
1845 const int src5= src[5 *srcStride];\
1846 const int src6= src[6 *srcStride];\
1847 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1848 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1849 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1850 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1856 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1859 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1861 src -= 2*srcStride;\
1862 for(i=0; i<h+5; i++)\
1864 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1865 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1866 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1867 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1871 tmp -= tmpStride*(h+5-2);\
1874 const int tmpB= tmp[-2*tmpStride];\
1875 const int tmpA= tmp[-1*tmpStride];\
1876 const int tmp0= tmp[0 *tmpStride];\
1877 const int tmp1= tmp[1 *tmpStride];\
1878 const int tmp2= tmp[2 *tmpStride];\
1879 const int tmp3= tmp[3 *tmpStride];\
1880 const int tmp4= tmp[4 *tmpStride];\
1881 const int tmp5= tmp[5 *tmpStride];\
1882 const int tmp6= tmp[6 *tmpStride];\
1883 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1884 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1885 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1886 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1892 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1894 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1898 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1899 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1900 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1901 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1902 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1903 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1904 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1905 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1911 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1913 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1917 const int srcB= src[-2*srcStride];\
1918 const int srcA= src[-1*srcStride];\
1919 const int src0= src[0 *srcStride];\
1920 const int src1= src[1 *srcStride];\
1921 const int src2= src[2 *srcStride];\
1922 const int src3= src[3 *srcStride];\
1923 const int src4= src[4 *srcStride];\
1924 const int src5= src[5 *srcStride];\
1925 const int src6= src[6 *srcStride];\
1926 const int src7= src[7 *srcStride];\
1927 const int src8= src[8 *srcStride];\
1928 const int src9= src[9 *srcStride];\
1929 const int src10=src[10*srcStride];\
1930 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1931 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1932 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1933 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1934 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1935 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1936 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1937 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1943 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1946 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1948 src -= 2*srcStride;\
1949 for(i=0; i<h+5; i++)\
1951 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1952 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1953 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1954 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1955 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1956 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1957 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1958 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1962 tmp -= tmpStride*(h+5-2);\
1965 const int tmpB= tmp[-2*tmpStride];\
1966 const int tmpA= tmp[-1*tmpStride];\
1967 const int tmp0= tmp[0 *tmpStride];\
1968 const int tmp1= tmp[1 *tmpStride];\
1969 const int tmp2= tmp[2 *tmpStride];\
1970 const int tmp3= tmp[3 *tmpStride];\
1971 const int tmp4= tmp[4 *tmpStride];\
1972 const int tmp5= tmp[5 *tmpStride];\
1973 const int tmp6= tmp[6 *tmpStride];\
1974 const int tmp7= tmp[7 *tmpStride];\
1975 const int tmp8= tmp[8 *tmpStride];\
1976 const int tmp9= tmp[9 *tmpStride];\
1977 const int tmp10=tmp[10*tmpStride];\
1978 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1979 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1980 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1981 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1982 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1983 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1984 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1985 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1991 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1992 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1993 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1994 src += 8*srcStride;\
1995 dst += 8*dstStride;\
1996 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1997 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2000 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2001 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2002 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2003 src += 8*srcStride;\
2004 dst += 8*dstStride;\
2005 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2006 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2009 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2010 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2011 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2012 src += 8*srcStride;\
2013 tmp += 8*tmpStride;\
2014 dst += 8*dstStride;\
2015 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2016 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2019 #define H264_MC(OPNAME, SIZE) \
2020 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2021 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2024 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t half[SIZE*SIZE];\
2026 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2027 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2030 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2031 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2034 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t half[SIZE*SIZE];\
2036 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2037 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2040 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[SIZE*(SIZE+5)];\
2042 uint8_t * const full_mid= full + SIZE*2;\
2043 uint8_t half[SIZE*SIZE];\
2044 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2045 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2046 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2049 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[SIZE*(SIZE+5)];\
2051 uint8_t * const full_mid= full + SIZE*2;\
2052 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2053 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2056 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2057 uint8_t full[SIZE*(SIZE+5)];\
2058 uint8_t * const full_mid= full + SIZE*2;\
2059 uint8_t half[SIZE*SIZE];\
2060 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2061 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2062 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2065 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2066 uint8_t full[SIZE*(SIZE+5)];\
2067 uint8_t * const full_mid= full + SIZE*2;\
2068 uint8_t halfH[SIZE*SIZE];\
2069 uint8_t halfV[SIZE*SIZE];\
2070 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2071 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2072 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2073 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2076 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2077 uint8_t full[SIZE*(SIZE+5)];\
2078 uint8_t * const full_mid= full + SIZE*2;\
2079 uint8_t halfH[SIZE*SIZE];\
2080 uint8_t halfV[SIZE*SIZE];\
2081 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2082 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2083 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2084 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2087 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2088 uint8_t full[SIZE*(SIZE+5)];\
2089 uint8_t * const full_mid= full + SIZE*2;\
2090 uint8_t halfH[SIZE*SIZE];\
2091 uint8_t halfV[SIZE*SIZE];\
2092 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2093 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2094 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2095 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2098 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2099 uint8_t full[SIZE*(SIZE+5)];\
2100 uint8_t * const full_mid= full + SIZE*2;\
2101 uint8_t halfH[SIZE*SIZE];\
2102 uint8_t halfV[SIZE*SIZE];\
2103 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2104 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2105 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2106 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2109 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2110 int16_t tmp[SIZE*(SIZE+5)];\
2111 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2114 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2115 int16_t tmp[SIZE*(SIZE+5)];\
2116 uint8_t halfH[SIZE*SIZE];\
2117 uint8_t halfHV[SIZE*SIZE];\
2118 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2119 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2123 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2124 int16_t tmp[SIZE*(SIZE+5)];\
2125 uint8_t halfH[SIZE*SIZE];\
2126 uint8_t halfHV[SIZE*SIZE];\
2127 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2128 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2129 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2132 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2133 uint8_t full[SIZE*(SIZE+5)];\
2134 uint8_t * const full_mid= full + SIZE*2;\
2135 int16_t tmp[SIZE*(SIZE+5)];\
2136 uint8_t halfV[SIZE*SIZE];\
2137 uint8_t halfHV[SIZE*SIZE];\
2138 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2139 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2140 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2141 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2144 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2145 uint8_t full[SIZE*(SIZE+5)];\
2146 uint8_t * const full_mid= full + SIZE*2;\
2147 int16_t tmp[SIZE*(SIZE+5)];\
2148 uint8_t halfV[SIZE*SIZE];\
2149 uint8_t halfHV[SIZE*SIZE];\
2150 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2151 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2152 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2153 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2156 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2157 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2158 #define op_put(a, b) a = cm[((b) + 16)>>5]
2159 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2160 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2162 H264_LOWPASS(put_ , op_put, op2_put)
2163 H264_LOWPASS(avg_ , op_avg, op2_avg)
2177 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2178 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2182 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2183 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2184 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2185 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2186 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2187 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2188 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2189 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2195 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2196 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2200 const int src_1= src[ -srcStride];
2201 const int src0 = src[0 ];
2202 const int src1 = src[ srcStride];
2203 const int src2 = src[2*srcStride];
2204 const int src3 = src[3*srcStride];
2205 const int src4 = src[4*srcStride];
2206 const int src5 = src[5*srcStride];
2207 const int src6 = src[6*srcStride];
2208 const int src7 = src[7*srcStride];
2209 const int src8 = src[8*srcStride];
2210 const int src9 = src[9*srcStride];
2211 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2212 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2213 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2214 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2215 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2216 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2217 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2218 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2224 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2225 put_pixels8_c(dst, src, stride, 8);
2228 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2230 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2231 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2234 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2235 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2238 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2240 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2241 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2244 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2245 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2248 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2252 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2253 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2254 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2255 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2257 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2261 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2263 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2264 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2266 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2268 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2269 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2272 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2274 const int strength= ff_h263_loop_filter_strength[qscale];
2278 int p0= src[x-2*stride];
2279 int p1= src[x-1*stride];
2280 int p2= src[x+0*stride];
2281 int p3= src[x+1*stride];
2282 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2284 if (d<-2*strength) d1= 0;
2285 else if(d<- strength) d1=-2*strength - d;
2286 else if(d< strength) d1= d;
2287 else if(d< 2*strength) d1= 2*strength - d;
2292 if(p1&256) p1= ~(p1>>31);
2293 if(p2&256) p2= ~(p2>>31);
2295 src[x-1*stride] = p1;
2296 src[x+0*stride] = p2;
2300 d2= clip((p0-p3)/4, -ad1, ad1);
2302 src[x-2*stride] = p0 - d2;
2303 src[x+ stride] = p3 + d2;
2307 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2309 const int strength= ff_h263_loop_filter_strength[qscale];
2313 int p0= src[y*stride-2];
2314 int p1= src[y*stride-1];
2315 int p2= src[y*stride+0];
2316 int p3= src[y*stride+1];
2317 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2319 if (d<-2*strength) d1= 0;
2320 else if(d<- strength) d1=-2*strength - d;
2321 else if(d< strength) d1= d;
2322 else if(d< 2*strength) d1= 2*strength - d;
2327 if(p1&256) p1= ~(p1>>31);
2328 if(p2&256) p2= ~(p2>>31);
2330 src[y*stride-1] = p1;
2331 src[y*stride+0] = p2;
2335 d2= clip((p0-p3)/4, -ad1, ad1);
2337 src[y*stride-2] = p0 - d2;
2338 src[y*stride+1] = p3 + d2;
2342 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2348 s += abs(pix1[0] - pix2[0]);
2349 s += abs(pix1[1] - pix2[1]);
2350 s += abs(pix1[2] - pix2[2]);
2351 s += abs(pix1[3] - pix2[3]);
2352 s += abs(pix1[4] - pix2[4]);
2353 s += abs(pix1[5] - pix2[5]);
2354 s += abs(pix1[6] - pix2[6]);
2355 s += abs(pix1[7] - pix2[7]);
2356 s += abs(pix1[8] - pix2[8]);
2357 s += abs(pix1[9] - pix2[9]);
2358 s += abs(pix1[10] - pix2[10]);
2359 s += abs(pix1[11] - pix2[11]);
2360 s += abs(pix1[12] - pix2[12]);
2361 s += abs(pix1[13] - pix2[13]);
2362 s += abs(pix1[14] - pix2[14]);
2363 s += abs(pix1[15] - pix2[15]);
2370 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2376 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2377 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2378 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2379 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2380 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2381 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2382 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2383 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2384 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2385 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2386 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2387 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2388 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2389 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2390 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2391 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2398 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2401 uint8_t *pix3 = pix2 + line_size;
2405 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2406 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2407 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2408 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2409 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2410 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2411 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2412 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2413 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2414 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2415 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2416 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2417 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2418 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2419 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2420 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2428 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2431 uint8_t *pix3 = pix2 + line_size;
2435 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2436 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2437 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2438 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2439 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2440 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2441 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2442 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2443 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2444 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2445 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2446 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2447 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2448 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2449 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2450 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2458 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2464 s += abs(pix1[0] - pix2[0]);
2465 s += abs(pix1[1] - pix2[1]);
2466 s += abs(pix1[2] - pix2[2]);
2467 s += abs(pix1[3] - pix2[3]);
2468 s += abs(pix1[4] - pix2[4]);
2469 s += abs(pix1[5] - pix2[5]);
2470 s += abs(pix1[6] - pix2[6]);
2471 s += abs(pix1[7] - pix2[7]);
2478 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2484 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2485 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2486 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2487 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2488 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2489 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2490 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2491 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2498 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2501 uint8_t *pix3 = pix2 + line_size;
2505 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2506 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2507 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2508 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2509 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2510 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2511 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2512 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2520 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2523 uint8_t *pix3 = pix2 + line_size;
2527 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2528 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2529 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2530 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2531 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2532 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2533 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2534 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2542 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2546 for(i=0; i<8*8; i++){
2547 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2550 assert(-512<b && b<512);
2552 sum += (w*b)*(w*b)>>4;
2557 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2560 for(i=0; i<8*8; i++){
2561 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2566 * permutes an 8x8 block.
2567 * @param block the block which will be permuted according to the given permutation vector
2568 * @param permutation the permutation vector
2569 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2570 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2571 * (inverse) permutated to scantable order!
2573 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2579 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2581 for(i=0; i<=last; i++){
2582 const int j= scantable[i];
2587 for(i=0; i<=last; i++){
2588 const int j= scantable[i];
2589 const int perm_j= permutation[j];
2590 block[perm_j]= temp[j];
2594 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2598 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2601 memset(cmp, 0, sizeof(void*)*5);
2609 cmp[i]= c->hadamard8_diff[i];
2615 cmp[i]= c->dct_sad[i];
2618 cmp[i]= c->quant_psnr[i];
2636 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2642 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2644 static void clear_blocks_c(DCTELEM *blocks)
2646 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2649 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2651 for(i=0; i+7<w; i+=8){
2652 dst[i+0] += src[i+0];
2653 dst[i+1] += src[i+1];
2654 dst[i+2] += src[i+2];
2655 dst[i+3] += src[i+3];
2656 dst[i+4] += src[i+4];
2657 dst[i+5] += src[i+5];
2658 dst[i+6] += src[i+6];
2659 dst[i+7] += src[i+7];
2662 dst[i+0] += src[i+0];
2665 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2667 for(i=0; i+7<w; i+=8){
2668 dst[i+0] = src1[i+0]-src2[i+0];
2669 dst[i+1] = src1[i+1]-src2[i+1];
2670 dst[i+2] = src1[i+2]-src2[i+2];
2671 dst[i+3] = src1[i+3]-src2[i+3];
2672 dst[i+4] = src1[i+4]-src2[i+4];
2673 dst[i+5] = src1[i+5]-src2[i+5];
2674 dst[i+6] = src1[i+6]-src2[i+6];
2675 dst[i+7] = src1[i+7]-src2[i+7];
2678 dst[i+0] = src1[i+0]-src2[i+0];
2681 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2689 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2699 #define BUTTERFLY2(o1,o2,i1,i2) \
2703 #define BUTTERFLY1(x,y) \
2712 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2714 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2722 //FIXME try pointer walks
2723 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2724 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2725 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2726 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2728 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2729 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2730 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2731 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2733 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2734 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2735 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2736 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2740 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2741 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2742 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2743 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2745 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2746 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2747 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2748 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2751 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2752 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2753 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2754 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2760 printf("MAX:%d\n", maxi);
2766 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2774 //FIXME try pointer walks
2775 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2776 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2777 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2778 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2780 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2781 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2782 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2783 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2785 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2786 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2787 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2788 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2792 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2793 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2794 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2795 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2797 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2798 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2799 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2800 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2803 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2804 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2805 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2806 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2809 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2814 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2815 MpegEncContext * const s= (MpegEncContext *)c;
2816 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2817 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2822 s->dsp.diff_pixels(temp, src1, src2, stride);
2831 void simple_idct(DCTELEM *block); //FIXME
2833 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2834 MpegEncContext * const s= (MpegEncContext *)c;
2835 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2836 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2837 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2843 s->dsp.diff_pixels(temp, src1, src2, stride);
2845 memcpy(bak, temp, 64*sizeof(DCTELEM));
2847 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2848 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2849 simple_idct(temp); //FIXME
2852 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2857 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2858 MpegEncContext * const s= (MpegEncContext *)c;
2859 const uint8_t *scantable= s->intra_scantable.permutated;
2860 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2861 uint64_t __align8 aligned_bak[stride];
2862 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2863 uint8_t * const bak= (uint8_t*)aligned_bak;
2864 int i, last, run, bits, level, distoration, start_i;
2865 const int esc_length= s->ac_esc_length;
2867 uint8_t * last_length;
2872 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2873 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2876 s->dsp.diff_pixels(temp, src1, src2, stride);
2878 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2884 length = s->intra_ac_vlc_length;
2885 last_length= s->intra_ac_vlc_last_length;
2886 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2889 length = s->inter_ac_vlc_length;
2890 last_length= s->inter_ac_vlc_last_length;
2895 for(i=start_i; i<last; i++){
2896 int j= scantable[i];
2901 if((level&(~127)) == 0){
2902 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2911 level= temp[i] + 64;
2915 if((level&(~127)) == 0){
2916 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2924 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2926 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2929 s->dsp.idct_add(bak, stride, temp);
2931 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2933 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2936 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2937 MpegEncContext * const s= (MpegEncContext *)c;
2938 const uint8_t *scantable= s->intra_scantable.permutated;
2939 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2940 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2941 int i, last, run, bits, level, start_i;
2942 const int esc_length= s->ac_esc_length;
2944 uint8_t * last_length;
2948 s->dsp.diff_pixels(temp, src1, src2, stride);
2950 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2956 length = s->intra_ac_vlc_length;
2957 last_length= s->intra_ac_vlc_last_length;
2958 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2961 length = s->inter_ac_vlc_length;
2962 last_length= s->inter_ac_vlc_last_length;
2967 for(i=start_i; i<last; i++){
2968 int j= scantable[i];
2973 if((level&(~127)) == 0){
2974 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2983 level= temp[i] + 64;
2987 if((level&(~127)) == 0){
2988 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2996 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3001 for(x=0; x<16; x+=4){
3002 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3003 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3011 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3016 for(x=0; x<16; x++){
3017 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3026 #define SQ(a) ((a)*(a))
3027 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3032 for(x=0; x<16; x+=4){
3033 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3034 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3042 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3047 for(x=0; x<16; x++){
3048 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3057 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3058 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3059 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3060 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3061 WARPER8_16_SQ(rd8x8_c, rd16_c)
3062 WARPER8_16_SQ(bit8x8_c, bit16_c)
3064 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3066 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3069 put_pixels_clamped_c(block, dest, line_size);
3071 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3074 add_pixels_clamped_c(block, dest, line_size);
3077 /* init static data */
3078 void dsputil_static_init(void)
3082 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3083 for(i=0;i<MAX_NEG_CROP;i++) {
3085 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3088 for(i=0;i<512;i++) {
3089 squareTbl[i] = (i - 256) * (i - 256);
3092 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3096 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3100 #ifdef CONFIG_ENCODERS
3101 if(avctx->dct_algo==FF_DCT_FASTINT) {
3102 c->fdct = fdct_ifast;
3103 c->fdct248 = fdct_ifast248;
3105 else if(avctx->dct_algo==FF_DCT_FAAN) {
3106 c->fdct = ff_faandct;
3107 c->fdct248 = ff_faandct248;
3110 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3111 c->fdct248 = ff_fdct248_islow;
3113 #endif //CONFIG_ENCODERS
3115 if(avctx->idct_algo==FF_IDCT_INT){
3116 c->idct_put= ff_jref_idct_put;
3117 c->idct_add= ff_jref_idct_add;
3118 c->idct = j_rev_dct;
3119 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3120 }else{ //accurate/default
3121 c->idct_put= simple_idct_put;
3122 c->idct_add= simple_idct_add;
3123 c->idct = simple_idct;
3124 c->idct_permutation_type= FF_NO_IDCT_PERM;
3127 /* VP3 DSP support */
3128 c->vp3_dsp_init = vp3_dsp_init_c;
3129 c->vp3_idct_put = vp3_idct_put_c;
3130 c->vp3_idct_add = vp3_idct_add_c;
3132 c->get_pixels = get_pixels_c;
3133 c->diff_pixels = diff_pixels_c;
3134 c->put_pixels_clamped = put_pixels_clamped_c;
3135 c->add_pixels_clamped = add_pixels_clamped_c;
3138 c->clear_blocks = clear_blocks_c;
3139 c->pix_sum = pix_sum_c;
3140 c->pix_norm1 = pix_norm1_c;
3142 /* TODO [0] 16 [1] 8 */
3143 c->pix_abs[0][0] = pix_abs16_c;
3144 c->pix_abs[0][1] = pix_abs16_x2_c;
3145 c->pix_abs[0][2] = pix_abs16_y2_c;
3146 c->pix_abs[0][3] = pix_abs16_xy2_c;
3147 c->pix_abs[1][0] = pix_abs8_c;
3148 c->pix_abs[1][1] = pix_abs8_x2_c;
3149 c->pix_abs[1][2] = pix_abs8_y2_c;
3150 c->pix_abs[1][3] = pix_abs8_xy2_c;
3152 #define dspfunc(PFX, IDX, NUM) \
3153 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3154 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3155 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3156 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3158 dspfunc(put, 0, 16);
3159 dspfunc(put_no_rnd, 0, 16);
3161 dspfunc(put_no_rnd, 1, 8);
3165 dspfunc(avg, 0, 16);
3166 dspfunc(avg_no_rnd, 0, 16);
3168 dspfunc(avg_no_rnd, 1, 8);
3173 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3174 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3176 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3177 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3178 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3179 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3180 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3181 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3182 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3183 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3184 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3186 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3187 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3188 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3189 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3190 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3191 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3192 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3193 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3194 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3196 #define dspfunc(PFX, IDX, NUM) \
3197 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3198 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3199 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3200 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3201 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3202 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3203 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3204 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3205 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3206 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3207 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3208 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3209 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3210 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3211 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3212 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3214 dspfunc(put_qpel, 0, 16);
3215 dspfunc(put_no_rnd_qpel, 0, 16);
3217 dspfunc(avg_qpel, 0, 16);
3218 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3220 dspfunc(put_qpel, 1, 8);
3221 dspfunc(put_no_rnd_qpel, 1, 8);
3223 dspfunc(avg_qpel, 1, 8);
3224 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3226 dspfunc(put_h264_qpel, 0, 16);
3227 dspfunc(put_h264_qpel, 1, 8);
3228 dspfunc(put_h264_qpel, 2, 4);
3229 dspfunc(avg_h264_qpel, 0, 16);
3230 dspfunc(avg_h264_qpel, 1, 8);
3231 dspfunc(avg_h264_qpel, 2, 4);
3234 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3235 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3236 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3237 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3238 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3239 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3241 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3242 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3243 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3244 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3245 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3246 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3247 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3248 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3250 #define SET_CMP_FUNC(name) \
3251 c->name[0]= name ## 16_c;\
3252 c->name[1]= name ## 8x8_c;
3254 SET_CMP_FUNC(hadamard8_diff)
3255 c->hadamard8_diff[4]= hadamard8_intra16_c;
3256 SET_CMP_FUNC(dct_sad)
3257 c->sad[0]= pix_abs16_c;
3258 c->sad[1]= pix_abs8_c;
3261 SET_CMP_FUNC(quant_psnr)
3264 c->vsad[0]= vsad16_c;
3265 c->vsad[4]= vsad_intra16_c;
3266 c->vsse[0]= vsse16_c;
3267 c->vsse[4]= vsse_intra16_c;
3269 c->add_bytes= add_bytes_c;
3270 c->diff_bytes= diff_bytes_c;
3271 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3272 c->bswap_buf= bswap_buf;
3274 c->h263_h_loop_filter= h263_h_loop_filter_c;
3275 c->h263_v_loop_filter= h263_v_loop_filter_c;
3277 c->try_8x8basis= try_8x8basis_c;
3278 c->add_8x8basis= add_8x8basis_c;
3281 dsputil_init_mmx(c, avctx);
3284 dsputil_init_armv4l(c, avctx);
3287 dsputil_init_mlib(c, avctx);
3290 dsputil_init_alpha(c, avctx);
3293 dsputil_init_ppc(c, avctx);
3296 dsputil_init_mmi(c, avctx);
3299 dsputil_init_sh4(c,avctx);
3302 switch(c->idct_permutation_type){
3303 case FF_NO_IDCT_PERM:
3305 c->idct_permutation[i]= i;
3307 case FF_LIBMPEG2_IDCT_PERM:
3309 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3311 case FF_SIMPLE_IDCT_PERM:
3313 c->idct_permutation[i]= simple_mmx_permutation[i];
3315 case FF_TRANSPOSE_IDCT_PERM:
3317 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3320 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");