3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
35 uint32_t squareTbl[512];
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64];
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
134 static int pix_sum_c(uint8_t * pix, int line_size)
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
151 pix += line_size - 16;
156 static int pix_norm1_c(uint8_t * pix, int line_size)
159 uint32_t *sq = squareTbl + 256;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
185 register uint32_t x=*(uint32_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
199 pix += line_size - 16;
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
218 dst[i+0]= bswap_32(src[i+0]);
222 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
225 uint32_t *sq = squareTbl + 256;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
233 s += sq[pix1[4] - pix2[4]];
234 s += sq[pix1[5] - pix2[5]];
235 s += sq[pix1[6] - pix2[6]];
236 s += sq[pix1[7] - pix2[7]];
243 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
246 uint32_t *sq = squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[ 0] - pix2[ 0]];
251 s += sq[pix1[ 1] - pix2[ 1]];
252 s += sq[pix1[ 2] - pix2[ 2]];
253 s += sq[pix1[ 3] - pix2[ 3]];
254 s += sq[pix1[ 4] - pix2[ 4]];
255 s += sq[pix1[ 5] - pix2[ 5]];
256 s += sq[pix1[ 6] - pix2[ 6]];
257 s += sq[pix1[ 7] - pix2[ 7]];
258 s += sq[pix1[ 8] - pix2[ 8]];
259 s += sq[pix1[ 9] - pix2[ 9]];
260 s += sq[pix1[10] - pix2[10]];
261 s += sq[pix1[11] - pix2[11]];
262 s += sq[pix1[12] - pix2[12]];
263 s += sq[pix1[13] - pix2[13]];
264 s += sq[pix1[14] - pix2[14]];
265 s += sq[pix1[15] - pix2[15]];
273 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
277 /* read the pixels */
279 block[0] = pixels[0];
280 block[1] = pixels[1];
281 block[2] = pixels[2];
282 block[3] = pixels[3];
283 block[4] = pixels[4];
284 block[5] = pixels[5];
285 block[6] = pixels[6];
286 block[7] = pixels[7];
292 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
293 const uint8_t *s2, int stride){
296 /* read the pixels */
298 block[0] = s1[0] - s2[0];
299 block[1] = s1[1] - s2[1];
300 block[2] = s1[2] - s2[2];
301 block[3] = s1[3] - s2[3];
302 block[4] = s1[4] - s2[4];
303 block[5] = s1[5] - s2[5];
304 block[6] = s1[6] - s2[6];
305 block[7] = s1[7] - s2[7];
313 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
317 uint8_t *cm = cropTbl + MAX_NEG_CROP;
319 /* read the pixels */
321 pixels[0] = cm[block[0]];
322 pixels[1] = cm[block[1]];
323 pixels[2] = cm[block[2]];
324 pixels[3] = cm[block[3]];
325 pixels[4] = cm[block[4]];
326 pixels[5] = cm[block[5]];
327 pixels[6] = cm[block[6]];
328 pixels[7] = cm[block[7]];
335 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
339 uint8_t *cm = cropTbl + MAX_NEG_CROP;
341 /* read the pixels */
343 pixels[0] = cm[pixels[0] + block[0]];
344 pixels[1] = cm[pixels[1] + block[1]];
345 pixels[2] = cm[pixels[2] + block[2]];
346 pixels[3] = cm[pixels[3] + block[3]];
347 pixels[4] = cm[pixels[4] + block[4]];
348 pixels[5] = cm[pixels[5] + block[5]];
349 pixels[6] = cm[pixels[6] + block[6]];
350 pixels[7] = cm[pixels[7] + block[7]];
357 #define PIXOP2(OPNAME, OP) \
358 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
362 OP(*((uint64_t*)block), LD64(pixels));\
368 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
372 const uint64_t a= LD64(pixels );\
373 const uint64_t b= LD64(pixels+1);\
374 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
380 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
384 const uint64_t a= LD64(pixels );\
385 const uint64_t b= LD64(pixels+1);\
386 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
392 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
396 const uint64_t a= LD64(pixels );\
397 const uint64_t b= LD64(pixels+line_size);\
398 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
404 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
408 const uint64_t a= LD64(pixels );\
409 const uint64_t b= LD64(pixels+line_size);\
410 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
416 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
419 const uint64_t a= LD64(pixels );\
420 const uint64_t b= LD64(pixels+1);\
421 uint64_t l0= (a&0x0303030303030303ULL)\
422 + (b&0x0303030303030303ULL)\
423 + 0x0202020202020202ULL;\
424 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
425 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
429 for(i=0; i<h; i+=2){\
430 uint64_t a= LD64(pixels );\
431 uint64_t b= LD64(pixels+1);\
432 l1= (a&0x0303030303030303ULL)\
433 + (b&0x0303030303030303ULL);\
434 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
435 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
436 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
441 l0= (a&0x0303030303030303ULL)\
442 + (b&0x0303030303030303ULL)\
443 + 0x0202020202020202ULL;\
444 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
445 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
446 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
452 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
455 const uint64_t a= LD64(pixels );\
456 const uint64_t b= LD64(pixels+1);\
457 uint64_t l0= (a&0x0303030303030303ULL)\
458 + (b&0x0303030303030303ULL)\
459 + 0x0101010101010101ULL;\
460 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
461 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
465 for(i=0; i<h; i+=2){\
466 uint64_t a= LD64(pixels );\
467 uint64_t b= LD64(pixels+1);\
468 l1= (a&0x0303030303030303ULL)\
469 + (b&0x0303030303030303ULL);\
470 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
471 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
472 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
477 l0= (a&0x0303030303030303ULL)\
478 + (b&0x0303030303030303ULL)\
479 + 0x0101010101010101ULL;\
480 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
481 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
482 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
488 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
489 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
490 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
491 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
492 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
493 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
494 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
496 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
497 #else // 64 bit variant
499 #define PIXOP2(OPNAME, OP) \
500 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
503 OP(*((uint16_t*)(block )), LD16(pixels ));\
508 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
511 OP(*((uint32_t*)(block )), LD32(pixels ));\
516 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
519 OP(*((uint32_t*)(block )), LD32(pixels ));\
520 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
525 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
526 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
529 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
534 a= LD32(&src1[i*src_stride1 ]);\
535 b= LD32(&src2[i*src_stride2 ]);\
536 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
537 a= LD32(&src1[i*src_stride1+4]);\
538 b= LD32(&src2[i*src_stride2+4]);\
539 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
543 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544 int src_stride1, int src_stride2, int h){\
548 a= LD32(&src1[i*src_stride1 ]);\
549 b= LD32(&src2[i*src_stride2 ]);\
550 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
551 a= LD32(&src1[i*src_stride1+4]);\
552 b= LD32(&src2[i*src_stride2+4]);\
553 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
557 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
558 int src_stride1, int src_stride2, int h){\
562 a= LD32(&src1[i*src_stride1 ]);\
563 b= LD32(&src2[i*src_stride2 ]);\
564 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
568 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
569 int src_stride1, int src_stride2, int h){\
573 a= LD16(&src1[i*src_stride1 ]);\
574 b= LD16(&src2[i*src_stride2 ]);\
575 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
579 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
580 int src_stride1, int src_stride2, int h){\
581 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
582 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
585 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
586 int src_stride1, int src_stride2, int h){\
587 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
588 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
591 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
592 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
595 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
596 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
599 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
600 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
603 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
604 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
607 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
608 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
611 uint32_t a, b, c, d, l0, l1, h0, h1;\
612 a= LD32(&src1[i*src_stride1]);\
613 b= LD32(&src2[i*src_stride2]);\
614 c= LD32(&src3[i*src_stride3]);\
615 d= LD32(&src4[i*src_stride4]);\
616 l0= (a&0x03030303UL)\
619 h0= ((a&0xFCFCFCFCUL)>>2)\
620 + ((b&0xFCFCFCFCUL)>>2);\
621 l1= (c&0x03030303UL)\
623 h1= ((c&0xFCFCFCFCUL)>>2)\
624 + ((d&0xFCFCFCFCUL)>>2);\
625 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
626 a= LD32(&src1[i*src_stride1+4]);\
627 b= LD32(&src2[i*src_stride2+4]);\
628 c= LD32(&src3[i*src_stride3+4]);\
629 d= LD32(&src4[i*src_stride4+4]);\
630 l0= (a&0x03030303UL)\
633 h0= ((a&0xFCFCFCFCUL)>>2)\
634 + ((b&0xFCFCFCFCUL)>>2);\
635 l1= (c&0x03030303UL)\
637 h1= ((c&0xFCFCFCFCUL)>>2)\
638 + ((d&0xFCFCFCFCUL)>>2);\
639 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
643 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
644 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
647 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
648 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
651 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
652 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
655 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
656 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
659 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
660 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
663 uint32_t a, b, c, d, l0, l1, h0, h1;\
664 a= LD32(&src1[i*src_stride1]);\
665 b= LD32(&src2[i*src_stride2]);\
666 c= LD32(&src3[i*src_stride3]);\
667 d= LD32(&src4[i*src_stride4]);\
668 l0= (a&0x03030303UL)\
671 h0= ((a&0xFCFCFCFCUL)>>2)\
672 + ((b&0xFCFCFCFCUL)>>2);\
673 l1= (c&0x03030303UL)\
675 h1= ((c&0xFCFCFCFCUL)>>2)\
676 + ((d&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
678 a= LD32(&src1[i*src_stride1+4]);\
679 b= LD32(&src2[i*src_stride2+4]);\
680 c= LD32(&src3[i*src_stride3+4]);\
681 d= LD32(&src4[i*src_stride4+4]);\
682 l0= (a&0x03030303UL)\
685 h0= ((a&0xFCFCFCFCUL)>>2)\
686 + ((b&0xFCFCFCFCUL)>>2);\
687 l1= (c&0x03030303UL)\
689 h1= ((c&0xFCFCFCFCUL)>>2)\
690 + ((d&0xFCFCFCFCUL)>>2);\
691 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
694 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
695 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
696 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
697 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
699 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
700 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
701 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
702 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
705 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 int i, a0, b0, a1, b1;\
714 for(i=0; i<h; i+=2){\
720 block[0]= (a1+a0)>>2; /* FIXME non put */\
721 block[1]= (b1+b0)>>2;\
731 block[0]= (a1+a0)>>2;\
732 block[1]= (b1+b0)>>2;\
738 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
741 const uint32_t a= LD32(pixels );\
742 const uint32_t b= LD32(pixels+1);\
743 uint32_t l0= (a&0x03030303UL)\
746 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
747 + ((b&0xFCFCFCFCUL)>>2);\
751 for(i=0; i<h; i+=2){\
752 uint32_t a= LD32(pixels );\
753 uint32_t b= LD32(pixels+1);\
754 l1= (a&0x03030303UL)\
756 h1= ((a&0xFCFCFCFCUL)>>2)\
757 + ((b&0xFCFCFCFCUL)>>2);\
758 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
763 l0= (a&0x03030303UL)\
766 h0= ((a&0xFCFCFCFCUL)>>2)\
767 + ((b&0xFCFCFCFCUL)>>2);\
768 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
774 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 const uint32_t a= LD32(pixels );\
780 const uint32_t b= LD32(pixels+1);\
781 uint32_t l0= (a&0x03030303UL)\
784 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
785 + ((b&0xFCFCFCFCUL)>>2);\
789 for(i=0; i<h; i+=2){\
790 uint32_t a= LD32(pixels );\
791 uint32_t b= LD32(pixels+1);\
792 l1= (a&0x03030303UL)\
794 h1= ((a&0xFCFCFCFCUL)>>2)\
795 + ((b&0xFCFCFCFCUL)>>2);\
796 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
801 l0= (a&0x03030303UL)\
804 h0= ((a&0xFCFCFCFCUL)>>2)\
805 + ((b&0xFCFCFCFCUL)>>2);\
806 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
810 pixels+=4-line_size*(h+1);\
811 block +=4-line_size*h;\
815 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
820 const uint32_t a= LD32(pixels );\
821 const uint32_t b= LD32(pixels+1);\
822 uint32_t l0= (a&0x03030303UL)\
825 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
826 + ((b&0xFCFCFCFCUL)>>2);\
830 for(i=0; i<h; i+=2){\
831 uint32_t a= LD32(pixels );\
832 uint32_t b= LD32(pixels+1);\
833 l1= (a&0x03030303UL)\
835 h1= ((a&0xFCFCFCFCUL)>>2)\
836 + ((b&0xFCFCFCFCUL)>>2);\
837 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
842 l0= (a&0x03030303UL)\
845 h0= ((a&0xFCFCFCFCUL)>>2)\
846 + ((b&0xFCFCFCFCUL)>>2);\
847 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
851 pixels+=4-line_size*(h+1);\
852 block +=4-line_size*h;\
856 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
857 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
858 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
859 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
860 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
861 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
862 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
863 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
865 #define op_avg(a, b) a = rnd_avg32(a, b)
867 #define op_put(a, b) a = b
874 #define avg2(a,b) ((a+b+1)>>1)
875 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
878 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
880 const int A=(16-x16)*(16-y16);
881 const int B=( x16)*(16-y16);
882 const int C=(16-x16)*( y16);
883 const int D=( x16)*( y16);
888 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
889 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
890 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
891 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
892 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
893 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
894 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
895 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
901 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
902 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
905 const int s= 1<<shift;
915 for(x=0; x<8; x++){ //XXX FIXME optimize
916 int src_x, src_y, frac_x, frac_y, index;
925 if((unsigned)src_x < width){
926 if((unsigned)src_y < height){
927 index= src_x + src_y*stride;
928 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
929 + src[index +1]* frac_x )*(s-frac_y)
930 + ( src[index+stride ]*(s-frac_x)
931 + src[index+stride+1]* frac_x )* frac_y
934 index= src_x + clip(src_y, 0, height)*stride;
935 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
936 + src[index +1]* frac_x )*s
940 if((unsigned)src_y < height){
941 index= clip(src_x, 0, width) + src_y*stride;
942 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
943 + src[index+stride ]* frac_y )*s
946 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
947 dst[y*stride + x]= src[index ];
959 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
961 case 2: put_pixels2_c (dst, src, stride, height); break;
962 case 4: put_pixels4_c (dst, src, stride, height); break;
963 case 8: put_pixels8_c (dst, src, stride, height); break;
964 case 16:put_pixels16_c(dst, src, stride, height); break;
968 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
970 for (i=0; i < height; i++) {
971 for (j=0; j < width; j++) {
972 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
979 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
981 for (i=0; i < height; i++) {
982 for (j=0; j < width; j++) {
983 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
990 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
992 for (i=0; i < height; i++) {
993 for (j=0; j < width; j++) {
994 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1001 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1003 for (i=0; i < height; i++) {
1004 for (j=0; j < width; j++) {
1005 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1012 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1014 for (i=0; i < height; i++) {
1015 for (j=0; j < width; j++) {
1016 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1023 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1025 for (i=0; i < height; i++) {
1026 for (j=0; j < width; j++) {
1027 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1034 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1036 for (i=0; i < height; i++) {
1037 for (j=0; j < width; j++) {
1038 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1045 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1047 for (i=0; i < height; i++) {
1048 for (j=0; j < width; j++) {
1049 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1056 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1058 case 2: avg_pixels2_c (dst, src, stride, height); break;
1059 case 4: avg_pixels4_c (dst, src, stride, height); break;
1060 case 8: avg_pixels8_c (dst, src, stride, height); break;
1061 case 16:avg_pixels16_c(dst, src, stride, height); break;
1065 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1067 for (i=0; i < height; i++) {
1068 for (j=0; j < width; j++) {
1069 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1076 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1078 for (i=0; i < height; i++) {
1079 for (j=0; j < width; j++) {
1080 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1087 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1089 for (i=0; i < height; i++) {
1090 for (j=0; j < width; j++) {
1091 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1098 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1100 for (i=0; i < height; i++) {
1101 for (j=0; j < width; j++) {
1102 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1109 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1111 for (i=0; i < height; i++) {
1112 for (j=0; j < width; j++) {
1113 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1120 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1122 for (i=0; i < height; i++) {
1123 for (j=0; j < width; j++) {
1124 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1131 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1133 for (i=0; i < height; i++) {
1134 for (j=0; j < width; j++) {
1135 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1142 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1144 for (i=0; i < height; i++) {
1145 for (j=0; j < width; j++) {
1146 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1153 #define TPEL_WIDTH(width)\
1154 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1156 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1158 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1159 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1160 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1161 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1162 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1163 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1164 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1165 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1166 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1167 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1168 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1169 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1170 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1171 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1174 #define H264_CHROMA_MC(OPNAME, OP)\
1175 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1176 const int A=(8-x)*(8-y);\
1177 const int B=( x)*(8-y);\
1178 const int C=(8-x)*( y);\
1179 const int D=( x)*( y);\
1182 assert(x<8 && y<8 && x>=0 && y>=0);\
1186 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1187 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1193 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1194 const int A=(8-x)*(8-y);\
1195 const int B=( x)*(8-y);\
1196 const int C=(8-x)*( y);\
1197 const int D=( x)*( y);\
1200 assert(x<8 && y<8 && x>=0 && y>=0);\
1204 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1205 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1206 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1207 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1213 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1214 const int A=(8-x)*(8-y);\
1215 const int B=( x)*(8-y);\
1216 const int C=(8-x)*( y);\
1217 const int D=( x)*( y);\
1220 assert(x<8 && y<8 && x>=0 && y>=0);\
1224 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1225 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1226 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1227 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1228 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1229 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1230 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1231 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1237 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1238 #define op_put(a, b) a = (((b) + 32)>>6)
1240 H264_CHROMA_MC(put_ , op_put)
1241 H264_CHROMA_MC(avg_ , op_avg)
1245 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1250 ST32(dst , LD32(src ));
1256 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1261 ST32(dst , LD32(src ));
1262 ST32(dst+4 , LD32(src+4 ));
1268 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1273 ST32(dst , LD32(src ));
1274 ST32(dst+4 , LD32(src+4 ));
1275 ST32(dst+8 , LD32(src+8 ));
1276 ST32(dst+12, LD32(src+12));
1282 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1287 ST32(dst , LD32(src ));
1288 ST32(dst+4 , LD32(src+4 ));
1289 ST32(dst+8 , LD32(src+8 ));
1290 ST32(dst+12, LD32(src+12));
1297 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1302 ST32(dst , LD32(src ));
1303 ST32(dst+4 , LD32(src+4 ));
1311 #define QPEL_MC(r, OPNAME, RND, OP) \
1312 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1313 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1317 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1318 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1319 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1320 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1321 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1322 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1323 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1324 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1330 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1332 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1336 const int src0= src[0*srcStride];\
1337 const int src1= src[1*srcStride];\
1338 const int src2= src[2*srcStride];\
1339 const int src3= src[3*srcStride];\
1340 const int src4= src[4*srcStride];\
1341 const int src5= src[5*srcStride];\
1342 const int src6= src[6*srcStride];\
1343 const int src7= src[7*srcStride];\
1344 const int src8= src[8*srcStride];\
1345 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1346 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1347 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1348 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1349 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1350 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1351 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1352 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1358 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1359 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1364 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1365 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1366 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1367 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1368 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1369 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1370 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1371 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1372 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1373 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1374 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1375 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1376 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1377 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1378 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1379 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1385 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1386 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1391 const int src0= src[0*srcStride];\
1392 const int src1= src[1*srcStride];\
1393 const int src2= src[2*srcStride];\
1394 const int src3= src[3*srcStride];\
1395 const int src4= src[4*srcStride];\
1396 const int src5= src[5*srcStride];\
1397 const int src6= src[6*srcStride];\
1398 const int src7= src[7*srcStride];\
1399 const int src8= src[8*srcStride];\
1400 const int src9= src[9*srcStride];\
1401 const int src10= src[10*srcStride];\
1402 const int src11= src[11*srcStride];\
1403 const int src12= src[12*srcStride];\
1404 const int src13= src[13*srcStride];\
1405 const int src14= src[14*srcStride];\
1406 const int src15= src[15*srcStride];\
1407 const int src16= src[16*srcStride];\
1408 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1409 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1410 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1411 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1412 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1413 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1414 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1415 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1416 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1417 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1418 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1419 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1420 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1421 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1422 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1423 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1429 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1430 OPNAME ## pixels8_c(dst, src, stride, 8);\
1433 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1435 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1436 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1439 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1440 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1443 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1445 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1446 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1449 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1450 uint8_t full[16*9];\
1452 copy_block9(full, src, 16, stride, 9);\
1453 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1457 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1458 uint8_t full[16*9];\
1459 copy_block9(full, src, 16, stride, 9);\
1460 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1463 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1464 uint8_t full[16*9];\
1466 copy_block9(full, src, 16, stride, 9);\
1467 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1468 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1470 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1471 uint8_t full[16*9];\
1474 uint8_t halfHV[64];\
1475 copy_block9(full, src, 16, stride, 9);\
1476 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1477 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1478 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1479 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1481 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1482 uint8_t full[16*9];\
1484 uint8_t halfHV[64];\
1485 copy_block9(full, src, 16, stride, 9);\
1486 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1487 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1488 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1489 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1491 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1492 uint8_t full[16*9];\
1495 uint8_t halfHV[64];\
1496 copy_block9(full, src, 16, stride, 9);\
1497 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1498 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1499 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1500 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1502 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1503 uint8_t full[16*9];\
1505 uint8_t halfHV[64];\
1506 copy_block9(full, src, 16, stride, 9);\
1507 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1508 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1509 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1510 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1512 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1513 uint8_t full[16*9];\
1516 uint8_t halfHV[64];\
1517 copy_block9(full, src, 16, stride, 9);\
1518 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1519 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1520 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1521 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1523 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1524 uint8_t full[16*9];\
1526 uint8_t halfHV[64];\
1527 copy_block9(full, src, 16, stride, 9);\
1528 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1529 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1530 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1531 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1533 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1534 uint8_t full[16*9];\
1537 uint8_t halfHV[64];\
1538 copy_block9(full, src, 16, stride, 9);\
1539 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1540 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1541 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1542 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1544 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1545 uint8_t full[16*9];\
1547 uint8_t halfHV[64];\
1548 copy_block9(full, src, 16, stride, 9);\
1549 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1550 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1551 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1554 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1556 uint8_t halfHV[64];\
1557 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1558 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1559 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1561 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1563 uint8_t halfHV[64];\
1564 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1565 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1566 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1568 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1569 uint8_t full[16*9];\
1572 uint8_t halfHV[64];\
1573 copy_block9(full, src, 16, stride, 9);\
1574 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1575 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1576 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1577 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1579 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1580 uint8_t full[16*9];\
1582 copy_block9(full, src, 16, stride, 9);\
1583 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1584 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1585 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1587 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1588 uint8_t full[16*9];\
1591 uint8_t halfHV[64];\
1592 copy_block9(full, src, 16, stride, 9);\
1593 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1594 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1595 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1596 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1598 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1599 uint8_t full[16*9];\
1601 copy_block9(full, src, 16, stride, 9);\
1602 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1603 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1604 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1606 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1608 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1609 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1611 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1612 OPNAME ## pixels16_c(dst, src, stride, 16);\
1615 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1617 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1618 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1621 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1622 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1625 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1627 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1628 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1631 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1632 uint8_t full[24*17];\
1634 copy_block17(full, src, 24, stride, 17);\
1635 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1639 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1640 uint8_t full[24*17];\
1641 copy_block17(full, src, 24, stride, 17);\
1642 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1645 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1646 uint8_t full[24*17];\
1648 copy_block17(full, src, 24, stride, 17);\
1649 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1650 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1652 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1653 uint8_t full[24*17];\
1654 uint8_t halfH[272];\
1655 uint8_t halfV[256];\
1656 uint8_t halfHV[256];\
1657 copy_block17(full, src, 24, stride, 17);\
1658 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1659 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1660 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1661 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1663 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1664 uint8_t full[24*17];\
1665 uint8_t halfH[272];\
1666 uint8_t halfHV[256];\
1667 copy_block17(full, src, 24, stride, 17);\
1668 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1669 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1670 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1671 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1673 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1674 uint8_t full[24*17];\
1675 uint8_t halfH[272];\
1676 uint8_t halfV[256];\
1677 uint8_t halfHV[256];\
1678 copy_block17(full, src, 24, stride, 17);\
1679 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1680 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1681 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1682 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1684 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1685 uint8_t full[24*17];\
1686 uint8_t halfH[272];\
1687 uint8_t halfHV[256];\
1688 copy_block17(full, src, 24, stride, 17);\
1689 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1690 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1691 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1692 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1694 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1695 uint8_t full[24*17];\
1696 uint8_t halfH[272];\
1697 uint8_t halfV[256];\
1698 uint8_t halfHV[256];\
1699 copy_block17(full, src, 24, stride, 17);\
1700 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1701 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1702 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1703 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1705 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[24*17];\
1707 uint8_t halfH[272];\
1708 uint8_t halfHV[256];\
1709 copy_block17(full, src, 24, stride, 17);\
1710 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1711 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1712 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1713 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1715 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1716 uint8_t full[24*17];\
1717 uint8_t halfH[272];\
1718 uint8_t halfV[256];\
1719 uint8_t halfHV[256];\
1720 copy_block17(full, src, 24, stride, 17);\
1721 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1722 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1723 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1724 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1726 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[24*17];\
1728 uint8_t halfH[272];\
1729 uint8_t halfHV[256];\
1730 copy_block17(full, src, 24, stride, 17);\
1731 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1732 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1733 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1736 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t halfH[272];\
1738 uint8_t halfHV[256];\
1739 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1740 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1741 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1743 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t halfH[272];\
1745 uint8_t halfHV[256];\
1746 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1747 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1748 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1750 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t full[24*17];\
1752 uint8_t halfH[272];\
1753 uint8_t halfV[256];\
1754 uint8_t halfHV[256];\
1755 copy_block17(full, src, 24, stride, 17);\
1756 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1757 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1758 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1759 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1761 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1762 uint8_t full[24*17];\
1763 uint8_t halfH[272];\
1764 copy_block17(full, src, 24, stride, 17);\
1765 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1766 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1767 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1769 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[24*17];\
1771 uint8_t halfH[272];\
1772 uint8_t halfV[256];\
1773 uint8_t halfHV[256];\
1774 copy_block17(full, src, 24, stride, 17);\
1775 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1776 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1777 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1778 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1780 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[24*17];\
1782 uint8_t halfH[272];\
1783 copy_block17(full, src, 24, stride, 17);\
1784 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1785 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1786 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1788 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t halfH[272];\
1790 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1791 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1794 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1795 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1796 #define op_put(a, b) a = cm[((b) + 16)>>5]
1797 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1799 QPEL_MC(0, put_ , _ , op_put)
1800 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1801 QPEL_MC(0, avg_ , _ , op_avg)
1802 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1804 #undef op_avg_no_rnd
1806 #undef op_put_no_rnd
1809 #define H264_LOWPASS(OPNAME, OP, OP2) \
1810 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1812 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1816 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1817 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1818 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1819 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1825 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1827 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1831 const int srcB= src[-2*srcStride];\
1832 const int srcA= src[-1*srcStride];\
1833 const int src0= src[0 *srcStride];\
1834 const int src1= src[1 *srcStride];\
1835 const int src2= src[2 *srcStride];\
1836 const int src3= src[3 *srcStride];\
1837 const int src4= src[4 *srcStride];\
1838 const int src5= src[5 *srcStride];\
1839 const int src6= src[6 *srcStride];\
1840 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1841 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1842 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1843 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1849 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1852 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1854 src -= 2*srcStride;\
1855 for(i=0; i<h+5; i++)\
1857 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1858 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1859 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1860 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1864 tmp -= tmpStride*(h+5-2);\
1867 const int tmpB= tmp[-2*tmpStride];\
1868 const int tmpA= tmp[-1*tmpStride];\
1869 const int tmp0= tmp[0 *tmpStride];\
1870 const int tmp1= tmp[1 *tmpStride];\
1871 const int tmp2= tmp[2 *tmpStride];\
1872 const int tmp3= tmp[3 *tmpStride];\
1873 const int tmp4= tmp[4 *tmpStride];\
1874 const int tmp5= tmp[5 *tmpStride];\
1875 const int tmp6= tmp[6 *tmpStride];\
1876 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1877 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1878 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1879 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1885 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1887 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1891 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1892 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1893 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1894 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1895 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1896 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1897 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1898 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1904 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1906 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1910 const int srcB= src[-2*srcStride];\
1911 const int srcA= src[-1*srcStride];\
1912 const int src0= src[0 *srcStride];\
1913 const int src1= src[1 *srcStride];\
1914 const int src2= src[2 *srcStride];\
1915 const int src3= src[3 *srcStride];\
1916 const int src4= src[4 *srcStride];\
1917 const int src5= src[5 *srcStride];\
1918 const int src6= src[6 *srcStride];\
1919 const int src7= src[7 *srcStride];\
1920 const int src8= src[8 *srcStride];\
1921 const int src9= src[9 *srcStride];\
1922 const int src10=src[10*srcStride];\
1923 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1924 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1925 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1926 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1927 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1928 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1929 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1930 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1936 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1939 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1941 src -= 2*srcStride;\
1942 for(i=0; i<h+5; i++)\
1944 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1945 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1946 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1947 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1948 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1949 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1950 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1951 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1955 tmp -= tmpStride*(h+5-2);\
1958 const int tmpB= tmp[-2*tmpStride];\
1959 const int tmpA= tmp[-1*tmpStride];\
1960 const int tmp0= tmp[0 *tmpStride];\
1961 const int tmp1= tmp[1 *tmpStride];\
1962 const int tmp2= tmp[2 *tmpStride];\
1963 const int tmp3= tmp[3 *tmpStride];\
1964 const int tmp4= tmp[4 *tmpStride];\
1965 const int tmp5= tmp[5 *tmpStride];\
1966 const int tmp6= tmp[6 *tmpStride];\
1967 const int tmp7= tmp[7 *tmpStride];\
1968 const int tmp8= tmp[8 *tmpStride];\
1969 const int tmp9= tmp[9 *tmpStride];\
1970 const int tmp10=tmp[10*tmpStride];\
1971 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1972 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1973 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1974 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1975 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1976 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1977 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1978 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1984 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1985 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1986 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1987 src += 8*srcStride;\
1988 dst += 8*dstStride;\
1989 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1990 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1993 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1994 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1995 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1996 src += 8*srcStride;\
1997 dst += 8*dstStride;\
1998 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1999 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2002 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2003 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2004 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2005 src += 8*srcStride;\
2006 tmp += 8*tmpStride;\
2007 dst += 8*dstStride;\
2008 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2009 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2012 #define H264_MC(OPNAME, SIZE) \
2013 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2014 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2017 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t half[SIZE*SIZE];\
2019 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2020 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2023 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2024 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2027 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t half[SIZE*SIZE];\
2029 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2030 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2033 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[SIZE*(SIZE+5)];\
2035 uint8_t * const full_mid= full + SIZE*2;\
2036 uint8_t half[SIZE*SIZE];\
2037 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2038 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2039 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2042 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2043 uint8_t full[SIZE*(SIZE+5)];\
2044 uint8_t * const full_mid= full + SIZE*2;\
2045 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2046 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2049 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[SIZE*(SIZE+5)];\
2051 uint8_t * const full_mid= full + SIZE*2;\
2052 uint8_t half[SIZE*SIZE];\
2053 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2054 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2055 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2058 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059 uint8_t full[SIZE*(SIZE+5)];\
2060 uint8_t * const full_mid= full + SIZE*2;\
2061 uint8_t halfH[SIZE*SIZE];\
2062 uint8_t halfV[SIZE*SIZE];\
2063 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2064 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2065 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2066 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2069 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2070 uint8_t full[SIZE*(SIZE+5)];\
2071 uint8_t * const full_mid= full + SIZE*2;\
2072 uint8_t halfH[SIZE*SIZE];\
2073 uint8_t halfV[SIZE*SIZE];\
2074 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2075 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2076 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2077 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2080 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2081 uint8_t full[SIZE*(SIZE+5)];\
2082 uint8_t * const full_mid= full + SIZE*2;\
2083 uint8_t halfH[SIZE*SIZE];\
2084 uint8_t halfV[SIZE*SIZE];\
2085 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2086 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2087 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2088 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2091 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2092 uint8_t full[SIZE*(SIZE+5)];\
2093 uint8_t * const full_mid= full + SIZE*2;\
2094 uint8_t halfH[SIZE*SIZE];\
2095 uint8_t halfV[SIZE*SIZE];\
2096 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2097 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2098 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2099 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2102 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2103 int16_t tmp[SIZE*(SIZE+5)];\
2104 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2107 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2108 int16_t tmp[SIZE*(SIZE+5)];\
2109 uint8_t halfH[SIZE*SIZE];\
2110 uint8_t halfHV[SIZE*SIZE];\
2111 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2112 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2113 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2116 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2117 int16_t tmp[SIZE*(SIZE+5)];\
2118 uint8_t halfH[SIZE*SIZE];\
2119 uint8_t halfHV[SIZE*SIZE];\
2120 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2121 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2122 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2125 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2126 uint8_t full[SIZE*(SIZE+5)];\
2127 uint8_t * const full_mid= full + SIZE*2;\
2128 int16_t tmp[SIZE*(SIZE+5)];\
2129 uint8_t halfV[SIZE*SIZE];\
2130 uint8_t halfHV[SIZE*SIZE];\
2131 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2132 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2133 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2134 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2137 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t full[SIZE*(SIZE+5)];\
2139 uint8_t * const full_mid= full + SIZE*2;\
2140 int16_t tmp[SIZE*(SIZE+5)];\
2141 uint8_t halfV[SIZE*SIZE];\
2142 uint8_t halfHV[SIZE*SIZE];\
2143 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2144 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2145 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2146 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2149 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2150 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2151 #define op_put(a, b) a = cm[((b) + 16)>>5]
2152 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2153 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2155 H264_LOWPASS(put_ , op_put, op2_put)
2156 H264_LOWPASS(avg_ , op_avg, op2_avg)
2170 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2175 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2176 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2177 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2178 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2179 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2180 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2181 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2182 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2188 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2189 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2193 const int src_1= src[ -srcStride];
2194 const int src0 = src[0 ];
2195 const int src1 = src[ srcStride];
2196 const int src2 = src[2*srcStride];
2197 const int src3 = src[3*srcStride];
2198 const int src4 = src[4*srcStride];
2199 const int src5 = src[5*srcStride];
2200 const int src6 = src[6*srcStride];
2201 const int src7 = src[7*srcStride];
2202 const int src8 = src[8*srcStride];
2203 const int src9 = src[9*srcStride];
2204 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2205 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2206 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2207 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2208 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2209 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2210 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2211 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2217 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2218 put_pixels8_c(dst, src, stride, 8);
2221 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2223 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2224 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2227 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2228 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2231 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2233 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2234 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2237 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2238 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2241 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2245 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2246 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2247 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2248 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2250 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2254 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2255 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2256 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2257 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2259 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2261 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2262 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2265 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2267 const int strength= ff_h263_loop_filter_strength[qscale];
2271 int p0= src[x-2*stride];
2272 int p1= src[x-1*stride];
2273 int p2= src[x+0*stride];
2274 int p3= src[x+1*stride];
2275 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2277 if (d<-2*strength) d1= 0;
2278 else if(d<- strength) d1=-2*strength - d;
2279 else if(d< strength) d1= d;
2280 else if(d< 2*strength) d1= 2*strength - d;
2285 if(p1&256) p1= ~(p1>>31);
2286 if(p2&256) p2= ~(p2>>31);
2288 src[x-1*stride] = p1;
2289 src[x+0*stride] = p2;
2293 d2= clip((p0-p3)/4, -ad1, ad1);
2295 src[x-2*stride] = p0 - d2;
2296 src[x+ stride] = p3 + d2;
2300 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2302 const int strength= ff_h263_loop_filter_strength[qscale];
2306 int p0= src[y*stride-2];
2307 int p1= src[y*stride-1];
2308 int p2= src[y*stride+0];
2309 int p3= src[y*stride+1];
2310 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2312 if (d<-2*strength) d1= 0;
2313 else if(d<- strength) d1=-2*strength - d;
2314 else if(d< strength) d1= d;
2315 else if(d< 2*strength) d1= 2*strength - d;
2320 if(p1&256) p1= ~(p1>>31);
2321 if(p2&256) p2= ~(p2>>31);
2323 src[y*stride-1] = p1;
2324 src[y*stride+0] = p2;
2328 d2= clip((p0-p3)/4, -ad1, ad1);
2330 src[y*stride-2] = p0 - d2;
2331 src[y*stride+1] = p3 + d2;
2335 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2341 s += abs(pix1[0] - pix2[0]);
2342 s += abs(pix1[1] - pix2[1]);
2343 s += abs(pix1[2] - pix2[2]);
2344 s += abs(pix1[3] - pix2[3]);
2345 s += abs(pix1[4] - pix2[4]);
2346 s += abs(pix1[5] - pix2[5]);
2347 s += abs(pix1[6] - pix2[6]);
2348 s += abs(pix1[7] - pix2[7]);
2349 s += abs(pix1[8] - pix2[8]);
2350 s += abs(pix1[9] - pix2[9]);
2351 s += abs(pix1[10] - pix2[10]);
2352 s += abs(pix1[11] - pix2[11]);
2353 s += abs(pix1[12] - pix2[12]);
2354 s += abs(pix1[13] - pix2[13]);
2355 s += abs(pix1[14] - pix2[14]);
2356 s += abs(pix1[15] - pix2[15]);
2363 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2369 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2370 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2371 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2372 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2373 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2374 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2375 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2376 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2377 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2378 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2379 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2380 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2381 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2382 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2383 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2384 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2391 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2394 uint8_t *pix3 = pix2 + line_size;
2398 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2399 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2400 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2401 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2402 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2403 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2404 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2405 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2406 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2407 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2408 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2409 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2410 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2411 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2412 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2413 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2421 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2424 uint8_t *pix3 = pix2 + line_size;
2428 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2429 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2430 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2431 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2432 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2433 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2434 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2435 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2436 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2437 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2438 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2439 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2440 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2441 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2442 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2443 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2451 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2457 s += abs(pix1[0] - pix2[0]);
2458 s += abs(pix1[1] - pix2[1]);
2459 s += abs(pix1[2] - pix2[2]);
2460 s += abs(pix1[3] - pix2[3]);
2461 s += abs(pix1[4] - pix2[4]);
2462 s += abs(pix1[5] - pix2[5]);
2463 s += abs(pix1[6] - pix2[6]);
2464 s += abs(pix1[7] - pix2[7]);
2471 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2477 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2478 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2479 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2480 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2481 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2482 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2483 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2484 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2491 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2494 uint8_t *pix3 = pix2 + line_size;
2498 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2499 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2500 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2501 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2502 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2503 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2504 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2505 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2513 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2516 uint8_t *pix3 = pix2 + line_size;
2520 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2521 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2522 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2523 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2524 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2525 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2526 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2527 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2536 * permutes an 8x8 block.
2537 * @param block the block which will be permuted according to the given permutation vector
2538 * @param permutation the permutation vector
2539 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2540 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2541 * (inverse) permutated to scantable order!
2543 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2549 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2551 for(i=0; i<=last; i++){
2552 const int j= scantable[i];
2557 for(i=0; i<=last; i++){
2558 const int j= scantable[i];
2559 const int perm_j= permutation[j];
2560 block[perm_j]= temp[j];
2564 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2568 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2571 memset(cmp, 0, sizeof(void*)*5);
2579 cmp[i]= c->hadamard8_diff[i];
2585 cmp[i]= c->dct_sad[i];
2588 cmp[i]= c->quant_psnr[i];
2606 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2612 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2614 static void clear_blocks_c(DCTELEM *blocks)
2616 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2619 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2621 for(i=0; i+7<w; i+=8){
2622 dst[i+0] += src[i+0];
2623 dst[i+1] += src[i+1];
2624 dst[i+2] += src[i+2];
2625 dst[i+3] += src[i+3];
2626 dst[i+4] += src[i+4];
2627 dst[i+5] += src[i+5];
2628 dst[i+6] += src[i+6];
2629 dst[i+7] += src[i+7];
2632 dst[i+0] += src[i+0];
2635 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2637 for(i=0; i+7<w; i+=8){
2638 dst[i+0] = src1[i+0]-src2[i+0];
2639 dst[i+1] = src1[i+1]-src2[i+1];
2640 dst[i+2] = src1[i+2]-src2[i+2];
2641 dst[i+3] = src1[i+3]-src2[i+3];
2642 dst[i+4] = src1[i+4]-src2[i+4];
2643 dst[i+5] = src1[i+5]-src2[i+5];
2644 dst[i+6] = src1[i+6]-src2[i+6];
2645 dst[i+7] = src1[i+7]-src2[i+7];
2648 dst[i+0] = src1[i+0]-src2[i+0];
2651 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2659 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2669 #define BUTTERFLY2(o1,o2,i1,i2) \
2673 #define BUTTERFLY1(x,y) \
2682 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2684 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2692 //FIXME try pointer walks
2693 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2694 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2695 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2696 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2698 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2699 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2700 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2701 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2703 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2704 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2705 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2706 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2710 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2711 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2712 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2713 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2715 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2716 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2717 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2718 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2721 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2722 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2723 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2724 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2730 printf("MAX:%d\n", maxi);
2736 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2744 //FIXME try pointer walks
2745 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2746 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2747 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2748 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2750 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2751 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2752 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2753 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2755 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2756 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2757 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2758 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2762 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2763 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2764 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2765 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2767 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2768 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2769 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2770 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2773 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2774 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2775 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2776 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2779 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
2784 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2785 MpegEncContext * const s= (MpegEncContext *)c;
2786 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2787 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2792 s->dsp.diff_pixels(temp, src1, src2, stride);
2801 void simple_idct(DCTELEM *block); //FIXME
2803 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2804 MpegEncContext * const s= (MpegEncContext *)c;
2805 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2806 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2807 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2813 s->dsp.diff_pixels(temp, src1, src2, stride);
2815 memcpy(bak, temp, 64*sizeof(DCTELEM));
2817 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2818 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2819 simple_idct(temp); //FIXME
2822 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2827 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2828 MpegEncContext * const s= (MpegEncContext *)c;
2829 const uint8_t *scantable= s->intra_scantable.permutated;
2830 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2831 uint64_t __align8 aligned_bak[stride];
2832 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2833 uint8_t * const bak= (uint8_t*)aligned_bak;
2834 int i, last, run, bits, level, distoration, start_i;
2835 const int esc_length= s->ac_esc_length;
2837 uint8_t * last_length;
2842 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2843 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2846 s->dsp.diff_pixels(temp, src1, src2, stride);
2848 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2854 length = s->intra_ac_vlc_length;
2855 last_length= s->intra_ac_vlc_last_length;
2856 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2859 length = s->inter_ac_vlc_length;
2860 last_length= s->inter_ac_vlc_last_length;
2865 for(i=start_i; i<last; i++){
2866 int j= scantable[i];
2871 if((level&(~127)) == 0){
2872 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2881 level= temp[i] + 64;
2885 if((level&(~127)) == 0){
2886 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2894 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2896 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2899 s->dsp.idct_add(bak, stride, temp);
2901 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
2903 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2906 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2907 MpegEncContext * const s= (MpegEncContext *)c;
2908 const uint8_t *scantable= s->intra_scantable.permutated;
2909 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2910 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2911 int i, last, run, bits, level, start_i;
2912 const int esc_length= s->ac_esc_length;
2914 uint8_t * last_length;
2918 s->dsp.diff_pixels(temp, src1, src2, stride);
2920 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2926 length = s->intra_ac_vlc_length;
2927 last_length= s->intra_ac_vlc_last_length;
2928 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2931 length = s->inter_ac_vlc_length;
2932 last_length= s->inter_ac_vlc_last_length;
2937 for(i=start_i; i<last; i++){
2938 int j= scantable[i];
2943 if((level&(~127)) == 0){
2944 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2953 level= temp[i] + 64;
2957 if((level&(~127)) == 0){
2958 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2966 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
2971 for(x=0; x<16; x+=4){
2972 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
2973 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
2981 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2986 for(x=0; x<16; x++){
2987 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2996 #define SQ(a) ((a)*(a))
2997 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3002 for(x=0; x<16; x+=4){
3003 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3004 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3012 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3017 for(x=0; x<16; x++){
3018 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3027 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3028 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3029 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3030 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3031 WARPER8_16_SQ(rd8x8_c, rd16_c)
3032 WARPER8_16_SQ(bit8x8_c, bit16_c)
3034 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3036 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3039 put_pixels_clamped_c(block, dest, line_size);
3041 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3044 add_pixels_clamped_c(block, dest, line_size);
3047 /* init static data */
3048 void dsputil_static_init(void)
3052 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3053 for(i=0;i<MAX_NEG_CROP;i++) {
3055 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3058 for(i=0;i<512;i++) {
3059 squareTbl[i] = (i - 256) * (i - 256);
3062 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3066 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3070 #ifdef CONFIG_ENCODERS
3071 if(avctx->dct_algo==FF_DCT_FASTINT) {
3072 c->fdct = fdct_ifast;
3073 c->fdct248 = fdct_ifast248;
3075 else if(avctx->dct_algo==FF_DCT_FAAN) {
3076 c->fdct = ff_faandct;
3077 c->fdct248 = ff_faandct248;
3080 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3081 c->fdct248 = ff_fdct248_islow;
3083 #endif //CONFIG_ENCODERS
3085 if(avctx->idct_algo==FF_IDCT_INT){
3086 c->idct_put= ff_jref_idct_put;
3087 c->idct_add= ff_jref_idct_add;
3088 c->idct = j_rev_dct;
3089 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3090 }else{ //accurate/default
3091 c->idct_put= simple_idct_put;
3092 c->idct_add= simple_idct_add;
3093 c->idct = simple_idct;
3094 c->idct_permutation_type= FF_NO_IDCT_PERM;
3097 c->get_pixels = get_pixels_c;
3098 c->diff_pixels = diff_pixels_c;
3099 c->put_pixels_clamped = put_pixels_clamped_c;
3100 c->add_pixels_clamped = add_pixels_clamped_c;
3103 c->clear_blocks = clear_blocks_c;
3104 c->pix_sum = pix_sum_c;
3105 c->pix_norm1 = pix_norm1_c;
3107 /* TODO [0] 16 [1] 8 */
3108 c->pix_abs[0][0] = pix_abs16_c;
3109 c->pix_abs[0][1] = pix_abs16_x2_c;
3110 c->pix_abs[0][2] = pix_abs16_y2_c;
3111 c->pix_abs[0][3] = pix_abs16_xy2_c;
3112 c->pix_abs[1][0] = pix_abs8_c;
3113 c->pix_abs[1][1] = pix_abs8_x2_c;
3114 c->pix_abs[1][2] = pix_abs8_y2_c;
3115 c->pix_abs[1][3] = pix_abs8_xy2_c;
3117 #define dspfunc(PFX, IDX, NUM) \
3118 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3119 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3120 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3121 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3123 dspfunc(put, 0, 16);
3124 dspfunc(put_no_rnd, 0, 16);
3126 dspfunc(put_no_rnd, 1, 8);
3130 dspfunc(avg, 0, 16);
3131 dspfunc(avg_no_rnd, 0, 16);
3133 dspfunc(avg_no_rnd, 1, 8);
3138 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3139 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3140 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3141 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3142 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3143 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3144 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3145 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3146 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3148 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3149 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3150 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3151 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3152 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3153 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3154 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3155 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3156 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3158 #define dspfunc(PFX, IDX, NUM) \
3159 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3160 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3161 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3162 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3163 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3164 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3165 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3166 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3167 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3168 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3169 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3170 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3171 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3172 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3173 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3174 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3176 dspfunc(put_qpel, 0, 16);
3177 dspfunc(put_no_rnd_qpel, 0, 16);
3179 dspfunc(avg_qpel, 0, 16);
3180 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3182 dspfunc(put_qpel, 1, 8);
3183 dspfunc(put_no_rnd_qpel, 1, 8);
3185 dspfunc(avg_qpel, 1, 8);
3186 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3188 dspfunc(put_h264_qpel, 0, 16);
3189 dspfunc(put_h264_qpel, 1, 8);
3190 dspfunc(put_h264_qpel, 2, 4);
3191 dspfunc(avg_h264_qpel, 0, 16);
3192 dspfunc(avg_h264_qpel, 1, 8);
3193 dspfunc(avg_h264_qpel, 2, 4);
3196 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3197 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3198 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3199 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3200 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3201 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3203 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3204 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3205 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3206 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3207 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3208 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3209 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3210 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3212 #define SET_CMP_FUNC(name) \
3213 c->name[0]= name ## 16_c;\
3214 c->name[1]= name ## 8x8_c;
3216 SET_CMP_FUNC(hadamard8_diff)
3217 c->hadamard8_diff[4]= hadamard8_intra16_c;
3218 SET_CMP_FUNC(dct_sad)
3219 c->sad[0]= pix_abs16_c;
3220 c->sad[1]= pix_abs8_c;
3223 SET_CMP_FUNC(quant_psnr)
3226 c->vsad[0]= vsad16_c;
3227 c->vsad[4]= vsad_intra16_c;
3228 c->vsse[0]= vsse16_c;
3229 c->vsse[4]= vsse_intra16_c;
3231 c->add_bytes= add_bytes_c;
3232 c->diff_bytes= diff_bytes_c;
3233 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3234 c->bswap_buf= bswap_buf;
3236 c->h263_h_loop_filter= h263_h_loop_filter_c;
3237 c->h263_v_loop_filter= h263_v_loop_filter_c;
3240 dsputil_init_mmx(c, avctx);
3243 dsputil_init_armv4l(c, avctx);
3246 dsputil_init_mlib(c, avctx);
3249 dsputil_init_alpha(c, avctx);
3252 dsputil_init_ppc(c, avctx);
3255 dsputil_init_mmi(c, avctx);
3258 dsputil_init_sh4(c,avctx);
3261 switch(c->idct_permutation_type){
3262 case FF_NO_IDCT_PERM:
3264 c->idct_permutation[i]= i;
3266 case FF_LIBMPEG2_IDCT_PERM:
3268 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3270 case FF_SIMPLE_IDCT_PERM:
3272 c->idct_permutation[i]= simple_mmx_permutation[i];
3274 case FF_TRANSPOSE_IDCT_PERM:
3276 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3279 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");