3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
47 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 uint16_t __align8 inv_zigzag_direct16[64];
50 const uint8_t ff_alternate_horizontal_scan[64] = {
51 0, 1, 2, 3, 8, 9, 16, 17,
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
61 const uint8_t ff_alternate_vertical_scan[64] = {
62 0, 8, 16, 24, 1, 9, 2, 10,
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
72 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 const uint32_t inverse[256]={
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120 static int pix_sum_c(uint8_t * pix, int line_size)
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
137 pix += line_size - 16;
142 static int pix_norm1_c(uint8_t * pix, int line_size)
145 uint32_t *sq = squareTbl + 256;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 #if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
171 register uint32_t x=*(uint32_t*)pix;
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
185 pix += line_size - 16;
190 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
193 for(i=0; i+8<=w; i+=8){
194 dst[i+0]= bswap_32(src[i+0]);
195 dst[i+1]= bswap_32(src[i+1]);
196 dst[i+2]= bswap_32(src[i+2]);
197 dst[i+3]= bswap_32(src[i+3]);
198 dst[i+4]= bswap_32(src[i+4]);
199 dst[i+5]= bswap_32(src[i+5]);
200 dst[i+6]= bswap_32(src[i+6]);
201 dst[i+7]= bswap_32(src[i+7]);
204 dst[i+0]= bswap_32(src[i+0]);
208 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
211 uint32_t *sq = squareTbl + 256;
214 for (i = 0; i < 8; i++) {
215 s += sq[pix1[0] - pix2[0]];
216 s += sq[pix1[1] - pix2[1]];
217 s += sq[pix1[2] - pix2[2]];
218 s += sq[pix1[3] - pix2[3]];
219 s += sq[pix1[4] - pix2[4]];
220 s += sq[pix1[5] - pix2[5]];
221 s += sq[pix1[6] - pix2[6]];
222 s += sq[pix1[7] - pix2[7]];
229 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
232 uint32_t *sq = squareTbl + 256;
235 for (i = 0; i < 16; i++) {
236 s += sq[pix1[ 0] - pix2[ 0]];
237 s += sq[pix1[ 1] - pix2[ 1]];
238 s += sq[pix1[ 2] - pix2[ 2]];
239 s += sq[pix1[ 3] - pix2[ 3]];
240 s += sq[pix1[ 4] - pix2[ 4]];
241 s += sq[pix1[ 5] - pix2[ 5]];
242 s += sq[pix1[ 6] - pix2[ 6]];
243 s += sq[pix1[ 7] - pix2[ 7]];
244 s += sq[pix1[ 8] - pix2[ 8]];
245 s += sq[pix1[ 9] - pix2[ 9]];
246 s += sq[pix1[10] - pix2[10]];
247 s += sq[pix1[11] - pix2[11]];
248 s += sq[pix1[12] - pix2[12]];
249 s += sq[pix1[13] - pix2[13]];
250 s += sq[pix1[14] - pix2[14]];
251 s += sq[pix1[15] - pix2[15]];
259 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
263 /* read the pixels */
265 block[0] = pixels[0];
266 block[1] = pixels[1];
267 block[2] = pixels[2];
268 block[3] = pixels[3];
269 block[4] = pixels[4];
270 block[5] = pixels[5];
271 block[6] = pixels[6];
272 block[7] = pixels[7];
278 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
279 const uint8_t *s2, int stride){
282 /* read the pixels */
284 block[0] = s1[0] - s2[0];
285 block[1] = s1[1] - s2[1];
286 block[2] = s1[2] - s2[2];
287 block[3] = s1[3] - s2[3];
288 block[4] = s1[4] - s2[4];
289 block[5] = s1[5] - s2[5];
290 block[6] = s1[6] - s2[6];
291 block[7] = s1[7] - s2[7];
299 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
303 uint8_t *cm = cropTbl + MAX_NEG_CROP;
305 /* read the pixels */
307 pixels[0] = cm[block[0]];
308 pixels[1] = cm[block[1]];
309 pixels[2] = cm[block[2]];
310 pixels[3] = cm[block[3]];
311 pixels[4] = cm[block[4]];
312 pixels[5] = cm[block[5]];
313 pixels[6] = cm[block[6]];
314 pixels[7] = cm[block[7]];
321 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
325 uint8_t *cm = cropTbl + MAX_NEG_CROP;
327 /* read the pixels */
329 pixels[0] = cm[pixels[0] + block[0]];
330 pixels[1] = cm[pixels[1] + block[1]];
331 pixels[2] = cm[pixels[2] + block[2]];
332 pixels[3] = cm[pixels[3] + block[3]];
333 pixels[4] = cm[pixels[4] + block[4]];
334 pixels[5] = cm[pixels[5] + block[5]];
335 pixels[6] = cm[pixels[6] + block[6]];
336 pixels[7] = cm[pixels[7] + block[7]];
343 #define PIXOP2(OPNAME, OP) \
344 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
348 OP(*((uint64_t*)block), LD64(pixels));\
354 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
358 const uint64_t a= LD64(pixels );\
359 const uint64_t b= LD64(pixels+1);\
360 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
366 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
370 const uint64_t a= LD64(pixels );\
371 const uint64_t b= LD64(pixels+1);\
372 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
378 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
382 const uint64_t a= LD64(pixels );\
383 const uint64_t b= LD64(pixels+line_size);\
384 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
390 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
394 const uint64_t a= LD64(pixels );\
395 const uint64_t b= LD64(pixels+line_size);\
396 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
402 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
405 const uint64_t a= LD64(pixels );\
406 const uint64_t b= LD64(pixels+1);\
407 uint64_t l0= (a&0x0303030303030303ULL)\
408 + (b&0x0303030303030303ULL)\
409 + 0x0202020202020202ULL;\
410 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
411 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 for(i=0; i<h; i+=2){\
416 uint64_t a= LD64(pixels );\
417 uint64_t b= LD64(pixels+1);\
418 l1= (a&0x0303030303030303ULL)\
419 + (b&0x0303030303030303ULL);\
420 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
421 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
422 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
427 l0= (a&0x0303030303030303ULL)\
428 + (b&0x0303030303030303ULL)\
429 + 0x0202020202020202ULL;\
430 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
431 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
432 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
438 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
441 const uint64_t a= LD64(pixels );\
442 const uint64_t b= LD64(pixels+1);\
443 uint64_t l0= (a&0x0303030303030303ULL)\
444 + (b&0x0303030303030303ULL)\
445 + 0x0101010101010101ULL;\
446 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
447 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 for(i=0; i<h; i+=2){\
452 uint64_t a= LD64(pixels );\
453 uint64_t b= LD64(pixels+1);\
454 l1= (a&0x0303030303030303ULL)\
455 + (b&0x0303030303030303ULL);\
456 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
457 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
458 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
463 l0= (a&0x0303030303030303ULL)\
464 + (b&0x0303030303030303ULL)\
465 + 0x0101010101010101ULL;\
466 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
467 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
468 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
474 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
475 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
476 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
477 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
478 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
479 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
480 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
482 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
483 #else // 64 bit variant
485 #define PIXOP2(OPNAME, OP) \
486 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
489 OP(*((uint16_t*)(block )), LD16(pixels ));\
494 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
497 OP(*((uint32_t*)(block )), LD32(pixels ));\
502 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
505 OP(*((uint32_t*)(block )), LD32(pixels ));\
506 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
511 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
512 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
515 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
516 int src_stride1, int src_stride2, int h){\
520 a= LD32(&src1[i*src_stride1 ]);\
521 b= LD32(&src2[i*src_stride2 ]);\
522 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
523 a= LD32(&src1[i*src_stride1+4]);\
524 b= LD32(&src2[i*src_stride2+4]);\
525 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
529 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
534 a= LD32(&src1[i*src_stride1 ]);\
535 b= LD32(&src2[i*src_stride2 ]);\
536 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
537 a= LD32(&src1[i*src_stride1+4]);\
538 b= LD32(&src2[i*src_stride2+4]);\
539 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
543 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
544 int src_stride1, int src_stride2, int h){\
548 a= LD32(&src1[i*src_stride1 ]);\
549 b= LD32(&src2[i*src_stride2 ]);\
550 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
554 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
555 int src_stride1, int src_stride2, int h){\
559 a= LD16(&src1[i*src_stride1 ]);\
560 b= LD16(&src2[i*src_stride2 ]);\
561 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
565 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
566 int src_stride1, int src_stride2, int h){\
567 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
568 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
571 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
572 int src_stride1, int src_stride2, int h){\
573 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
574 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
577 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
578 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
581 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
582 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
585 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
586 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
589 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
590 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
593 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
594 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
597 uint32_t a, b, c, d, l0, l1, h0, h1;\
598 a= LD32(&src1[i*src_stride1]);\
599 b= LD32(&src2[i*src_stride2]);\
600 c= LD32(&src3[i*src_stride3]);\
601 d= LD32(&src4[i*src_stride4]);\
602 l0= (a&0x03030303UL)\
605 h0= ((a&0xFCFCFCFCUL)>>2)\
606 + ((b&0xFCFCFCFCUL)>>2);\
607 l1= (c&0x03030303UL)\
609 h1= ((c&0xFCFCFCFCUL)>>2)\
610 + ((d&0xFCFCFCFCUL)>>2);\
611 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
612 a= LD32(&src1[i*src_stride1+4]);\
613 b= LD32(&src2[i*src_stride2+4]);\
614 c= LD32(&src3[i*src_stride3+4]);\
615 d= LD32(&src4[i*src_stride4+4]);\
616 l0= (a&0x03030303UL)\
619 h0= ((a&0xFCFCFCFCUL)>>2)\
620 + ((b&0xFCFCFCFCUL)>>2);\
621 l1= (c&0x03030303UL)\
623 h1= ((c&0xFCFCFCFCUL)>>2)\
624 + ((d&0xFCFCFCFCUL)>>2);\
625 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
629 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
630 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
633 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
634 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
637 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
638 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
641 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
642 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
645 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
646 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
649 uint32_t a, b, c, d, l0, l1, h0, h1;\
650 a= LD32(&src1[i*src_stride1]);\
651 b= LD32(&src2[i*src_stride2]);\
652 c= LD32(&src3[i*src_stride3]);\
653 d= LD32(&src4[i*src_stride4]);\
654 l0= (a&0x03030303UL)\
657 h0= ((a&0xFCFCFCFCUL)>>2)\
658 + ((b&0xFCFCFCFCUL)>>2);\
659 l1= (c&0x03030303UL)\
661 h1= ((c&0xFCFCFCFCUL)>>2)\
662 + ((d&0xFCFCFCFCUL)>>2);\
663 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
664 a= LD32(&src1[i*src_stride1+4]);\
665 b= LD32(&src2[i*src_stride2+4]);\
666 c= LD32(&src3[i*src_stride3+4]);\
667 d= LD32(&src4[i*src_stride4+4]);\
668 l0= (a&0x03030303UL)\
671 h0= ((a&0xFCFCFCFCUL)>>2)\
672 + ((b&0xFCFCFCFCUL)>>2);\
673 l1= (c&0x03030303UL)\
675 h1= ((c&0xFCFCFCFCUL)>>2)\
676 + ((d&0xFCFCFCFCUL)>>2);\
677 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
680 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
681 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
682 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
683 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
685 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
686 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
687 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
688 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
691 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 int i, a0, b0, a1, b1;\
700 for(i=0; i<h; i+=2){\
706 block[0]= (a1+a0)>>2; /* FIXME non put */\
707 block[1]= (b1+b0)>>2;\
717 block[0]= (a1+a0)>>2;\
718 block[1]= (b1+b0)>>2;\
724 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
727 const uint32_t a= LD32(pixels );\
728 const uint32_t b= LD32(pixels+1);\
729 uint32_t l0= (a&0x03030303UL)\
732 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
733 + ((b&0xFCFCFCFCUL)>>2);\
737 for(i=0; i<h; i+=2){\
738 uint32_t a= LD32(pixels );\
739 uint32_t b= LD32(pixels+1);\
740 l1= (a&0x03030303UL)\
742 h1= ((a&0xFCFCFCFCUL)>>2)\
743 + ((b&0xFCFCFCFCUL)>>2);\
744 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
749 l0= (a&0x03030303UL)\
752 h0= ((a&0xFCFCFCFCUL)>>2)\
753 + ((b&0xFCFCFCFCUL)>>2);\
754 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
760 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
765 const uint32_t a= LD32(pixels );\
766 const uint32_t b= LD32(pixels+1);\
767 uint32_t l0= (a&0x03030303UL)\
770 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
771 + ((b&0xFCFCFCFCUL)>>2);\
775 for(i=0; i<h; i+=2){\
776 uint32_t a= LD32(pixels );\
777 uint32_t b= LD32(pixels+1);\
778 l1= (a&0x03030303UL)\
780 h1= ((a&0xFCFCFCFCUL)>>2)\
781 + ((b&0xFCFCFCFCUL)>>2);\
782 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
787 l0= (a&0x03030303UL)\
790 h0= ((a&0xFCFCFCFCUL)>>2)\
791 + ((b&0xFCFCFCFCUL)>>2);\
792 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
796 pixels+=4-line_size*(h+1);\
797 block +=4-line_size*h;\
801 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
806 const uint32_t a= LD32(pixels );\
807 const uint32_t b= LD32(pixels+1);\
808 uint32_t l0= (a&0x03030303UL)\
811 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
812 + ((b&0xFCFCFCFCUL)>>2);\
816 for(i=0; i<h; i+=2){\
817 uint32_t a= LD32(pixels );\
818 uint32_t b= LD32(pixels+1);\
819 l1= (a&0x03030303UL)\
821 h1= ((a&0xFCFCFCFCUL)>>2)\
822 + ((b&0xFCFCFCFCUL)>>2);\
823 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
828 l0= (a&0x03030303UL)\
831 h0= ((a&0xFCFCFCFCUL)>>2)\
832 + ((b&0xFCFCFCFCUL)>>2);\
833 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
837 pixels+=4-line_size*(h+1);\
838 block +=4-line_size*h;\
842 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
843 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
844 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
845 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
846 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
847 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
848 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
849 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
851 #define op_avg(a, b) a = rnd_avg32(a, b)
853 #define op_put(a, b) a = b
860 #define avg2(a,b) ((a+b+1)>>1)
861 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
864 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
866 const int A=(16-x16)*(16-y16);
867 const int B=( x16)*(16-y16);
868 const int C=(16-x16)*( y16);
869 const int D=( x16)*( y16);
874 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
875 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
876 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
877 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
878 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
879 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
880 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
881 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
887 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
888 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
891 const int s= 1<<shift;
901 for(x=0; x<8; x++){ //XXX FIXME optimize
902 int src_x, src_y, frac_x, frac_y, index;
911 if((unsigned)src_x < width){
912 if((unsigned)src_y < height){
913 index= src_x + src_y*stride;
914 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
915 + src[index +1]* frac_x )*(s-frac_y)
916 + ( src[index+stride ]*(s-frac_x)
917 + src[index+stride+1]* frac_x )* frac_y
920 index= src_x + clip(src_y, 0, height)*stride;
921 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
922 + src[index +1]* frac_x )*s
926 if((unsigned)src_y < height){
927 index= clip(src_x, 0, width) + src_y*stride;
928 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
929 + src[index+stride ]* frac_y )*s
932 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
933 dst[y*stride + x]= src[index ];
945 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
947 case 2: put_pixels2_c (dst, src, stride, height); break;
948 case 4: put_pixels4_c (dst, src, stride, height); break;
949 case 8: put_pixels8_c (dst, src, stride, height); break;
950 case 16:put_pixels16_c(dst, src, stride, height); break;
954 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
956 for (i=0; i < height; i++) {
957 for (j=0; j < width; j++) {
958 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
965 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
967 for (i=0; i < height; i++) {
968 for (j=0; j < width; j++) {
969 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
976 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
978 for (i=0; i < height; i++) {
979 for (j=0; j < width; j++) {
980 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
987 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
989 for (i=0; i < height; i++) {
990 for (j=0; j < width; j++) {
991 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
998 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1000 for (i=0; i < height; i++) {
1001 for (j=0; j < width; j++) {
1002 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1009 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1011 for (i=0; i < height; i++) {
1012 for (j=0; j < width; j++) {
1013 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1020 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1022 for (i=0; i < height; i++) {
1023 for (j=0; j < width; j++) {
1024 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1031 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1033 for (i=0; i < height; i++) {
1034 for (j=0; j < width; j++) {
1035 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1042 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1044 case 2: avg_pixels2_c (dst, src, stride, height); break;
1045 case 4: avg_pixels4_c (dst, src, stride, height); break;
1046 case 8: avg_pixels8_c (dst, src, stride, height); break;
1047 case 16:avg_pixels16_c(dst, src, stride, height); break;
1051 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1053 for (i=0; i < height; i++) {
1054 for (j=0; j < width; j++) {
1055 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1062 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1064 for (i=0; i < height; i++) {
1065 for (j=0; j < width; j++) {
1066 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1073 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1075 for (i=0; i < height; i++) {
1076 for (j=0; j < width; j++) {
1077 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1084 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1086 for (i=0; i < height; i++) {
1087 for (j=0; j < width; j++) {
1088 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1095 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1097 for (i=0; i < height; i++) {
1098 for (j=0; j < width; j++) {
1099 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1106 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1108 for (i=0; i < height; i++) {
1109 for (j=0; j < width; j++) {
1110 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1117 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1119 for (i=0; i < height; i++) {
1120 for (j=0; j < width; j++) {
1121 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1128 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1130 for (i=0; i < height; i++) {
1131 for (j=0; j < width; j++) {
1132 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1139 #define TPEL_WIDTH(width)\
1140 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1141 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1142 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1143 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1144 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1145 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1146 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1147 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1148 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1149 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1150 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1151 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1152 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1153 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1154 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1155 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1156 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1157 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1160 #define H264_CHROMA_MC(OPNAME, OP)\
1161 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1162 const int A=(8-x)*(8-y);\
1163 const int B=( x)*(8-y);\
1164 const int C=(8-x)*( y);\
1165 const int D=( x)*( y);\
1168 assert(x<8 && y<8 && x>=0 && y>=0);\
1172 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1173 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1179 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1180 const int A=(8-x)*(8-y);\
1181 const int B=( x)*(8-y);\
1182 const int C=(8-x)*( y);\
1183 const int D=( x)*( y);\
1186 assert(x<8 && y<8 && x>=0 && y>=0);\
1190 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1191 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1192 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1193 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1199 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1200 const int A=(8-x)*(8-y);\
1201 const int B=( x)*(8-y);\
1202 const int C=(8-x)*( y);\
1203 const int D=( x)*( y);\
1206 assert(x<8 && y<8 && x>=0 && y>=0);\
1210 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1211 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1212 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1213 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1214 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1215 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1216 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1217 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1223 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1224 #define op_put(a, b) a = (((b) + 32)>>6)
1226 H264_CHROMA_MC(put_ , op_put)
1227 H264_CHROMA_MC(avg_ , op_avg)
1231 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1236 ST32(dst , LD32(src ));
1242 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1247 ST32(dst , LD32(src ));
1248 ST32(dst+4 , LD32(src+4 ));
1254 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1259 ST32(dst , LD32(src ));
1260 ST32(dst+4 , LD32(src+4 ));
1261 ST32(dst+8 , LD32(src+8 ));
1262 ST32(dst+12, LD32(src+12));
1268 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1273 ST32(dst , LD32(src ));
1274 ST32(dst+4 , LD32(src+4 ));
1275 ST32(dst+8 , LD32(src+8 ));
1276 ST32(dst+12, LD32(src+12));
1283 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1288 ST32(dst , LD32(src ));
1289 ST32(dst+4 , LD32(src+4 ));
1297 #define QPEL_MC(r, OPNAME, RND, OP) \
1298 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1299 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1303 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1304 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1305 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1306 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1307 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1308 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1309 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1310 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1316 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1318 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1322 const int src0= src[0*srcStride];\
1323 const int src1= src[1*srcStride];\
1324 const int src2= src[2*srcStride];\
1325 const int src3= src[3*srcStride];\
1326 const int src4= src[4*srcStride];\
1327 const int src5= src[5*srcStride];\
1328 const int src6= src[6*srcStride];\
1329 const int src7= src[7*srcStride];\
1330 const int src8= src[8*srcStride];\
1331 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1332 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1333 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1334 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1335 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1336 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1337 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1338 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1344 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1345 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1350 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1351 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1352 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1353 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1354 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1355 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1356 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1357 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1358 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1359 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1360 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1361 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1362 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1363 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1364 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1365 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1371 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1372 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1377 const int src0= src[0*srcStride];\
1378 const int src1= src[1*srcStride];\
1379 const int src2= src[2*srcStride];\
1380 const int src3= src[3*srcStride];\
1381 const int src4= src[4*srcStride];\
1382 const int src5= src[5*srcStride];\
1383 const int src6= src[6*srcStride];\
1384 const int src7= src[7*srcStride];\
1385 const int src8= src[8*srcStride];\
1386 const int src9= src[9*srcStride];\
1387 const int src10= src[10*srcStride];\
1388 const int src11= src[11*srcStride];\
1389 const int src12= src[12*srcStride];\
1390 const int src13= src[13*srcStride];\
1391 const int src14= src[14*srcStride];\
1392 const int src15= src[15*srcStride];\
1393 const int src16= src[16*srcStride];\
1394 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1395 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1396 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1397 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1398 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1399 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1400 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1401 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1402 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1403 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1404 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1405 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1406 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1407 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1408 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1409 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1415 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1416 OPNAME ## pixels8_c(dst, src, stride, 8);\
1419 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1421 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1422 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1425 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1426 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1429 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1431 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1432 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1435 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1436 uint8_t full[16*9];\
1438 copy_block9(full, src, 16, stride, 9);\
1439 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1440 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1443 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1444 uint8_t full[16*9];\
1445 copy_block9(full, src, 16, stride, 9);\
1446 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1449 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1450 uint8_t full[16*9];\
1452 copy_block9(full, src, 16, stride, 9);\
1453 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1454 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1456 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1457 uint8_t full[16*9];\
1460 uint8_t halfHV[64];\
1461 copy_block9(full, src, 16, stride, 9);\
1462 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1463 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1464 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1465 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1467 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1468 uint8_t full[16*9];\
1470 uint8_t halfHV[64];\
1471 copy_block9(full, src, 16, stride, 9);\
1472 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1473 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1474 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1475 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1477 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1478 uint8_t full[16*9];\
1481 uint8_t halfHV[64];\
1482 copy_block9(full, src, 16, stride, 9);\
1483 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1484 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1485 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1486 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1488 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1489 uint8_t full[16*9];\
1491 uint8_t halfHV[64];\
1492 copy_block9(full, src, 16, stride, 9);\
1493 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1494 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1495 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1496 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1498 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1499 uint8_t full[16*9];\
1502 uint8_t halfHV[64];\
1503 copy_block9(full, src, 16, stride, 9);\
1504 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1505 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1506 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1507 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1509 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1510 uint8_t full[16*9];\
1512 uint8_t halfHV[64];\
1513 copy_block9(full, src, 16, stride, 9);\
1514 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1515 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1516 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1517 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1519 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1520 uint8_t full[16*9];\
1523 uint8_t halfHV[64];\
1524 copy_block9(full, src, 16, stride, 9);\
1525 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1526 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1527 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1528 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1530 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1531 uint8_t full[16*9];\
1533 uint8_t halfHV[64];\
1534 copy_block9(full, src, 16, stride, 9);\
1535 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1536 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1537 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1538 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1540 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1542 uint8_t halfHV[64];\
1543 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1544 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1545 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1547 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1549 uint8_t halfHV[64];\
1550 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1551 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1552 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1554 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1555 uint8_t full[16*9];\
1558 uint8_t halfHV[64];\
1559 copy_block9(full, src, 16, stride, 9);\
1560 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1561 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1562 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1563 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1565 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1566 uint8_t full[16*9];\
1568 copy_block9(full, src, 16, stride, 9);\
1569 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1570 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1571 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1573 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1574 uint8_t full[16*9];\
1577 uint8_t halfHV[64];\
1578 copy_block9(full, src, 16, stride, 9);\
1579 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1580 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1581 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1582 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1584 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1585 uint8_t full[16*9];\
1587 copy_block9(full, src, 16, stride, 9);\
1588 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1589 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1590 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1592 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1594 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1595 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1597 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1598 OPNAME ## pixels16_c(dst, src, stride, 16);\
1601 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1603 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1604 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1607 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1608 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1611 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1613 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1614 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1617 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1618 uint8_t full[24*17];\
1620 copy_block17(full, src, 24, stride, 17);\
1621 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1622 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1625 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1626 uint8_t full[24*17];\
1627 copy_block17(full, src, 24, stride, 17);\
1628 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1631 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1632 uint8_t full[24*17];\
1634 copy_block17(full, src, 24, stride, 17);\
1635 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1636 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1638 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1639 uint8_t full[24*17];\
1640 uint8_t halfH[272];\
1641 uint8_t halfV[256];\
1642 uint8_t halfHV[256];\
1643 copy_block17(full, src, 24, stride, 17);\
1644 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1645 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1646 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1647 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1649 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1650 uint8_t full[24*17];\
1651 uint8_t halfH[272];\
1652 uint8_t halfHV[256];\
1653 copy_block17(full, src, 24, stride, 17);\
1654 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1655 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1656 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1657 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1659 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1660 uint8_t full[24*17];\
1661 uint8_t halfH[272];\
1662 uint8_t halfV[256];\
1663 uint8_t halfHV[256];\
1664 copy_block17(full, src, 24, stride, 17);\
1665 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1666 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1667 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1668 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1670 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1671 uint8_t full[24*17];\
1672 uint8_t halfH[272];\
1673 uint8_t halfHV[256];\
1674 copy_block17(full, src, 24, stride, 17);\
1675 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1676 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1677 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1678 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1680 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[24*17];\
1682 uint8_t halfH[272];\
1683 uint8_t halfV[256];\
1684 uint8_t halfHV[256];\
1685 copy_block17(full, src, 24, stride, 17);\
1686 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1687 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1688 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1689 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1691 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1692 uint8_t full[24*17];\
1693 uint8_t halfH[272];\
1694 uint8_t halfHV[256];\
1695 copy_block17(full, src, 24, stride, 17);\
1696 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1697 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1698 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1699 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1701 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1702 uint8_t full[24*17];\
1703 uint8_t halfH[272];\
1704 uint8_t halfV[256];\
1705 uint8_t halfHV[256];\
1706 copy_block17(full, src, 24, stride, 17);\
1707 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1708 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1709 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1710 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1712 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[24*17];\
1714 uint8_t halfH[272];\
1715 uint8_t halfHV[256];\
1716 copy_block17(full, src, 24, stride, 17);\
1717 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1718 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1719 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1720 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1722 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1723 uint8_t halfH[272];\
1724 uint8_t halfHV[256];\
1725 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1726 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1727 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1729 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t halfH[272];\
1731 uint8_t halfHV[256];\
1732 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1733 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1734 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1736 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[24*17];\
1738 uint8_t halfH[272];\
1739 uint8_t halfV[256];\
1740 uint8_t halfHV[256];\
1741 copy_block17(full, src, 24, stride, 17);\
1742 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1743 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1744 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1745 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1747 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[24*17];\
1749 uint8_t halfH[272];\
1750 copy_block17(full, src, 24, stride, 17);\
1751 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1752 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1753 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1755 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[24*17];\
1757 uint8_t halfH[272];\
1758 uint8_t halfV[256];\
1759 uint8_t halfHV[256];\
1760 copy_block17(full, src, 24, stride, 17);\
1761 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1762 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1763 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1764 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1766 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1767 uint8_t full[24*17];\
1768 uint8_t halfH[272];\
1769 copy_block17(full, src, 24, stride, 17);\
1770 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1771 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1772 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1774 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1775 uint8_t halfH[272];\
1776 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1777 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1780 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1781 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1782 #define op_put(a, b) a = cm[((b) + 16)>>5]
1783 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1785 QPEL_MC(0, put_ , _ , op_put)
1786 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1787 QPEL_MC(0, avg_ , _ , op_avg)
1788 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1790 #undef op_avg_no_rnd
1792 #undef op_put_no_rnd
1795 #define H264_LOWPASS(OPNAME, OP, OP2) \
1796 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1798 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1802 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1803 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1804 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1805 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1811 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1813 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1817 const int srcB= src[-2*srcStride];\
1818 const int srcA= src[-1*srcStride];\
1819 const int src0= src[0 *srcStride];\
1820 const int src1= src[1 *srcStride];\
1821 const int src2= src[2 *srcStride];\
1822 const int src3= src[3 *srcStride];\
1823 const int src4= src[4 *srcStride];\
1824 const int src5= src[5 *srcStride];\
1825 const int src6= src[6 *srcStride];\
1826 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1827 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1828 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1829 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1835 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1838 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1840 src -= 2*srcStride;\
1841 for(i=0; i<h+5; i++)\
1843 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1844 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1845 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1846 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1850 tmp -= tmpStride*(h+5-2);\
1853 const int tmpB= tmp[-2*tmpStride];\
1854 const int tmpA= tmp[-1*tmpStride];\
1855 const int tmp0= tmp[0 *tmpStride];\
1856 const int tmp1= tmp[1 *tmpStride];\
1857 const int tmp2= tmp[2 *tmpStride];\
1858 const int tmp3= tmp[3 *tmpStride];\
1859 const int tmp4= tmp[4 *tmpStride];\
1860 const int tmp5= tmp[5 *tmpStride];\
1861 const int tmp6= tmp[6 *tmpStride];\
1862 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1863 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1864 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1865 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1871 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1873 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1877 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1878 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1879 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1880 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1881 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1882 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1883 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1884 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1890 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1892 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1896 const int srcB= src[-2*srcStride];\
1897 const int srcA= src[-1*srcStride];\
1898 const int src0= src[0 *srcStride];\
1899 const int src1= src[1 *srcStride];\
1900 const int src2= src[2 *srcStride];\
1901 const int src3= src[3 *srcStride];\
1902 const int src4= src[4 *srcStride];\
1903 const int src5= src[5 *srcStride];\
1904 const int src6= src[6 *srcStride];\
1905 const int src7= src[7 *srcStride];\
1906 const int src8= src[8 *srcStride];\
1907 const int src9= src[9 *srcStride];\
1908 const int src10=src[10*srcStride];\
1909 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1910 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1911 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1912 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1913 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1914 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1915 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1916 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1922 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1925 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1927 src -= 2*srcStride;\
1928 for(i=0; i<h+5; i++)\
1930 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1931 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1932 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1933 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1934 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1935 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1936 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1937 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1941 tmp -= tmpStride*(h+5-2);\
1944 const int tmpB= tmp[-2*tmpStride];\
1945 const int tmpA= tmp[-1*tmpStride];\
1946 const int tmp0= tmp[0 *tmpStride];\
1947 const int tmp1= tmp[1 *tmpStride];\
1948 const int tmp2= tmp[2 *tmpStride];\
1949 const int tmp3= tmp[3 *tmpStride];\
1950 const int tmp4= tmp[4 *tmpStride];\
1951 const int tmp5= tmp[5 *tmpStride];\
1952 const int tmp6= tmp[6 *tmpStride];\
1953 const int tmp7= tmp[7 *tmpStride];\
1954 const int tmp8= tmp[8 *tmpStride];\
1955 const int tmp9= tmp[9 *tmpStride];\
1956 const int tmp10=tmp[10*tmpStride];\
1957 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1958 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1959 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1960 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1961 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1962 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1963 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1964 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1970 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1971 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1972 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1973 src += 8*srcStride;\
1974 dst += 8*dstStride;\
1975 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1976 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1979 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1980 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1981 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1982 src += 8*srcStride;\
1983 dst += 8*dstStride;\
1984 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1985 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1988 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1989 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1990 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1991 src += 8*srcStride;\
1992 tmp += 8*tmpStride;\
1993 dst += 8*dstStride;\
1994 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1995 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1998 #define H264_MC(OPNAME, SIZE) \
1999 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2000 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2003 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t half[SIZE*SIZE];\
2005 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2006 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2009 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2010 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2013 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2014 uint8_t half[SIZE*SIZE];\
2015 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2016 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2019 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2020 uint8_t full[SIZE*(SIZE+5)];\
2021 uint8_t * const full_mid= full + SIZE*2;\
2022 uint8_t half[SIZE*SIZE];\
2023 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2024 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2025 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2028 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t full[SIZE*(SIZE+5)];\
2030 uint8_t * const full_mid= full + SIZE*2;\
2031 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2032 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2035 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t full[SIZE*(SIZE+5)];\
2037 uint8_t * const full_mid= full + SIZE*2;\
2038 uint8_t half[SIZE*SIZE];\
2039 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2040 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2041 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2044 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[SIZE*(SIZE+5)];\
2046 uint8_t * const full_mid= full + SIZE*2;\
2047 uint8_t halfH[SIZE*SIZE];\
2048 uint8_t halfV[SIZE*SIZE];\
2049 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2050 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2051 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2052 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2055 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2056 uint8_t full[SIZE*(SIZE+5)];\
2057 uint8_t * const full_mid= full + SIZE*2;\
2058 uint8_t halfH[SIZE*SIZE];\
2059 uint8_t halfV[SIZE*SIZE];\
2060 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2061 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2062 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2063 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2066 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2067 uint8_t full[SIZE*(SIZE+5)];\
2068 uint8_t * const full_mid= full + SIZE*2;\
2069 uint8_t halfH[SIZE*SIZE];\
2070 uint8_t halfV[SIZE*SIZE];\
2071 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2072 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2073 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2074 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2077 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2078 uint8_t full[SIZE*(SIZE+5)];\
2079 uint8_t * const full_mid= full + SIZE*2;\
2080 uint8_t halfH[SIZE*SIZE];\
2081 uint8_t halfV[SIZE*SIZE];\
2082 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2083 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2084 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2085 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2088 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2089 int16_t tmp[SIZE*(SIZE+5)];\
2090 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2093 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2094 int16_t tmp[SIZE*(SIZE+5)];\
2095 uint8_t halfH[SIZE*SIZE];\
2096 uint8_t halfHV[SIZE*SIZE];\
2097 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2098 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2099 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2102 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2103 int16_t tmp[SIZE*(SIZE+5)];\
2104 uint8_t halfH[SIZE*SIZE];\
2105 uint8_t halfHV[SIZE*SIZE];\
2106 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2107 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2108 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2111 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2112 uint8_t full[SIZE*(SIZE+5)];\
2113 uint8_t * const full_mid= full + SIZE*2;\
2114 int16_t tmp[SIZE*(SIZE+5)];\
2115 uint8_t halfV[SIZE*SIZE];\
2116 uint8_t halfHV[SIZE*SIZE];\
2117 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2118 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2119 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2120 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2123 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2124 uint8_t full[SIZE*(SIZE+5)];\
2125 uint8_t * const full_mid= full + SIZE*2;\
2126 int16_t tmp[SIZE*(SIZE+5)];\
2127 uint8_t halfV[SIZE*SIZE];\
2128 uint8_t halfHV[SIZE*SIZE];\
2129 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2130 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2131 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2132 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2135 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2136 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2137 #define op_put(a, b) a = cm[((b) + 16)>>5]
2138 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2139 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2141 H264_LOWPASS(put_ , op_put, op2_put)
2142 H264_LOWPASS(avg_ , op_avg, op2_avg)
2156 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2157 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2161 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2162 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2163 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2164 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2165 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2166 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2167 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2168 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2174 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2175 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2179 const int src_1= src[ -srcStride];
2180 const int src0 = src[0 ];
2181 const int src1 = src[ srcStride];
2182 const int src2 = src[2*srcStride];
2183 const int src3 = src[3*srcStride];
2184 const int src4 = src[4*srcStride];
2185 const int src5 = src[5*srcStride];
2186 const int src6 = src[6*srcStride];
2187 const int src7 = src[7*srcStride];
2188 const int src8 = src[8*srcStride];
2189 const int src9 = src[9*srcStride];
2190 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2191 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2192 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2193 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2194 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2195 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2196 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2197 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2203 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2204 put_pixels8_c(dst, src, stride, 8);
2207 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2209 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2210 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2213 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2214 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2217 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2219 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2220 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2223 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2224 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2227 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2231 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2232 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2233 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2234 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2236 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2240 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2241 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2242 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2243 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2245 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2247 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2248 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2252 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2258 s += abs(pix1[0] - pix2[0]);
2259 s += abs(pix1[1] - pix2[1]);
2260 s += abs(pix1[2] - pix2[2]);
2261 s += abs(pix1[3] - pix2[3]);
2262 s += abs(pix1[4] - pix2[4]);
2263 s += abs(pix1[5] - pix2[5]);
2264 s += abs(pix1[6] - pix2[6]);
2265 s += abs(pix1[7] - pix2[7]);
2266 s += abs(pix1[8] - pix2[8]);
2267 s += abs(pix1[9] - pix2[9]);
2268 s += abs(pix1[10] - pix2[10]);
2269 s += abs(pix1[11] - pix2[11]);
2270 s += abs(pix1[12] - pix2[12]);
2271 s += abs(pix1[13] - pix2[13]);
2272 s += abs(pix1[14] - pix2[14]);
2273 s += abs(pix1[15] - pix2[15]);
2280 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2286 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2287 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2288 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2289 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2290 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2291 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2292 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2293 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2294 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2295 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2296 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2297 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2298 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2299 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2300 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2301 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2308 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2311 uint8_t *pix3 = pix2 + line_size;
2315 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2316 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2317 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2318 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2319 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2320 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2321 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2322 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2323 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2324 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2325 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2326 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2327 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2328 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2329 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2330 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2338 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2341 uint8_t *pix3 = pix2 + line_size;
2345 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2346 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2347 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2348 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2349 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2350 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2351 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2352 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2353 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2354 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2355 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2356 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2357 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2358 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2359 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2360 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2368 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2374 s += abs(pix1[0] - pix2[0]);
2375 s += abs(pix1[1] - pix2[1]);
2376 s += abs(pix1[2] - pix2[2]);
2377 s += abs(pix1[3] - pix2[3]);
2378 s += abs(pix1[4] - pix2[4]);
2379 s += abs(pix1[5] - pix2[5]);
2380 s += abs(pix1[6] - pix2[6]);
2381 s += abs(pix1[7] - pix2[7]);
2388 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2394 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2395 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2396 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2397 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2398 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2399 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2400 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2401 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2408 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2411 uint8_t *pix3 = pix2 + line_size;
2415 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2416 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2417 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2418 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2419 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2420 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2421 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2422 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2430 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2433 uint8_t *pix3 = pix2 + line_size;
2437 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2438 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2439 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2440 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2441 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2442 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2443 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2444 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2452 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2453 return pix_abs16x16_c(a,b,stride);
2456 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2457 return pix_abs8x8_c(a,b,stride);
2461 * permutes an 8x8 block.
2462 * @param block the block which will be permuted according to the given permutation vector
2463 * @param permutation the permutation vector
2464 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2465 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2466 * (inverse) permutated to scantable order!
2468 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2474 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2476 for(i=0; i<=last; i++){
2477 const int j= scantable[i];
2482 for(i=0; i<=last; i++){
2483 const int j= scantable[i];
2484 const int perm_j= permutation[j];
2485 block[perm_j]= temp[j];
2490 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2492 static void clear_blocks_c(DCTELEM *blocks)
2494 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2497 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2499 for(i=0; i+7<w; i+=8){
2500 dst[i+0] += src[i+0];
2501 dst[i+1] += src[i+1];
2502 dst[i+2] += src[i+2];
2503 dst[i+3] += src[i+3];
2504 dst[i+4] += src[i+4];
2505 dst[i+5] += src[i+5];
2506 dst[i+6] += src[i+6];
2507 dst[i+7] += src[i+7];
2510 dst[i+0] += src[i+0];
2513 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2515 for(i=0; i+7<w; i+=8){
2516 dst[i+0] = src1[i+0]-src2[i+0];
2517 dst[i+1] = src1[i+1]-src2[i+1];
2518 dst[i+2] = src1[i+2]-src2[i+2];
2519 dst[i+3] = src1[i+3]-src2[i+3];
2520 dst[i+4] = src1[i+4]-src2[i+4];
2521 dst[i+5] = src1[i+5]-src2[i+5];
2522 dst[i+6] = src1[i+6]-src2[i+6];
2523 dst[i+7] = src1[i+7]-src2[i+7];
2526 dst[i+0] = src1[i+0]-src2[i+0];
2529 #define BUTTERFLY2(o1,o2,i1,i2) \
2533 #define BUTTERFLY1(x,y) \
2542 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2544 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2550 //FIXME try pointer walks
2551 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2552 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2553 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2554 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2556 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2557 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2558 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2559 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2561 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2562 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2563 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2564 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2568 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2569 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2570 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2571 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2573 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2574 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2575 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2576 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2579 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2580 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2581 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2582 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2588 printf("MAX:%d\n", maxi);
2594 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2598 //FIXME OOOPS ignore 0 term instead of mean mess
2600 //FIXME try pointer walks
2601 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2602 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2603 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2604 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2606 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2607 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2608 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2609 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2611 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2612 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2613 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2614 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2618 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2619 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2620 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2621 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2623 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2624 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2625 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2626 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2629 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2630 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2631 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2632 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2638 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2639 MpegEncContext * const s= (MpegEncContext *)c;
2640 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2641 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2644 s->dsp.diff_pixels(temp, src1, src2, stride);
2653 void simple_idct(DCTELEM *block); //FIXME
2655 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2656 MpegEncContext * const s= (MpegEncContext *)c;
2657 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2658 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2659 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2664 s->dsp.diff_pixels(temp, src1, src2, stride);
2666 memcpy(bak, temp, 64*sizeof(DCTELEM));
2668 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2669 s->dct_unquantize(s, temp, 0, s->qscale);
2670 simple_idct(temp); //FIXME
2673 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2678 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2679 MpegEncContext * const s= (MpegEncContext *)c;
2680 const uint8_t *scantable= s->intra_scantable.permutated;
2681 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2682 uint64_t __align8 aligned_bak[stride];
2683 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2684 uint8_t * const bak= (uint8_t*)aligned_bak;
2685 int i, last, run, bits, level, distoration, start_i;
2686 const int esc_length= s->ac_esc_length;
2688 uint8_t * last_length;
2691 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2692 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2695 s->dsp.diff_pixels(temp, src1, src2, stride);
2697 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2703 length = s->intra_ac_vlc_length;
2704 last_length= s->intra_ac_vlc_last_length;
2705 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2708 length = s->inter_ac_vlc_length;
2709 last_length= s->inter_ac_vlc_last_length;
2714 for(i=start_i; i<last; i++){
2715 int j= scantable[i];
2720 if((level&(~127)) == 0){
2721 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2730 level= temp[i] + 64;
2734 if((level&(~127)) == 0){
2735 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2742 s->dct_unquantize(s, temp, 0, s->qscale);
2745 s->dsp.idct_add(bak, stride, temp);
2747 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2749 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2752 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2753 MpegEncContext * const s= (MpegEncContext *)c;
2754 const uint8_t *scantable= s->intra_scantable.permutated;
2755 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2756 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2757 int i, last, run, bits, level, start_i;
2758 const int esc_length= s->ac_esc_length;
2760 uint8_t * last_length;
2762 s->dsp.diff_pixels(temp, src1, src2, stride);
2764 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2770 length = s->intra_ac_vlc_length;
2771 last_length= s->intra_ac_vlc_last_length;
2772 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2775 length = s->inter_ac_vlc_length;
2776 last_length= s->inter_ac_vlc_last_length;
2781 for(i=start_i; i<last; i++){
2782 int j= scantable[i];
2787 if((level&(~127)) == 0){
2788 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2797 level= temp[i] + 64;
2801 if((level&(~127)) == 0){
2802 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2811 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2812 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2813 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2814 WARPER88_1616(rd8x8_c, rd16x16_c)
2815 WARPER88_1616(bit8x8_c, bit16x16_c)
2817 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2819 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2822 put_pixels_clamped_c(block, dest, line_size);
2824 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2827 add_pixels_clamped_c(block, dest, line_size);
2830 /* init static data */
2831 void dsputil_static_init(void)
2835 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2836 for(i=0;i<MAX_NEG_CROP;i++) {
2838 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2841 for(i=0;i<512;i++) {
2842 squareTbl[i] = (i - 256) * (i - 256);
2845 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2849 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2853 #ifdef CONFIG_ENCODERS
2854 if(avctx->dct_algo==FF_DCT_FASTINT)
2855 c->fdct = fdct_ifast;
2857 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2858 #endif //CONFIG_ENCODERS
2860 if(avctx->idct_algo==FF_IDCT_INT){
2861 c->idct_put= ff_jref_idct_put;
2862 c->idct_add= ff_jref_idct_add;
2863 c->idct = j_rev_dct;
2864 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2865 }else{ //accurate/default
2866 c->idct_put= simple_idct_put;
2867 c->idct_add= simple_idct_add;
2868 c->idct = simple_idct;
2869 c->idct_permutation_type= FF_NO_IDCT_PERM;
2872 c->get_pixels = get_pixels_c;
2873 c->diff_pixels = diff_pixels_c;
2874 c->put_pixels_clamped = put_pixels_clamped_c;
2875 c->add_pixels_clamped = add_pixels_clamped_c;
2878 c->clear_blocks = clear_blocks_c;
2879 c->pix_sum = pix_sum_c;
2880 c->pix_norm1 = pix_norm1_c;
2884 /* TODO [0] 16 [1] 8 */
2885 c->pix_abs16x16 = pix_abs16x16_c;
2886 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2887 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2888 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2889 c->pix_abs8x8 = pix_abs8x8_c;
2890 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2891 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2892 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2894 #define dspfunc(PFX, IDX, NUM) \
2895 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2896 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2897 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2898 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2900 dspfunc(put, 0, 16);
2901 dspfunc(put_no_rnd, 0, 16);
2903 dspfunc(put_no_rnd, 1, 8);
2907 dspfunc(avg, 0, 16);
2908 dspfunc(avg_no_rnd, 0, 16);
2910 dspfunc(avg_no_rnd, 1, 8);
2915 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2916 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2917 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2918 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2919 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2920 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2921 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2922 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2923 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2925 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2926 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2927 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2928 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2929 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2930 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2931 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2932 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2933 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2935 #define dspfunc(PFX, IDX, NUM) \
2936 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2937 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2938 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2939 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2940 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2941 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2942 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2943 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2944 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2945 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2946 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2947 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2948 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2949 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2950 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2951 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2953 dspfunc(put_qpel, 0, 16);
2954 dspfunc(put_no_rnd_qpel, 0, 16);
2956 dspfunc(avg_qpel, 0, 16);
2957 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2959 dspfunc(put_qpel, 1, 8);
2960 dspfunc(put_no_rnd_qpel, 1, 8);
2962 dspfunc(avg_qpel, 1, 8);
2963 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2965 dspfunc(put_h264_qpel, 0, 16);
2966 dspfunc(put_h264_qpel, 1, 8);
2967 dspfunc(put_h264_qpel, 2, 4);
2968 dspfunc(avg_h264_qpel, 0, 16);
2969 dspfunc(avg_h264_qpel, 1, 8);
2970 dspfunc(avg_h264_qpel, 2, 4);
2973 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2974 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2975 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2976 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2977 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2978 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2980 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2981 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2982 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2983 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2984 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2985 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2986 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2987 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2989 c->hadamard8_diff[0]= hadamard8_diff16_c;
2990 c->hadamard8_diff[1]= hadamard8_diff_c;
2991 c->hadamard8_abs = hadamard8_abs_c;
2993 c->dct_sad[0]= dct_sad16x16_c;
2994 c->dct_sad[1]= dct_sad8x8_c;
2996 c->sad[0]= sad16x16_c;
2997 c->sad[1]= sad8x8_c;
2999 c->quant_psnr[0]= quant_psnr16x16_c;
3000 c->quant_psnr[1]= quant_psnr8x8_c;
3002 c->rd[0]= rd16x16_c;
3005 c->bit[0]= bit16x16_c;
3006 c->bit[1]= bit8x8_c;
3008 c->add_bytes= add_bytes_c;
3009 c->diff_bytes= diff_bytes_c;
3010 c->bswap_buf= bswap_buf;
3013 dsputil_init_mmx(c, avctx);
3016 dsputil_init_armv4l(c, avctx);
3019 dsputil_init_mlib(c, avctx);
3022 dsputil_init_alpha(c, avctx);
3025 dsputil_init_ppc(c, avctx);
3028 dsputil_init_mmi(c, avctx);
3031 dsputil_init_sh4(c,avctx);
3034 switch(c->idct_permutation_type){
3035 case FF_NO_IDCT_PERM:
3037 c->idct_permutation[i]= i;
3039 case FF_LIBMPEG2_IDCT_PERM:
3041 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3043 case FF_SIMPLE_IDCT_PERM:
3045 c->idct_permutation[i]= simple_mmx_permutation[i];
3047 case FF_TRANSPOSE_IDCT_PERM:
3049 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3052 fprintf(stderr, "Internal error, IDCT permutation not set\n");