3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
47 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 uint16_t __align8 inv_zigzag_direct16[64];
50 const uint8_t ff_alternate_horizontal_scan[64] = {
51 0, 1, 2, 3, 8, 9, 16, 17,
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
61 const uint8_t ff_alternate_vertical_scan[64] = {
62 0, 8, 16, 24, 1, 9, 2, 10,
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
72 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 const uint32_t inverse[256]={
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120 static int pix_sum_c(uint8_t * pix, int line_size)
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
137 pix += line_size - 16;
142 static int pix_norm1_c(uint8_t * pix, int line_size)
145 uint32_t *sq = squareTbl + 256;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 #if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
171 register uint32_t x=*(uint32_t*)pix;
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
185 pix += line_size - 16;
191 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
194 uint32_t *sq = squareTbl + 256;
197 for (i = 0; i < 8; i++) {
198 s += sq[pix1[0] - pix2[0]];
199 s += sq[pix1[1] - pix2[1]];
200 s += sq[pix1[2] - pix2[2]];
201 s += sq[pix1[3] - pix2[3]];
202 s += sq[pix1[4] - pix2[4]];
203 s += sq[pix1[5] - pix2[5]];
204 s += sq[pix1[6] - pix2[6]];
205 s += sq[pix1[7] - pix2[7]];
212 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
215 uint32_t *sq = squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 s += sq[pix1[ 0] - pix2[ 0]];
220 s += sq[pix1[ 1] - pix2[ 1]];
221 s += sq[pix1[ 2] - pix2[ 2]];
222 s += sq[pix1[ 3] - pix2[ 3]];
223 s += sq[pix1[ 4] - pix2[ 4]];
224 s += sq[pix1[ 5] - pix2[ 5]];
225 s += sq[pix1[ 6] - pix2[ 6]];
226 s += sq[pix1[ 7] - pix2[ 7]];
227 s += sq[pix1[ 8] - pix2[ 8]];
228 s += sq[pix1[ 9] - pix2[ 9]];
229 s += sq[pix1[10] - pix2[10]];
230 s += sq[pix1[11] - pix2[11]];
231 s += sq[pix1[12] - pix2[12]];
232 s += sq[pix1[13] - pix2[13]];
233 s += sq[pix1[14] - pix2[14]];
234 s += sq[pix1[15] - pix2[15]];
242 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
246 /* read the pixels */
248 block[0] = pixels[0];
249 block[1] = pixels[1];
250 block[2] = pixels[2];
251 block[3] = pixels[3];
252 block[4] = pixels[4];
253 block[5] = pixels[5];
254 block[6] = pixels[6];
255 block[7] = pixels[7];
261 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262 const uint8_t *s2, int stride){
265 /* read the pixels */
267 block[0] = s1[0] - s2[0];
268 block[1] = s1[1] - s2[1];
269 block[2] = s1[2] - s2[2];
270 block[3] = s1[3] - s2[3];
271 block[4] = s1[4] - s2[4];
272 block[5] = s1[5] - s2[5];
273 block[6] = s1[6] - s2[6];
274 block[7] = s1[7] - s2[7];
282 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
286 uint8_t *cm = cropTbl + MAX_NEG_CROP;
288 /* read the pixels */
290 pixels[0] = cm[block[0]];
291 pixels[1] = cm[block[1]];
292 pixels[2] = cm[block[2]];
293 pixels[3] = cm[block[3]];
294 pixels[4] = cm[block[4]];
295 pixels[5] = cm[block[5]];
296 pixels[6] = cm[block[6]];
297 pixels[7] = cm[block[7]];
304 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
308 uint8_t *cm = cropTbl + MAX_NEG_CROP;
310 /* read the pixels */
312 pixels[0] = cm[pixels[0] + block[0]];
313 pixels[1] = cm[pixels[1] + block[1]];
314 pixels[2] = cm[pixels[2] + block[2]];
315 pixels[3] = cm[pixels[3] + block[3]];
316 pixels[4] = cm[pixels[4] + block[4]];
317 pixels[5] = cm[pixels[5] + block[5]];
318 pixels[6] = cm[pixels[6] + block[6]];
319 pixels[7] = cm[pixels[7] + block[7]];
326 #define PIXOP2(OPNAME, OP) \
327 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
331 OP(*((uint64_t*)block), LD64(pixels));\
337 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
341 const uint64_t a= LD64(pixels );\
342 const uint64_t b= LD64(pixels+1);\
343 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
349 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353 const uint64_t a= LD64(pixels );\
354 const uint64_t b= LD64(pixels+1);\
355 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
361 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
365 const uint64_t a= LD64(pixels );\
366 const uint64_t b= LD64(pixels+line_size);\
367 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
373 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
377 const uint64_t a= LD64(pixels );\
378 const uint64_t b= LD64(pixels+line_size);\
379 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
385 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
388 const uint64_t a= LD64(pixels );\
389 const uint64_t b= LD64(pixels+1);\
390 uint64_t l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0202020202020202ULL;\
393 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
398 for(i=0; i<h; i+=2){\
399 uint64_t a= LD64(pixels );\
400 uint64_t b= LD64(pixels+1);\
401 l1= (a&0x0303030303030303ULL)\
402 + (b&0x0303030303030303ULL);\
403 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
410 l0= (a&0x0303030303030303ULL)\
411 + (b&0x0303030303030303ULL)\
412 + 0x0202020202020202ULL;\
413 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
421 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
424 const uint64_t a= LD64(pixels );\
425 const uint64_t b= LD64(pixels+1);\
426 uint64_t l0= (a&0x0303030303030303ULL)\
427 + (b&0x0303030303030303ULL)\
428 + 0x0101010101010101ULL;\
429 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
434 for(i=0; i<h; i+=2){\
435 uint64_t a= LD64(pixels );\
436 uint64_t b= LD64(pixels+1);\
437 l1= (a&0x0303030303030303ULL)\
438 + (b&0x0303030303030303ULL);\
439 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446 l0= (a&0x0303030303030303ULL)\
447 + (b&0x0303030303030303ULL)\
448 + 0x0101010101010101ULL;\
449 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
457 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
458 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
465 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466 #else // 64 bit variant
468 #define PIXOP2(OPNAME, OP) \
469 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
472 OP(*((uint32_t*)(block )), LD32(pixels ));\
477 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
480 OP(*((uint32_t*)(block )), LD32(pixels ));\
481 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
486 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
487 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
490 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
491 int src_stride1, int src_stride2, int h){\
495 a= LD32(&src1[i*src_stride1 ]);\
496 b= LD32(&src2[i*src_stride2 ]);\
497 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
498 a= LD32(&src1[i*src_stride1+4]);\
499 b= LD32(&src2[i*src_stride2+4]);\
500 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
504 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
505 int src_stride1, int src_stride2, int h){\
509 a= LD32(&src1[i*src_stride1 ]);\
510 b= LD32(&src2[i*src_stride2 ]);\
511 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
512 a= LD32(&src1[i*src_stride1+4]);\
513 b= LD32(&src2[i*src_stride2+4]);\
514 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
518 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
519 int src_stride1, int src_stride2, int h){\
523 a= LD32(&src1[i*src_stride1 ]);\
524 b= LD32(&src2[i*src_stride2 ]);\
525 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
529 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
530 int src_stride1, int src_stride2, int h){\
531 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
532 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
535 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
536 int src_stride1, int src_stride2, int h){\
537 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
538 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
541 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
542 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
545 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
546 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
549 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
550 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
553 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
554 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
557 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
558 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
561 uint32_t a, b, c, d, l0, l1, h0, h1;\
562 a= LD32(&src1[i*src_stride1]);\
563 b= LD32(&src2[i*src_stride2]);\
564 c= LD32(&src3[i*src_stride3]);\
565 d= LD32(&src4[i*src_stride4]);\
566 l0= (a&0x03030303UL)\
569 h0= ((a&0xFCFCFCFCUL)>>2)\
570 + ((b&0xFCFCFCFCUL)>>2);\
571 l1= (c&0x03030303UL)\
573 h1= ((c&0xFCFCFCFCUL)>>2)\
574 + ((d&0xFCFCFCFCUL)>>2);\
575 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
576 a= LD32(&src1[i*src_stride1+4]);\
577 b= LD32(&src2[i*src_stride2+4]);\
578 c= LD32(&src3[i*src_stride3+4]);\
579 d= LD32(&src4[i*src_stride4+4]);\
580 l0= (a&0x03030303UL)\
583 h0= ((a&0xFCFCFCFCUL)>>2)\
584 + ((b&0xFCFCFCFCUL)>>2);\
585 l1= (c&0x03030303UL)\
587 h1= ((c&0xFCFCFCFCUL)>>2)\
588 + ((d&0xFCFCFCFCUL)>>2);\
589 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
592 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
593 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
596 uint32_t a, b, c, d, l0, l1, h0, h1;\
597 a= LD32(&src1[i*src_stride1]);\
598 b= LD32(&src2[i*src_stride2]);\
599 c= LD32(&src3[i*src_stride3]);\
600 d= LD32(&src4[i*src_stride4]);\
601 l0= (a&0x03030303UL)\
604 h0= ((a&0xFCFCFCFCUL)>>2)\
605 + ((b&0xFCFCFCFCUL)>>2);\
606 l1= (c&0x03030303UL)\
608 h1= ((c&0xFCFCFCFCUL)>>2)\
609 + ((d&0xFCFCFCFCUL)>>2);\
610 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
611 a= LD32(&src1[i*src_stride1+4]);\
612 b= LD32(&src2[i*src_stride2+4]);\
613 c= LD32(&src3[i*src_stride3+4]);\
614 d= LD32(&src4[i*src_stride4+4]);\
615 l0= (a&0x03030303UL)\
618 h0= ((a&0xFCFCFCFCUL)>>2)\
619 + ((b&0xFCFCFCFCUL)>>2);\
620 l1= (c&0x03030303UL)\
622 h1= ((c&0xFCFCFCFCUL)>>2)\
623 + ((d&0xFCFCFCFCUL)>>2);\
624 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
627 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
628 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
629 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
630 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
632 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
633 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
634 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
635 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
638 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
643 const uint32_t a= LD32(pixels );\
644 const uint32_t b= LD32(pixels+1);\
645 uint32_t l0= (a&0x03030303UL)\
648 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
649 + ((b&0xFCFCFCFCUL)>>2);\
653 for(i=0; i<h; i+=2){\
654 uint32_t a= LD32(pixels );\
655 uint32_t b= LD32(pixels+1);\
656 l1= (a&0x03030303UL)\
658 h1= ((a&0xFCFCFCFCUL)>>2)\
659 + ((b&0xFCFCFCFCUL)>>2);\
660 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
665 l0= (a&0x03030303UL)\
668 h0= ((a&0xFCFCFCFCUL)>>2)\
669 + ((b&0xFCFCFCFCUL)>>2);\
670 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
674 pixels+=4-line_size*(h+1);\
675 block +=4-line_size*h;\
679 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
684 const uint32_t a= LD32(pixels );\
685 const uint32_t b= LD32(pixels+1);\
686 uint32_t l0= (a&0x03030303UL)\
689 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
690 + ((b&0xFCFCFCFCUL)>>2);\
694 for(i=0; i<h; i+=2){\
695 uint32_t a= LD32(pixels );\
696 uint32_t b= LD32(pixels+1);\
697 l1= (a&0x03030303UL)\
699 h1= ((a&0xFCFCFCFCUL)>>2)\
700 + ((b&0xFCFCFCFCUL)>>2);\
701 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
706 l0= (a&0x03030303UL)\
709 h0= ((a&0xFCFCFCFCUL)>>2)\
710 + ((b&0xFCFCFCFCUL)>>2);\
711 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
715 pixels+=4-line_size*(h+1);\
716 block +=4-line_size*h;\
720 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
721 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
722 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
723 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
724 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
725 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
726 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
727 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
729 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
731 #define op_put(a, b) a = b
738 #define avg2(a,b) ((a+b+1)>>1)
739 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
742 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
744 const int A=(16-x16)*(16-y16);
745 const int B=( x16)*(16-y16);
746 const int C=(16-x16)*( y16);
747 const int D=( x16)*( y16);
752 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
753 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
754 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
755 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
756 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
757 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
758 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
759 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
765 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
766 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
769 const int s= 1<<shift;
779 for(x=0; x<8; x++){ //XXX FIXME optimize
780 int src_x, src_y, frac_x, frac_y, index;
789 if((unsigned)src_x < width){
790 if((unsigned)src_y < height){
791 index= src_x + src_y*stride;
792 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
793 + src[index +1]* frac_x )*(s-frac_y)
794 + ( src[index+stride ]*(s-frac_x)
795 + src[index+stride+1]* frac_x )* frac_y
798 index= src_x + clip(src_y, 0, height)*stride;
799 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
800 + src[index +1]* frac_x )*s
804 if((unsigned)src_y < height){
805 index= clip(src_x, 0, width) + src_y*stride;
806 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
807 + src[index+stride ]* frac_y )*s
810 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
811 dst[y*stride + x]= src[index ];
822 #define H264_CHROMA_MC(OPNAME, OP)\
823 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
824 const int A=(8-x)*(8-y);\
825 const int B=( x)*(8-y);\
826 const int C=(8-x)*( y);\
827 const int D=( x)*( y);\
830 assert(x<8 && y<8 && x>=0 && y>=0);\
834 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
835 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
841 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
842 const int A=(8-x)*(8-y);\
843 const int B=( x)*(8-y);\
844 const int C=(8-x)*( y);\
845 const int D=( x)*( y);\
848 assert(x<8 && y<8 && x>=0 && y>=0);\
852 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
853 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
854 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
855 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
861 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
862 const int A=(8-x)*(8-y);\
863 const int B=( x)*(8-y);\
864 const int C=(8-x)*( y);\
865 const int D=( x)*( y);\
868 assert(x<8 && y<8 && x>=0 && y>=0);\
872 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
873 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
874 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
875 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
876 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
877 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
878 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
879 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
885 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
886 #define op_put(a, b) a = (((b) + 32)>>6)
888 H264_CHROMA_MC(put_ , op_put)
889 H264_CHROMA_MC(avg_ , op_avg)
893 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
898 ST32(dst , LD32(src ));
904 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
909 ST32(dst , LD32(src ));
910 ST32(dst+4 , LD32(src+4 ));
916 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
921 ST32(dst , LD32(src ));
922 ST32(dst+4 , LD32(src+4 ));
923 ST32(dst+8 , LD32(src+8 ));
924 ST32(dst+12, LD32(src+12));
930 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
935 ST32(dst , LD32(src ));
936 ST32(dst+4 , LD32(src+4 ));
937 ST32(dst+8 , LD32(src+8 ));
938 ST32(dst+12, LD32(src+12));
945 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
950 ST32(dst , LD32(src ));
951 ST32(dst+4 , LD32(src+4 ));
959 #define QPEL_MC(r, OPNAME, RND, OP) \
960 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
961 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
965 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
966 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
967 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
968 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
969 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
970 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
971 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
972 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
978 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
980 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
984 const int src0= src[0*srcStride];\
985 const int src1= src[1*srcStride];\
986 const int src2= src[2*srcStride];\
987 const int src3= src[3*srcStride];\
988 const int src4= src[4*srcStride];\
989 const int src5= src[5*srcStride];\
990 const int src6= src[6*srcStride];\
991 const int src7= src[7*srcStride];\
992 const int src8= src[8*srcStride];\
993 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
994 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
995 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
996 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
997 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
998 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
999 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1000 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1006 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1007 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1012 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1013 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1014 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1015 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1016 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1017 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1018 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1019 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1020 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1021 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1022 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1023 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1024 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1025 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1026 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1027 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1033 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1034 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1039 const int src0= src[0*srcStride];\
1040 const int src1= src[1*srcStride];\
1041 const int src2= src[2*srcStride];\
1042 const int src3= src[3*srcStride];\
1043 const int src4= src[4*srcStride];\
1044 const int src5= src[5*srcStride];\
1045 const int src6= src[6*srcStride];\
1046 const int src7= src[7*srcStride];\
1047 const int src8= src[8*srcStride];\
1048 const int src9= src[9*srcStride];\
1049 const int src10= src[10*srcStride];\
1050 const int src11= src[11*srcStride];\
1051 const int src12= src[12*srcStride];\
1052 const int src13= src[13*srcStride];\
1053 const int src14= src[14*srcStride];\
1054 const int src15= src[15*srcStride];\
1055 const int src16= src[16*srcStride];\
1056 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1057 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1058 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1059 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1060 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1061 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1062 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1063 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1064 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1065 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1066 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1067 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1068 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1069 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1070 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1071 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1077 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1078 OPNAME ## pixels8_c(dst, src, stride, 8);\
1081 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1083 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1084 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1087 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1088 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1091 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1093 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1094 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1097 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1098 uint8_t full[16*9];\
1100 copy_block9(full, src, 16, stride, 9);\
1101 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1102 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1105 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1106 uint8_t full[16*9];\
1107 copy_block9(full, src, 16, stride, 9);\
1108 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1111 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1112 uint8_t full[16*9];\
1114 copy_block9(full, src, 16, stride, 9);\
1115 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1116 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1118 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1119 uint8_t full[16*9];\
1122 uint8_t halfHV[64];\
1123 copy_block9(full, src, 16, stride, 9);\
1124 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1125 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1126 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1127 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1129 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1130 uint8_t full[16*9];\
1132 uint8_t halfHV[64];\
1133 copy_block9(full, src, 16, stride, 9);\
1134 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1135 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1136 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1137 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1139 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1140 uint8_t full[16*9];\
1143 uint8_t halfHV[64];\
1144 copy_block9(full, src, 16, stride, 9);\
1145 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1146 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1147 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1148 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1150 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1151 uint8_t full[16*9];\
1153 uint8_t halfHV[64];\
1154 copy_block9(full, src, 16, stride, 9);\
1155 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1156 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1157 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1158 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1160 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1161 uint8_t full[16*9];\
1164 uint8_t halfHV[64];\
1165 copy_block9(full, src, 16, stride, 9);\
1166 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1167 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1168 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1169 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1171 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1172 uint8_t full[16*9];\
1174 uint8_t halfHV[64];\
1175 copy_block9(full, src, 16, stride, 9);\
1176 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1177 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1178 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1179 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1181 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1182 uint8_t full[16*9];\
1185 uint8_t halfHV[64];\
1186 copy_block9(full, src, 16, stride, 9);\
1187 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1188 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1189 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1190 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1192 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1193 uint8_t full[16*9];\
1195 uint8_t halfHV[64];\
1196 copy_block9(full, src, 16, stride, 9);\
1197 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1198 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1199 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1200 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1202 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1204 uint8_t halfHV[64];\
1205 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1206 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1207 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1209 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1211 uint8_t halfHV[64];\
1212 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1213 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1214 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1216 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217 uint8_t full[16*9];\
1220 uint8_t halfHV[64];\
1221 copy_block9(full, src, 16, stride, 9);\
1222 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1223 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1224 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1225 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1227 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1228 uint8_t full[16*9];\
1230 copy_block9(full, src, 16, stride, 9);\
1231 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1232 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1233 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1235 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1236 uint8_t full[16*9];\
1239 uint8_t halfHV[64];\
1240 copy_block9(full, src, 16, stride, 9);\
1241 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1242 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1243 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1244 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1246 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1247 uint8_t full[16*9];\
1249 copy_block9(full, src, 16, stride, 9);\
1250 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1251 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1252 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1254 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1256 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1257 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1259 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1260 OPNAME ## pixels16_c(dst, src, stride, 16);\
1263 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1265 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1266 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1269 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1270 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1273 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1275 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1276 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1279 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1280 uint8_t full[24*17];\
1282 copy_block17(full, src, 24, stride, 17);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1284 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1287 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t full[24*17];\
1289 copy_block17(full, src, 24, stride, 17);\
1290 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1293 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1294 uint8_t full[24*17];\
1296 copy_block17(full, src, 24, stride, 17);\
1297 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1298 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1300 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1301 uint8_t full[24*17];\
1302 uint8_t halfH[272];\
1303 uint8_t halfV[256];\
1304 uint8_t halfHV[256];\
1305 copy_block17(full, src, 24, stride, 17);\
1306 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1307 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1308 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1309 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1311 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1312 uint8_t full[24*17];\
1313 uint8_t halfH[272];\
1314 uint8_t halfHV[256];\
1315 copy_block17(full, src, 24, stride, 17);\
1316 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1317 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1318 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1319 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1321 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1322 uint8_t full[24*17];\
1323 uint8_t halfH[272];\
1324 uint8_t halfV[256];\
1325 uint8_t halfHV[256];\
1326 copy_block17(full, src, 24, stride, 17);\
1327 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1328 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1329 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1330 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1332 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1333 uint8_t full[24*17];\
1334 uint8_t halfH[272];\
1335 uint8_t halfHV[256];\
1336 copy_block17(full, src, 24, stride, 17);\
1337 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1338 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1339 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1340 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1342 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1343 uint8_t full[24*17];\
1344 uint8_t halfH[272];\
1345 uint8_t halfV[256];\
1346 uint8_t halfHV[256];\
1347 copy_block17(full, src, 24, stride, 17);\
1348 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1349 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1350 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1351 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1353 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1354 uint8_t full[24*17];\
1355 uint8_t halfH[272];\
1356 uint8_t halfHV[256];\
1357 copy_block17(full, src, 24, stride, 17);\
1358 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1359 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1360 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1361 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1363 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1364 uint8_t full[24*17];\
1365 uint8_t halfH[272];\
1366 uint8_t halfV[256];\
1367 uint8_t halfHV[256];\
1368 copy_block17(full, src, 24, stride, 17);\
1369 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1370 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1371 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1372 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1374 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1375 uint8_t full[24*17];\
1376 uint8_t halfH[272];\
1377 uint8_t halfHV[256];\
1378 copy_block17(full, src, 24, stride, 17);\
1379 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1380 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1381 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1382 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1384 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1385 uint8_t halfH[272];\
1386 uint8_t halfHV[256];\
1387 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1388 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1389 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1391 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1392 uint8_t halfH[272];\
1393 uint8_t halfHV[256];\
1394 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1395 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1396 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1398 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1399 uint8_t full[24*17];\
1400 uint8_t halfH[272];\
1401 uint8_t halfV[256];\
1402 uint8_t halfHV[256];\
1403 copy_block17(full, src, 24, stride, 17);\
1404 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1405 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1406 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1407 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1409 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1410 uint8_t full[24*17];\
1411 uint8_t halfH[272];\
1412 copy_block17(full, src, 24, stride, 17);\
1413 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1414 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1415 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1417 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1418 uint8_t full[24*17];\
1419 uint8_t halfH[272];\
1420 uint8_t halfV[256];\
1421 uint8_t halfHV[256];\
1422 copy_block17(full, src, 24, stride, 17);\
1423 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1424 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1425 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1426 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1428 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1429 uint8_t full[24*17];\
1430 uint8_t halfH[272];\
1431 copy_block17(full, src, 24, stride, 17);\
1432 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1433 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1434 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1436 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1437 uint8_t halfH[272];\
1438 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1439 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1442 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1443 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1444 #define op_put(a, b) a = cm[((b) + 16)>>5]
1445 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1447 QPEL_MC(0, put_ , _ , op_put)
1448 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1449 QPEL_MC(0, avg_ , _ , op_avg)
1450 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1452 #undef op_avg_no_rnd
1454 #undef op_put_no_rnd
1457 #define H264_LOWPASS(OPNAME, OP, OP2) \
1458 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1460 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1464 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
1465 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
1466 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
1467 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
1473 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1475 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1479 const int srcB= src[-2*srcStride];\
1480 const int srcA= src[-1*srcStride];\
1481 const int src0= src[0 *srcStride];\
1482 const int src1= src[1 *srcStride];\
1483 const int src2= src[2 *srcStride];\
1484 const int src3= src[3 *srcStride];\
1485 const int src4= src[4 *srcStride];\
1486 const int src5= src[5 *srcStride];\
1487 const int src6= src[6 *srcStride];\
1488 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1489 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1490 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1491 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1497 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1500 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1502 src -= 2*srcStride;\
1503 for(i=0; i<h+5; i++)\
1505 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
1506 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
1507 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
1508 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
1512 tmp -= tmpStride*(h+5-2);\
1515 const int tmpB= tmp[-2*tmpStride];\
1516 const int tmpA= tmp[-1*tmpStride];\
1517 const int tmp0= tmp[0 *tmpStride];\
1518 const int tmp1= tmp[1 *tmpStride];\
1519 const int tmp2= tmp[2 *tmpStride];\
1520 const int tmp3= tmp[3 *tmpStride];\
1521 const int tmp4= tmp[4 *tmpStride];\
1522 const int tmp5= tmp[5 *tmpStride];\
1523 const int tmp6= tmp[6 *tmpStride];\
1524 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1525 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1526 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1527 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1533 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1535 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1539 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
1540 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
1541 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
1542 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
1543 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
1544 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
1545 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
1546 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
1552 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1554 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1558 const int srcB= src[-2*srcStride];\
1559 const int srcA= src[-1*srcStride];\
1560 const int src0= src[0 *srcStride];\
1561 const int src1= src[1 *srcStride];\
1562 const int src2= src[2 *srcStride];\
1563 const int src3= src[3 *srcStride];\
1564 const int src4= src[4 *srcStride];\
1565 const int src5= src[5 *srcStride];\
1566 const int src6= src[6 *srcStride];\
1567 const int src7= src[7 *srcStride];\
1568 const int src8= src[8 *srcStride];\
1569 const int src9= src[9 *srcStride];\
1570 const int src10=src[10*srcStride];\
1571 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
1572 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
1573 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
1574 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
1575 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
1576 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
1577 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
1578 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
1584 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1587 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1589 src -= 2*srcStride;\
1590 for(i=0; i<h+5; i++)\
1592 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
1593 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
1594 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
1595 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
1596 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
1597 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
1598 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
1599 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
1603 tmp -= tmpStride*(h+5-2);\
1606 const int tmpB= tmp[-2*tmpStride];\
1607 const int tmpA= tmp[-1*tmpStride];\
1608 const int tmp0= tmp[0 *tmpStride];\
1609 const int tmp1= tmp[1 *tmpStride];\
1610 const int tmp2= tmp[2 *tmpStride];\
1611 const int tmp3= tmp[3 *tmpStride];\
1612 const int tmp4= tmp[4 *tmpStride];\
1613 const int tmp5= tmp[5 *tmpStride];\
1614 const int tmp6= tmp[6 *tmpStride];\
1615 const int tmp7= tmp[7 *tmpStride];\
1616 const int tmp8= tmp[8 *tmpStride];\
1617 const int tmp9= tmp[9 *tmpStride];\
1618 const int tmp10=tmp[10*tmpStride];\
1619 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
1620 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
1621 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
1622 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
1623 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
1624 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
1625 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
1626 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
1632 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1633 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1634 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1635 src += 8*srcStride;\
1636 dst += 8*dstStride;\
1637 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
1638 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
1641 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1643 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1644 src += 8*srcStride;\
1645 dst += 8*dstStride;\
1646 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
1647 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
1650 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
1651 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1652 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1653 src += 8*srcStride;\
1654 tmp += 8*tmpStride;\
1655 dst += 8*dstStride;\
1656 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
1657 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
1660 #define H264_MC(OPNAME, SIZE) \
1661 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1662 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
1665 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t half[SIZE*SIZE];\
1667 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1668 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
1671 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1672 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
1675 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t half[SIZE*SIZE];\
1677 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
1678 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
1681 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1682 uint8_t full[SIZE*(SIZE+5)];\
1683 uint8_t * const full_mid= full + SIZE*2;\
1684 uint8_t half[SIZE*SIZE];\
1685 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1686 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1687 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
1690 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1691 uint8_t full[SIZE*(SIZE+5)];\
1692 uint8_t * const full_mid= full + SIZE*2;\
1693 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1694 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
1697 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1698 uint8_t full[SIZE*(SIZE+5)];\
1699 uint8_t * const full_mid= full + SIZE*2;\
1700 uint8_t half[SIZE*SIZE];\
1701 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1702 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
1703 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
1706 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[SIZE*(SIZE+5)];\
1708 uint8_t * const full_mid= full + SIZE*2;\
1709 uint8_t halfH[SIZE*SIZE];\
1710 uint8_t halfV[SIZE*SIZE];\
1711 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1712 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1713 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1714 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1717 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1718 uint8_t full[SIZE*(SIZE+5)];\
1719 uint8_t * const full_mid= full + SIZE*2;\
1720 uint8_t halfH[SIZE*SIZE];\
1721 uint8_t halfV[SIZE*SIZE];\
1722 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1723 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1724 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1725 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1728 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[SIZE*(SIZE+5)];\
1730 uint8_t * const full_mid= full + SIZE*2;\
1731 uint8_t halfH[SIZE*SIZE];\
1732 uint8_t halfV[SIZE*SIZE];\
1733 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1734 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1735 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1736 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1739 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[SIZE*(SIZE+5)];\
1741 uint8_t * const full_mid= full + SIZE*2;\
1742 uint8_t halfH[SIZE*SIZE];\
1743 uint8_t halfV[SIZE*SIZE];\
1744 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1745 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1746 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1747 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
1750 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1751 int16_t tmp[SIZE*(SIZE+5)];\
1752 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
1755 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1756 int16_t tmp[SIZE*(SIZE+5)];\
1757 uint8_t halfH[SIZE*SIZE];\
1758 uint8_t halfHV[SIZE*SIZE];\
1759 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
1760 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1761 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1764 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1765 int16_t tmp[SIZE*(SIZE+5)];\
1766 uint8_t halfH[SIZE*SIZE];\
1767 uint8_t halfHV[SIZE*SIZE];\
1768 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
1769 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1770 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
1773 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[SIZE*(SIZE+5)];\
1775 uint8_t * const full_mid= full + SIZE*2;\
1776 int16_t tmp[SIZE*(SIZE+5)];\
1777 uint8_t halfV[SIZE*SIZE];\
1778 uint8_t halfHV[SIZE*SIZE];\
1779 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
1780 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1781 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1782 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1785 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[SIZE*(SIZE+5)];\
1787 uint8_t * const full_mid= full + SIZE*2;\
1788 int16_t tmp[SIZE*(SIZE+5)];\
1789 uint8_t halfV[SIZE*SIZE];\
1790 uint8_t halfHV[SIZE*SIZE];\
1791 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
1792 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
1793 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
1794 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
1797 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1798 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
1799 #define op_put(a, b) a = cm[((b) + 16)>>5]
1800 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
1801 #define op2_put(a, b) a = cm[((b) + 512)>>10]
1803 H264_LOWPASS(put_ , op_put, op2_put)
1804 H264_LOWPASS(avg_ , op_avg, op2_avg)
1818 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1819 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1823 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1824 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1825 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1826 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1827 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1828 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1829 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1830 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1836 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1837 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1841 const int src_1= src[ -srcStride];
1842 const int src0 = src[0 ];
1843 const int src1 = src[ srcStride];
1844 const int src2 = src[2*srcStride];
1845 const int src3 = src[3*srcStride];
1846 const int src4 = src[4*srcStride];
1847 const int src5 = src[5*srcStride];
1848 const int src6 = src[6*srcStride];
1849 const int src7 = src[7*srcStride];
1850 const int src8 = src[8*srcStride];
1851 const int src9 = src[9*srcStride];
1852 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1853 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1854 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1855 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1856 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1857 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1858 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1859 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1865 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1866 put_pixels8_c(dst, src, stride, 8);
1869 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1871 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1872 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1875 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1876 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1879 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1881 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1882 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1885 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1886 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1889 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1893 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1894 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1895 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1896 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1898 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1902 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1903 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1904 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1905 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1907 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1909 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1910 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1914 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1920 s += abs(pix1[0] - pix2[0]);
1921 s += abs(pix1[1] - pix2[1]);
1922 s += abs(pix1[2] - pix2[2]);
1923 s += abs(pix1[3] - pix2[3]);
1924 s += abs(pix1[4] - pix2[4]);
1925 s += abs(pix1[5] - pix2[5]);
1926 s += abs(pix1[6] - pix2[6]);
1927 s += abs(pix1[7] - pix2[7]);
1928 s += abs(pix1[8] - pix2[8]);
1929 s += abs(pix1[9] - pix2[9]);
1930 s += abs(pix1[10] - pix2[10]);
1931 s += abs(pix1[11] - pix2[11]);
1932 s += abs(pix1[12] - pix2[12]);
1933 s += abs(pix1[13] - pix2[13]);
1934 s += abs(pix1[14] - pix2[14]);
1935 s += abs(pix1[15] - pix2[15]);
1942 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1948 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1949 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1950 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1951 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1952 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1953 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1954 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1955 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1956 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1957 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1958 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1959 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1960 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1961 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1962 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1963 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1970 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1973 uint8_t *pix3 = pix2 + line_size;
1977 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1978 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1979 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1980 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1981 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1982 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1983 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1984 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1985 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1986 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1987 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1988 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1989 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1990 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1991 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1992 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2000 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2003 uint8_t *pix3 = pix2 + line_size;
2007 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2008 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2009 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2010 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2011 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2012 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2013 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2014 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2015 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2016 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2017 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2018 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2019 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2020 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2021 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2022 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2030 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2036 s += abs(pix1[0] - pix2[0]);
2037 s += abs(pix1[1] - pix2[1]);
2038 s += abs(pix1[2] - pix2[2]);
2039 s += abs(pix1[3] - pix2[3]);
2040 s += abs(pix1[4] - pix2[4]);
2041 s += abs(pix1[5] - pix2[5]);
2042 s += abs(pix1[6] - pix2[6]);
2043 s += abs(pix1[7] - pix2[7]);
2050 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2056 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2057 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2058 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2059 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2060 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2061 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2062 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2063 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2070 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2073 uint8_t *pix3 = pix2 + line_size;
2077 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2078 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2079 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2080 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2081 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2082 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2083 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2084 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2092 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
2095 uint8_t *pix3 = pix2 + line_size;
2099 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2100 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2101 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2102 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2103 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2104 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2105 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2106 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2114 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
2115 return pix_abs16x16_c(a,b,stride);
2118 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
2119 return pix_abs8x8_c(a,b,stride);
2123 * permutes an 8x8 block.
2124 * @param block the block which will be permuted according to the given permutation vector
2125 * @param permutation the permutation vector
2126 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2127 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2128 * (inverse) permutated to scantable order!
2130 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2136 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2138 for(i=0; i<=last; i++){
2139 const int j= scantable[i];
2144 for(i=0; i<=last; i++){
2145 const int j= scantable[i];
2146 const int perm_j= permutation[j];
2147 block[perm_j]= temp[j];
2152 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2154 static void clear_blocks_c(DCTELEM *blocks)
2156 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2159 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2161 for(i=0; i+7<w; i+=8){
2162 dst[i+0] += src[i+0];
2163 dst[i+1] += src[i+1];
2164 dst[i+2] += src[i+2];
2165 dst[i+3] += src[i+3];
2166 dst[i+4] += src[i+4];
2167 dst[i+5] += src[i+5];
2168 dst[i+6] += src[i+6];
2169 dst[i+7] += src[i+7];
2172 dst[i+0] += src[i+0];
2175 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2177 for(i=0; i+7<w; i+=8){
2178 dst[i+0] = src1[i+0]-src2[i+0];
2179 dst[i+1] = src1[i+1]-src2[i+1];
2180 dst[i+2] = src1[i+2]-src2[i+2];
2181 dst[i+3] = src1[i+3]-src2[i+3];
2182 dst[i+4] = src1[i+4]-src2[i+4];
2183 dst[i+5] = src1[i+5]-src2[i+5];
2184 dst[i+6] = src1[i+6]-src2[i+6];
2185 dst[i+7] = src1[i+7]-src2[i+7];
2188 dst[i+0] = src1[i+0]-src2[i+0];
2191 #define BUTTERFLY2(o1,o2,i1,i2) \
2195 #define BUTTERFLY1(x,y) \
2204 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
2206 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
2212 //FIXME try pointer walks
2213 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2214 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2215 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2216 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2218 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2219 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2220 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2221 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2223 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2224 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2225 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2226 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2230 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2231 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2232 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2233 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2235 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2236 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2237 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2238 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2241 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2242 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2243 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2244 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2250 printf("MAX:%d\n", maxi);
2256 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
2260 //FIXME OOOPS ignore 0 term instead of mean mess
2262 //FIXME try pointer walks
2263 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
2264 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
2265 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
2266 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
2268 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2269 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2270 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2271 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2273 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2274 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2275 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2276 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2280 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2281 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2282 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2283 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2285 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2286 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2287 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2288 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2291 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2292 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2293 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2294 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2300 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2301 MpegEncContext * const s= (MpegEncContext *)c;
2302 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2303 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2306 s->dsp.diff_pixels(temp, src1, src2, stride);
2315 void simple_idct(DCTELEM *block); //FIXME
2317 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2318 MpegEncContext * const s= (MpegEncContext *)c;
2319 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
2320 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2321 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
2326 s->dsp.diff_pixels(temp, src1, src2, stride);
2328 memcpy(bak, temp, 64*sizeof(DCTELEM));
2330 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2331 s->dct_unquantize(s, temp, 0, s->qscale);
2332 simple_idct(temp); //FIXME
2335 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2340 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2341 MpegEncContext * const s= (MpegEncContext *)c;
2342 const uint8_t *scantable= s->intra_scantable.permutated;
2343 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2344 uint64_t __align8 aligned_bak[stride];
2345 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2346 uint8_t * const bak= (uint8_t*)aligned_bak;
2347 int i, last, run, bits, level, distoration, start_i;
2348 const int esc_length= s->ac_esc_length;
2350 uint8_t * last_length;
2353 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
2354 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
2357 s->dsp.diff_pixels(temp, src1, src2, stride);
2359 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2365 length = s->intra_ac_vlc_length;
2366 last_length= s->intra_ac_vlc_last_length;
2367 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2370 length = s->inter_ac_vlc_length;
2371 last_length= s->inter_ac_vlc_last_length;
2376 for(i=start_i; i<last; i++){
2377 int j= scantable[i];
2382 if((level&(~127)) == 0){
2383 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2392 level= temp[i] + 64;
2396 if((level&(~127)) == 0){
2397 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2404 s->dct_unquantize(s, temp, 0, s->qscale);
2407 s->dsp.idct_add(bak, stride, temp);
2409 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
2411 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2414 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
2415 MpegEncContext * const s= (MpegEncContext *)c;
2416 const uint8_t *scantable= s->intra_scantable.permutated;
2417 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
2418 DCTELEM * const temp= (DCTELEM*)aligned_temp;
2419 int i, last, run, bits, level, start_i;
2420 const int esc_length= s->ac_esc_length;
2422 uint8_t * last_length;
2424 s->dsp.diff_pixels(temp, src1, src2, stride);
2426 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2432 length = s->intra_ac_vlc_length;
2433 last_length= s->intra_ac_vlc_last_length;
2434 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2437 length = s->inter_ac_vlc_length;
2438 last_length= s->inter_ac_vlc_last_length;
2443 for(i=start_i; i<last; i++){
2444 int j= scantable[i];
2449 if((level&(~127)) == 0){
2450 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2459 level= temp[i] + 64;
2463 if((level&(~127)) == 0){
2464 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2473 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
2474 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
2475 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
2476 WARPER88_1616(rd8x8_c, rd16x16_c)
2477 WARPER88_1616(bit8x8_c, bit16x16_c)
2479 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2481 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2484 put_pixels_clamped_c(block, dest, line_size);
2486 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2489 add_pixels_clamped_c(block, dest, line_size);
2492 /* init static data */
2493 void dsputil_static_init(void)
2497 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2498 for(i=0;i<MAX_NEG_CROP;i++) {
2500 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2503 for(i=0;i<512;i++) {
2504 squareTbl[i] = (i - 256) * (i - 256);
2507 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2511 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2515 #ifdef CONFIG_ENCODERS
2516 if(avctx->dct_algo==FF_DCT_FASTINT)
2517 c->fdct = fdct_ifast;
2519 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2520 #endif //CONFIG_ENCODERS
2522 if(avctx->idct_algo==FF_IDCT_INT){
2523 c->idct_put= ff_jref_idct_put;
2524 c->idct_add= ff_jref_idct_add;
2525 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2526 }else{ //accurate/default
2527 c->idct_put= simple_idct_put;
2528 c->idct_add= simple_idct_add;
2529 c->idct_permutation_type= FF_NO_IDCT_PERM;
2532 c->get_pixels = get_pixels_c;
2533 c->diff_pixels = diff_pixels_c;
2534 c->put_pixels_clamped = put_pixels_clamped_c;
2535 c->add_pixels_clamped = add_pixels_clamped_c;
2538 c->clear_blocks = clear_blocks_c;
2539 c->pix_sum = pix_sum_c;
2540 c->pix_norm1 = pix_norm1_c;
2544 /* TODO [0] 16 [1] 8 */
2545 c->pix_abs16x16 = pix_abs16x16_c;
2546 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2547 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2548 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2549 c->pix_abs8x8 = pix_abs8x8_c;
2550 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2551 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2552 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2554 #define dspfunc(PFX, IDX, NUM) \
2555 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2556 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2557 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2558 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2560 dspfunc(put, 0, 16);
2561 dspfunc(put_no_rnd, 0, 16);
2563 dspfunc(put_no_rnd, 1, 8);
2565 dspfunc(avg, 0, 16);
2566 dspfunc(avg_no_rnd, 0, 16);
2568 dspfunc(avg_no_rnd, 1, 8);
2571 #define dspfunc(PFX, IDX, NUM) \
2572 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2573 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2574 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2575 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2576 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2577 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2578 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2579 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2580 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2581 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2582 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2583 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2584 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2585 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2586 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2587 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2589 dspfunc(put_qpel, 0, 16);
2590 dspfunc(put_no_rnd_qpel, 0, 16);
2592 dspfunc(avg_qpel, 0, 16);
2593 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2595 dspfunc(put_qpel, 1, 8);
2596 dspfunc(put_no_rnd_qpel, 1, 8);
2598 dspfunc(avg_qpel, 1, 8);
2599 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2601 dspfunc(put_h264_qpel, 0, 16);
2602 dspfunc(put_h264_qpel, 1, 8);
2603 dspfunc(put_h264_qpel, 2, 4);
2604 dspfunc(avg_h264_qpel, 0, 16);
2605 dspfunc(avg_h264_qpel, 1, 8);
2606 dspfunc(avg_h264_qpel, 2, 4);
2609 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
2610 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
2611 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
2612 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
2613 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
2614 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
2616 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2617 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2618 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2619 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2620 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2621 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2622 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2623 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2625 c->hadamard8_diff[0]= hadamard8_diff16_c;
2626 c->hadamard8_diff[1]= hadamard8_diff_c;
2627 c->hadamard8_abs = hadamard8_abs_c;
2629 c->dct_sad[0]= dct_sad16x16_c;
2630 c->dct_sad[1]= dct_sad8x8_c;
2632 c->sad[0]= sad16x16_c;
2633 c->sad[1]= sad8x8_c;
2635 c->quant_psnr[0]= quant_psnr16x16_c;
2636 c->quant_psnr[1]= quant_psnr8x8_c;
2638 c->rd[0]= rd16x16_c;
2641 c->bit[0]= bit16x16_c;
2642 c->bit[1]= bit8x8_c;
2644 c->add_bytes= add_bytes_c;
2645 c->diff_bytes= diff_bytes_c;
2648 dsputil_init_mmx(c, avctx);
2651 dsputil_init_armv4l(c, avctx);
2654 dsputil_init_mlib(c, avctx);
2657 dsputil_init_alpha(c, avctx);
2660 dsputil_init_ppc(c, avctx);
2663 dsputil_init_mmi(c, avctx);
2666 switch(c->idct_permutation_type){
2667 case FF_NO_IDCT_PERM:
2669 c->idct_permutation[i]= i;
2671 case FF_LIBMPEG2_IDCT_PERM:
2673 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2675 case FF_SIMPLE_IDCT_PERM:
2677 c->idct_permutation[i]= simple_mmx_permutation[i];
2679 case FF_TRANSPOSE_IDCT_PERM:
2681 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2684 fprintf(stderr, "Internal error, IDCT permutation not set\n");