3 * Copyright (c) 2000, 2001 Fabrice Bellard.
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Lesser General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Lesser General Public License for more details.
15 * You should have received a copy of the GNU Lesser General Public
16 * License along with this library; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
29 #include "mpegvideo.h"
30 #include "simple_idct.h"
33 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
34 uint32_t squareTbl[512];
36 const uint8_t ff_zigzag_direct[64] = {
37 0, 1, 8, 16, 9, 2, 3, 10,
38 17, 24, 32, 25, 18, 11, 4, 5,
39 12, 19, 26, 33, 40, 48, 41, 34,
40 27, 20, 13, 6, 7, 14, 21, 28,
41 35, 42, 49, 56, 57, 50, 43, 36,
42 29, 22, 15, 23, 30, 37, 44, 51,
43 58, 59, 52, 45, 38, 31, 39, 46,
44 53, 60, 61, 54, 47, 55, 62, 63
47 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
48 uint16_t __align8 inv_zigzag_direct16[64];
50 const uint8_t ff_alternate_horizontal_scan[64] = {
51 0, 1, 2, 3, 8, 9, 16, 17,
52 10, 11, 4, 5, 6, 7, 15, 14,
53 13, 12, 19, 18, 24, 25, 32, 33,
54 26, 27, 20, 21, 22, 23, 28, 29,
55 30, 31, 34, 35, 40, 41, 48, 49,
56 42, 43, 36, 37, 38, 39, 44, 45,
57 46, 47, 50, 51, 56, 57, 58, 59,
58 52, 53, 54, 55, 60, 61, 62, 63,
61 const uint8_t ff_alternate_vertical_scan[64] = {
62 0, 8, 16, 24, 1, 9, 2, 10,
63 17, 25, 32, 40, 48, 56, 57, 49,
64 41, 33, 26, 18, 3, 11, 4, 12,
65 19, 27, 34, 42, 50, 58, 35, 43,
66 51, 59, 20, 28, 5, 13, 6, 14,
67 21, 29, 36, 44, 52, 60, 37, 45,
68 53, 61, 22, 30, 7, 15, 23, 31,
69 38, 46, 54, 62, 39, 47, 55, 63,
72 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
73 const uint32_t inverse[256]={
74 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
75 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
76 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
77 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
78 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
79 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
80 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
81 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
82 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
83 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
84 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
85 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
86 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
87 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
88 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
89 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
90 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
91 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
92 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
93 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
94 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
95 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
96 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
97 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
98 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
99 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
100 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
101 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
102 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
103 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
104 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
105 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
108 /* Input permutation for the simple_idct_mmx */
109 static const uint8_t simple_mmx_permutation[64]={
110 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
111 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
112 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
113 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
114 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
115 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
116 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
117 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
120 static int pix_sum_c(uint8_t * pix, int line_size)
125 for (i = 0; i < 16; i++) {
126 for (j = 0; j < 16; j += 8) {
137 pix += line_size - 16;
142 static int pix_norm1_c(uint8_t * pix, int line_size)
145 uint32_t *sq = squareTbl + 256;
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 #if LONG_MAX > 2147483647
161 register uint64_t x=*(uint64_t*)pix;
163 s += sq[(x>>8)&0xff];
164 s += sq[(x>>16)&0xff];
165 s += sq[(x>>24)&0xff];
166 s += sq[(x>>32)&0xff];
167 s += sq[(x>>40)&0xff];
168 s += sq[(x>>48)&0xff];
169 s += sq[(x>>56)&0xff];
171 register uint32_t x=*(uint32_t*)pix;
173 s += sq[(x>>8)&0xff];
174 s += sq[(x>>16)&0xff];
175 s += sq[(x>>24)&0xff];
176 x=*(uint32_t*)(pix+4);
178 s += sq[(x>>8)&0xff];
179 s += sq[(x>>16)&0xff];
180 s += sq[(x>>24)&0xff];
185 pix += line_size - 16;
191 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size)
194 uint32_t *sq = squareTbl + 256;
197 for (i = 0; i < 8; i++) {
198 s += sq[pix1[0] - pix2[0]];
199 s += sq[pix1[1] - pix2[1]];
200 s += sq[pix1[2] - pix2[2]];
201 s += sq[pix1[3] - pix2[3]];
202 s += sq[pix1[4] - pix2[4]];
203 s += sq[pix1[5] - pix2[5]];
204 s += sq[pix1[6] - pix2[6]];
205 s += sq[pix1[7] - pix2[7]];
212 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
215 uint32_t *sq = squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 s += sq[pix1[ 0] - pix2[ 0]];
220 s += sq[pix1[ 1] - pix2[ 1]];
221 s += sq[pix1[ 2] - pix2[ 2]];
222 s += sq[pix1[ 3] - pix2[ 3]];
223 s += sq[pix1[ 4] - pix2[ 4]];
224 s += sq[pix1[ 5] - pix2[ 5]];
225 s += sq[pix1[ 6] - pix2[ 6]];
226 s += sq[pix1[ 7] - pix2[ 7]];
227 s += sq[pix1[ 8] - pix2[ 8]];
228 s += sq[pix1[ 9] - pix2[ 9]];
229 s += sq[pix1[10] - pix2[10]];
230 s += sq[pix1[11] - pix2[11]];
231 s += sq[pix1[12] - pix2[12]];
232 s += sq[pix1[13] - pix2[13]];
233 s += sq[pix1[14] - pix2[14]];
234 s += sq[pix1[15] - pix2[15]];
242 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
246 /* read the pixels */
248 block[0] = pixels[0];
249 block[1] = pixels[1];
250 block[2] = pixels[2];
251 block[3] = pixels[3];
252 block[4] = pixels[4];
253 block[5] = pixels[5];
254 block[6] = pixels[6];
255 block[7] = pixels[7];
261 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
262 const uint8_t *s2, int stride){
265 /* read the pixels */
267 block[0] = s1[0] - s2[0];
268 block[1] = s1[1] - s2[1];
269 block[2] = s1[2] - s2[2];
270 block[3] = s1[3] - s2[3];
271 block[4] = s1[4] - s2[4];
272 block[5] = s1[5] - s2[5];
273 block[6] = s1[6] - s2[6];
274 block[7] = s1[7] - s2[7];
282 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
286 uint8_t *cm = cropTbl + MAX_NEG_CROP;
288 /* read the pixels */
290 pixels[0] = cm[block[0]];
291 pixels[1] = cm[block[1]];
292 pixels[2] = cm[block[2]];
293 pixels[3] = cm[block[3]];
294 pixels[4] = cm[block[4]];
295 pixels[5] = cm[block[5]];
296 pixels[6] = cm[block[6]];
297 pixels[7] = cm[block[7]];
304 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
308 uint8_t *cm = cropTbl + MAX_NEG_CROP;
310 /* read the pixels */
312 pixels[0] = cm[pixels[0] + block[0]];
313 pixels[1] = cm[pixels[1] + block[1]];
314 pixels[2] = cm[pixels[2] + block[2]];
315 pixels[3] = cm[pixels[3] + block[3]];
316 pixels[4] = cm[pixels[4] + block[4]];
317 pixels[5] = cm[pixels[5] + block[5]];
318 pixels[6] = cm[pixels[6] + block[6]];
319 pixels[7] = cm[pixels[7] + block[7]];
326 #define PIXOP2(OPNAME, OP) \
327 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
331 OP(*((uint64_t*)block), LD64(pixels));\
337 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
341 const uint64_t a= LD64(pixels );\
342 const uint64_t b= LD64(pixels+1);\
343 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
349 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
353 const uint64_t a= LD64(pixels );\
354 const uint64_t b= LD64(pixels+1);\
355 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
361 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
365 const uint64_t a= LD64(pixels );\
366 const uint64_t b= LD64(pixels+line_size);\
367 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
373 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
377 const uint64_t a= LD64(pixels );\
378 const uint64_t b= LD64(pixels+line_size);\
379 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
385 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
388 const uint64_t a= LD64(pixels );\
389 const uint64_t b= LD64(pixels+1);\
390 uint64_t l0= (a&0x0303030303030303ULL)\
391 + (b&0x0303030303030303ULL)\
392 + 0x0202020202020202ULL;\
393 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
394 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
398 for(i=0; i<h; i+=2){\
399 uint64_t a= LD64(pixels );\
400 uint64_t b= LD64(pixels+1);\
401 l1= (a&0x0303030303030303ULL)\
402 + (b&0x0303030303030303ULL);\
403 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
404 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
405 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
410 l0= (a&0x0303030303030303ULL)\
411 + (b&0x0303030303030303ULL)\
412 + 0x0202020202020202ULL;\
413 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
414 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
415 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
421 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
424 const uint64_t a= LD64(pixels );\
425 const uint64_t b= LD64(pixels+1);\
426 uint64_t l0= (a&0x0303030303030303ULL)\
427 + (b&0x0303030303030303ULL)\
428 + 0x0101010101010101ULL;\
429 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
430 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
434 for(i=0; i<h; i+=2){\
435 uint64_t a= LD64(pixels );\
436 uint64_t b= LD64(pixels+1);\
437 l1= (a&0x0303030303030303ULL)\
438 + (b&0x0303030303030303ULL);\
439 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
440 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
441 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
446 l0= (a&0x0303030303030303ULL)\
447 + (b&0x0303030303030303ULL)\
448 + 0x0101010101010101ULL;\
449 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
450 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
451 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
457 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
458 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
459 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
460 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
461 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
462 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
463 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
465 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
466 #else // 64 bit variant
468 #define PIXOP2(OPNAME, OP) \
469 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
472 OP(*((uint32_t*)(block )), LD32(pixels ));\
473 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
478 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
479 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
482 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
483 int src_stride1, int src_stride2, int h){\
487 a= LD32(&src1[i*src_stride1 ]);\
488 b= LD32(&src2[i*src_stride2 ]);\
489 OP(*((uint32_t*)&dst[i*dst_stride ]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
490 a= LD32(&src1[i*src_stride1+4]);\
491 b= LD32(&src2[i*src_stride2+4]);\
492 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
496 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
497 int src_stride1, int src_stride2, int h){\
501 a= LD32(&src1[i*src_stride1 ]);\
502 b= LD32(&src2[i*src_stride2 ]);\
503 OP(*((uint32_t*)&dst[i*dst_stride ]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
504 a= LD32(&src1[i*src_stride1+4]);\
505 b= LD32(&src2[i*src_stride2+4]);\
506 OP(*((uint32_t*)&dst[i*dst_stride+4]), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
510 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
511 int src_stride1, int src_stride2, int h){\
512 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
513 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
516 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
517 int src_stride1, int src_stride2, int h){\
518 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
519 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
522 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
523 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
526 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
527 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
530 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
531 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
534 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
535 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
538 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
539 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
542 uint32_t a, b, c, d, l0, l1, h0, h1;\
543 a= LD32(&src1[i*src_stride1]);\
544 b= LD32(&src2[i*src_stride2]);\
545 c= LD32(&src3[i*src_stride3]);\
546 d= LD32(&src4[i*src_stride4]);\
547 l0= (a&0x03030303UL)\
550 h0= ((a&0xFCFCFCFCUL)>>2)\
551 + ((b&0xFCFCFCFCUL)>>2);\
552 l1= (c&0x03030303UL)\
554 h1= ((c&0xFCFCFCFCUL)>>2)\
555 + ((d&0xFCFCFCFCUL)>>2);\
556 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
557 a= LD32(&src1[i*src_stride1+4]);\
558 b= LD32(&src2[i*src_stride2+4]);\
559 c= LD32(&src3[i*src_stride3+4]);\
560 d= LD32(&src4[i*src_stride4+4]);\
561 l0= (a&0x03030303UL)\
564 h0= ((a&0xFCFCFCFCUL)>>2)\
565 + ((b&0xFCFCFCFCUL)>>2);\
566 l1= (c&0x03030303UL)\
568 h1= ((c&0xFCFCFCFCUL)>>2)\
569 + ((d&0xFCFCFCFCUL)>>2);\
570 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
573 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
574 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
577 uint32_t a, b, c, d, l0, l1, h0, h1;\
578 a= LD32(&src1[i*src_stride1]);\
579 b= LD32(&src2[i*src_stride2]);\
580 c= LD32(&src3[i*src_stride3]);\
581 d= LD32(&src4[i*src_stride4]);\
582 l0= (a&0x03030303UL)\
585 h0= ((a&0xFCFCFCFCUL)>>2)\
586 + ((b&0xFCFCFCFCUL)>>2);\
587 l1= (c&0x03030303UL)\
589 h1= ((c&0xFCFCFCFCUL)>>2)\
590 + ((d&0xFCFCFCFCUL)>>2);\
591 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
592 a= LD32(&src1[i*src_stride1+4]);\
593 b= LD32(&src2[i*src_stride2+4]);\
594 c= LD32(&src3[i*src_stride3+4]);\
595 d= LD32(&src4[i*src_stride4+4]);\
596 l0= (a&0x03030303UL)\
599 h0= ((a&0xFCFCFCFCUL)>>2)\
600 + ((b&0xFCFCFCFCUL)>>2);\
601 l1= (c&0x03030303UL)\
603 h1= ((c&0xFCFCFCFCUL)>>2)\
604 + ((d&0xFCFCFCFCUL)>>2);\
605 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
608 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
609 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
610 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
611 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
613 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
614 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
615 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
616 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
619 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint32_t a= LD32(pixels );\
625 const uint32_t b= LD32(pixels+1);\
626 uint32_t l0= (a&0x03030303UL)\
629 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
630 + ((b&0xFCFCFCFCUL)>>2);\
634 for(i=0; i<h; i+=2){\
635 uint32_t a= LD32(pixels );\
636 uint32_t b= LD32(pixels+1);\
637 l1= (a&0x03030303UL)\
639 h1= ((a&0xFCFCFCFCUL)>>2)\
640 + ((b&0xFCFCFCFCUL)>>2);\
641 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
646 l0= (a&0x03030303UL)\
649 h0= ((a&0xFCFCFCFCUL)>>2)\
650 + ((b&0xFCFCFCFCUL)>>2);\
651 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
655 pixels+=4-line_size*(h+1);\
656 block +=4-line_size*h;\
660 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
665 const uint32_t a= LD32(pixels );\
666 const uint32_t b= LD32(pixels+1);\
667 uint32_t l0= (a&0x03030303UL)\
670 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
671 + ((b&0xFCFCFCFCUL)>>2);\
675 for(i=0; i<h; i+=2){\
676 uint32_t a= LD32(pixels );\
677 uint32_t b= LD32(pixels+1);\
678 l1= (a&0x03030303UL)\
680 h1= ((a&0xFCFCFCFCUL)>>2)\
681 + ((b&0xFCFCFCFCUL)>>2);\
682 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
687 l0= (a&0x03030303UL)\
690 h0= ((a&0xFCFCFCFCUL)>>2)\
691 + ((b&0xFCFCFCFCUL)>>2);\
692 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
696 pixels+=4-line_size*(h+1);\
697 block +=4-line_size*h;\
701 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
702 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
703 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
704 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
705 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
706 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
707 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
708 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
710 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
712 #define op_put(a, b) a = b
719 #define avg2(a,b) ((a+b+1)>>1)
720 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
723 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
725 const int A=(16-x16)*(16-y16);
726 const int B=( x16)*(16-y16);
727 const int C=(16-x16)*( y16);
728 const int D=( x16)*( y16);
733 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
734 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
735 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
736 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
737 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
738 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
739 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
740 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
746 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
747 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
750 const int s= 1<<shift;
760 for(x=0; x<8; x++){ //XXX FIXME optimize
761 int src_x, src_y, frac_x, frac_y, index;
770 if((unsigned)src_x < width){
771 if((unsigned)src_y < height){
772 index= src_x + src_y*stride;
773 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
774 + src[index +1]* frac_x )*(s-frac_y)
775 + ( src[index+stride ]*(s-frac_x)
776 + src[index+stride+1]* frac_x )* frac_y
779 index= src_x + clip(src_y, 0, height)*stride;
780 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
781 + src[index +1]* frac_x )*s
785 if((unsigned)src_y < height){
786 index= clip(src_x, 0, width) + src_y*stride;
787 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
788 + src[index+stride ]* frac_y )*s
791 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
792 dst[y*stride + x]= src[index ];
804 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
809 ST32(dst , LD32(src ));
810 ST32(dst+4 , LD32(src+4 ));
811 ST32(dst+8 , LD32(src+8 ));
812 ST32(dst+12, LD32(src+12));
819 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
824 ST32(dst , LD32(src ));
825 ST32(dst+4 , LD32(src+4 ));
833 #define QPEL_MC(r, OPNAME, RND, OP) \
834 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
835 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
839 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
840 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
841 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
842 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
843 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
844 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
845 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
846 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
852 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
854 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
858 const int src0= src[0*srcStride];\
859 const int src1= src[1*srcStride];\
860 const int src2= src[2*srcStride];\
861 const int src3= src[3*srcStride];\
862 const int src4= src[4*srcStride];\
863 const int src5= src[5*srcStride];\
864 const int src6= src[6*srcStride];\
865 const int src7= src[7*srcStride];\
866 const int src8= src[8*srcStride];\
867 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
868 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
869 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
870 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
871 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
872 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
873 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
874 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
880 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
881 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
886 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
887 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
888 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
889 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
890 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
891 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
892 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
893 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
894 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
895 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
896 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
897 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
898 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
899 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
900 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
901 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
907 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
908 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
913 const int src0= src[0*srcStride];\
914 const int src1= src[1*srcStride];\
915 const int src2= src[2*srcStride];\
916 const int src3= src[3*srcStride];\
917 const int src4= src[4*srcStride];\
918 const int src5= src[5*srcStride];\
919 const int src6= src[6*srcStride];\
920 const int src7= src[7*srcStride];\
921 const int src8= src[8*srcStride];\
922 const int src9= src[9*srcStride];\
923 const int src10= src[10*srcStride];\
924 const int src11= src[11*srcStride];\
925 const int src12= src[12*srcStride];\
926 const int src13= src[13*srcStride];\
927 const int src14= src[14*srcStride];\
928 const int src15= src[15*srcStride];\
929 const int src16= src[16*srcStride];\
930 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
931 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
932 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
933 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
934 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
935 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
936 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
937 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
938 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
939 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
940 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
941 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
942 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
943 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
944 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
945 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
951 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
952 OPNAME ## pixels8_c(dst, src, stride, 8);\
955 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
957 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
958 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
961 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
962 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
965 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
967 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
968 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
971 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
974 copy_block9(full, src, 16, stride, 9);\
975 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
976 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
979 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
981 copy_block9(full, src, 16, stride, 9);\
982 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
985 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
988 copy_block9(full, src, 16, stride, 9);\
989 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
990 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
992 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
997 copy_block9(full, src, 16, stride, 9);\
998 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
999 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1000 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1001 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1003 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1004 uint8_t full[16*9];\
1006 uint8_t halfHV[64];\
1007 copy_block9(full, src, 16, stride, 9);\
1008 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1009 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1010 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1011 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1013 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1014 uint8_t full[16*9];\
1017 uint8_t halfHV[64];\
1018 copy_block9(full, src, 16, stride, 9);\
1019 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1020 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1021 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1022 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1024 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1025 uint8_t full[16*9];\
1027 uint8_t halfHV[64];\
1028 copy_block9(full, src, 16, stride, 9);\
1029 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1030 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1031 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1032 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1034 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1035 uint8_t full[16*9];\
1038 uint8_t halfHV[64];\
1039 copy_block9(full, src, 16, stride, 9);\
1040 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1041 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1042 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1043 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1045 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1046 uint8_t full[16*9];\
1048 uint8_t halfHV[64];\
1049 copy_block9(full, src, 16, stride, 9);\
1050 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1051 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1052 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1053 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1055 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1056 uint8_t full[16*9];\
1059 uint8_t halfHV[64];\
1060 copy_block9(full, src, 16, stride, 9);\
1061 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1062 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1063 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1064 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1066 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1067 uint8_t full[16*9];\
1069 uint8_t halfHV[64];\
1070 copy_block9(full, src, 16, stride, 9);\
1071 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1072 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1073 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1074 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1076 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1078 uint8_t halfHV[64];\
1079 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1080 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1081 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1083 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1085 uint8_t halfHV[64];\
1086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1087 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1088 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1090 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1091 uint8_t full[16*9];\
1094 uint8_t halfHV[64];\
1095 copy_block9(full, src, 16, stride, 9);\
1096 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1097 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1098 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1099 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1101 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1102 uint8_t full[16*9];\
1104 copy_block9(full, src, 16, stride, 9);\
1105 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1106 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1107 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1109 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1110 uint8_t full[16*9];\
1113 uint8_t halfHV[64];\
1114 copy_block9(full, src, 16, stride, 9);\
1115 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1116 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1117 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1118 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1120 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1121 uint8_t full[16*9];\
1123 copy_block9(full, src, 16, stride, 9);\
1124 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1125 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1126 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1128 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1130 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1131 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1133 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1134 OPNAME ## pixels16_c(dst, src, stride, 16);\
1137 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1139 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1140 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1143 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1144 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1147 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1149 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1150 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1153 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1154 uint8_t full[24*17];\
1156 copy_block17(full, src, 24, stride, 17);\
1157 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1158 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1161 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1162 uint8_t full[24*17];\
1163 copy_block17(full, src, 24, stride, 17);\
1164 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1167 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1168 uint8_t full[24*17];\
1170 copy_block17(full, src, 24, stride, 17);\
1171 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1172 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1174 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1175 uint8_t full[24*17];\
1176 uint8_t halfH[272];\
1177 uint8_t halfV[256];\
1178 uint8_t halfHV[256];\
1179 copy_block17(full, src, 24, stride, 17);\
1180 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1181 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1182 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1183 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1185 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1186 uint8_t full[24*17];\
1187 uint8_t halfH[272];\
1188 uint8_t halfHV[256];\
1189 copy_block17(full, src, 24, stride, 17);\
1190 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1191 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1192 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1193 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1195 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1196 uint8_t full[24*17];\
1197 uint8_t halfH[272];\
1198 uint8_t halfV[256];\
1199 uint8_t halfHV[256];\
1200 copy_block17(full, src, 24, stride, 17);\
1201 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1202 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1203 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1204 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1206 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1207 uint8_t full[24*17];\
1208 uint8_t halfH[272];\
1209 uint8_t halfHV[256];\
1210 copy_block17(full, src, 24, stride, 17);\
1211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1212 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1213 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1214 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1216 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1217 uint8_t full[24*17];\
1218 uint8_t halfH[272];\
1219 uint8_t halfV[256];\
1220 uint8_t halfHV[256];\
1221 copy_block17(full, src, 24, stride, 17);\
1222 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1223 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1224 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1225 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1227 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1228 uint8_t full[24*17];\
1229 uint8_t halfH[272];\
1230 uint8_t halfHV[256];\
1231 copy_block17(full, src, 24, stride, 17);\
1232 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1233 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1234 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1235 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1237 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1238 uint8_t full[24*17];\
1239 uint8_t halfH[272];\
1240 uint8_t halfV[256];\
1241 uint8_t halfHV[256];\
1242 copy_block17(full, src, 24, stride, 17);\
1243 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1244 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1245 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1246 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1248 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1249 uint8_t full[24*17];\
1250 uint8_t halfH[272];\
1251 uint8_t halfHV[256];\
1252 copy_block17(full, src, 24, stride, 17);\
1253 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1254 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1255 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1256 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1258 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1259 uint8_t halfH[272];\
1260 uint8_t halfHV[256];\
1261 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1262 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1263 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1265 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1266 uint8_t halfH[272];\
1267 uint8_t halfHV[256];\
1268 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1269 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1270 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1272 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1273 uint8_t full[24*17];\
1274 uint8_t halfH[272];\
1275 uint8_t halfV[256];\
1276 uint8_t halfHV[256];\
1277 copy_block17(full, src, 24, stride, 17);\
1278 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1279 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1280 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1281 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1283 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1284 uint8_t full[24*17];\
1285 uint8_t halfH[272];\
1286 copy_block17(full, src, 24, stride, 17);\
1287 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1288 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1289 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1291 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1292 uint8_t full[24*17];\
1293 uint8_t halfH[272];\
1294 uint8_t halfV[256];\
1295 uint8_t halfHV[256];\
1296 copy_block17(full, src, 24, stride, 17);\
1297 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1298 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1299 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1300 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1302 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1303 uint8_t full[24*17];\
1304 uint8_t halfH[272];\
1305 copy_block17(full, src, 24, stride, 17);\
1306 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1307 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1308 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1310 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1311 uint8_t halfH[272];\
1312 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1313 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1316 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1317 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1318 #define op_put(a, b) a = cm[((b) + 16)>>5]
1319 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1321 QPEL_MC(0, put_ , _ , op_put)
1322 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1323 QPEL_MC(0, avg_ , _ , op_avg)
1324 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1326 #undef op_avg_no_rnd
1328 #undef op_put_no_rnd
1330 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1331 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1335 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1336 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1337 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1338 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1339 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1340 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1341 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1342 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1348 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1349 uint8_t *cm = cropTbl + MAX_NEG_CROP;
1353 const int src_1= src[ -srcStride];
1354 const int src0 = src[0 ];
1355 const int src1 = src[ srcStride];
1356 const int src2 = src[2*srcStride];
1357 const int src3 = src[3*srcStride];
1358 const int src4 = src[4*srcStride];
1359 const int src5 = src[5*srcStride];
1360 const int src6 = src[6*srcStride];
1361 const int src7 = src[7*srcStride];
1362 const int src8 = src[8*srcStride];
1363 const int src9 = src[9*srcStride];
1364 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1365 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1366 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1367 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1368 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1369 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1370 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1371 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1377 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
1378 put_pixels8_c(dst, src, stride, 8);
1381 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1383 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1384 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
1387 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1388 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1391 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1393 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1394 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
1397 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1398 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1401 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1405 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1406 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1407 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1408 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1410 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1414 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1415 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1416 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1417 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
1419 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1421 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1422 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1426 static inline int pix_abs16x16_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1432 s += abs(pix1[0] - pix2[0]);
1433 s += abs(pix1[1] - pix2[1]);
1434 s += abs(pix1[2] - pix2[2]);
1435 s += abs(pix1[3] - pix2[3]);
1436 s += abs(pix1[4] - pix2[4]);
1437 s += abs(pix1[5] - pix2[5]);
1438 s += abs(pix1[6] - pix2[6]);
1439 s += abs(pix1[7] - pix2[7]);
1440 s += abs(pix1[8] - pix2[8]);
1441 s += abs(pix1[9] - pix2[9]);
1442 s += abs(pix1[10] - pix2[10]);
1443 s += abs(pix1[11] - pix2[11]);
1444 s += abs(pix1[12] - pix2[12]);
1445 s += abs(pix1[13] - pix2[13]);
1446 s += abs(pix1[14] - pix2[14]);
1447 s += abs(pix1[15] - pix2[15]);
1454 static int pix_abs16x16_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1460 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1461 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1462 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1463 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1464 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1465 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1466 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1467 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1468 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1469 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1470 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1471 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1472 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1473 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1474 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1475 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1482 static int pix_abs16x16_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1485 uint8_t *pix3 = pix2 + line_size;
1489 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1490 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1491 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1492 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1493 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1494 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1495 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1496 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1497 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1498 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1499 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1500 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1501 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1502 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1503 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1504 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1512 static int pix_abs16x16_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1515 uint8_t *pix3 = pix2 + line_size;
1519 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1520 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1521 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1522 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1523 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1524 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1525 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1526 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1527 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1528 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1529 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1530 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1531 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1532 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1533 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1534 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1542 static inline int pix_abs8x8_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1548 s += abs(pix1[0] - pix2[0]);
1549 s += abs(pix1[1] - pix2[1]);
1550 s += abs(pix1[2] - pix2[2]);
1551 s += abs(pix1[3] - pix2[3]);
1552 s += abs(pix1[4] - pix2[4]);
1553 s += abs(pix1[5] - pix2[5]);
1554 s += abs(pix1[6] - pix2[6]);
1555 s += abs(pix1[7] - pix2[7]);
1562 static int pix_abs8x8_x2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1568 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1569 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1570 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1571 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1572 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1573 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1574 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1575 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1582 static int pix_abs8x8_y2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1585 uint8_t *pix3 = pix2 + line_size;
1589 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1590 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1591 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1592 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1593 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1594 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1595 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1596 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1604 static int pix_abs8x8_xy2_c(uint8_t *pix1, uint8_t *pix2, int line_size)
1607 uint8_t *pix3 = pix2 + line_size;
1611 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1612 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1613 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1614 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1615 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1616 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1617 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1618 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1626 static int sad16x16_c(void *s, uint8_t *a, uint8_t *b, int stride){
1627 return pix_abs16x16_c(a,b,stride);
1630 static int sad8x8_c(void *s, uint8_t *a, uint8_t *b, int stride){
1631 return pix_abs8x8_c(a,b,stride);
1635 * permutes an 8x8 block.
1636 * @param block the block which will be permuted according to the given permutation vector
1637 * @param permutation the permutation vector
1638 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1639 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1640 * (inverse) permutated to scantable order!
1642 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1648 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
1650 for(i=0; i<=last; i++){
1651 const int j= scantable[i];
1656 for(i=0; i<=last; i++){
1657 const int j= scantable[i];
1658 const int perm_j= permutation[j];
1659 block[perm_j]= temp[j];
1664 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
1666 static void clear_blocks_c(DCTELEM *blocks)
1668 memset(blocks, 0, sizeof(DCTELEM)*6*64);
1671 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1673 for(i=0; i+7<w; i+=8){
1674 dst[i+0] += src[i+0];
1675 dst[i+1] += src[i+1];
1676 dst[i+2] += src[i+2];
1677 dst[i+3] += src[i+3];
1678 dst[i+4] += src[i+4];
1679 dst[i+5] += src[i+5];
1680 dst[i+6] += src[i+6];
1681 dst[i+7] += src[i+7];
1684 dst[i+0] += src[i+0];
1687 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1689 for(i=0; i+7<w; i+=8){
1690 dst[i+0] = src1[i+0]-src2[i+0];
1691 dst[i+1] = src1[i+1]-src2[i+1];
1692 dst[i+2] = src1[i+2]-src2[i+2];
1693 dst[i+3] = src1[i+3]-src2[i+3];
1694 dst[i+4] = src1[i+4]-src2[i+4];
1695 dst[i+5] = src1[i+5]-src2[i+5];
1696 dst[i+6] = src1[i+6]-src2[i+6];
1697 dst[i+7] = src1[i+7]-src2[i+7];
1700 dst[i+0] = src1[i+0]-src2[i+0];
1703 #define BUTTERFLY2(o1,o2,i1,i2) \
1707 #define BUTTERFLY1(x,y) \
1716 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
1718 static int hadamard8_diff_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride){
1724 //FIXME try pointer walks
1725 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
1726 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
1727 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
1728 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
1730 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1731 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1732 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1733 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1735 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1736 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1737 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1738 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1742 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1743 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1744 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1745 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1747 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1748 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1749 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1750 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1753 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1754 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1755 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1756 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1762 printf("MAX:%d\n", maxi);
1768 static int hadamard8_abs_c(uint8_t *src, int stride, int mean){
1772 //FIXME OOOPS ignore 0 term instead of mean mess
1774 //FIXME try pointer walks
1775 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-mean,src[stride*i+1]-mean);
1776 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-mean,src[stride*i+3]-mean);
1777 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-mean,src[stride*i+5]-mean);
1778 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-mean,src[stride*i+7]-mean);
1780 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
1781 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
1782 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
1783 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
1785 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
1786 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
1787 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
1788 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
1792 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
1793 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
1794 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
1795 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
1797 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
1798 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
1799 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
1800 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
1803 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
1804 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
1805 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
1806 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
1812 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1813 MpegEncContext * const s= (MpegEncContext *)c;
1814 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1815 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1818 s->dsp.diff_pixels(temp, src1, src2, stride);
1827 void simple_idct(DCTELEM *block); //FIXME
1829 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1830 MpegEncContext * const s= (MpegEncContext *)c;
1831 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
1832 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1833 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
1838 s->dsp.diff_pixels(temp, src1, src2, stride);
1840 memcpy(bak, temp, 64*sizeof(DCTELEM));
1842 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1843 s->dct_unquantize(s, temp, 0, s->qscale);
1844 simple_idct(temp); //FIXME
1847 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
1852 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1853 MpegEncContext * const s= (MpegEncContext *)c;
1854 const uint8_t *scantable= s->intra_scantable.permutated;
1855 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1856 uint64_t __align8 aligned_bak[stride];
1857 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1858 uint8_t * const bak= (uint8_t*)aligned_bak;
1859 int i, last, run, bits, level, distoration, start_i;
1860 const int esc_length= s->ac_esc_length;
1862 uint8_t * last_length;
1865 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
1866 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
1869 s->dsp.diff_pixels(temp, src1, src2, stride);
1871 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1877 length = s->intra_ac_vlc_length;
1878 last_length= s->intra_ac_vlc_last_length;
1879 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1882 length = s->inter_ac_vlc_length;
1883 last_length= s->inter_ac_vlc_last_length;
1888 for(i=start_i; i<last; i++){
1889 int j= scantable[i];
1894 if((level&(~127)) == 0){
1895 bits+= length[UNI_AC_ENC_INDEX(run, level)];
1904 level= temp[i] + 64;
1908 if((level&(~127)) == 0){
1909 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1916 s->dct_unquantize(s, temp, 0, s->qscale);
1919 s->dsp.idct_add(bak, stride, temp);
1921 distoration= s->dsp.sse[1](NULL, bak, src1, stride);
1923 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
1926 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride){
1927 MpegEncContext * const s= (MpegEncContext *)c;
1928 const uint8_t *scantable= s->intra_scantable.permutated;
1929 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
1930 DCTELEM * const temp= (DCTELEM*)aligned_temp;
1931 int i, last, run, bits, level, start_i;
1932 const int esc_length= s->ac_esc_length;
1934 uint8_t * last_length;
1936 s->dsp.diff_pixels(temp, src1, src2, stride);
1938 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
1944 length = s->intra_ac_vlc_length;
1945 last_length= s->intra_ac_vlc_last_length;
1946 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
1949 length = s->inter_ac_vlc_length;
1950 last_length= s->inter_ac_vlc_last_length;
1955 for(i=start_i; i<last; i++){
1956 int j= scantable[i];
1961 if((level&(~127)) == 0){
1962 bits+= length[UNI_AC_ENC_INDEX(run, level)];
1971 level= temp[i] + 64;
1975 if((level&(~127)) == 0){
1976 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
1985 WARPER88_1616(hadamard8_diff_c, hadamard8_diff16_c)
1986 WARPER88_1616(dct_sad8x8_c, dct_sad16x16_c)
1987 WARPER88_1616(quant_psnr8x8_c, quant_psnr16x16_c)
1988 WARPER88_1616(rd8x8_c, rd16x16_c)
1989 WARPER88_1616(bit8x8_c, bit16x16_c)
1991 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1993 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1996 put_pixels_clamped_c(block, dest, line_size);
1998 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2001 add_pixels_clamped_c(block, dest, line_size);
2004 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2006 static int init_done = 0;
2010 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
2011 for(i=0;i<MAX_NEG_CROP;i++) {
2013 cropTbl[i + MAX_NEG_CROP + 256] = 255;
2016 for(i=0;i<512;i++) {
2017 squareTbl[i] = (i - 256) * (i - 256);
2020 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2025 #ifdef CONFIG_ENCODERS
2026 if(avctx->dct_algo==FF_DCT_FASTINT)
2027 c->fdct = fdct_ifast;
2029 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2030 #endif //CONFIG_ENCODERS
2032 if(avctx->idct_algo==FF_IDCT_INT){
2033 c->idct_put= ff_jref_idct_put;
2034 c->idct_add= ff_jref_idct_add;
2035 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2036 }else{ //accurate/default
2037 c->idct_put= simple_idct_put;
2038 c->idct_add= simple_idct_add;
2039 c->idct_permutation_type= FF_NO_IDCT_PERM;
2042 c->get_pixels = get_pixels_c;
2043 c->diff_pixels = diff_pixels_c;
2044 c->put_pixels_clamped = put_pixels_clamped_c;
2045 c->add_pixels_clamped = add_pixels_clamped_c;
2048 c->clear_blocks = clear_blocks_c;
2049 c->pix_sum = pix_sum_c;
2050 c->pix_norm1 = pix_norm1_c;
2054 /* TODO [0] 16 [1] 8 */
2055 c->pix_abs16x16 = pix_abs16x16_c;
2056 c->pix_abs16x16_x2 = pix_abs16x16_x2_c;
2057 c->pix_abs16x16_y2 = pix_abs16x16_y2_c;
2058 c->pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
2059 c->pix_abs8x8 = pix_abs8x8_c;
2060 c->pix_abs8x8_x2 = pix_abs8x8_x2_c;
2061 c->pix_abs8x8_y2 = pix_abs8x8_y2_c;
2062 c->pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
2064 #define dspfunc(PFX, IDX, NUM) \
2065 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
2066 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
2067 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
2068 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
2070 dspfunc(put, 0, 16);
2071 dspfunc(put_no_rnd, 0, 16);
2073 dspfunc(put_no_rnd, 1, 8);
2075 dspfunc(avg, 0, 16);
2076 dspfunc(avg_no_rnd, 0, 16);
2078 dspfunc(avg_no_rnd, 1, 8);
2081 #define dspfunc(PFX, IDX, NUM) \
2082 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2083 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2084 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2085 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2086 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2087 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2088 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2089 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2090 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2091 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2092 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2093 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2094 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2095 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2096 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2097 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2099 dspfunc(put_qpel, 0, 16);
2100 dspfunc(put_no_rnd_qpel, 0, 16);
2102 dspfunc(avg_qpel, 0, 16);
2103 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
2105 dspfunc(put_qpel, 1, 8);
2106 dspfunc(put_no_rnd_qpel, 1, 8);
2108 dspfunc(avg_qpel, 1, 8);
2109 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
2112 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
2113 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
2114 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
2115 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
2116 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
2117 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
2118 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
2119 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
2121 c->hadamard8_diff[0]= hadamard8_diff16_c;
2122 c->hadamard8_diff[1]= hadamard8_diff_c;
2123 c->hadamard8_abs = hadamard8_abs_c;
2125 c->dct_sad[0]= dct_sad16x16_c;
2126 c->dct_sad[1]= dct_sad8x8_c;
2128 c->sad[0]= sad16x16_c;
2129 c->sad[1]= sad8x8_c;
2131 c->quant_psnr[0]= quant_psnr16x16_c;
2132 c->quant_psnr[1]= quant_psnr8x8_c;
2134 c->rd[0]= rd16x16_c;
2137 c->bit[0]= bit16x16_c;
2138 c->bit[1]= bit8x8_c;
2140 c->add_bytes= add_bytes_c;
2141 c->diff_bytes= diff_bytes_c;
2144 dsputil_init_mmx(c, avctx);
2147 dsputil_init_armv4l(c, avctx);
2150 dsputil_init_mlib(c, avctx);
2153 dsputil_init_alpha(c, avctx);
2156 dsputil_init_ppc(c, avctx);
2159 dsputil_init_mmi(c, avctx);
2162 switch(c->idct_permutation_type){
2163 case FF_NO_IDCT_PERM:
2165 c->idct_permutation[i]= i;
2167 case FF_LIBMPEG2_IDCT_PERM:
2169 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
2171 case FF_SIMPLE_IDCT_PERM:
2173 c->idct_permutation[i]= simple_mmx_permutation[i];
2175 case FF_TRANSPOSE_IDCT_PERM:
2177 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
2180 fprintf(stderr, "Internal error, IDCT permutation not set\n");