3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
34 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
35 uint32_t squareTbl[512] = {0, };
37 const uint8_t ff_zigzag_direct[64] = {
38 0, 1, 8, 16, 9, 2, 3, 10,
39 17, 24, 32, 25, 18, 11, 4, 5,
40 12, 19, 26, 33, 40, 48, 41, 34,
41 27, 20, 13, 6, 7, 14, 21, 28,
42 35, 42, 49, 56, 57, 50, 43, 36,
43 29, 22, 15, 23, 30, 37, 44, 51,
44 58, 59, 52, 45, 38, 31, 39, 46,
45 53, 60, 61, 54, 47, 55, 62, 63
48 /* Specific zigzag scan for 248 idct. NOTE that unlike the
49 specification, we interleave the fields */
50 const uint8_t ff_zigzag248_direct[64] = {
51 0, 8, 1, 9, 16, 24, 2, 10,
52 17, 25, 32, 40, 48, 56, 33, 41,
53 18, 26, 3, 11, 4, 12, 19, 27,
54 34, 42, 49, 57, 50, 58, 35, 43,
55 20, 28, 5, 13, 6, 14, 21, 29,
56 36, 44, 51, 59, 52, 60, 37, 45,
57 22, 30, 7, 15, 23, 31, 38, 46,
58 53, 61, 54, 62, 39, 47, 55, 63,
61 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
62 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
64 const uint8_t ff_alternate_horizontal_scan[64] = {
65 0, 1, 2, 3, 8, 9, 16, 17,
66 10, 11, 4, 5, 6, 7, 15, 14,
67 13, 12, 19, 18, 24, 25, 32, 33,
68 26, 27, 20, 21, 22, 23, 28, 29,
69 30, 31, 34, 35, 40, 41, 48, 49,
70 42, 43, 36, 37, 38, 39, 44, 45,
71 46, 47, 50, 51, 56, 57, 58, 59,
72 52, 53, 54, 55, 60, 61, 62, 63,
75 const uint8_t ff_alternate_vertical_scan[64] = {
76 0, 8, 16, 24, 1, 9, 2, 10,
77 17, 25, 32, 40, 48, 56, 57, 49,
78 41, 33, 26, 18, 3, 11, 4, 12,
79 19, 27, 34, 42, 50, 58, 35, 43,
80 51, 59, 20, 28, 5, 13, 6, 14,
81 21, 29, 36, 44, 52, 60, 37, 45,
82 53, 61, 22, 30, 7, 15, 23, 31,
83 38, 46, 54, 62, 39, 47, 55, 63,
86 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
87 const uint32_t inverse[256]={
88 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
89 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
90 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
91 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
92 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
93 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
94 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
95 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
96 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
97 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
98 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
99 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
100 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
101 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
102 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
103 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
104 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
105 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
106 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
107 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
108 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
109 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
110 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
111 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
112 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
113 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
114 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
115 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
116 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
117 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
118 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
119 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
122 /* Input permutation for the simple_idct_mmx */
123 static const uint8_t simple_mmx_permutation[64]={
124 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
125 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
126 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
127 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
128 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
129 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
130 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
131 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
134 static int pix_sum_c(uint8_t * pix, int line_size)
139 for (i = 0; i < 16; i++) {
140 for (j = 0; j < 16; j += 8) {
151 pix += line_size - 16;
156 static int pix_norm1_c(uint8_t * pix, int line_size)
159 uint32_t *sq = squareTbl + 256;
162 for (i = 0; i < 16; i++) {
163 for (j = 0; j < 16; j += 8) {
174 #if LONG_MAX > 2147483647
175 register uint64_t x=*(uint64_t*)pix;
177 s += sq[(x>>8)&0xff];
178 s += sq[(x>>16)&0xff];
179 s += sq[(x>>24)&0xff];
180 s += sq[(x>>32)&0xff];
181 s += sq[(x>>40)&0xff];
182 s += sq[(x>>48)&0xff];
183 s += sq[(x>>56)&0xff];
185 register uint32_t x=*(uint32_t*)pix;
187 s += sq[(x>>8)&0xff];
188 s += sq[(x>>16)&0xff];
189 s += sq[(x>>24)&0xff];
190 x=*(uint32_t*)(pix+4);
192 s += sq[(x>>8)&0xff];
193 s += sq[(x>>16)&0xff];
194 s += sq[(x>>24)&0xff];
199 pix += line_size - 16;
204 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
207 for(i=0; i+8<=w; i+=8){
208 dst[i+0]= bswap_32(src[i+0]);
209 dst[i+1]= bswap_32(src[i+1]);
210 dst[i+2]= bswap_32(src[i+2]);
211 dst[i+3]= bswap_32(src[i+3]);
212 dst[i+4]= bswap_32(src[i+4]);
213 dst[i+5]= bswap_32(src[i+5]);
214 dst[i+6]= bswap_32(src[i+6]);
215 dst[i+7]= bswap_32(src[i+7]);
218 dst[i+0]= bswap_32(src[i+0]);
222 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
225 uint32_t *sq = squareTbl + 256;
228 for (i = 0; i < h; i++) {
229 s += sq[pix1[0] - pix2[0]];
230 s += sq[pix1[1] - pix2[1]];
231 s += sq[pix1[2] - pix2[2]];
232 s += sq[pix1[3] - pix2[3]];
239 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
242 uint32_t *sq = squareTbl + 256;
245 for (i = 0; i < h; i++) {
246 s += sq[pix1[0] - pix2[0]];
247 s += sq[pix1[1] - pix2[1]];
248 s += sq[pix1[2] - pix2[2]];
249 s += sq[pix1[3] - pix2[3]];
250 s += sq[pix1[4] - pix2[4]];
251 s += sq[pix1[5] - pix2[5]];
252 s += sq[pix1[6] - pix2[6]];
253 s += sq[pix1[7] - pix2[7]];
260 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
263 uint32_t *sq = squareTbl + 256;
266 for (i = 0; i < h; i++) {
267 s += sq[pix1[ 0] - pix2[ 0]];
268 s += sq[pix1[ 1] - pix2[ 1]];
269 s += sq[pix1[ 2] - pix2[ 2]];
270 s += sq[pix1[ 3] - pix2[ 3]];
271 s += sq[pix1[ 4] - pix2[ 4]];
272 s += sq[pix1[ 5] - pix2[ 5]];
273 s += sq[pix1[ 6] - pix2[ 6]];
274 s += sq[pix1[ 7] - pix2[ 7]];
275 s += sq[pix1[ 8] - pix2[ 8]];
276 s += sq[pix1[ 9] - pix2[ 9]];
277 s += sq[pix1[10] - pix2[10]];
278 s += sq[pix1[11] - pix2[11]];
279 s += sq[pix1[12] - pix2[12]];
280 s += sq[pix1[13] - pix2[13]];
281 s += sq[pix1[14] - pix2[14]];
282 s += sq[pix1[15] - pix2[15]];
291 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
293 const int dec_count= w==8 ? 3 : 4;
297 static const int scale[2][2][4][4]={
301 {268, 239, 239, 213},
306 {344, 310, 310, 280},
314 {275, 245, 245, 218},
319 {352, 317, 317, 286},
328 for (i = 0; i < h; i++) {
329 for (j = 0; j < w; j+=4) {
330 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
331 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
332 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
333 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
338 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
342 for(level=0; level<dec_count; level++){
343 for(ori= level ? 1 : 0; ori<4; ori++){
344 int sx= (ori&1) ? 1<<level: 0;
345 int stride= 16<<(dec_count-level);
346 int sy= (ori&2) ? stride>>1 : 0;
349 for(i=0; i<size; i++){
350 for(j=0; j<size; j++){
351 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
358 for (i = 0; i < h; i++) {
359 for (j = 0; j < w; j+=4) {
360 s+= ABS(tmp[16*i+j+0]);
361 s+= ABS(tmp[16*i+j+1]);
362 s+= ABS(tmp[16*i+j+2]);
363 s+= ABS(tmp[16*i+j+3]);
371 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
372 return w_c(v, pix1, pix2, line_size, 8, h, 1);
375 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
376 return w_c(v, pix1, pix2, line_size, 8, h, 0);
379 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
380 return w_c(v, pix1, pix2, line_size, 16, h, 1);
383 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
384 return w_c(v, pix1, pix2, line_size, 16, h, 0);
387 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
391 /* read the pixels */
393 block[0] = pixels[0];
394 block[1] = pixels[1];
395 block[2] = pixels[2];
396 block[3] = pixels[3];
397 block[4] = pixels[4];
398 block[5] = pixels[5];
399 block[6] = pixels[6];
400 block[7] = pixels[7];
406 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
407 const uint8_t *s2, int stride){
410 /* read the pixels */
412 block[0] = s1[0] - s2[0];
413 block[1] = s1[1] - s2[1];
414 block[2] = s1[2] - s2[2];
415 block[3] = s1[3] - s2[3];
416 block[4] = s1[4] - s2[4];
417 block[5] = s1[5] - s2[5];
418 block[6] = s1[6] - s2[6];
419 block[7] = s1[7] - s2[7];
427 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 uint8_t *cm = cropTbl + MAX_NEG_CROP;
433 /* read the pixels */
435 pixels[0] = cm[block[0]];
436 pixels[1] = cm[block[1]];
437 pixels[2] = cm[block[2]];
438 pixels[3] = cm[block[3]];
439 pixels[4] = cm[block[4]];
440 pixels[5] = cm[block[5]];
441 pixels[6] = cm[block[6]];
442 pixels[7] = cm[block[7]];
449 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
453 uint8_t *cm = cropTbl + MAX_NEG_CROP;
455 /* read the pixels */
457 pixels[0] = cm[block[0]];
458 pixels[1] = cm[block[1]];
459 pixels[2] = cm[block[2]];
460 pixels[3] = cm[block[3]];
467 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
471 uint8_t *cm = cropTbl + MAX_NEG_CROP;
473 /* read the pixels */
475 pixels[0] = cm[block[0]];
476 pixels[1] = cm[block[1]];
483 static void put_signed_pixels_clamped_c(const DCTELEM *block,
484 uint8_t *restrict pixels,
489 for (i = 0; i < 8; i++) {
490 for (j = 0; j < 8; j++) {
493 else if (*block > 127)
496 *pixels = (uint8_t)(*block + 128);
500 pixels += (line_size - 8);
504 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
508 uint8_t *cm = cropTbl + MAX_NEG_CROP;
510 /* read the pixels */
512 pixels[0] = cm[pixels[0] + block[0]];
513 pixels[1] = cm[pixels[1] + block[1]];
514 pixels[2] = cm[pixels[2] + block[2]];
515 pixels[3] = cm[pixels[3] + block[3]];
516 pixels[4] = cm[pixels[4] + block[4]];
517 pixels[5] = cm[pixels[5] + block[5]];
518 pixels[6] = cm[pixels[6] + block[6]];
519 pixels[7] = cm[pixels[7] + block[7]];
525 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
529 uint8_t *cm = cropTbl + MAX_NEG_CROP;
531 /* read the pixels */
533 pixels[0] = cm[pixels[0] + block[0]];
534 pixels[1] = cm[pixels[1] + block[1]];
535 pixels[2] = cm[pixels[2] + block[2]];
536 pixels[3] = cm[pixels[3] + block[3]];
542 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
546 uint8_t *cm = cropTbl + MAX_NEG_CROP;
548 /* read the pixels */
550 pixels[0] = cm[pixels[0] + block[0]];
551 pixels[1] = cm[pixels[1] + block[1]];
558 #define PIXOP2(OPNAME, OP) \
559 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
563 OP(*((uint64_t*)block), LD64(pixels));\
569 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
573 const uint64_t a= LD64(pixels );\
574 const uint64_t b= LD64(pixels+1);\
575 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
581 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
585 const uint64_t a= LD64(pixels );\
586 const uint64_t b= LD64(pixels+1);\
587 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
593 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
597 const uint64_t a= LD64(pixels );\
598 const uint64_t b= LD64(pixels+line_size);\
599 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
605 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
609 const uint64_t a= LD64(pixels );\
610 const uint64_t b= LD64(pixels+line_size);\
611 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
617 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
620 const uint64_t a= LD64(pixels );\
621 const uint64_t b= LD64(pixels+1);\
622 uint64_t l0= (a&0x0303030303030303ULL)\
623 + (b&0x0303030303030303ULL)\
624 + 0x0202020202020202ULL;\
625 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
626 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
630 for(i=0; i<h; i+=2){\
631 uint64_t a= LD64(pixels );\
632 uint64_t b= LD64(pixels+1);\
633 l1= (a&0x0303030303030303ULL)\
634 + (b&0x0303030303030303ULL);\
635 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
636 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
637 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
642 l0= (a&0x0303030303030303ULL)\
643 + (b&0x0303030303030303ULL)\
644 + 0x0202020202020202ULL;\
645 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
646 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
647 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
653 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
656 const uint64_t a= LD64(pixels );\
657 const uint64_t b= LD64(pixels+1);\
658 uint64_t l0= (a&0x0303030303030303ULL)\
659 + (b&0x0303030303030303ULL)\
660 + 0x0101010101010101ULL;\
661 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
662 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
666 for(i=0; i<h; i+=2){\
667 uint64_t a= LD64(pixels );\
668 uint64_t b= LD64(pixels+1);\
669 l1= (a&0x0303030303030303ULL)\
670 + (b&0x0303030303030303ULL);\
671 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
672 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
673 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
678 l0= (a&0x0303030303030303ULL)\
679 + (b&0x0303030303030303ULL)\
680 + 0x0101010101010101ULL;\
681 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
682 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
683 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
689 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
690 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
691 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
692 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
693 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
694 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
695 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
697 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
698 #else // 64 bit variant
700 #define PIXOP2(OPNAME, OP) \
701 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
704 OP(*((uint16_t*)(block )), LD16(pixels ));\
709 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
712 OP(*((uint32_t*)(block )), LD32(pixels ));\
717 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
720 OP(*((uint32_t*)(block )), LD32(pixels ));\
721 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
726 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
727 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
730 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
731 int src_stride1, int src_stride2, int h){\
735 a= LD32(&src1[i*src_stride1 ]);\
736 b= LD32(&src2[i*src_stride2 ]);\
737 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
738 a= LD32(&src1[i*src_stride1+4]);\
739 b= LD32(&src2[i*src_stride2+4]);\
740 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
744 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
745 int src_stride1, int src_stride2, int h){\
749 a= LD32(&src1[i*src_stride1 ]);\
750 b= LD32(&src2[i*src_stride2 ]);\
751 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
752 a= LD32(&src1[i*src_stride1+4]);\
753 b= LD32(&src2[i*src_stride2+4]);\
754 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
758 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
759 int src_stride1, int src_stride2, int h){\
763 a= LD32(&src1[i*src_stride1 ]);\
764 b= LD32(&src2[i*src_stride2 ]);\
765 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
769 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
774 a= LD16(&src1[i*src_stride1 ]);\
775 b= LD16(&src2[i*src_stride2 ]);\
776 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
780 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
781 int src_stride1, int src_stride2, int h){\
782 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
783 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
786 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
792 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
793 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
796 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
797 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
800 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
801 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
804 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
805 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
808 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
809 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
812 uint32_t a, b, c, d, l0, l1, h0, h1;\
813 a= LD32(&src1[i*src_stride1]);\
814 b= LD32(&src2[i*src_stride2]);\
815 c= LD32(&src3[i*src_stride3]);\
816 d= LD32(&src4[i*src_stride4]);\
817 l0= (a&0x03030303UL)\
820 h0= ((a&0xFCFCFCFCUL)>>2)\
821 + ((b&0xFCFCFCFCUL)>>2);\
822 l1= (c&0x03030303UL)\
824 h1= ((c&0xFCFCFCFCUL)>>2)\
825 + ((d&0xFCFCFCFCUL)>>2);\
826 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
827 a= LD32(&src1[i*src_stride1+4]);\
828 b= LD32(&src2[i*src_stride2+4]);\
829 c= LD32(&src3[i*src_stride3+4]);\
830 d= LD32(&src4[i*src_stride4+4]);\
831 l0= (a&0x03030303UL)\
834 h0= ((a&0xFCFCFCFCUL)>>2)\
835 + ((b&0xFCFCFCFCUL)>>2);\
836 l1= (c&0x03030303UL)\
838 h1= ((c&0xFCFCFCFCUL)>>2)\
839 + ((d&0xFCFCFCFCUL)>>2);\
840 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
844 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
845 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
848 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
849 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
852 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
853 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
856 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
857 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
860 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
861 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
864 uint32_t a, b, c, d, l0, l1, h0, h1;\
865 a= LD32(&src1[i*src_stride1]);\
866 b= LD32(&src2[i*src_stride2]);\
867 c= LD32(&src3[i*src_stride3]);\
868 d= LD32(&src4[i*src_stride4]);\
869 l0= (a&0x03030303UL)\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
879 a= LD32(&src1[i*src_stride1+4]);\
880 b= LD32(&src2[i*src_stride2+4]);\
881 c= LD32(&src3[i*src_stride3+4]);\
882 d= LD32(&src4[i*src_stride4+4]);\
883 l0= (a&0x03030303UL)\
886 h0= ((a&0xFCFCFCFCUL)>>2)\
887 + ((b&0xFCFCFCFCUL)>>2);\
888 l1= (c&0x03030303UL)\
890 h1= ((c&0xFCFCFCFCUL)>>2)\
891 + ((d&0xFCFCFCFCUL)>>2);\
892 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
895 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
896 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
897 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
898 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
900 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
901 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
902 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
903 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
906 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
908 int i, a0, b0, a1, b1;\
915 for(i=0; i<h; i+=2){\
921 block[0]= (a1+a0)>>2; /* FIXME non put */\
922 block[1]= (b1+b0)>>2;\
932 block[0]= (a1+a0)>>2;\
933 block[1]= (b1+b0)>>2;\
939 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
942 const uint32_t a= LD32(pixels );\
943 const uint32_t b= LD32(pixels+1);\
944 uint32_t l0= (a&0x03030303UL)\
947 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
948 + ((b&0xFCFCFCFCUL)>>2);\
952 for(i=0; i<h; i+=2){\
953 uint32_t a= LD32(pixels );\
954 uint32_t b= LD32(pixels+1);\
955 l1= (a&0x03030303UL)\
957 h1= ((a&0xFCFCFCFCUL)>>2)\
958 + ((b&0xFCFCFCFCUL)>>2);\
959 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
964 l0= (a&0x03030303UL)\
967 h0= ((a&0xFCFCFCFCUL)>>2)\
968 + ((b&0xFCFCFCFCUL)>>2);\
969 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
975 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1002 l0= (a&0x03030303UL)\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1011 pixels+=4-line_size*(h+1);\
1012 block +=4-line_size*h;\
1016 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1019 for(j=0; j<2; j++){\
1021 const uint32_t a= LD32(pixels );\
1022 const uint32_t b= LD32(pixels+1);\
1023 uint32_t l0= (a&0x03030303UL)\
1026 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1027 + ((b&0xFCFCFCFCUL)>>2);\
1031 for(i=0; i<h; i+=2){\
1032 uint32_t a= LD32(pixels );\
1033 uint32_t b= LD32(pixels+1);\
1034 l1= (a&0x03030303UL)\
1035 + (b&0x03030303UL);\
1036 h1= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1038 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1043 l0= (a&0x03030303UL)\
1046 h0= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1052 pixels+=4-line_size*(h+1);\
1053 block +=4-line_size*h;\
1057 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1058 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1059 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1060 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1061 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1062 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1063 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1064 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1066 #define op_avg(a, b) a = rnd_avg32(a, b)
1068 #define op_put(a, b) a = b
1075 #define avg2(a,b) ((a+b+1)>>1)
1076 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1078 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1079 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1082 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1083 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1086 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1088 const int A=(16-x16)*(16-y16);
1089 const int B=( x16)*(16-y16);
1090 const int C=(16-x16)*( y16);
1091 const int D=( x16)*( y16);
1096 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1097 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1098 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1099 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1100 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1101 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1102 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1103 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1109 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1110 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1113 const int s= 1<<shift;
1123 for(x=0; x<8; x++){ //XXX FIXME optimize
1124 int src_x, src_y, frac_x, frac_y, index;
1128 frac_x= src_x&(s-1);
1129 frac_y= src_y&(s-1);
1133 if((unsigned)src_x < width){
1134 if((unsigned)src_y < height){
1135 index= src_x + src_y*stride;
1136 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1137 + src[index +1]* frac_x )*(s-frac_y)
1138 + ( src[index+stride ]*(s-frac_x)
1139 + src[index+stride+1]* frac_x )* frac_y
1142 index= src_x + clip(src_y, 0, height)*stride;
1143 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1144 + src[index +1]* frac_x )*s
1148 if((unsigned)src_y < height){
1149 index= clip(src_x, 0, width) + src_y*stride;
1150 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1151 + src[index+stride ]* frac_y )*s
1154 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1155 dst[y*stride + x]= src[index ];
1167 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1169 case 2: put_pixels2_c (dst, src, stride, height); break;
1170 case 4: put_pixels4_c (dst, src, stride, height); break;
1171 case 8: put_pixels8_c (dst, src, stride, height); break;
1172 case 16:put_pixels16_c(dst, src, stride, height); break;
1176 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1178 for (i=0; i < height; i++) {
1179 for (j=0; j < width; j++) {
1180 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1187 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1189 for (i=0; i < height; i++) {
1190 for (j=0; j < width; j++) {
1191 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1198 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1200 for (i=0; i < height; i++) {
1201 for (j=0; j < width; j++) {
1202 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1209 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1211 for (i=0; i < height; i++) {
1212 for (j=0; j < width; j++) {
1213 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1220 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1222 for (i=0; i < height; i++) {
1223 for (j=0; j < width; j++) {
1224 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1231 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1233 for (i=0; i < height; i++) {
1234 for (j=0; j < width; j++) {
1235 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1242 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1244 for (i=0; i < height; i++) {
1245 for (j=0; j < width; j++) {
1246 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1253 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1255 for (i=0; i < height; i++) {
1256 for (j=0; j < width; j++) {
1257 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1264 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1266 case 2: avg_pixels2_c (dst, src, stride, height); break;
1267 case 4: avg_pixels4_c (dst, src, stride, height); break;
1268 case 8: avg_pixels8_c (dst, src, stride, height); break;
1269 case 16:avg_pixels16_c(dst, src, stride, height); break;
1273 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1275 for (i=0; i < height; i++) {
1276 for (j=0; j < width; j++) {
1277 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1284 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1286 for (i=0; i < height; i++) {
1287 for (j=0; j < width; j++) {
1288 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1295 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1297 for (i=0; i < height; i++) {
1298 for (j=0; j < width; j++) {
1299 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1306 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1308 for (i=0; i < height; i++) {
1309 for (j=0; j < width; j++) {
1310 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1317 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1319 for (i=0; i < height; i++) {
1320 for (j=0; j < width; j++) {
1321 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1328 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1330 for (i=0; i < height; i++) {
1331 for (j=0; j < width; j++) {
1332 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1339 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1341 for (i=0; i < height; i++) {
1342 for (j=0; j < width; j++) {
1343 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1350 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1352 for (i=0; i < height; i++) {
1353 for (j=0; j < width; j++) {
1354 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1361 #define TPEL_WIDTH(width)\
1362 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1363 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1364 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1365 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1366 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1367 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1368 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1370 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1372 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1374 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1376 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1378 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1382 #define H264_CHROMA_MC(OPNAME, OP)\
1383 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1384 const int A=(8-x)*(8-y);\
1385 const int B=( x)*(8-y);\
1386 const int C=(8-x)*( y);\
1387 const int D=( x)*( y);\
1390 assert(x<8 && y<8 && x>=0 && y>=0);\
1394 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1395 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1401 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1402 const int A=(8-x)*(8-y);\
1403 const int B=( x)*(8-y);\
1404 const int C=(8-x)*( y);\
1405 const int D=( x)*( y);\
1408 assert(x<8 && y<8 && x>=0 && y>=0);\
1412 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1413 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1414 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1415 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1421 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1434 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1435 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1436 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1437 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1438 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1439 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1445 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1446 #define op_put(a, b) a = (((b) + 32)>>6)
1448 H264_CHROMA_MC(put_ , op_put)
1449 H264_CHROMA_MC(avg_ , op_avg)
1453 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1458 ST32(dst , LD32(src ));
1464 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1469 ST32(dst , LD32(src ));
1470 ST32(dst+4 , LD32(src+4 ));
1476 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1481 ST32(dst , LD32(src ));
1482 ST32(dst+4 , LD32(src+4 ));
1483 ST32(dst+8 , LD32(src+8 ));
1484 ST32(dst+12, LD32(src+12));
1490 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1495 ST32(dst , LD32(src ));
1496 ST32(dst+4 , LD32(src+4 ));
1497 ST32(dst+8 , LD32(src+8 ));
1498 ST32(dst+12, LD32(src+12));
1505 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1510 ST32(dst , LD32(src ));
1511 ST32(dst+4 , LD32(src+4 ));
1519 #define QPEL_MC(r, OPNAME, RND, OP) \
1520 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1521 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1525 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1526 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1527 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1528 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1529 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1530 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1531 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1532 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1538 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1540 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1544 const int src0= src[0*srcStride];\
1545 const int src1= src[1*srcStride];\
1546 const int src2= src[2*srcStride];\
1547 const int src3= src[3*srcStride];\
1548 const int src4= src[4*srcStride];\
1549 const int src5= src[5*srcStride];\
1550 const int src6= src[6*srcStride];\
1551 const int src7= src[7*srcStride];\
1552 const int src8= src[8*srcStride];\
1553 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1554 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1555 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1556 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1557 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1558 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1559 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1560 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1566 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1567 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1572 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1573 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1574 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1575 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1576 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1577 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1578 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1579 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1580 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1581 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1582 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1583 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1584 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1585 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1586 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1587 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1593 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1594 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1599 const int src0= src[0*srcStride];\
1600 const int src1= src[1*srcStride];\
1601 const int src2= src[2*srcStride];\
1602 const int src3= src[3*srcStride];\
1603 const int src4= src[4*srcStride];\
1604 const int src5= src[5*srcStride];\
1605 const int src6= src[6*srcStride];\
1606 const int src7= src[7*srcStride];\
1607 const int src8= src[8*srcStride];\
1608 const int src9= src[9*srcStride];\
1609 const int src10= src[10*srcStride];\
1610 const int src11= src[11*srcStride];\
1611 const int src12= src[12*srcStride];\
1612 const int src13= src[13*srcStride];\
1613 const int src14= src[14*srcStride];\
1614 const int src15= src[15*srcStride];\
1615 const int src16= src[16*srcStride];\
1616 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1617 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1618 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1619 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1620 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1621 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1622 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1623 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1624 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1625 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1626 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1627 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1628 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1629 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1630 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1631 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1637 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1638 OPNAME ## pixels8_c(dst, src, stride, 8);\
1641 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1643 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1644 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1647 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1648 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1651 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1653 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1654 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1657 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1658 uint8_t full[16*9];\
1660 copy_block9(full, src, 16, stride, 9);\
1661 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1662 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1665 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1666 uint8_t full[16*9];\
1667 copy_block9(full, src, 16, stride, 9);\
1668 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1671 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1674 copy_block9(full, src, 16, stride, 9);\
1675 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1676 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1678 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1679 uint8_t full[16*9];\
1682 uint8_t halfHV[64];\
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1685 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1686 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1687 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1689 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1690 uint8_t full[16*9];\
1692 uint8_t halfHV[64];\
1693 copy_block9(full, src, 16, stride, 9);\
1694 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1695 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1696 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1697 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1699 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1700 uint8_t full[16*9];\
1703 uint8_t halfHV[64];\
1704 copy_block9(full, src, 16, stride, 9);\
1705 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1706 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1707 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1708 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1710 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1713 uint8_t halfHV[64];\
1714 copy_block9(full, src, 16, stride, 9);\
1715 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1716 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1717 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1718 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1720 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1724 uint8_t halfHV[64];\
1725 copy_block9(full, src, 16, stride, 9);\
1726 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1727 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1729 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1731 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1734 uint8_t halfHV[64];\
1735 copy_block9(full, src, 16, stride, 9);\
1736 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1737 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1738 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1739 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1741 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1742 uint8_t full[16*9];\
1745 uint8_t halfHV[64];\
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1748 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1752 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1755 uint8_t halfHV[64];\
1756 copy_block9(full, src, 16, stride, 9);\
1757 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1758 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1760 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1762 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1764 uint8_t halfHV[64];\
1765 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1769 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1771 uint8_t halfHV[64];\
1772 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1776 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1780 uint8_t halfHV[64];\
1781 copy_block9(full, src, 16, stride, 9);\
1782 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1784 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1785 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1787 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1788 uint8_t full[16*9];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1793 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1795 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1796 uint8_t full[16*9];\
1799 uint8_t halfHV[64];\
1800 copy_block9(full, src, 16, stride, 9);\
1801 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1803 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1804 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1806 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1807 uint8_t full[16*9];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1814 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1817 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1819 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1820 OPNAME ## pixels16_c(dst, src, stride, 16);\
1823 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1825 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1826 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1829 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1830 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1833 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1835 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1836 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1839 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1840 uint8_t full[24*17];\
1842 copy_block17(full, src, 24, stride, 17);\
1843 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1844 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1847 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1848 uint8_t full[24*17];\
1849 copy_block17(full, src, 24, stride, 17);\
1850 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1853 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1856 copy_block17(full, src, 24, stride, 17);\
1857 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1858 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1860 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1861 uint8_t full[24*17];\
1862 uint8_t halfH[272];\
1863 uint8_t halfV[256];\
1864 uint8_t halfHV[256];\
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1867 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1868 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1869 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1871 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[24*17];\
1873 uint8_t halfH[272];\
1874 uint8_t halfHV[256];\
1875 copy_block17(full, src, 24, stride, 17);\
1876 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1877 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1878 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1879 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1881 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1882 uint8_t full[24*17];\
1883 uint8_t halfH[272];\
1884 uint8_t halfV[256];\
1885 uint8_t halfHV[256];\
1886 copy_block17(full, src, 24, stride, 17);\
1887 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1888 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1889 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1890 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1892 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 uint8_t halfH[272];\
1895 uint8_t halfHV[256];\
1896 copy_block17(full, src, 24, stride, 17);\
1897 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1898 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1899 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1900 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1902 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1904 uint8_t halfH[272];\
1905 uint8_t halfV[256];\
1906 uint8_t halfHV[256];\
1907 copy_block17(full, src, 24, stride, 17);\
1908 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1909 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1911 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1913 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1915 uint8_t halfH[272];\
1916 uint8_t halfHV[256];\
1917 copy_block17(full, src, 24, stride, 17);\
1918 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1919 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1920 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1921 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1923 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1924 uint8_t full[24*17];\
1925 uint8_t halfH[272];\
1926 uint8_t halfV[256];\
1927 uint8_t halfHV[256];\
1928 copy_block17(full, src, 24, stride, 17);\
1929 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1930 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1934 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfHV[256];\
1938 copy_block17(full, src, 24, stride, 17);\
1939 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1940 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1942 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1944 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
1947 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1951 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t halfH[272];\
1953 uint8_t halfHV[256];\
1954 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1958 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfV[256];\
1962 uint8_t halfHV[256];\
1963 copy_block17(full, src, 24, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1966 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1967 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1969 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1970 uint8_t full[24*17];\
1971 uint8_t halfH[272];\
1972 copy_block17(full, src, 24, stride, 17);\
1973 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1975 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1977 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t full[24*17];\
1979 uint8_t halfH[272];\
1980 uint8_t halfV[256];\
1981 uint8_t halfHV[256];\
1982 copy_block17(full, src, 24, stride, 17);\
1983 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1985 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1986 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1988 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[24*17];\
1990 uint8_t halfH[272];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1996 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1999 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2002 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2003 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2004 #define op_put(a, b) a = cm[((b) + 16)>>5]
2005 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2007 QPEL_MC(0, put_ , _ , op_put)
2008 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2009 QPEL_MC(0, avg_ , _ , op_avg)
2010 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2012 #undef op_avg_no_rnd
2014 #undef op_put_no_rnd
2017 #define H264_LOWPASS(OPNAME, OP, OP2) \
2018 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2020 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2024 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2025 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2026 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2027 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2033 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2035 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2039 const int srcB= src[-2*srcStride];\
2040 const int srcA= src[-1*srcStride];\
2041 const int src0= src[0 *srcStride];\
2042 const int src1= src[1 *srcStride];\
2043 const int src2= src[2 *srcStride];\
2044 const int src3= src[3 *srcStride];\
2045 const int src4= src[4 *srcStride];\
2046 const int src5= src[5 *srcStride];\
2047 const int src6= src[6 *srcStride];\
2048 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2049 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2050 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2051 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2057 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2060 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2062 src -= 2*srcStride;\
2063 for(i=0; i<h+5; i++)\
2065 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2066 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2067 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2068 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2072 tmp -= tmpStride*(h+5-2);\
2075 const int tmpB= tmp[-2*tmpStride];\
2076 const int tmpA= tmp[-1*tmpStride];\
2077 const int tmp0= tmp[0 *tmpStride];\
2078 const int tmp1= tmp[1 *tmpStride];\
2079 const int tmp2= tmp[2 *tmpStride];\
2080 const int tmp3= tmp[3 *tmpStride];\
2081 const int tmp4= tmp[4 *tmpStride];\
2082 const int tmp5= tmp[5 *tmpStride];\
2083 const int tmp6= tmp[6 *tmpStride];\
2084 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2085 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2086 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2087 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2093 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2095 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2099 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2100 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2101 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2102 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2103 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2104 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2105 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2106 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2112 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2114 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2118 const int srcB= src[-2*srcStride];\
2119 const int srcA= src[-1*srcStride];\
2120 const int src0= src[0 *srcStride];\
2121 const int src1= src[1 *srcStride];\
2122 const int src2= src[2 *srcStride];\
2123 const int src3= src[3 *srcStride];\
2124 const int src4= src[4 *srcStride];\
2125 const int src5= src[5 *srcStride];\
2126 const int src6= src[6 *srcStride];\
2127 const int src7= src[7 *srcStride];\
2128 const int src8= src[8 *srcStride];\
2129 const int src9= src[9 *srcStride];\
2130 const int src10=src[10*srcStride];\
2131 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2132 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2133 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2134 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2135 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2136 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2137 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2138 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2144 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2147 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2149 src -= 2*srcStride;\
2150 for(i=0; i<h+5; i++)\
2152 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2153 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2154 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2155 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2156 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2157 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2158 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2159 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2163 tmp -= tmpStride*(h+5-2);\
2166 const int tmpB= tmp[-2*tmpStride];\
2167 const int tmpA= tmp[-1*tmpStride];\
2168 const int tmp0= tmp[0 *tmpStride];\
2169 const int tmp1= tmp[1 *tmpStride];\
2170 const int tmp2= tmp[2 *tmpStride];\
2171 const int tmp3= tmp[3 *tmpStride];\
2172 const int tmp4= tmp[4 *tmpStride];\
2173 const int tmp5= tmp[5 *tmpStride];\
2174 const int tmp6= tmp[6 *tmpStride];\
2175 const int tmp7= tmp[7 *tmpStride];\
2176 const int tmp8= tmp[8 *tmpStride];\
2177 const int tmp9= tmp[9 *tmpStride];\
2178 const int tmp10=tmp[10*tmpStride];\
2179 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2180 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2181 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2182 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2183 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2184 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2185 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2186 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2192 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2193 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2194 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2195 src += 8*srcStride;\
2196 dst += 8*dstStride;\
2197 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2198 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2201 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2203 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2204 src += 8*srcStride;\
2205 dst += 8*dstStride;\
2206 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2207 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2210 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2211 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2212 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2213 src += 8*srcStride;\
2214 dst += 8*dstStride;\
2215 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2216 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2219 #define H264_MC(OPNAME, SIZE) \
2220 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2221 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2224 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2225 uint8_t half[SIZE*SIZE];\
2226 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2227 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2230 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2231 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2234 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t half[SIZE*SIZE];\
2236 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2237 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2240 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2241 uint8_t full[SIZE*(SIZE+5)];\
2242 uint8_t * const full_mid= full + SIZE*2;\
2243 uint8_t half[SIZE*SIZE];\
2244 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2245 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2246 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2249 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2250 uint8_t full[SIZE*(SIZE+5)];\
2251 uint8_t * const full_mid= full + SIZE*2;\
2252 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2253 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2256 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2257 uint8_t full[SIZE*(SIZE+5)];\
2258 uint8_t * const full_mid= full + SIZE*2;\
2259 uint8_t half[SIZE*SIZE];\
2260 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2261 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2262 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2265 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2266 uint8_t full[SIZE*(SIZE+5)];\
2267 uint8_t * const full_mid= full + SIZE*2;\
2268 uint8_t halfH[SIZE*SIZE];\
2269 uint8_t halfV[SIZE*SIZE];\
2270 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2271 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2272 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2273 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2276 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2277 uint8_t full[SIZE*(SIZE+5)];\
2278 uint8_t * const full_mid= full + SIZE*2;\
2279 uint8_t halfH[SIZE*SIZE];\
2280 uint8_t halfV[SIZE*SIZE];\
2281 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2282 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2283 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2284 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2287 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2288 uint8_t full[SIZE*(SIZE+5)];\
2289 uint8_t * const full_mid= full + SIZE*2;\
2290 uint8_t halfH[SIZE*SIZE];\
2291 uint8_t halfV[SIZE*SIZE];\
2292 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2293 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2294 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2295 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2298 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2299 uint8_t full[SIZE*(SIZE+5)];\
2300 uint8_t * const full_mid= full + SIZE*2;\
2301 uint8_t halfH[SIZE*SIZE];\
2302 uint8_t halfV[SIZE*SIZE];\
2303 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2304 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2305 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2306 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2309 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2310 int16_t tmp[SIZE*(SIZE+5)];\
2311 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2314 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2315 int16_t tmp[SIZE*(SIZE+5)];\
2316 uint8_t halfH[SIZE*SIZE];\
2317 uint8_t halfHV[SIZE*SIZE];\
2318 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2319 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2320 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2323 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2324 int16_t tmp[SIZE*(SIZE+5)];\
2325 uint8_t halfH[SIZE*SIZE];\
2326 uint8_t halfHV[SIZE*SIZE];\
2327 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2328 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2329 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2332 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2333 uint8_t full[SIZE*(SIZE+5)];\
2334 uint8_t * const full_mid= full + SIZE*2;\
2335 int16_t tmp[SIZE*(SIZE+5)];\
2336 uint8_t halfV[SIZE*SIZE];\
2337 uint8_t halfHV[SIZE*SIZE];\
2338 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2339 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2340 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2341 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t full[SIZE*(SIZE+5)];\
2346 uint8_t * const full_mid= full + SIZE*2;\
2347 int16_t tmp[SIZE*(SIZE+5)];\
2348 uint8_t halfV[SIZE*SIZE];\
2349 uint8_t halfHV[SIZE*SIZE];\
2350 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2351 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2352 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2356 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2357 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2358 #define op_put(a, b) a = cm[((b) + 16)>>5]
2359 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2360 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2362 H264_LOWPASS(put_ , op_put, op2_put)
2363 H264_LOWPASS(avg_ , op_avg, op2_avg)
2377 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2378 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2382 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2383 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2384 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2385 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2386 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2387 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2388 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2389 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2395 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2396 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2400 const int src_1= src[ -srcStride];
2401 const int src0 = src[0 ];
2402 const int src1 = src[ srcStride];
2403 const int src2 = src[2*srcStride];
2404 const int src3 = src[3*srcStride];
2405 const int src4 = src[4*srcStride];
2406 const int src5 = src[5*srcStride];
2407 const int src6 = src[6*srcStride];
2408 const int src7 = src[7*srcStride];
2409 const int src8 = src[8*srcStride];
2410 const int src9 = src[9*srcStride];
2411 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2412 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2413 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2414 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2415 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2416 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2417 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2418 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2424 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2425 put_pixels8_c(dst, src, stride, 8);
2428 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2430 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2431 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2434 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2435 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2438 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2440 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2441 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2444 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2445 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2448 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2452 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2453 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2454 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2455 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2457 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2461 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2462 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2463 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2464 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2466 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2468 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2469 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2472 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2474 const int strength= ff_h263_loop_filter_strength[qscale];
2478 int p0= src[x-2*stride];
2479 int p1= src[x-1*stride];
2480 int p2= src[x+0*stride];
2481 int p3= src[x+1*stride];
2482 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2484 if (d<-2*strength) d1= 0;
2485 else if(d<- strength) d1=-2*strength - d;
2486 else if(d< strength) d1= d;
2487 else if(d< 2*strength) d1= 2*strength - d;
2492 if(p1&256) p1= ~(p1>>31);
2493 if(p2&256) p2= ~(p2>>31);
2495 src[x-1*stride] = p1;
2496 src[x+0*stride] = p2;
2500 d2= clip((p0-p3)/4, -ad1, ad1);
2502 src[x-2*stride] = p0 - d2;
2503 src[x+ stride] = p3 + d2;
2507 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2509 const int strength= ff_h263_loop_filter_strength[qscale];
2513 int p0= src[y*stride-2];
2514 int p1= src[y*stride-1];
2515 int p2= src[y*stride+0];
2516 int p3= src[y*stride+1];
2517 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2519 if (d<-2*strength) d1= 0;
2520 else if(d<- strength) d1=-2*strength - d;
2521 else if(d< strength) d1= d;
2522 else if(d< 2*strength) d1= 2*strength - d;
2527 if(p1&256) p1= ~(p1>>31);
2528 if(p2&256) p2= ~(p2>>31);
2530 src[y*stride-1] = p1;
2531 src[y*stride+0] = p2;
2535 d2= clip((p0-p3)/4, -ad1, ad1);
2537 src[y*stride-2] = p0 - d2;
2538 src[y*stride+1] = p3 + d2;
2542 static void h261_loop_filter_c(uint8_t *src, int stride){
2547 temp[x ] = 4*src[x ];
2548 temp[x + 7*8] = 4*src[x + 7*stride];
2552 xy = y * stride + x;
2554 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2559 src[ y*stride] = (temp[ y*8] + 2)>>2;
2560 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2562 xy = y * stride + x;
2564 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2569 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2575 s += abs(pix1[0] - pix2[0]);
2576 s += abs(pix1[1] - pix2[1]);
2577 s += abs(pix1[2] - pix2[2]);
2578 s += abs(pix1[3] - pix2[3]);
2579 s += abs(pix1[4] - pix2[4]);
2580 s += abs(pix1[5] - pix2[5]);
2581 s += abs(pix1[6] - pix2[6]);
2582 s += abs(pix1[7] - pix2[7]);
2583 s += abs(pix1[8] - pix2[8]);
2584 s += abs(pix1[9] - pix2[9]);
2585 s += abs(pix1[10] - pix2[10]);
2586 s += abs(pix1[11] - pix2[11]);
2587 s += abs(pix1[12] - pix2[12]);
2588 s += abs(pix1[13] - pix2[13]);
2589 s += abs(pix1[14] - pix2[14]);
2590 s += abs(pix1[15] - pix2[15]);
2597 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2603 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2604 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2605 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2606 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2607 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2608 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2609 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2610 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2611 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2612 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2613 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2614 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2615 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2616 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2617 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2618 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2625 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2628 uint8_t *pix3 = pix2 + line_size;
2632 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2633 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2634 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2635 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2636 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2637 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2638 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2639 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2640 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2641 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2642 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2643 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2644 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2645 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2646 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2647 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2655 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2658 uint8_t *pix3 = pix2 + line_size;
2662 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2663 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2664 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2665 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2666 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2667 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2668 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2669 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2670 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2671 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2672 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2673 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2674 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2675 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2676 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2677 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2685 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2691 s += abs(pix1[0] - pix2[0]);
2692 s += abs(pix1[1] - pix2[1]);
2693 s += abs(pix1[2] - pix2[2]);
2694 s += abs(pix1[3] - pix2[3]);
2695 s += abs(pix1[4] - pix2[4]);
2696 s += abs(pix1[5] - pix2[5]);
2697 s += abs(pix1[6] - pix2[6]);
2698 s += abs(pix1[7] - pix2[7]);
2705 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2711 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2712 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2713 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2714 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2715 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2716 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2717 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2718 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2725 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2728 uint8_t *pix3 = pix2 + line_size;
2732 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2733 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2734 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2735 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2736 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2737 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2738 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2739 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2747 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2750 uint8_t *pix3 = pix2 + line_size;
2754 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2755 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2756 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2757 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2758 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2759 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2760 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2761 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2769 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2775 for(x=0; x<16; x++){
2776 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2779 for(x=0; x<15; x++){
2780 score2+= ABS( s1[x ] - s1[x +stride]
2781 - s1[x+1] + s1[x+1+stride])
2782 -ABS( s2[x ] - s2[x +stride]
2783 - s2[x+1] + s2[x+1+stride]);
2790 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2791 else return score1 + ABS(score2)*8;
2794 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2801 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2805 score2+= ABS( s1[x ] - s1[x +stride]
2806 - s1[x+1] + s1[x+1+stride])
2807 -ABS( s2[x ] - s2[x +stride]
2808 - s2[x+1] + s2[x+1+stride]);
2815 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2816 else return score1 + ABS(score2)*8;
2819 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2823 for(i=0; i<8*8; i++){
2824 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2827 assert(-512<b && b<512);
2829 sum += (w*b)*(w*b)>>4;
2834 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
2837 for(i=0; i<8*8; i++){
2838 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
2843 * permutes an 8x8 block.
2844 * @param block the block which will be permuted according to the given permutation vector
2845 * @param permutation the permutation vector
2846 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
2847 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
2848 * (inverse) permutated to scantable order!
2850 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
2856 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
2858 for(i=0; i<=last; i++){
2859 const int j= scantable[i];
2864 for(i=0; i<=last; i++){
2865 const int j= scantable[i];
2866 const int perm_j= permutation[j];
2867 block[perm_j]= temp[j];
2871 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
2875 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
2878 memset(cmp, 0, sizeof(void*)*5);
2886 cmp[i]= c->hadamard8_diff[i];
2892 cmp[i]= c->dct_sad[i];
2895 cmp[i]= c->dct_max[i];
2898 cmp[i]= c->quant_psnr[i];
2925 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
2931 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
2933 static void clear_blocks_c(DCTELEM *blocks)
2935 memset(blocks, 0, sizeof(DCTELEM)*6*64);
2938 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
2940 for(i=0; i+7<w; i+=8){
2941 dst[i+0] += src[i+0];
2942 dst[i+1] += src[i+1];
2943 dst[i+2] += src[i+2];
2944 dst[i+3] += src[i+3];
2945 dst[i+4] += src[i+4];
2946 dst[i+5] += src[i+5];
2947 dst[i+6] += src[i+6];
2948 dst[i+7] += src[i+7];
2951 dst[i+0] += src[i+0];
2954 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
2956 for(i=0; i+7<w; i+=8){
2957 dst[i+0] = src1[i+0]-src2[i+0];
2958 dst[i+1] = src1[i+1]-src2[i+1];
2959 dst[i+2] = src1[i+2]-src2[i+2];
2960 dst[i+3] = src1[i+3]-src2[i+3];
2961 dst[i+4] = src1[i+4]-src2[i+4];
2962 dst[i+5] = src1[i+5]-src2[i+5];
2963 dst[i+6] = src1[i+6]-src2[i+6];
2964 dst[i+7] = src1[i+7]-src2[i+7];
2967 dst[i+0] = src1[i+0]-src2[i+0];
2970 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
2978 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
2988 #define BUTTERFLY2(o1,o2,i1,i2) \
2992 #define BUTTERFLY1(x,y) \
3001 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3003 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3011 //FIXME try pointer walks
3012 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3013 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3014 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3015 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3017 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3018 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3019 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3020 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3022 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3023 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3024 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3025 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3029 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3030 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3031 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3032 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3034 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3035 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3036 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3037 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3040 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3041 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3042 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3043 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3049 printf("MAX:%d\n", maxi);
3055 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3063 //FIXME try pointer walks
3064 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3065 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3066 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3067 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3069 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3070 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3071 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3072 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3074 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3075 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3076 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3077 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3081 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3082 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3083 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3084 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3086 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3087 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3088 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3089 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3092 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3093 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3094 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3095 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3098 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3103 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3104 MpegEncContext * const s= (MpegEncContext *)c;
3105 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3106 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3111 s->dsp.diff_pixels(temp, src1, src2, stride);
3120 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3121 MpegEncContext * const s= (MpegEncContext *)c;
3122 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3123 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3128 s->dsp.diff_pixels(temp, src1, src2, stride);
3132 sum= FFMAX(sum, ABS(temp[i]));
3137 void simple_idct(DCTELEM *block); //FIXME
3139 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3140 MpegEncContext * const s= (MpegEncContext *)c;
3141 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3142 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3143 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3149 s->dsp.diff_pixels(temp, src1, src2, stride);
3151 memcpy(bak, temp, 64*sizeof(DCTELEM));
3153 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3154 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3155 simple_idct(temp); //FIXME
3158 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3163 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3164 MpegEncContext * const s= (MpegEncContext *)c;
3165 const uint8_t *scantable= s->intra_scantable.permutated;
3166 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3167 uint64_t __align8 aligned_bak[stride];
3168 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3169 uint8_t * const bak= (uint8_t*)aligned_bak;
3170 int i, last, run, bits, level, distoration, start_i;
3171 const int esc_length= s->ac_esc_length;
3173 uint8_t * last_length;
3178 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3179 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3182 s->dsp.diff_pixels(temp, src1, src2, stride);
3184 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3190 length = s->intra_ac_vlc_length;
3191 last_length= s->intra_ac_vlc_last_length;
3192 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3195 length = s->inter_ac_vlc_length;
3196 last_length= s->inter_ac_vlc_last_length;
3201 for(i=start_i; i<last; i++){
3202 int j= scantable[i];
3207 if((level&(~127)) == 0){
3208 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3217 level= temp[i] + 64;
3221 if((level&(~127)) == 0){
3222 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3230 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3232 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3235 s->dsp.idct_add(bak, stride, temp);
3237 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3239 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3242 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3243 MpegEncContext * const s= (MpegEncContext *)c;
3244 const uint8_t *scantable= s->intra_scantable.permutated;
3245 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3246 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3247 int i, last, run, bits, level, start_i;
3248 const int esc_length= s->ac_esc_length;
3250 uint8_t * last_length;
3254 s->dsp.diff_pixels(temp, src1, src2, stride);
3256 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3262 length = s->intra_ac_vlc_length;
3263 last_length= s->intra_ac_vlc_last_length;
3264 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3267 length = s->inter_ac_vlc_length;
3268 last_length= s->inter_ac_vlc_last_length;
3273 for(i=start_i; i<last; i++){
3274 int j= scantable[i];
3279 if((level&(~127)) == 0){
3280 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3289 level= temp[i] + 64;
3293 if((level&(~127)) == 0){
3294 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3302 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3307 for(x=0; x<16; x+=4){
3308 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3309 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3317 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3322 for(x=0; x<16; x++){
3323 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3332 #define SQ(a) ((a)*(a))
3333 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3338 for(x=0; x<16; x+=4){
3339 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3340 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3348 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3353 for(x=0; x<16; x++){
3354 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3363 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3364 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3365 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3366 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3367 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3368 WARPER8_16_SQ(rd8x8_c, rd16_c)
3369 WARPER8_16_SQ(bit8x8_c, bit16_c)
3371 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3373 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3376 put_pixels_clamped_c(block, dest, line_size);
3378 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3381 add_pixels_clamped_c(block, dest, line_size);
3384 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3387 put_pixels_clamped4_c(block, dest, line_size);
3389 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3392 add_pixels_clamped4_c(block, dest, line_size);
3395 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3398 put_pixels_clamped2_c(block, dest, line_size);
3400 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3403 add_pixels_clamped2_c(block, dest, line_size);
3406 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3408 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3410 dest[0] = cm[(block[0] + 4)>>3];
3412 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3414 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3416 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3419 /* init static data */
3420 void dsputil_static_init(void)
3424 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3425 for(i=0;i<MAX_NEG_CROP;i++) {
3427 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3430 for(i=0;i<512;i++) {
3431 squareTbl[i] = (i - 256) * (i - 256);
3434 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3438 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3442 #ifdef CONFIG_ENCODERS
3443 if(avctx->dct_algo==FF_DCT_FASTINT) {
3444 c->fdct = fdct_ifast;
3445 c->fdct248 = fdct_ifast248;
3447 else if(avctx->dct_algo==FF_DCT_FAAN) {
3448 c->fdct = ff_faandct;
3449 c->fdct248 = ff_faandct248;
3452 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3453 c->fdct248 = ff_fdct248_islow;
3455 #endif //CONFIG_ENCODERS
3457 if(avctx->lowres==1){
3458 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3459 c->idct_put= ff_jref_idct4_put;
3460 c->idct_add= ff_jref_idct4_add;
3462 c->idct_put= ff_h264_lowres_idct_put_c;
3463 c->idct_add= ff_h264_lowres_idct_add_c;
3465 c->idct = j_rev_dct4;
3466 c->idct_permutation_type= FF_NO_IDCT_PERM;
3467 }else if(avctx->lowres==2){
3468 c->idct_put= ff_jref_idct2_put;
3469 c->idct_add= ff_jref_idct2_add;
3470 c->idct = j_rev_dct2;
3471 c->idct_permutation_type= FF_NO_IDCT_PERM;
3472 }else if(avctx->lowres==3){
3473 c->idct_put= ff_jref_idct1_put;
3474 c->idct_add= ff_jref_idct1_add;
3475 c->idct = j_rev_dct1;
3476 c->idct_permutation_type= FF_NO_IDCT_PERM;
3478 if(avctx->idct_algo==FF_IDCT_INT){
3479 c->idct_put= ff_jref_idct_put;
3480 c->idct_add= ff_jref_idct_add;
3481 c->idct = j_rev_dct;
3482 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3483 }else{ //accurate/default
3484 c->idct_put= simple_idct_put;
3485 c->idct_add= simple_idct_add;
3486 c->idct = simple_idct;
3487 c->idct_permutation_type= FF_NO_IDCT_PERM;
3491 c->h264_idct_add= ff_h264_idct_add_c;
3493 /* VP3 DSP support */
3494 c->vp3_dsp_init = vp3_dsp_init_c;
3495 c->vp3_idct = vp3_idct_c;
3497 c->get_pixels = get_pixels_c;
3498 c->diff_pixels = diff_pixels_c;
3499 c->put_pixels_clamped = put_pixels_clamped_c;
3500 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3501 c->add_pixels_clamped = add_pixels_clamped_c;
3504 c->clear_blocks = clear_blocks_c;
3505 c->pix_sum = pix_sum_c;
3506 c->pix_norm1 = pix_norm1_c;
3508 /* TODO [0] 16 [1] 8 */
3509 c->pix_abs[0][0] = pix_abs16_c;
3510 c->pix_abs[0][1] = pix_abs16_x2_c;
3511 c->pix_abs[0][2] = pix_abs16_y2_c;
3512 c->pix_abs[0][3] = pix_abs16_xy2_c;
3513 c->pix_abs[1][0] = pix_abs8_c;
3514 c->pix_abs[1][1] = pix_abs8_x2_c;
3515 c->pix_abs[1][2] = pix_abs8_y2_c;
3516 c->pix_abs[1][3] = pix_abs8_xy2_c;
3518 #define dspfunc(PFX, IDX, NUM) \
3519 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3520 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3521 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3522 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3524 dspfunc(put, 0, 16);
3525 dspfunc(put_no_rnd, 0, 16);
3527 dspfunc(put_no_rnd, 1, 8);
3531 dspfunc(avg, 0, 16);
3532 dspfunc(avg_no_rnd, 0, 16);
3534 dspfunc(avg_no_rnd, 1, 8);
3539 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3540 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3542 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3543 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3544 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3545 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3546 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3547 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3548 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3549 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3550 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3552 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3553 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3554 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3555 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3556 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3557 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3558 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3559 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3560 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3562 #define dspfunc(PFX, IDX, NUM) \
3563 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3564 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3565 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3566 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3567 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3568 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3569 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3570 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3571 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3572 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3573 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3574 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3575 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3576 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3577 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3578 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3580 dspfunc(put_qpel, 0, 16);
3581 dspfunc(put_no_rnd_qpel, 0, 16);
3583 dspfunc(avg_qpel, 0, 16);
3584 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3586 dspfunc(put_qpel, 1, 8);
3587 dspfunc(put_no_rnd_qpel, 1, 8);
3589 dspfunc(avg_qpel, 1, 8);
3590 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3592 dspfunc(put_h264_qpel, 0, 16);
3593 dspfunc(put_h264_qpel, 1, 8);
3594 dspfunc(put_h264_qpel, 2, 4);
3595 dspfunc(avg_h264_qpel, 0, 16);
3596 dspfunc(avg_h264_qpel, 1, 8);
3597 dspfunc(avg_h264_qpel, 2, 4);
3600 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3601 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3602 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3603 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3604 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3605 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3607 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3608 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3609 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3610 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3611 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3612 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3613 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3614 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3616 #define SET_CMP_FUNC(name) \
3617 c->name[0]= name ## 16_c;\
3618 c->name[1]= name ## 8x8_c;
3620 SET_CMP_FUNC(hadamard8_diff)
3621 c->hadamard8_diff[4]= hadamard8_intra16_c;
3622 SET_CMP_FUNC(dct_sad)
3623 SET_CMP_FUNC(dct_max)
3624 c->sad[0]= pix_abs16_c;
3625 c->sad[1]= pix_abs8_c;
3629 SET_CMP_FUNC(quant_psnr)
3632 c->vsad[0]= vsad16_c;
3633 c->vsad[4]= vsad_intra16_c;
3634 c->vsse[0]= vsse16_c;
3635 c->vsse[4]= vsse_intra16_c;
3636 c->nsse[0]= nsse16_c;
3637 c->nsse[1]= nsse8_c;
3638 c->w53[0]= w53_16_c;
3640 c->w97[0]= w97_16_c;
3643 c->add_bytes= add_bytes_c;
3644 c->diff_bytes= diff_bytes_c;
3645 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3646 c->bswap_buf= bswap_buf;
3648 c->h263_h_loop_filter= h263_h_loop_filter_c;
3649 c->h263_v_loop_filter= h263_v_loop_filter_c;
3651 c->h261_loop_filter= h261_loop_filter_c;
3653 c->try_8x8basis= try_8x8basis_c;
3654 c->add_8x8basis= add_8x8basis_c;
3657 dsputil_init_mmx(c, avctx);
3660 dsputil_init_armv4l(c, avctx);
3663 dsputil_init_mlib(c, avctx);
3666 dsputil_init_vis(c,avctx);
3669 dsputil_init_alpha(c, avctx);
3672 dsputil_init_ppc(c, avctx);
3675 dsputil_init_mmi(c, avctx);
3678 dsputil_init_sh4(c,avctx);
3681 switch(c->idct_permutation_type){
3682 case FF_NO_IDCT_PERM:
3684 c->idct_permutation[i]= i;
3686 case FF_LIBMPEG2_IDCT_PERM:
3688 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3690 case FF_SIMPLE_IDCT_PERM:
3692 c->idct_permutation[i]= simple_mmx_permutation[i];
3694 case FF_TRANSPOSE_IDCT_PERM:
3696 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3699 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");