3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
35 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
37 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
38 uint32_t squareTbl[512] = {0, };
40 const uint8_t ff_zigzag_direct[64] = {
41 0, 1, 8, 16, 9, 2, 3, 10,
42 17, 24, 32, 25, 18, 11, 4, 5,
43 12, 19, 26, 33, 40, 48, 41, 34,
44 27, 20, 13, 6, 7, 14, 21, 28,
45 35, 42, 49, 56, 57, 50, 43, 36,
46 29, 22, 15, 23, 30, 37, 44, 51,
47 58, 59, 52, 45, 38, 31, 39, 46,
48 53, 60, 61, 54, 47, 55, 62, 63
51 /* Specific zigzag scan for 248 idct. NOTE that unlike the
52 specification, we interleave the fields */
53 const uint8_t ff_zigzag248_direct[64] = {
54 0, 8, 1, 9, 16, 24, 2, 10,
55 17, 25, 32, 40, 48, 56, 33, 41,
56 18, 26, 3, 11, 4, 12, 19, 27,
57 34, 42, 49, 57, 50, 58, 35, 43,
58 20, 28, 5, 13, 6, 14, 21, 29,
59 36, 44, 51, 59, 52, 60, 37, 45,
60 22, 30, 7, 15, 23, 31, 38, 46,
61 53, 61, 54, 62, 39, 47, 55, 63,
64 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
65 uint16_t __align8 inv_zigzag_direct16[64] = {0, };
67 const uint8_t ff_alternate_horizontal_scan[64] = {
68 0, 1, 2, 3, 8, 9, 16, 17,
69 10, 11, 4, 5, 6, 7, 15, 14,
70 13, 12, 19, 18, 24, 25, 32, 33,
71 26, 27, 20, 21, 22, 23, 28, 29,
72 30, 31, 34, 35, 40, 41, 48, 49,
73 42, 43, 36, 37, 38, 39, 44, 45,
74 46, 47, 50, 51, 56, 57, 58, 59,
75 52, 53, 54, 55, 60, 61, 62, 63,
78 const uint8_t ff_alternate_vertical_scan[64] = {
79 0, 8, 16, 24, 1, 9, 2, 10,
80 17, 25, 32, 40, 48, 56, 57, 49,
81 41, 33, 26, 18, 3, 11, 4, 12,
82 19, 27, 34, 42, 50, 58, 35, 43,
83 51, 59, 20, 28, 5, 13, 6, 14,
84 21, 29, 36, 44, 52, 60, 37, 45,
85 53, 61, 22, 30, 7, 15, 23, 31,
86 38, 46, 54, 62, 39, 47, 55, 63,
89 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
90 const uint32_t inverse[256]={
91 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
92 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
93 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
94 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
95 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
96 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
97 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
98 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
99 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
100 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
101 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
102 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
103 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
104 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
105 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
106 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
107 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
108 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
109 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
110 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
111 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
112 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
113 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
114 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
115 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
116 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
117 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
118 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
119 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
120 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
121 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
122 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
125 /* Input permutation for the simple_idct_mmx */
126 static const uint8_t simple_mmx_permutation[64]={
127 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
128 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
129 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
130 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
131 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
132 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
133 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
134 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
137 static int pix_sum_c(uint8_t * pix, int line_size)
142 for (i = 0; i < 16; i++) {
143 for (j = 0; j < 16; j += 8) {
154 pix += line_size - 16;
159 static int pix_norm1_c(uint8_t * pix, int line_size)
162 uint32_t *sq = squareTbl + 256;
165 for (i = 0; i < 16; i++) {
166 for (j = 0; j < 16; j += 8) {
177 #if LONG_MAX > 2147483647
178 register uint64_t x=*(uint64_t*)pix;
180 s += sq[(x>>8)&0xff];
181 s += sq[(x>>16)&0xff];
182 s += sq[(x>>24)&0xff];
183 s += sq[(x>>32)&0xff];
184 s += sq[(x>>40)&0xff];
185 s += sq[(x>>48)&0xff];
186 s += sq[(x>>56)&0xff];
188 register uint32_t x=*(uint32_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 x=*(uint32_t*)(pix+4);
195 s += sq[(x>>8)&0xff];
196 s += sq[(x>>16)&0xff];
197 s += sq[(x>>24)&0xff];
202 pix += line_size - 16;
207 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
210 for(i=0; i+8<=w; i+=8){
211 dst[i+0]= bswap_32(src[i+0]);
212 dst[i+1]= bswap_32(src[i+1]);
213 dst[i+2]= bswap_32(src[i+2]);
214 dst[i+3]= bswap_32(src[i+3]);
215 dst[i+4]= bswap_32(src[i+4]);
216 dst[i+5]= bswap_32(src[i+5]);
217 dst[i+6]= bswap_32(src[i+6]);
218 dst[i+7]= bswap_32(src[i+7]);
221 dst[i+0]= bswap_32(src[i+0]);
225 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
228 uint32_t *sq = squareTbl + 256;
231 for (i = 0; i < h; i++) {
232 s += sq[pix1[0] - pix2[0]];
233 s += sq[pix1[1] - pix2[1]];
234 s += sq[pix1[2] - pix2[2]];
235 s += sq[pix1[3] - pix2[3]];
242 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
245 uint32_t *sq = squareTbl + 256;
248 for (i = 0; i < h; i++) {
249 s += sq[pix1[0] - pix2[0]];
250 s += sq[pix1[1] - pix2[1]];
251 s += sq[pix1[2] - pix2[2]];
252 s += sq[pix1[3] - pix2[3]];
253 s += sq[pix1[4] - pix2[4]];
254 s += sq[pix1[5] - pix2[5]];
255 s += sq[pix1[6] - pix2[6]];
256 s += sq[pix1[7] - pix2[7]];
263 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
266 uint32_t *sq = squareTbl + 256;
269 for (i = 0; i < h; i++) {
270 s += sq[pix1[ 0] - pix2[ 0]];
271 s += sq[pix1[ 1] - pix2[ 1]];
272 s += sq[pix1[ 2] - pix2[ 2]];
273 s += sq[pix1[ 3] - pix2[ 3]];
274 s += sq[pix1[ 4] - pix2[ 4]];
275 s += sq[pix1[ 5] - pix2[ 5]];
276 s += sq[pix1[ 6] - pix2[ 6]];
277 s += sq[pix1[ 7] - pix2[ 7]];
278 s += sq[pix1[ 8] - pix2[ 8]];
279 s += sq[pix1[ 9] - pix2[ 9]];
280 s += sq[pix1[10] - pix2[10]];
281 s += sq[pix1[11] - pix2[11]];
282 s += sq[pix1[12] - pix2[12]];
283 s += sq[pix1[13] - pix2[13]];
284 s += sq[pix1[14] - pix2[14]];
285 s += sq[pix1[15] - pix2[15]];
294 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
295 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
297 const int dec_count= w==8 ? 3 : 4;
301 static const int scale[2][2][4][4]={
305 {268, 239, 239, 213},
310 {344, 310, 310, 280},
318 {275, 245, 245, 218},
323 {352, 317, 317, 286},
332 for (i = 0; i < h; i++) {
333 for (j = 0; j < w; j+=4) {
334 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
335 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
336 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
337 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
343 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
347 for(level=0; level<dec_count; level++){
348 for(ori= level ? 1 : 0; ori<4; ori++){
349 int sx= (ori&1) ? 1<<level: 0;
350 int stride= 16<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
354 for(i=0; i<size; i++){
355 for(j=0; j<size; j++){
356 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
363 for (i = 0; i < h; i++) {
364 for (j = 0; j < w; j+=4) {
365 s+= ABS(tmp[16*i+j+0]);
366 s+= ABS(tmp[16*i+j+1]);
367 s+= ABS(tmp[16*i+j+2]);
368 s+= ABS(tmp[16*i+j+3]);
377 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 8, h, 1);
381 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 8, h, 0);
385 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 16, h, 1);
389 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
390 return w_c(v, pix1, pix2, line_size, 16, h, 0);
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397 /* read the pixels */
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
416 /* read the pixels */
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
439 /* read the pixels */
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461 /* read the pixels */
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
499 else if (*block > 127)
502 *pixels = (uint8_t)(*block + 128);
506 pixels += (line_size - 8);
510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
516 /* read the pixels */
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537 /* read the pixels */
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554 /* read the pixels */
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
564 #define PIXOP2(OPNAME, OP) \
565 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
569 OP(*((uint64_t*)block), LD64(pixels));\
575 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
579 const uint64_t a= LD64(pixels );\
580 const uint64_t b= LD64(pixels+1);\
581 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
587 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
591 const uint64_t a= LD64(pixels );\
592 const uint64_t b= LD64(pixels+1);\
593 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
599 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
603 const uint64_t a= LD64(pixels );\
604 const uint64_t b= LD64(pixels+line_size);\
605 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
611 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
615 const uint64_t a= LD64(pixels );\
616 const uint64_t b= LD64(pixels+line_size);\
617 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
623 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
626 const uint64_t a= LD64(pixels );\
627 const uint64_t b= LD64(pixels+1);\
628 uint64_t l0= (a&0x0303030303030303ULL)\
629 + (b&0x0303030303030303ULL)\
630 + 0x0202020202020202ULL;\
631 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
632 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
636 for(i=0; i<h; i+=2){\
637 uint64_t a= LD64(pixels );\
638 uint64_t b= LD64(pixels+1);\
639 l1= (a&0x0303030303030303ULL)\
640 + (b&0x0303030303030303ULL);\
641 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
642 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
643 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
648 l0= (a&0x0303030303030303ULL)\
649 + (b&0x0303030303030303ULL)\
650 + 0x0202020202020202ULL;\
651 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
652 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
653 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
659 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
662 const uint64_t a= LD64(pixels );\
663 const uint64_t b= LD64(pixels+1);\
664 uint64_t l0= (a&0x0303030303030303ULL)\
665 + (b&0x0303030303030303ULL)\
666 + 0x0101010101010101ULL;\
667 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
668 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
672 for(i=0; i<h; i+=2){\
673 uint64_t a= LD64(pixels );\
674 uint64_t b= LD64(pixels+1);\
675 l1= (a&0x0303030303030303ULL)\
676 + (b&0x0303030303030303ULL);\
677 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
678 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
679 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
684 l0= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL)\
686 + 0x0101010101010101ULL;\
687 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
688 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
689 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
695 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
696 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
697 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
698 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
699 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
700 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
701 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
703 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
704 #else // 64 bit variant
706 #define PIXOP2(OPNAME, OP) \
707 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
710 OP(*((uint16_t*)(block )), LD16(pixels ));\
715 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
718 OP(*((uint32_t*)(block )), LD32(pixels ));\
723 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
726 OP(*((uint32_t*)(block )), LD32(pixels ));\
727 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
732 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
733 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
736 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
737 int src_stride1, int src_stride2, int h){\
741 a= LD32(&src1[i*src_stride1 ]);\
742 b= LD32(&src2[i*src_stride2 ]);\
743 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
744 a= LD32(&src1[i*src_stride1+4]);\
745 b= LD32(&src2[i*src_stride2+4]);\
746 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
750 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
751 int src_stride1, int src_stride2, int h){\
755 a= LD32(&src1[i*src_stride1 ]);\
756 b= LD32(&src2[i*src_stride2 ]);\
757 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
758 a= LD32(&src1[i*src_stride1+4]);\
759 b= LD32(&src2[i*src_stride2+4]);\
760 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
764 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
769 a= LD32(&src1[i*src_stride1 ]);\
770 b= LD32(&src2[i*src_stride2 ]);\
771 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
775 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
776 int src_stride1, int src_stride2, int h){\
780 a= LD16(&src1[i*src_stride1 ]);\
781 b= LD16(&src2[i*src_stride2 ]);\
782 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
786 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
787 int src_stride1, int src_stride2, int h){\
788 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
789 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
792 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
794 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
795 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
798 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
799 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
802 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
803 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
806 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
807 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
810 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
811 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
814 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
815 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
818 uint32_t a, b, c, d, l0, l1, h0, h1;\
819 a= LD32(&src1[i*src_stride1]);\
820 b= LD32(&src2[i*src_stride2]);\
821 c= LD32(&src3[i*src_stride3]);\
822 d= LD32(&src4[i*src_stride4]);\
823 l0= (a&0x03030303UL)\
826 h0= ((a&0xFCFCFCFCUL)>>2)\
827 + ((b&0xFCFCFCFCUL)>>2);\
828 l1= (c&0x03030303UL)\
830 h1= ((c&0xFCFCFCFCUL)>>2)\
831 + ((d&0xFCFCFCFCUL)>>2);\
832 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
833 a= LD32(&src1[i*src_stride1+4]);\
834 b= LD32(&src2[i*src_stride2+4]);\
835 c= LD32(&src3[i*src_stride3+4]);\
836 d= LD32(&src4[i*src_stride4+4]);\
837 l0= (a&0x03030303UL)\
840 h0= ((a&0xFCFCFCFCUL)>>2)\
841 + ((b&0xFCFCFCFCUL)>>2);\
842 l1= (c&0x03030303UL)\
844 h1= ((c&0xFCFCFCFCUL)>>2)\
845 + ((d&0xFCFCFCFCUL)>>2);\
846 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
850 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
851 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
854 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
855 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
858 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
859 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
862 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
863 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
866 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
867 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
870 uint32_t a, b, c, d, l0, l1, h0, h1;\
871 a= LD32(&src1[i*src_stride1]);\
872 b= LD32(&src2[i*src_stride2]);\
873 c= LD32(&src3[i*src_stride3]);\
874 d= LD32(&src4[i*src_stride4]);\
875 l0= (a&0x03030303UL)\
878 h0= ((a&0xFCFCFCFCUL)>>2)\
879 + ((b&0xFCFCFCFCUL)>>2);\
880 l1= (c&0x03030303UL)\
882 h1= ((c&0xFCFCFCFCUL)>>2)\
883 + ((d&0xFCFCFCFCUL)>>2);\
884 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
885 a= LD32(&src1[i*src_stride1+4]);\
886 b= LD32(&src2[i*src_stride2+4]);\
887 c= LD32(&src3[i*src_stride3+4]);\
888 d= LD32(&src4[i*src_stride4+4]);\
889 l0= (a&0x03030303UL)\
892 h0= ((a&0xFCFCFCFCUL)>>2)\
893 + ((b&0xFCFCFCFCUL)>>2);\
894 l1= (c&0x03030303UL)\
896 h1= ((c&0xFCFCFCFCUL)>>2)\
897 + ((d&0xFCFCFCFCUL)>>2);\
898 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
901 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
902 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
904 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
906 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
907 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
908 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
909 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
912 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
914 int i, a0, b0, a1, b1;\
921 for(i=0; i<h; i+=2){\
927 block[0]= (a1+a0)>>2; /* FIXME non put */\
928 block[1]= (b1+b0)>>2;\
938 block[0]= (a1+a0)>>2;\
939 block[1]= (b1+b0)>>2;\
945 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
948 const uint32_t a= LD32(pixels );\
949 const uint32_t b= LD32(pixels+1);\
950 uint32_t l0= (a&0x03030303UL)\
953 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
954 + ((b&0xFCFCFCFCUL)>>2);\
958 for(i=0; i<h; i+=2){\
959 uint32_t a= LD32(pixels );\
960 uint32_t b= LD32(pixels+1);\
961 l1= (a&0x03030303UL)\
963 h1= ((a&0xFCFCFCFCUL)>>2)\
964 + ((b&0xFCFCFCFCUL)>>2);\
965 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
970 l0= (a&0x03030303UL)\
973 h0= ((a&0xFCFCFCFCUL)>>2)\
974 + ((b&0xFCFCFCFCUL)>>2);\
975 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
981 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
986 const uint32_t a= LD32(pixels );\
987 const uint32_t b= LD32(pixels+1);\
988 uint32_t l0= (a&0x03030303UL)\
991 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
996 for(i=0; i<h; i+=2){\
997 uint32_t a= LD32(pixels );\
998 uint32_t b= LD32(pixels+1);\
999 l1= (a&0x03030303UL)\
1000 + (b&0x03030303UL);\
1001 h1= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1008 l0= (a&0x03030303UL)\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017 pixels+=4-line_size*(h+1);\
1018 block +=4-line_size*h;\
1022 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1025 for(j=0; j<2; j++){\
1027 const uint32_t a= LD32(pixels );\
1028 const uint32_t b= LD32(pixels+1);\
1029 uint32_t l0= (a&0x03030303UL)\
1032 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1037 for(i=0; i<h; i+=2){\
1038 uint32_t a= LD32(pixels );\
1039 uint32_t b= LD32(pixels+1);\
1040 l1= (a&0x03030303UL)\
1041 + (b&0x03030303UL);\
1042 h1= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 l0= (a&0x03030303UL)\
1052 h0= ((a&0xFCFCFCFCUL)>>2)\
1053 + ((b&0xFCFCFCFCUL)>>2);\
1054 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1058 pixels+=4-line_size*(h+1);\
1059 block +=4-line_size*h;\
1063 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1064 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1065 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1066 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1067 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1068 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1069 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1070 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1072 #define op_avg(a, b) a = rnd_avg32(a, b)
1074 #define op_put(a, b) a = b
1081 #define avg2(a,b) ((a+b+1)>>1)
1082 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1084 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1085 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1088 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1089 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1092 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1094 const int A=(16-x16)*(16-y16);
1095 const int B=( x16)*(16-y16);
1096 const int C=(16-x16)*( y16);
1097 const int D=( x16)*( y16);
1102 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1103 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1104 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1105 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1106 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1107 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1108 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1109 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1115 static void gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1116 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1119 const int s= 1<<shift;
1129 for(x=0; x<8; x++){ //XXX FIXME optimize
1130 int src_x, src_y, frac_x, frac_y, index;
1134 frac_x= src_x&(s-1);
1135 frac_y= src_y&(s-1);
1139 if((unsigned)src_x < width){
1140 if((unsigned)src_y < height){
1141 index= src_x + src_y*stride;
1142 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1143 + src[index +1]* frac_x )*(s-frac_y)
1144 + ( src[index+stride ]*(s-frac_x)
1145 + src[index+stride+1]* frac_x )* frac_y
1148 index= src_x + clip(src_y, 0, height)*stride;
1149 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1150 + src[index +1]* frac_x )*s
1154 if((unsigned)src_y < height){
1155 index= clip(src_x, 0, width) + src_y*stride;
1156 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1157 + src[index+stride ]* frac_y )*s
1160 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1161 dst[y*stride + x]= src[index ];
1173 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1175 case 2: put_pixels2_c (dst, src, stride, height); break;
1176 case 4: put_pixels4_c (dst, src, stride, height); break;
1177 case 8: put_pixels8_c (dst, src, stride, height); break;
1178 case 16:put_pixels16_c(dst, src, stride, height); break;
1182 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1184 for (i=0; i < height; i++) {
1185 for (j=0; j < width; j++) {
1186 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1193 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1195 for (i=0; i < height; i++) {
1196 for (j=0; j < width; j++) {
1197 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1204 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 for (i=0; i < height; i++) {
1207 for (j=0; j < width; j++) {
1208 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1215 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1226 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1237 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1248 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1259 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1270 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 case 2: avg_pixels2_c (dst, src, stride, height); break;
1273 case 4: avg_pixels4_c (dst, src, stride, height); break;
1274 case 8: avg_pixels8_c (dst, src, stride, height); break;
1275 case 16:avg_pixels16_c(dst, src, stride, height); break;
1279 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1290 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1301 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 for (i=0; i < height; i++) {
1304 for (j=0; j < width; j++) {
1305 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1312 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1323 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1334 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1345 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1367 #define TPEL_WIDTH(width)\
1368 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1369 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1370 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1371 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1372 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1373 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1374 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1375 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1376 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1377 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1378 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1379 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1380 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1381 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1382 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1383 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1384 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1385 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1388 #define H264_CHROMA_MC(OPNAME, OP)\
1389 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1390 const int A=(8-x)*(8-y);\
1391 const int B=( x)*(8-y);\
1392 const int C=(8-x)*( y);\
1393 const int D=( x)*( y);\
1396 assert(x<8 && y<8 && x>=0 && y>=0);\
1400 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1401 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1407 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1408 const int A=(8-x)*(8-y);\
1409 const int B=( x)*(8-y);\
1410 const int C=(8-x)*( y);\
1411 const int D=( x)*( y);\
1414 assert(x<8 && y<8 && x>=0 && y>=0);\
1418 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1419 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1420 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1421 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1427 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1428 const int A=(8-x)*(8-y);\
1429 const int B=( x)*(8-y);\
1430 const int C=(8-x)*( y);\
1431 const int D=( x)*( y);\
1434 assert(x<8 && y<8 && x>=0 && y>=0);\
1438 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1439 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1440 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1441 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1442 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1443 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1444 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1445 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1451 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1452 #define op_put(a, b) a = (((b) + 32)>>6)
1454 H264_CHROMA_MC(put_ , op_put)
1455 H264_CHROMA_MC(avg_ , op_avg)
1459 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1464 ST32(dst , LD32(src ));
1470 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1475 ST32(dst , LD32(src ));
1476 ST32(dst+4 , LD32(src+4 ));
1482 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1487 ST32(dst , LD32(src ));
1488 ST32(dst+4 , LD32(src+4 ));
1489 ST32(dst+8 , LD32(src+8 ));
1490 ST32(dst+12, LD32(src+12));
1496 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1501 ST32(dst , LD32(src ));
1502 ST32(dst+4 , LD32(src+4 ));
1503 ST32(dst+8 , LD32(src+8 ));
1504 ST32(dst+12, LD32(src+12));
1511 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1516 ST32(dst , LD32(src ));
1517 ST32(dst+4 , LD32(src+4 ));
1525 #define QPEL_MC(r, OPNAME, RND, OP) \
1526 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1527 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1531 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1532 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1533 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1534 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1535 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1536 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1537 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1538 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1544 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1546 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1550 const int src0= src[0*srcStride];\
1551 const int src1= src[1*srcStride];\
1552 const int src2= src[2*srcStride];\
1553 const int src3= src[3*srcStride];\
1554 const int src4= src[4*srcStride];\
1555 const int src5= src[5*srcStride];\
1556 const int src6= src[6*srcStride];\
1557 const int src7= src[7*srcStride];\
1558 const int src8= src[8*srcStride];\
1559 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1560 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1561 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1562 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1563 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1564 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1565 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1566 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1572 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1573 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1578 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1579 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1580 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1581 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1582 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1583 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1584 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1585 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1586 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1587 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1588 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1589 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1590 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1591 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1592 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1593 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1599 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1600 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1605 const int src0= src[0*srcStride];\
1606 const int src1= src[1*srcStride];\
1607 const int src2= src[2*srcStride];\
1608 const int src3= src[3*srcStride];\
1609 const int src4= src[4*srcStride];\
1610 const int src5= src[5*srcStride];\
1611 const int src6= src[6*srcStride];\
1612 const int src7= src[7*srcStride];\
1613 const int src8= src[8*srcStride];\
1614 const int src9= src[9*srcStride];\
1615 const int src10= src[10*srcStride];\
1616 const int src11= src[11*srcStride];\
1617 const int src12= src[12*srcStride];\
1618 const int src13= src[13*srcStride];\
1619 const int src14= src[14*srcStride];\
1620 const int src15= src[15*srcStride];\
1621 const int src16= src[16*srcStride];\
1622 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1623 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1624 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1625 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1626 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1627 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1628 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1629 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1630 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1631 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1632 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1633 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1634 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1635 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1636 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1637 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1643 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels8_c(dst, src, stride, 8);\
1647 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1649 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1650 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1653 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1654 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1657 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1659 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1660 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1663 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1664 uint8_t full[16*9];\
1666 copy_block9(full, src, 16, stride, 9);\
1667 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1668 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1671 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1672 uint8_t full[16*9];\
1673 copy_block9(full, src, 16, stride, 9);\
1674 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1677 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1678 uint8_t full[16*9];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1682 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1684 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1685 uint8_t full[16*9];\
1688 uint8_t halfHV[64];\
1689 copy_block9(full, src, 16, stride, 9);\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1691 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1692 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1693 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1695 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1696 uint8_t full[16*9];\
1698 uint8_t halfHV[64];\
1699 copy_block9(full, src, 16, stride, 9);\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1701 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1702 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1703 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1705 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1709 uint8_t halfHV[64];\
1710 copy_block9(full, src, 16, stride, 9);\
1711 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1712 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1713 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1714 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1716 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1719 uint8_t halfHV[64];\
1720 copy_block9(full, src, 16, stride, 9);\
1721 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1722 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1724 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1726 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1747 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1768 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t halfHV[64];\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1773 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1775 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t halfHV[64];\
1778 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1782 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1786 uint8_t halfHV[64];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1790 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1791 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1793 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1794 uint8_t full[16*9];\
1796 copy_block9(full, src, 16, stride, 9);\
1797 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1798 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1799 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1801 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1805 uint8_t halfHV[64];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1809 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1810 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1812 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t full[16*9];\
1815 copy_block9(full, src, 16, stride, 9);\
1816 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1817 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1818 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1820 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1822 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1823 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1825 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1826 OPNAME ## pixels16_c(dst, src, stride, 16);\
1829 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1831 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1832 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1835 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1836 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1839 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1841 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1842 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1845 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1846 uint8_t full[24*17];\
1848 copy_block17(full, src, 24, stride, 17);\
1849 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1850 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1853 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[24*17];\
1855 copy_block17(full, src, 24, stride, 17);\
1856 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1859 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[24*17];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1864 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1866 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1867 uint8_t full[24*17];\
1868 uint8_t halfH[272];\
1869 uint8_t halfV[256];\
1870 uint8_t halfHV[256];\
1871 copy_block17(full, src, 24, stride, 17);\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1873 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1874 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1875 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1877 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1878 uint8_t full[24*17];\
1879 uint8_t halfH[272];\
1880 uint8_t halfHV[256];\
1881 copy_block17(full, src, 24, stride, 17);\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1883 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1884 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1885 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1887 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1889 uint8_t halfH[272];\
1890 uint8_t halfV[256];\
1891 uint8_t halfHV[256];\
1892 copy_block17(full, src, 24, stride, 17);\
1893 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1894 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1895 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1896 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1898 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1900 uint8_t halfH[272];\
1901 uint8_t halfHV[256];\
1902 copy_block17(full, src, 24, stride, 17);\
1903 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1904 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1906 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1908 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1929 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1950 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t halfH[272];\
1952 uint8_t halfHV[256];\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1955 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1957 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t halfH[272];\
1959 uint8_t halfHV[256];\
1960 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1964 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfV[256];\
1968 uint8_t halfHV[256];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1972 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1973 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1975 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[24*17];\
1977 uint8_t halfH[272];\
1978 copy_block17(full, src, 24, stride, 17);\
1979 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1980 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1981 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1983 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfV[256];\
1987 uint8_t halfHV[256];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1991 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1992 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1994 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1995 uint8_t full[24*17];\
1996 uint8_t halfH[272];\
1997 copy_block17(full, src, 24, stride, 17);\
1998 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1999 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2000 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2002 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003 uint8_t halfH[272];\
2004 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2005 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2008 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2009 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2010 #define op_put(a, b) a = cm[((b) + 16)>>5]
2011 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2013 QPEL_MC(0, put_ , _ , op_put)
2014 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2015 QPEL_MC(0, avg_ , _ , op_avg)
2016 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2018 #undef op_avg_no_rnd
2020 #undef op_put_no_rnd
2023 #define H264_LOWPASS(OPNAME, OP, OP2) \
2024 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2026 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2030 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2031 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2032 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2033 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2039 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2041 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2045 const int srcB= src[-2*srcStride];\
2046 const int srcA= src[-1*srcStride];\
2047 const int src0= src[0 *srcStride];\
2048 const int src1= src[1 *srcStride];\
2049 const int src2= src[2 *srcStride];\
2050 const int src3= src[3 *srcStride];\
2051 const int src4= src[4 *srcStride];\
2052 const int src5= src[5 *srcStride];\
2053 const int src6= src[6 *srcStride];\
2054 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2055 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2056 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2057 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2063 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2066 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2068 src -= 2*srcStride;\
2069 for(i=0; i<h+5; i++)\
2071 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2072 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2073 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2074 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2078 tmp -= tmpStride*(h+5-2);\
2081 const int tmpB= tmp[-2*tmpStride];\
2082 const int tmpA= tmp[-1*tmpStride];\
2083 const int tmp0= tmp[0 *tmpStride];\
2084 const int tmp1= tmp[1 *tmpStride];\
2085 const int tmp2= tmp[2 *tmpStride];\
2086 const int tmp3= tmp[3 *tmpStride];\
2087 const int tmp4= tmp[4 *tmpStride];\
2088 const int tmp5= tmp[5 *tmpStride];\
2089 const int tmp6= tmp[6 *tmpStride];\
2090 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2091 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2092 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2093 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2099 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2101 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2105 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2106 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2107 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2108 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2109 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2110 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2111 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2112 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2118 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2120 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2124 const int srcB= src[-2*srcStride];\
2125 const int srcA= src[-1*srcStride];\
2126 const int src0= src[0 *srcStride];\
2127 const int src1= src[1 *srcStride];\
2128 const int src2= src[2 *srcStride];\
2129 const int src3= src[3 *srcStride];\
2130 const int src4= src[4 *srcStride];\
2131 const int src5= src[5 *srcStride];\
2132 const int src6= src[6 *srcStride];\
2133 const int src7= src[7 *srcStride];\
2134 const int src8= src[8 *srcStride];\
2135 const int src9= src[9 *srcStride];\
2136 const int src10=src[10*srcStride];\
2137 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2138 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2139 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2140 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2141 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2142 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2143 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2144 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2150 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2153 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2155 src -= 2*srcStride;\
2156 for(i=0; i<h+5; i++)\
2158 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2159 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2160 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2161 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2162 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2163 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2164 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2165 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2169 tmp -= tmpStride*(h+5-2);\
2172 const int tmpB= tmp[-2*tmpStride];\
2173 const int tmpA= tmp[-1*tmpStride];\
2174 const int tmp0= tmp[0 *tmpStride];\
2175 const int tmp1= tmp[1 *tmpStride];\
2176 const int tmp2= tmp[2 *tmpStride];\
2177 const int tmp3= tmp[3 *tmpStride];\
2178 const int tmp4= tmp[4 *tmpStride];\
2179 const int tmp5= tmp[5 *tmpStride];\
2180 const int tmp6= tmp[6 *tmpStride];\
2181 const int tmp7= tmp[7 *tmpStride];\
2182 const int tmp8= tmp[8 *tmpStride];\
2183 const int tmp9= tmp[9 *tmpStride];\
2184 const int tmp10=tmp[10*tmpStride];\
2185 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2186 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2187 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2188 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2189 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2190 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2191 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2192 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2198 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2199 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2200 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2201 src += 8*srcStride;\
2202 dst += 8*dstStride;\
2203 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2204 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2207 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2208 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2209 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2210 src += 8*srcStride;\
2211 dst += 8*dstStride;\
2212 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2213 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2216 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2217 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2218 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2219 src += 8*srcStride;\
2220 dst += 8*dstStride;\
2221 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2222 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2225 #define H264_MC(OPNAME, SIZE) \
2226 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2227 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2230 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2231 uint8_t half[SIZE*SIZE];\
2232 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2233 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2236 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2237 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2240 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2241 uint8_t half[SIZE*SIZE];\
2242 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2243 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2246 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2247 uint8_t full[SIZE*(SIZE+5)];\
2248 uint8_t * const full_mid= full + SIZE*2;\
2249 uint8_t half[SIZE*SIZE];\
2250 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2251 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2252 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2255 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2256 uint8_t full[SIZE*(SIZE+5)];\
2257 uint8_t * const full_mid= full + SIZE*2;\
2258 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2259 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2262 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2263 uint8_t full[SIZE*(SIZE+5)];\
2264 uint8_t * const full_mid= full + SIZE*2;\
2265 uint8_t half[SIZE*SIZE];\
2266 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2267 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2268 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2271 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2272 uint8_t full[SIZE*(SIZE+5)];\
2273 uint8_t * const full_mid= full + SIZE*2;\
2274 uint8_t halfH[SIZE*SIZE];\
2275 uint8_t halfV[SIZE*SIZE];\
2276 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2277 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2278 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2279 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2282 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2283 uint8_t full[SIZE*(SIZE+5)];\
2284 uint8_t * const full_mid= full + SIZE*2;\
2285 uint8_t halfH[SIZE*SIZE];\
2286 uint8_t halfV[SIZE*SIZE];\
2287 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2288 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2289 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2290 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2294 uint8_t full[SIZE*(SIZE+5)];\
2295 uint8_t * const full_mid= full + SIZE*2;\
2296 uint8_t halfH[SIZE*SIZE];\
2297 uint8_t halfV[SIZE*SIZE];\
2298 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2299 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2300 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2301 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2304 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2305 uint8_t full[SIZE*(SIZE+5)];\
2306 uint8_t * const full_mid= full + SIZE*2;\
2307 uint8_t halfH[SIZE*SIZE];\
2308 uint8_t halfV[SIZE*SIZE];\
2309 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2310 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2311 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2312 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2315 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2316 int16_t tmp[SIZE*(SIZE+5)];\
2317 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2320 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2321 int16_t tmp[SIZE*(SIZE+5)];\
2322 uint8_t halfH[SIZE*SIZE];\
2323 uint8_t halfHV[SIZE*SIZE];\
2324 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2325 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2326 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2329 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2330 int16_t tmp[SIZE*(SIZE+5)];\
2331 uint8_t halfH[SIZE*SIZE];\
2332 uint8_t halfHV[SIZE*SIZE];\
2333 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2334 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2335 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2338 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2339 uint8_t full[SIZE*(SIZE+5)];\
2340 uint8_t * const full_mid= full + SIZE*2;\
2341 int16_t tmp[SIZE*(SIZE+5)];\
2342 uint8_t halfV[SIZE*SIZE];\
2343 uint8_t halfHV[SIZE*SIZE];\
2344 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2345 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2346 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2347 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2351 uint8_t full[SIZE*(SIZE+5)];\
2352 uint8_t * const full_mid= full + SIZE*2;\
2353 int16_t tmp[SIZE*(SIZE+5)];\
2354 uint8_t halfV[SIZE*SIZE];\
2355 uint8_t halfHV[SIZE*SIZE];\
2356 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2357 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2358 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2359 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2362 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2363 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2364 #define op_put(a, b) a = cm[((b) + 16)>>5]
2365 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2366 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2368 H264_LOWPASS(put_ , op_put, op2_put)
2369 H264_LOWPASS(avg_ , op_avg, op2_avg)
2383 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2384 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2385 #define H264_WEIGHT(W,H) \
2386 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2387 int attribute_unused x, y; \
2388 offset <<= log2_denom; \
2389 if(log2_denom) offset += 1<<(log2_denom-1); \
2390 for(y=0; y<H; y++, block += stride){ \
2393 if(W==2) continue; \
2396 if(W==4) continue; \
2401 if(W==8) continue; \
2412 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
2413 int attribute_unused x, y; \
2414 int offset = (offsets + offsetd + 1) >> 1; \
2415 offset = ((offset << 1) + 1) << log2_denom; \
2416 for(y=0; y<H; y++, dst += stride, src += stride){ \
2419 if(W==2) continue; \
2422 if(W==4) continue; \
2427 if(W==8) continue; \
2454 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2455 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2459 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2460 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2461 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2462 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2463 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2464 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2465 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2466 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2472 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2473 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2477 const int src_1= src[ -srcStride];
2478 const int src0 = src[0 ];
2479 const int src1 = src[ srcStride];
2480 const int src2 = src[2*srcStride];
2481 const int src3 = src[3*srcStride];
2482 const int src4 = src[4*srcStride];
2483 const int src5 = src[5*srcStride];
2484 const int src6 = src[6*srcStride];
2485 const int src7 = src[7*srcStride];
2486 const int src8 = src[8*srcStride];
2487 const int src9 = src[9*srcStride];
2488 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2489 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2490 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2491 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2492 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2493 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2494 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2495 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2501 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2502 put_pixels8_c(dst, src, stride, 8);
2505 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2507 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2508 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2511 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2512 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2515 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2517 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2518 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2521 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2522 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2525 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2529 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2530 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2531 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2532 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2534 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2538 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2539 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2540 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2541 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2543 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2545 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2546 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2549 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2551 const int strength= ff_h263_loop_filter_strength[qscale];
2555 int p0= src[x-2*stride];
2556 int p1= src[x-1*stride];
2557 int p2= src[x+0*stride];
2558 int p3= src[x+1*stride];
2559 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2561 if (d<-2*strength) d1= 0;
2562 else if(d<- strength) d1=-2*strength - d;
2563 else if(d< strength) d1= d;
2564 else if(d< 2*strength) d1= 2*strength - d;
2569 if(p1&256) p1= ~(p1>>31);
2570 if(p2&256) p2= ~(p2>>31);
2572 src[x-1*stride] = p1;
2573 src[x+0*stride] = p2;
2577 d2= clip((p0-p3)/4, -ad1, ad1);
2579 src[x-2*stride] = p0 - d2;
2580 src[x+ stride] = p3 + d2;
2584 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2586 const int strength= ff_h263_loop_filter_strength[qscale];
2590 int p0= src[y*stride-2];
2591 int p1= src[y*stride-1];
2592 int p2= src[y*stride+0];
2593 int p3= src[y*stride+1];
2594 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2596 if (d<-2*strength) d1= 0;
2597 else if(d<- strength) d1=-2*strength - d;
2598 else if(d< strength) d1= d;
2599 else if(d< 2*strength) d1= 2*strength - d;
2604 if(p1&256) p1= ~(p1>>31);
2605 if(p2&256) p2= ~(p2>>31);
2607 src[y*stride-1] = p1;
2608 src[y*stride+0] = p2;
2612 d2= clip((p0-p3)/4, -ad1, ad1);
2614 src[y*stride-2] = p0 - d2;
2615 src[y*stride+1] = p3 + d2;
2619 static void h261_loop_filter_c(uint8_t *src, int stride){
2624 temp[x ] = 4*src[x ];
2625 temp[x + 7*8] = 4*src[x + 7*stride];
2629 xy = y * stride + x;
2631 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2636 src[ y*stride] = (temp[ y*8] + 2)>>2;
2637 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2639 xy = y * stride + x;
2641 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2646 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int *tc0)
2649 for( i = 0; i < 4; i++ ) {
2654 for( d = 0; d < 4; d++ ) {
2655 const int p0 = pix[-1*xstride];
2656 const int p1 = pix[-2*xstride];
2657 const int p2 = pix[-3*xstride];
2658 const int q0 = pix[0];
2659 const int q1 = pix[1*xstride];
2660 const int q2 = pix[2*xstride];
2662 if( ABS( p0 - q0 ) < alpha &&
2663 ABS( p1 - p0 ) < beta &&
2664 ABS( q1 - q0 ) < beta ) {
2669 if( ABS( p2 - p0 ) < beta ) {
2670 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2673 if( ABS( q2 - q0 ) < beta ) {
2674 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2678 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2679 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2680 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2686 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2688 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2690 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2692 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2695 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int *tc0)
2698 for( i = 0; i < 4; i++ ) {
2699 const int tc = tc0[i];
2704 for( d = 0; d < 2; d++ ) {
2705 const int p0 = pix[-1*xstride];
2706 const int p1 = pix[-2*xstride];
2707 const int q0 = pix[0];
2708 const int q1 = pix[1*xstride];
2710 if( ABS( p0 - q0 ) < alpha &&
2711 ABS( p1 - p0 ) < beta &&
2712 ABS( q1 - q0 ) < beta ) {
2714 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2716 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2717 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2723 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2725 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2727 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int *tc0)
2729 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2732 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2738 s += abs(pix1[0] - pix2[0]);
2739 s += abs(pix1[1] - pix2[1]);
2740 s += abs(pix1[2] - pix2[2]);
2741 s += abs(pix1[3] - pix2[3]);
2742 s += abs(pix1[4] - pix2[4]);
2743 s += abs(pix1[5] - pix2[5]);
2744 s += abs(pix1[6] - pix2[6]);
2745 s += abs(pix1[7] - pix2[7]);
2746 s += abs(pix1[8] - pix2[8]);
2747 s += abs(pix1[9] - pix2[9]);
2748 s += abs(pix1[10] - pix2[10]);
2749 s += abs(pix1[11] - pix2[11]);
2750 s += abs(pix1[12] - pix2[12]);
2751 s += abs(pix1[13] - pix2[13]);
2752 s += abs(pix1[14] - pix2[14]);
2753 s += abs(pix1[15] - pix2[15]);
2760 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2766 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2767 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2768 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2769 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2770 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2771 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2772 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2773 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2774 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2775 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2776 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2777 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2778 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2779 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2780 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2781 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2788 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2791 uint8_t *pix3 = pix2 + line_size;
2795 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2796 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2797 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2798 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2799 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2800 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2801 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2802 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2803 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2804 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2805 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2806 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2807 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2808 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2809 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2810 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2818 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2821 uint8_t *pix3 = pix2 + line_size;
2825 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2826 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2827 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2828 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2829 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2830 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2831 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2832 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2833 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2834 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2835 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2836 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2837 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2838 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2839 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2840 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2848 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2854 s += abs(pix1[0] - pix2[0]);
2855 s += abs(pix1[1] - pix2[1]);
2856 s += abs(pix1[2] - pix2[2]);
2857 s += abs(pix1[3] - pix2[3]);
2858 s += abs(pix1[4] - pix2[4]);
2859 s += abs(pix1[5] - pix2[5]);
2860 s += abs(pix1[6] - pix2[6]);
2861 s += abs(pix1[7] - pix2[7]);
2868 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2874 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2875 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2876 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2877 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2878 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2879 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2880 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2881 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2888 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2891 uint8_t *pix3 = pix2 + line_size;
2895 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2896 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2897 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2898 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2899 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2900 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2901 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2902 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2910 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2913 uint8_t *pix3 = pix2 + line_size;
2917 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2918 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2919 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2920 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2921 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2922 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2923 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2924 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2932 static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2938 for(x=0; x<16; x++){
2939 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2942 for(x=0; x<15; x++){
2943 score2+= ABS( s1[x ] - s1[x +stride]
2944 - s1[x+1] + s1[x+1+stride])
2945 -ABS( s2[x ] - s2[x +stride]
2946 - s2[x+1] + s2[x+1+stride]);
2953 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2954 else return score1 + ABS(score2)*8;
2957 static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2964 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
2968 score2+= ABS( s1[x ] - s1[x +stride]
2969 - s1[x+1] + s1[x+1+stride])
2970 -ABS( s2[x ] - s2[x +stride]
2971 - s2[x+1] + s2[x+1+stride]);
2978 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
2979 else return score1 + ABS(score2)*8;
2982 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
2986 for(i=0; i<8*8; i++){
2987 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
2990 assert(-512<b && b<512);
2992 sum += (w*b)*(w*b)>>4;
2997 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3000 for(i=0; i<8*8; i++){
3001 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3006 * permutes an 8x8 block.
3007 * @param block the block which will be permuted according to the given permutation vector
3008 * @param permutation the permutation vector
3009 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3010 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3011 * (inverse) permutated to scantable order!
3013 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3019 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3021 for(i=0; i<=last; i++){
3022 const int j= scantable[i];
3027 for(i=0; i<=last; i++){
3028 const int j= scantable[i];
3029 const int perm_j= permutation[j];
3030 block[perm_j]= temp[j];
3034 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3038 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3041 memset(cmp, 0, sizeof(void*)*5);
3049 cmp[i]= c->hadamard8_diff[i];
3055 cmp[i]= c->dct_sad[i];
3058 cmp[i]= c->dct_max[i];
3061 cmp[i]= c->quant_psnr[i];
3088 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3094 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3096 static void clear_blocks_c(DCTELEM *blocks)
3098 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3101 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3103 for(i=0; i+7<w; i+=8){
3104 dst[i+0] += src[i+0];
3105 dst[i+1] += src[i+1];
3106 dst[i+2] += src[i+2];
3107 dst[i+3] += src[i+3];
3108 dst[i+4] += src[i+4];
3109 dst[i+5] += src[i+5];
3110 dst[i+6] += src[i+6];
3111 dst[i+7] += src[i+7];
3114 dst[i+0] += src[i+0];
3117 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3119 for(i=0; i+7<w; i+=8){
3120 dst[i+0] = src1[i+0]-src2[i+0];
3121 dst[i+1] = src1[i+1]-src2[i+1];
3122 dst[i+2] = src1[i+2]-src2[i+2];
3123 dst[i+3] = src1[i+3]-src2[i+3];
3124 dst[i+4] = src1[i+4]-src2[i+4];
3125 dst[i+5] = src1[i+5]-src2[i+5];
3126 dst[i+6] = src1[i+6]-src2[i+6];
3127 dst[i+7] = src1[i+7]-src2[i+7];
3130 dst[i+0] = src1[i+0]-src2[i+0];
3133 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3141 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3151 #define BUTTERFLY2(o1,o2,i1,i2) \
3155 #define BUTTERFLY1(x,y) \
3164 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3166 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3174 //FIXME try pointer walks
3175 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3176 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3177 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3178 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3180 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3181 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3182 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3183 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3185 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3186 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3187 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3188 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3192 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3193 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3194 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3195 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3197 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3198 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3199 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3200 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3203 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3204 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3205 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3206 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3212 printf("MAX:%d\n", maxi);
3218 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3226 //FIXME try pointer walks
3227 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3228 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3229 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3230 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3232 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3233 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3234 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3235 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3237 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3238 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3239 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3240 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3244 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3245 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3246 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3247 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3249 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3250 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3251 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3252 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3255 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3256 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3257 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3258 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3261 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3266 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3267 MpegEncContext * const s= (MpegEncContext *)c;
3268 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3269 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3274 s->dsp.diff_pixels(temp, src1, src2, stride);
3283 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3284 MpegEncContext * const s= (MpegEncContext *)c;
3285 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3286 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3291 s->dsp.diff_pixels(temp, src1, src2, stride);
3295 sum= FFMAX(sum, ABS(temp[i]));
3300 void simple_idct(DCTELEM *block); //FIXME
3302 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3303 MpegEncContext * const s= (MpegEncContext *)c;
3304 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64*2/8];
3305 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3306 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3312 s->dsp.diff_pixels(temp, src1, src2, stride);
3314 memcpy(bak, temp, 64*sizeof(DCTELEM));
3316 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3317 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3318 simple_idct(temp); //FIXME
3321 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3326 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3327 MpegEncContext * const s= (MpegEncContext *)c;
3328 const uint8_t *scantable= s->intra_scantable.permutated;
3329 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3330 uint64_t __align8 aligned_bak[stride];
3331 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3332 uint8_t * const bak= (uint8_t*)aligned_bak;
3333 int i, last, run, bits, level, distoration, start_i;
3334 const int esc_length= s->ac_esc_length;
3336 uint8_t * last_length;
3341 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3342 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3345 s->dsp.diff_pixels(temp, src1, src2, stride);
3347 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3353 length = s->intra_ac_vlc_length;
3354 last_length= s->intra_ac_vlc_last_length;
3355 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3358 length = s->inter_ac_vlc_length;
3359 last_length= s->inter_ac_vlc_last_length;
3364 for(i=start_i; i<last; i++){
3365 int j= scantable[i];
3370 if((level&(~127)) == 0){
3371 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3380 level= temp[i] + 64;
3384 if((level&(~127)) == 0){
3385 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3393 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3395 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3398 s->dsp.idct_add(bak, stride, temp);
3400 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3402 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3405 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3406 MpegEncContext * const s= (MpegEncContext *)c;
3407 const uint8_t *scantable= s->intra_scantable.permutated;
3408 uint64_t __align8 aligned_temp[sizeof(DCTELEM)*64/8];
3409 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3410 int i, last, run, bits, level, start_i;
3411 const int esc_length= s->ac_esc_length;
3413 uint8_t * last_length;
3417 s->dsp.diff_pixels(temp, src1, src2, stride);
3419 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3425 length = s->intra_ac_vlc_length;
3426 last_length= s->intra_ac_vlc_last_length;
3427 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3430 length = s->inter_ac_vlc_length;
3431 last_length= s->inter_ac_vlc_last_length;
3436 for(i=start_i; i<last; i++){
3437 int j= scantable[i];
3442 if((level&(~127)) == 0){
3443 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3452 level= temp[i] + 64;
3456 if((level&(~127)) == 0){
3457 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3465 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3470 for(x=0; x<16; x+=4){
3471 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3472 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3480 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3485 for(x=0; x<16; x++){
3486 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3495 #define SQ(a) ((a)*(a))
3496 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3501 for(x=0; x<16; x+=4){
3502 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3503 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3511 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3516 for(x=0; x<16; x++){
3517 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3526 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3527 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3528 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3529 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3530 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3531 WARPER8_16_SQ(rd8x8_c, rd16_c)
3532 WARPER8_16_SQ(bit8x8_c, bit16_c)
3534 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3536 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3539 put_pixels_clamped_c(block, dest, line_size);
3541 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3544 add_pixels_clamped_c(block, dest, line_size);
3547 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3550 put_pixels_clamped4_c(block, dest, line_size);
3552 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3555 add_pixels_clamped4_c(block, dest, line_size);
3558 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3561 put_pixels_clamped2_c(block, dest, line_size);
3563 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3566 add_pixels_clamped2_c(block, dest, line_size);
3569 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3571 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3573 dest[0] = cm[(block[0] + 4)>>3];
3575 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3577 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3579 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3582 /* init static data */
3583 void dsputil_static_init(void)
3587 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3588 for(i=0;i<MAX_NEG_CROP;i++) {
3590 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3593 for(i=0;i<512;i++) {
3594 squareTbl[i] = (i - 256) * (i - 256);
3597 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3601 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3605 #ifdef CONFIG_ENCODERS
3606 if(avctx->dct_algo==FF_DCT_FASTINT) {
3607 c->fdct = fdct_ifast;
3608 c->fdct248 = fdct_ifast248;
3610 else if(avctx->dct_algo==FF_DCT_FAAN) {
3611 c->fdct = ff_faandct;
3612 c->fdct248 = ff_faandct248;
3615 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3616 c->fdct248 = ff_fdct248_islow;
3618 #endif //CONFIG_ENCODERS
3620 if(avctx->lowres==1){
3621 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3622 c->idct_put= ff_jref_idct4_put;
3623 c->idct_add= ff_jref_idct4_add;
3625 c->idct_put= ff_h264_lowres_idct_put_c;
3626 c->idct_add= ff_h264_lowres_idct_add_c;
3628 c->idct = j_rev_dct4;
3629 c->idct_permutation_type= FF_NO_IDCT_PERM;
3630 }else if(avctx->lowres==2){
3631 c->idct_put= ff_jref_idct2_put;
3632 c->idct_add= ff_jref_idct2_add;
3633 c->idct = j_rev_dct2;
3634 c->idct_permutation_type= FF_NO_IDCT_PERM;
3635 }else if(avctx->lowres==3){
3636 c->idct_put= ff_jref_idct1_put;
3637 c->idct_add= ff_jref_idct1_add;
3638 c->idct = j_rev_dct1;
3639 c->idct_permutation_type= FF_NO_IDCT_PERM;
3641 if(avctx->idct_algo==FF_IDCT_INT){
3642 c->idct_put= ff_jref_idct_put;
3643 c->idct_add= ff_jref_idct_add;
3644 c->idct = j_rev_dct;
3645 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3646 }else{ //accurate/default
3647 c->idct_put= simple_idct_put;
3648 c->idct_add= simple_idct_add;
3649 c->idct = simple_idct;
3650 c->idct_permutation_type= FF_NO_IDCT_PERM;
3654 c->h264_idct_add= ff_h264_idct_add_c;
3656 /* VP3 DSP support */
3657 c->vp3_dsp_init = vp3_dsp_init_c;
3658 c->vp3_idct = vp3_idct_c;
3660 c->get_pixels = get_pixels_c;
3661 c->diff_pixels = diff_pixels_c;
3662 c->put_pixels_clamped = put_pixels_clamped_c;
3663 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3664 c->add_pixels_clamped = add_pixels_clamped_c;
3667 c->clear_blocks = clear_blocks_c;
3668 c->pix_sum = pix_sum_c;
3669 c->pix_norm1 = pix_norm1_c;
3671 /* TODO [0] 16 [1] 8 */
3672 c->pix_abs[0][0] = pix_abs16_c;
3673 c->pix_abs[0][1] = pix_abs16_x2_c;
3674 c->pix_abs[0][2] = pix_abs16_y2_c;
3675 c->pix_abs[0][3] = pix_abs16_xy2_c;
3676 c->pix_abs[1][0] = pix_abs8_c;
3677 c->pix_abs[1][1] = pix_abs8_x2_c;
3678 c->pix_abs[1][2] = pix_abs8_y2_c;
3679 c->pix_abs[1][3] = pix_abs8_xy2_c;
3681 #define dspfunc(PFX, IDX, NUM) \
3682 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3683 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3684 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3685 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3687 dspfunc(put, 0, 16);
3688 dspfunc(put_no_rnd, 0, 16);
3690 dspfunc(put_no_rnd, 1, 8);
3694 dspfunc(avg, 0, 16);
3695 dspfunc(avg_no_rnd, 0, 16);
3697 dspfunc(avg_no_rnd, 1, 8);
3702 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3703 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3705 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3706 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3707 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3708 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3709 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3710 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3711 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3712 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3713 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3715 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3716 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3717 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3718 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3719 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3720 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3721 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3722 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3723 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3725 #define dspfunc(PFX, IDX, NUM) \
3726 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3727 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3728 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3729 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3730 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3731 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3732 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3733 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3734 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3735 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3736 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3737 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3738 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3739 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3740 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3741 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3743 dspfunc(put_qpel, 0, 16);
3744 dspfunc(put_no_rnd_qpel, 0, 16);
3746 dspfunc(avg_qpel, 0, 16);
3747 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3749 dspfunc(put_qpel, 1, 8);
3750 dspfunc(put_no_rnd_qpel, 1, 8);
3752 dspfunc(avg_qpel, 1, 8);
3753 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3755 dspfunc(put_h264_qpel, 0, 16);
3756 dspfunc(put_h264_qpel, 1, 8);
3757 dspfunc(put_h264_qpel, 2, 4);
3758 dspfunc(avg_h264_qpel, 0, 16);
3759 dspfunc(avg_h264_qpel, 1, 8);
3760 dspfunc(avg_h264_qpel, 2, 4);
3763 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3764 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3765 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3766 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3767 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3768 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3770 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3771 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3772 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3773 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3774 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3775 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3776 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3777 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3778 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3779 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3780 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3781 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3782 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3783 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3784 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3785 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3786 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3787 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3788 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3789 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3791 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3792 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3793 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3794 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3795 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3796 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3797 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3798 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3800 #define SET_CMP_FUNC(name) \
3801 c->name[0]= name ## 16_c;\
3802 c->name[1]= name ## 8x8_c;
3804 SET_CMP_FUNC(hadamard8_diff)
3805 c->hadamard8_diff[4]= hadamard8_intra16_c;
3806 SET_CMP_FUNC(dct_sad)
3807 SET_CMP_FUNC(dct_max)
3808 c->sad[0]= pix_abs16_c;
3809 c->sad[1]= pix_abs8_c;
3813 SET_CMP_FUNC(quant_psnr)
3816 c->vsad[0]= vsad16_c;
3817 c->vsad[4]= vsad_intra16_c;
3818 c->vsse[0]= vsse16_c;
3819 c->vsse[4]= vsse_intra16_c;
3820 c->nsse[0]= nsse16_c;
3821 c->nsse[1]= nsse8_c;
3822 c->w53[0]= w53_16_c;
3824 c->w97[0]= w97_16_c;
3827 c->add_bytes= add_bytes_c;
3828 c->diff_bytes= diff_bytes_c;
3829 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3830 c->bswap_buf= bswap_buf;
3832 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
3833 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
3834 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
3835 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
3837 c->h263_h_loop_filter= h263_h_loop_filter_c;
3838 c->h263_v_loop_filter= h263_v_loop_filter_c;
3840 c->h261_loop_filter= h261_loop_filter_c;
3842 c->try_8x8basis= try_8x8basis_c;
3843 c->add_8x8basis= add_8x8basis_c;
3846 dsputil_init_mmx(c, avctx);
3849 dsputil_init_armv4l(c, avctx);
3852 dsputil_init_mlib(c, avctx);
3855 dsputil_init_vis(c,avctx);
3858 dsputil_init_alpha(c, avctx);
3861 dsputil_init_ppc(c, avctx);
3864 dsputil_init_mmi(c, avctx);
3867 dsputil_init_sh4(c,avctx);
3870 switch(c->idct_permutation_type){
3871 case FF_NO_IDCT_PERM:
3873 c->idct_permutation[i]= i;
3875 case FF_LIBMPEG2_IDCT_PERM:
3877 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3879 case FF_SIMPLE_IDCT_PERM:
3881 c->idct_permutation[i]= simple_mmx_permutation[i];
3883 case FF_TRANSPOSE_IDCT_PERM:
3885 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3888 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");