3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39 uint32_t squareTbl[512] = {0, };
41 const uint8_t ff_zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54 const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
68 const uint8_t ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
79 const uint8_t ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91 const uint32_t inverse[256]={
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
126 /* Input permutation for the simple_idct_mmx */
127 static const uint8_t simple_mmx_permutation[64]={
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
138 static int pix_sum_c(uint8_t * pix, int line_size)
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
155 pix += line_size - 16;
160 static int pix_norm1_c(uint8_t * pix, int line_size)
163 uint32_t *sq = squareTbl + 256;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
189 register uint32_t x=*(uint32_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
203 pix += line_size - 16;
208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
222 dst[i+0]= bswap_32(src[i+0]);
226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
229 uint32_t *sq = squareTbl + 256;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
246 uint32_t *sq = squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
267 uint32_t *sq = squareTbl + 256;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
295 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
296 #ifdef CONFIG_SNOW_ENCODER //idwt is in snow.c
298 const int dec_count= w==8 ? 3 : 4;
302 static const int scale[2][2][4][4]={
306 {268, 239, 239, 213},
311 {344, 310, 310, 280},
319 {275, 245, 245, 218},
324 {352, 317, 317, 286},
333 for (i = 0; i < h; i++) {
334 for (j = 0; j < w; j+=4) {
335 tmp[16*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
336 tmp[16*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
337 tmp[16*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
338 tmp[16*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
344 ff_spatial_dwt(tmp, w, h, 16, type, dec_count);
348 for(level=0; level<dec_count; level++){
349 for(ori= level ? 1 : 0; ori<4; ori++){
350 int sx= (ori&1) ? 1<<level: 0;
351 int stride= 16<<(dec_count-level);
352 int sy= (ori&2) ? stride>>1 : 0;
355 for(i=0; i<size; i++){
356 for(j=0; j<size; j++){
357 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
364 for (i = 0; i < h; i++) {
365 for (j = 0; j < w; j+=4) {
366 s+= ABS(tmp[16*i+j+0]);
367 s+= ABS(tmp[16*i+j+1]);
368 s+= ABS(tmp[16*i+j+2]);
369 s+= ABS(tmp[16*i+j+3]);
378 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 1);
382 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 8, h, 0);
386 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 1);
390 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 16, h, 0);
394 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
398 /* read the pixels */
400 block[0] = pixels[0];
401 block[1] = pixels[1];
402 block[2] = pixels[2];
403 block[3] = pixels[3];
404 block[4] = pixels[4];
405 block[5] = pixels[5];
406 block[6] = pixels[6];
407 block[7] = pixels[7];
413 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
414 const uint8_t *s2, int stride){
417 /* read the pixels */
419 block[0] = s1[0] - s2[0];
420 block[1] = s1[1] - s2[1];
421 block[2] = s1[2] - s2[2];
422 block[3] = s1[3] - s2[3];
423 block[4] = s1[4] - s2[4];
424 block[5] = s1[5] - s2[5];
425 block[6] = s1[6] - s2[6];
426 block[7] = s1[7] - s2[7];
434 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
438 uint8_t *cm = cropTbl + MAX_NEG_CROP;
440 /* read the pixels */
442 pixels[0] = cm[block[0]];
443 pixels[1] = cm[block[1]];
444 pixels[2] = cm[block[2]];
445 pixels[3] = cm[block[3]];
446 pixels[4] = cm[block[4]];
447 pixels[5] = cm[block[5]];
448 pixels[6] = cm[block[6]];
449 pixels[7] = cm[block[7]];
456 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
460 uint8_t *cm = cropTbl + MAX_NEG_CROP;
462 /* read the pixels */
464 pixels[0] = cm[block[0]];
465 pixels[1] = cm[block[1]];
466 pixels[2] = cm[block[2]];
467 pixels[3] = cm[block[3]];
474 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
478 uint8_t *cm = cropTbl + MAX_NEG_CROP;
480 /* read the pixels */
482 pixels[0] = cm[block[0]];
483 pixels[1] = cm[block[1]];
490 static void put_signed_pixels_clamped_c(const DCTELEM *block,
491 uint8_t *restrict pixels,
496 for (i = 0; i < 8; i++) {
497 for (j = 0; j < 8; j++) {
500 else if (*block > 127)
503 *pixels = (uint8_t)(*block + 128);
507 pixels += (line_size - 8);
511 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
515 uint8_t *cm = cropTbl + MAX_NEG_CROP;
517 /* read the pixels */
519 pixels[0] = cm[pixels[0] + block[0]];
520 pixels[1] = cm[pixels[1] + block[1]];
521 pixels[2] = cm[pixels[2] + block[2]];
522 pixels[3] = cm[pixels[3] + block[3]];
523 pixels[4] = cm[pixels[4] + block[4]];
524 pixels[5] = cm[pixels[5] + block[5]];
525 pixels[6] = cm[pixels[6] + block[6]];
526 pixels[7] = cm[pixels[7] + block[7]];
532 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
536 uint8_t *cm = cropTbl + MAX_NEG_CROP;
538 /* read the pixels */
540 pixels[0] = cm[pixels[0] + block[0]];
541 pixels[1] = cm[pixels[1] + block[1]];
542 pixels[2] = cm[pixels[2] + block[2]];
543 pixels[3] = cm[pixels[3] + block[3]];
549 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
553 uint8_t *cm = cropTbl + MAX_NEG_CROP;
555 /* read the pixels */
557 pixels[0] = cm[pixels[0] + block[0]];
558 pixels[1] = cm[pixels[1] + block[1]];
564 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
568 pixels[0] += block[0];
569 pixels[1] += block[1];
570 pixels[2] += block[2];
571 pixels[3] += block[3];
572 pixels[4] += block[4];
573 pixels[5] += block[5];
574 pixels[6] += block[6];
575 pixels[7] += block[7];
581 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
585 pixels[0] += block[0];
586 pixels[1] += block[1];
587 pixels[2] += block[2];
588 pixels[3] += block[3];
596 #define PIXOP2(OPNAME, OP) \
597 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
601 OP(*((uint64_t*)block), LD64(pixels));\
607 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
611 const uint64_t a= LD64(pixels );\
612 const uint64_t b= LD64(pixels+1);\
613 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
619 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
623 const uint64_t a= LD64(pixels );\
624 const uint64_t b= LD64(pixels+1);\
625 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
631 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
635 const uint64_t a= LD64(pixels );\
636 const uint64_t b= LD64(pixels+line_size);\
637 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
643 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
647 const uint64_t a= LD64(pixels );\
648 const uint64_t b= LD64(pixels+line_size);\
649 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
655 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
658 const uint64_t a= LD64(pixels );\
659 const uint64_t b= LD64(pixels+1);\
660 uint64_t l0= (a&0x0303030303030303ULL)\
661 + (b&0x0303030303030303ULL)\
662 + 0x0202020202020202ULL;\
663 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
664 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
668 for(i=0; i<h; i+=2){\
669 uint64_t a= LD64(pixels );\
670 uint64_t b= LD64(pixels+1);\
671 l1= (a&0x0303030303030303ULL)\
672 + (b&0x0303030303030303ULL);\
673 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
674 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
675 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
680 l0= (a&0x0303030303030303ULL)\
681 + (b&0x0303030303030303ULL)\
682 + 0x0202020202020202ULL;\
683 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
684 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
685 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
691 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
694 const uint64_t a= LD64(pixels );\
695 const uint64_t b= LD64(pixels+1);\
696 uint64_t l0= (a&0x0303030303030303ULL)\
697 + (b&0x0303030303030303ULL)\
698 + 0x0101010101010101ULL;\
699 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
700 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
704 for(i=0; i<h; i+=2){\
705 uint64_t a= LD64(pixels );\
706 uint64_t b= LD64(pixels+1);\
707 l1= (a&0x0303030303030303ULL)\
708 + (b&0x0303030303030303ULL);\
709 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
710 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
711 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
716 l0= (a&0x0303030303030303ULL)\
717 + (b&0x0303030303030303ULL)\
718 + 0x0101010101010101ULL;\
719 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
720 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
721 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
727 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
730 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
735 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
736 #else // 64 bit variant
738 #define PIXOP2(OPNAME, OP) \
739 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
742 OP(*((uint16_t*)(block )), LD16(pixels ));\
747 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
750 OP(*((uint32_t*)(block )), LD32(pixels ));\
755 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
758 OP(*((uint32_t*)(block )), LD32(pixels ));\
759 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
764 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
765 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
768 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
769 int src_stride1, int src_stride2, int h){\
773 a= LD32(&src1[i*src_stride1 ]);\
774 b= LD32(&src2[i*src_stride2 ]);\
775 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
776 a= LD32(&src1[i*src_stride1+4]);\
777 b= LD32(&src2[i*src_stride2+4]);\
778 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
782 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
783 int src_stride1, int src_stride2, int h){\
787 a= LD32(&src1[i*src_stride1 ]);\
788 b= LD32(&src2[i*src_stride2 ]);\
789 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
790 a= LD32(&src1[i*src_stride1+4]);\
791 b= LD32(&src2[i*src_stride2+4]);\
792 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
796 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
797 int src_stride1, int src_stride2, int h){\
801 a= LD32(&src1[i*src_stride1 ]);\
802 b= LD32(&src2[i*src_stride2 ]);\
803 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
807 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
808 int src_stride1, int src_stride2, int h){\
812 a= LD16(&src1[i*src_stride1 ]);\
813 b= LD16(&src2[i*src_stride2 ]);\
814 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
818 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
819 int src_stride1, int src_stride2, int h){\
820 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
821 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
824 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
825 int src_stride1, int src_stride2, int h){\
826 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
827 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
830 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
834 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
838 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
842 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
843 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
846 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
847 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
850 uint32_t a, b, c, d, l0, l1, h0, h1;\
851 a= LD32(&src1[i*src_stride1]);\
852 b= LD32(&src2[i*src_stride2]);\
853 c= LD32(&src3[i*src_stride3]);\
854 d= LD32(&src4[i*src_stride4]);\
855 l0= (a&0x03030303UL)\
858 h0= ((a&0xFCFCFCFCUL)>>2)\
859 + ((b&0xFCFCFCFCUL)>>2);\
860 l1= (c&0x03030303UL)\
862 h1= ((c&0xFCFCFCFCUL)>>2)\
863 + ((d&0xFCFCFCFCUL)>>2);\
864 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
865 a= LD32(&src1[i*src_stride1+4]);\
866 b= LD32(&src2[i*src_stride2+4]);\
867 c= LD32(&src3[i*src_stride3+4]);\
868 d= LD32(&src4[i*src_stride4+4]);\
869 l0= (a&0x03030303UL)\
872 h0= ((a&0xFCFCFCFCUL)>>2)\
873 + ((b&0xFCFCFCFCUL)>>2);\
874 l1= (c&0x03030303UL)\
876 h1= ((c&0xFCFCFCFCUL)>>2)\
877 + ((d&0xFCFCFCFCUL)>>2);\
878 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
882 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
886 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
890 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
894 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
898 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
899 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
902 uint32_t a, b, c, d, l0, l1, h0, h1;\
903 a= LD32(&src1[i*src_stride1]);\
904 b= LD32(&src2[i*src_stride2]);\
905 c= LD32(&src3[i*src_stride3]);\
906 d= LD32(&src4[i*src_stride4]);\
907 l0= (a&0x03030303UL)\
910 h0= ((a&0xFCFCFCFCUL)>>2)\
911 + ((b&0xFCFCFCFCUL)>>2);\
912 l1= (c&0x03030303UL)\
914 h1= ((c&0xFCFCFCFCUL)>>2)\
915 + ((d&0xFCFCFCFCUL)>>2);\
916 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
917 a= LD32(&src1[i*src_stride1+4]);\
918 b= LD32(&src2[i*src_stride2+4]);\
919 c= LD32(&src3[i*src_stride3+4]);\
920 d= LD32(&src4[i*src_stride4+4]);\
921 l0= (a&0x03030303UL)\
924 h0= ((a&0xFCFCFCFCUL)>>2)\
925 + ((b&0xFCFCFCFCUL)>>2);\
926 l1= (c&0x03030303UL)\
928 h1= ((c&0xFCFCFCFCUL)>>2)\
929 + ((d&0xFCFCFCFCUL)>>2);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
933 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
934 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
935 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
936 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
938 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
939 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
940 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
941 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
944 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
946 int i, a0, b0, a1, b1;\
953 for(i=0; i<h; i+=2){\
959 block[0]= (a1+a0)>>2; /* FIXME non put */\
960 block[1]= (b1+b0)>>2;\
970 block[0]= (a1+a0)>>2;\
971 block[1]= (b1+b0)>>2;\
977 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
980 const uint32_t a= LD32(pixels );\
981 const uint32_t b= LD32(pixels+1);\
982 uint32_t l0= (a&0x03030303UL)\
985 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
986 + ((b&0xFCFCFCFCUL)>>2);\
990 for(i=0; i<h; i+=2){\
991 uint32_t a= LD32(pixels );\
992 uint32_t b= LD32(pixels+1);\
993 l1= (a&0x03030303UL)\
995 h1= ((a&0xFCFCFCFCUL)>>2)\
996 + ((b&0xFCFCFCFCUL)>>2);\
997 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1002 l0= (a&0x03030303UL)\
1005 h0= ((a&0xFCFCFCFCUL)>>2)\
1006 + ((b&0xFCFCFCFCUL)>>2);\
1007 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1016 for(j=0; j<2; j++){\
1018 const uint32_t a= LD32(pixels );\
1019 const uint32_t b= LD32(pixels+1);\
1020 uint32_t l0= (a&0x03030303UL)\
1023 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1024 + ((b&0xFCFCFCFCUL)>>2);\
1028 for(i=0; i<h; i+=2){\
1029 uint32_t a= LD32(pixels );\
1030 uint32_t b= LD32(pixels+1);\
1031 l1= (a&0x03030303UL)\
1032 + (b&0x03030303UL);\
1033 h1= ((a&0xFCFCFCFCUL)>>2)\
1034 + ((b&0xFCFCFCFCUL)>>2);\
1035 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1040 l0= (a&0x03030303UL)\
1043 h0= ((a&0xFCFCFCFCUL)>>2)\
1044 + ((b&0xFCFCFCFCUL)>>2);\
1045 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1049 pixels+=4-line_size*(h+1);\
1050 block +=4-line_size*h;\
1054 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1057 for(j=0; j<2; j++){\
1059 const uint32_t a= LD32(pixels );\
1060 const uint32_t b= LD32(pixels+1);\
1061 uint32_t l0= (a&0x03030303UL)\
1064 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1065 + ((b&0xFCFCFCFCUL)>>2);\
1069 for(i=0; i<h; i+=2){\
1070 uint32_t a= LD32(pixels );\
1071 uint32_t b= LD32(pixels+1);\
1072 l1= (a&0x03030303UL)\
1073 + (b&0x03030303UL);\
1074 h1= ((a&0xFCFCFCFCUL)>>2)\
1075 + ((b&0xFCFCFCFCUL)>>2);\
1076 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1081 l0= (a&0x03030303UL)\
1084 h0= ((a&0xFCFCFCFCUL)>>2)\
1085 + ((b&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1090 pixels+=4-line_size*(h+1);\
1091 block +=4-line_size*h;\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1104 #define op_avg(a, b) a = rnd_avg32(a, b)
1106 #define op_put(a, b) a = b
1113 #define avg2(a,b) ((a+b+1)>>1)
1114 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1116 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1120 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1121 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1124 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1126 const int A=(16-x16)*(16-y16);
1127 const int B=( x16)*(16-y16);
1128 const int C=(16-x16)*( y16);
1129 const int D=( x16)*( y16);
1134 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1135 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1136 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1137 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1138 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1139 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1140 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1141 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1147 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1148 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1151 const int s= 1<<shift;
1161 for(x=0; x<8; x++){ //XXX FIXME optimize
1162 int src_x, src_y, frac_x, frac_y, index;
1166 frac_x= src_x&(s-1);
1167 frac_y= src_y&(s-1);
1171 if((unsigned)src_x < width){
1172 if((unsigned)src_y < height){
1173 index= src_x + src_y*stride;
1174 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1175 + src[index +1]* frac_x )*(s-frac_y)
1176 + ( src[index+stride ]*(s-frac_x)
1177 + src[index+stride+1]* frac_x )* frac_y
1180 index= src_x + clip(src_y, 0, height)*stride;
1181 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1182 + src[index +1]* frac_x )*s
1186 if((unsigned)src_y < height){
1187 index= clip(src_x, 0, width) + src_y*stride;
1188 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1189 + src[index+stride ]* frac_y )*s
1192 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1193 dst[y*stride + x]= src[index ];
1205 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1207 case 2: put_pixels2_c (dst, src, stride, height); break;
1208 case 4: put_pixels4_c (dst, src, stride, height); break;
1209 case 8: put_pixels8_c (dst, src, stride, height); break;
1210 case 16:put_pixels16_c(dst, src, stride, height); break;
1214 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1216 for (i=0; i < height; i++) {
1217 for (j=0; j < width; j++) {
1218 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1225 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1227 for (i=0; i < height; i++) {
1228 for (j=0; j < width; j++) {
1229 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1236 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1238 for (i=0; i < height; i++) {
1239 for (j=0; j < width; j++) {
1240 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1247 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1249 for (i=0; i < height; i++) {
1250 for (j=0; j < width; j++) {
1251 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1258 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1260 for (i=0; i < height; i++) {
1261 for (j=0; j < width; j++) {
1262 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1269 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1271 for (i=0; i < height; i++) {
1272 for (j=0; j < width; j++) {
1273 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1280 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1282 for (i=0; i < height; i++) {
1283 for (j=0; j < width; j++) {
1284 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1291 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1293 for (i=0; i < height; i++) {
1294 for (j=0; j < width; j++) {
1295 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1302 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1304 case 2: avg_pixels2_c (dst, src, stride, height); break;
1305 case 4: avg_pixels4_c (dst, src, stride, height); break;
1306 case 8: avg_pixels8_c (dst, src, stride, height); break;
1307 case 16:avg_pixels16_c(dst, src, stride, height); break;
1311 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1313 for (i=0; i < height; i++) {
1314 for (j=0; j < width; j++) {
1315 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1322 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1324 for (i=0; i < height; i++) {
1325 for (j=0; j < width; j++) {
1326 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1333 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1335 for (i=0; i < height; i++) {
1336 for (j=0; j < width; j++) {
1337 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1344 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1346 for (i=0; i < height; i++) {
1347 for (j=0; j < width; j++) {
1348 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1355 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1357 for (i=0; i < height; i++) {
1358 for (j=0; j < width; j++) {
1359 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1366 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1377 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1388 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1399 #define TPEL_WIDTH(width)\
1400 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1414 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1415 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1416 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1417 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1420 #define H264_CHROMA_MC(OPNAME, OP)\
1421 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1422 const int A=(8-x)*(8-y);\
1423 const int B=( x)*(8-y);\
1424 const int C=(8-x)*( y);\
1425 const int D=( x)*( y);\
1428 assert(x<8 && y<8 && x>=0 && y>=0);\
1432 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1433 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1439 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1440 const int A=(8-x)*(8-y);\
1441 const int B=( x)*(8-y);\
1442 const int C=(8-x)*( y);\
1443 const int D=( x)*( y);\
1446 assert(x<8 && y<8 && x>=0 && y>=0);\
1450 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1451 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1453 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1459 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1460 const int A=(8-x)*(8-y);\
1461 const int B=( x)*(8-y);\
1462 const int C=(8-x)*( y);\
1463 const int D=( x)*( y);\
1466 assert(x<8 && y<8 && x>=0 && y>=0);\
1470 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1471 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1472 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1473 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1474 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1475 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1476 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1477 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1483 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1484 #define op_put(a, b) a = (((b) + 32)>>6)
1486 H264_CHROMA_MC(put_ , op_put)
1487 H264_CHROMA_MC(avg_ , op_avg)
1491 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1496 ST16(dst , LD16(src ));
1502 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1507 ST32(dst , LD32(src ));
1513 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1518 ST32(dst , LD32(src ));
1519 ST32(dst+4 , LD32(src+4 ));
1525 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1530 ST32(dst , LD32(src ));
1531 ST32(dst+4 , LD32(src+4 ));
1532 ST32(dst+8 , LD32(src+8 ));
1533 ST32(dst+12, LD32(src+12));
1539 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1544 ST32(dst , LD32(src ));
1545 ST32(dst+4 , LD32(src+4 ));
1546 ST32(dst+8 , LD32(src+8 ));
1547 ST32(dst+12, LD32(src+12));
1554 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1559 ST32(dst , LD32(src ));
1560 ST32(dst+4 , LD32(src+4 ));
1568 #define QPEL_MC(r, OPNAME, RND, OP) \
1569 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1570 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1574 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1575 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1576 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1577 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1578 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1579 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1580 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1581 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1587 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1589 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1593 const int src0= src[0*srcStride];\
1594 const int src1= src[1*srcStride];\
1595 const int src2= src[2*srcStride];\
1596 const int src3= src[3*srcStride];\
1597 const int src4= src[4*srcStride];\
1598 const int src5= src[5*srcStride];\
1599 const int src6= src[6*srcStride];\
1600 const int src7= src[7*srcStride];\
1601 const int src8= src[8*srcStride];\
1602 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1603 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1604 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1605 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1606 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1607 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1608 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1609 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1615 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1616 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1621 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1622 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1623 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1624 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1625 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1626 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1627 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1628 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1629 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1630 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1631 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1632 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1633 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1634 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1635 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1636 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1642 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1643 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1648 const int src0= src[0*srcStride];\
1649 const int src1= src[1*srcStride];\
1650 const int src2= src[2*srcStride];\
1651 const int src3= src[3*srcStride];\
1652 const int src4= src[4*srcStride];\
1653 const int src5= src[5*srcStride];\
1654 const int src6= src[6*srcStride];\
1655 const int src7= src[7*srcStride];\
1656 const int src8= src[8*srcStride];\
1657 const int src9= src[9*srcStride];\
1658 const int src10= src[10*srcStride];\
1659 const int src11= src[11*srcStride];\
1660 const int src12= src[12*srcStride];\
1661 const int src13= src[13*srcStride];\
1662 const int src14= src[14*srcStride];\
1663 const int src15= src[15*srcStride];\
1664 const int src16= src[16*srcStride];\
1665 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1666 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1667 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1668 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1669 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1670 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1671 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1672 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1673 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1674 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1675 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1676 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1677 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1678 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1679 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1680 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1686 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1687 OPNAME ## pixels8_c(dst, src, stride, 8);\
1690 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1692 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1693 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1696 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1697 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1700 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1702 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1703 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1706 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1707 uint8_t full[16*9];\
1709 copy_block9(full, src, 16, stride, 9);\
1710 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1711 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1714 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1715 uint8_t full[16*9];\
1716 copy_block9(full, src, 16, stride, 9);\
1717 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1720 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1721 uint8_t full[16*9];\
1723 copy_block9(full, src, 16, stride, 9);\
1724 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1725 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1727 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1728 uint8_t full[16*9];\
1731 uint8_t halfHV[64];\
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1738 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1741 uint8_t halfHV[64];\
1742 copy_block9(full, src, 16, stride, 9);\
1743 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1744 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1745 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1746 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1748 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1749 uint8_t full[16*9];\
1752 uint8_t halfHV[64];\
1753 copy_block9(full, src, 16, stride, 9);\
1754 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1759 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1760 uint8_t full[16*9];\
1762 uint8_t halfHV[64];\
1763 copy_block9(full, src, 16, stride, 9);\
1764 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1765 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1766 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1767 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1769 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1770 uint8_t full[16*9];\
1773 uint8_t halfHV[64];\
1774 copy_block9(full, src, 16, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1777 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1778 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1780 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1781 uint8_t full[16*9];\
1783 uint8_t halfHV[64];\
1784 copy_block9(full, src, 16, stride, 9);\
1785 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1786 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1787 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1788 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1790 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1791 uint8_t full[16*9];\
1794 uint8_t halfHV[64];\
1795 copy_block9(full, src, 16, stride, 9);\
1796 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1798 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1799 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1801 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1802 uint8_t full[16*9];\
1804 uint8_t halfHV[64];\
1805 copy_block9(full, src, 16, stride, 9);\
1806 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1807 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1808 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1809 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1811 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1813 uint8_t halfHV[64];\
1814 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1815 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1816 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1818 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1820 uint8_t halfHV[64];\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1823 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1825 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1826 uint8_t full[16*9];\
1829 uint8_t halfHV[64];\
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1836 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[16*9];\
1839 copy_block9(full, src, 16, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1841 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1842 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1844 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1848 uint8_t halfHV[64];\
1849 copy_block9(full, src, 16, stride, 9);\
1850 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1852 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1853 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1855 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1856 uint8_t full[16*9];\
1858 copy_block9(full, src, 16, stride, 9);\
1859 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1860 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1861 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1863 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1866 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1868 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1869 OPNAME ## pixels16_c(dst, src, stride, 16);\
1872 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1874 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1875 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1878 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1879 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1882 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1884 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1885 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1888 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1889 uint8_t full[24*17];\
1891 copy_block17(full, src, 24, stride, 17);\
1892 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1893 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1896 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[24*17];\
1898 copy_block17(full, src, 24, stride, 17);\
1899 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1902 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1903 uint8_t full[24*17];\
1905 copy_block17(full, src, 24, stride, 17);\
1906 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1907 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1909 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t full[24*17];\
1911 uint8_t halfH[272];\
1912 uint8_t halfV[256];\
1913 uint8_t halfHV[256];\
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1920 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 uint8_t halfH[272];\
1923 uint8_t halfHV[256];\
1924 copy_block17(full, src, 24, stride, 17);\
1925 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1926 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1927 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1928 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1930 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[24*17];\
1932 uint8_t halfH[272];\
1933 uint8_t halfV[256];\
1934 uint8_t halfHV[256];\
1935 copy_block17(full, src, 24, stride, 17);\
1936 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1941 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t full[24*17];\
1943 uint8_t halfH[272];\
1944 uint8_t halfHV[256];\
1945 copy_block17(full, src, 24, stride, 17);\
1946 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1947 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1948 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1949 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1951 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1952 uint8_t full[24*17];\
1953 uint8_t halfH[272];\
1954 uint8_t halfV[256];\
1955 uint8_t halfHV[256];\
1956 copy_block17(full, src, 24, stride, 17);\
1957 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1959 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1960 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1962 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[24*17];\
1964 uint8_t halfH[272];\
1965 uint8_t halfHV[256];\
1966 copy_block17(full, src, 24, stride, 17);\
1967 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1968 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1969 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1970 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1972 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1973 uint8_t full[24*17];\
1974 uint8_t halfH[272];\
1975 uint8_t halfV[256];\
1976 uint8_t halfHV[256];\
1977 copy_block17(full, src, 24, stride, 17);\
1978 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1980 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1981 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1983 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1984 uint8_t full[24*17];\
1985 uint8_t halfH[272];\
1986 uint8_t halfHV[256];\
1987 copy_block17(full, src, 24, stride, 17);\
1988 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1989 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1990 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1991 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1993 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t halfH[272];\
1995 uint8_t halfHV[256];\
1996 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1997 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1998 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2000 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2001 uint8_t halfH[272];\
2002 uint8_t halfHV[256];\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2005 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2007 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[24*17];\
2009 uint8_t halfH[272];\
2010 uint8_t halfV[256];\
2011 uint8_t halfHV[256];\
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2018 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t full[24*17];\
2020 uint8_t halfH[272];\
2021 copy_block17(full, src, 24, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2023 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2024 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2026 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2028 uint8_t halfH[272];\
2029 uint8_t halfV[256];\
2030 uint8_t halfHV[256];\
2031 copy_block17(full, src, 24, stride, 17);\
2032 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2034 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2035 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2037 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2038 uint8_t full[24*17];\
2039 uint8_t halfH[272];\
2040 copy_block17(full, src, 24, stride, 17);\
2041 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2042 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2043 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2045 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2046 uint8_t halfH[272];\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2048 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2051 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2052 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2053 #define op_put(a, b) a = cm[((b) + 16)>>5]
2054 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2056 QPEL_MC(0, put_ , _ , op_put)
2057 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2058 QPEL_MC(0, avg_ , _ , op_avg)
2059 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2061 #undef op_avg_no_rnd
2063 #undef op_put_no_rnd
2066 #define H264_LOWPASS(OPNAME, OP, OP2) \
2067 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2069 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2073 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2074 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2080 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2082 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2086 const int srcB= src[-2*srcStride];\
2087 const int srcA= src[-1*srcStride];\
2088 const int src0= src[0 *srcStride];\
2089 const int src1= src[1 *srcStride];\
2090 const int src2= src[2 *srcStride];\
2091 const int src3= src[3 *srcStride];\
2092 const int src4= src[4 *srcStride];\
2093 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2094 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2100 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2103 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2105 src -= 2*srcStride;\
2106 for(i=0; i<h+5; i++)\
2108 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2109 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2113 tmp -= tmpStride*(h+5-2);\
2116 const int tmpB= tmp[-2*tmpStride];\
2117 const int tmpA= tmp[-1*tmpStride];\
2118 const int tmp0= tmp[0 *tmpStride];\
2119 const int tmp1= tmp[1 *tmpStride];\
2120 const int tmp2= tmp[2 *tmpStride];\
2121 const int tmp3= tmp[3 *tmpStride];\
2122 const int tmp4= tmp[4 *tmpStride];\
2123 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2124 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2129 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2131 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2135 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2136 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2137 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2138 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2144 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2146 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2150 const int srcB= src[-2*srcStride];\
2151 const int srcA= src[-1*srcStride];\
2152 const int src0= src[0 *srcStride];\
2153 const int src1= src[1 *srcStride];\
2154 const int src2= src[2 *srcStride];\
2155 const int src3= src[3 *srcStride];\
2156 const int src4= src[4 *srcStride];\
2157 const int src5= src[5 *srcStride];\
2158 const int src6= src[6 *srcStride];\
2159 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2160 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2161 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2162 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2168 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2173 src -= 2*srcStride;\
2174 for(i=0; i<h+5; i++)\
2176 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2177 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2178 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2179 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2183 tmp -= tmpStride*(h+5-2);\
2186 const int tmpB= tmp[-2*tmpStride];\
2187 const int tmpA= tmp[-1*tmpStride];\
2188 const int tmp0= tmp[0 *tmpStride];\
2189 const int tmp1= tmp[1 *tmpStride];\
2190 const int tmp2= tmp[2 *tmpStride];\
2191 const int tmp3= tmp[3 *tmpStride];\
2192 const int tmp4= tmp[4 *tmpStride];\
2193 const int tmp5= tmp[5 *tmpStride];\
2194 const int tmp6= tmp[6 *tmpStride];\
2195 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2196 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2197 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2198 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2204 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2212 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2213 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2214 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2215 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2216 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2217 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2223 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2225 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2229 const int srcB= src[-2*srcStride];\
2230 const int srcA= src[-1*srcStride];\
2231 const int src0= src[0 *srcStride];\
2232 const int src1= src[1 *srcStride];\
2233 const int src2= src[2 *srcStride];\
2234 const int src3= src[3 *srcStride];\
2235 const int src4= src[4 *srcStride];\
2236 const int src5= src[5 *srcStride];\
2237 const int src6= src[6 *srcStride];\
2238 const int src7= src[7 *srcStride];\
2239 const int src8= src[8 *srcStride];\
2240 const int src9= src[9 *srcStride];\
2241 const int src10=src[10*srcStride];\
2242 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2243 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2244 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2245 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2246 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2247 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2248 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2249 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2255 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2258 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2260 src -= 2*srcStride;\
2261 for(i=0; i<h+5; i++)\
2263 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2264 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2265 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2266 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2267 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2268 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2269 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2270 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2274 tmp -= tmpStride*(h+5-2);\
2277 const int tmpB= tmp[-2*tmpStride];\
2278 const int tmpA= tmp[-1*tmpStride];\
2279 const int tmp0= tmp[0 *tmpStride];\
2280 const int tmp1= tmp[1 *tmpStride];\
2281 const int tmp2= tmp[2 *tmpStride];\
2282 const int tmp3= tmp[3 *tmpStride];\
2283 const int tmp4= tmp[4 *tmpStride];\
2284 const int tmp5= tmp[5 *tmpStride];\
2285 const int tmp6= tmp[6 *tmpStride];\
2286 const int tmp7= tmp[7 *tmpStride];\
2287 const int tmp8= tmp[8 *tmpStride];\
2288 const int tmp9= tmp[9 *tmpStride];\
2289 const int tmp10=tmp[10*tmpStride];\
2290 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2291 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2292 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2293 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2294 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2295 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2296 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2297 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2303 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2306 src += 8*srcStride;\
2307 dst += 8*dstStride;\
2308 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2309 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2312 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2315 src += 8*srcStride;\
2316 dst += 8*dstStride;\
2317 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2318 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2321 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2324 src += 8*srcStride;\
2325 dst += 8*dstStride;\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2327 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2330 #define H264_MC(OPNAME, SIZE) \
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2332 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t half[SIZE*SIZE];\
2337 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2338 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2342 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2345 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2346 uint8_t half[SIZE*SIZE];\
2347 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2348 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2351 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2352 uint8_t full[SIZE*(SIZE+5)];\
2353 uint8_t * const full_mid= full + SIZE*2;\
2354 uint8_t half[SIZE*SIZE];\
2355 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2356 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2357 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t full[SIZE*(SIZE+5)];\
2362 uint8_t * const full_mid= full + SIZE*2;\
2363 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2364 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2367 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2368 uint8_t full[SIZE*(SIZE+5)];\
2369 uint8_t * const full_mid= full + SIZE*2;\
2370 uint8_t half[SIZE*SIZE];\
2371 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2372 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t full[SIZE*(SIZE+5)];\
2378 uint8_t * const full_mid= full + SIZE*2;\
2379 uint8_t halfH[SIZE*SIZE];\
2380 uint8_t halfV[SIZE*SIZE];\
2381 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2382 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2383 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2384 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2387 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2388 uint8_t full[SIZE*(SIZE+5)];\
2389 uint8_t * const full_mid= full + SIZE*2;\
2390 uint8_t halfH[SIZE*SIZE];\
2391 uint8_t halfV[SIZE*SIZE];\
2392 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2393 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2394 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2395 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2398 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2399 uint8_t full[SIZE*(SIZE+5)];\
2400 uint8_t * const full_mid= full + SIZE*2;\
2401 uint8_t halfH[SIZE*SIZE];\
2402 uint8_t halfV[SIZE*SIZE];\
2403 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2404 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2405 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2406 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2409 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2410 uint8_t full[SIZE*(SIZE+5)];\
2411 uint8_t * const full_mid= full + SIZE*2;\
2412 uint8_t halfH[SIZE*SIZE];\
2413 uint8_t halfV[SIZE*SIZE];\
2414 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2415 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2416 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2417 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2420 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2421 int16_t tmp[SIZE*(SIZE+5)];\
2422 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2425 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2426 int16_t tmp[SIZE*(SIZE+5)];\
2427 uint8_t halfH[SIZE*SIZE];\
2428 uint8_t halfHV[SIZE*SIZE];\
2429 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2430 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2435 int16_t tmp[SIZE*(SIZE+5)];\
2436 uint8_t halfH[SIZE*SIZE];\
2437 uint8_t halfHV[SIZE*SIZE];\
2438 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2439 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2440 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2444 uint8_t full[SIZE*(SIZE+5)];\
2445 uint8_t * const full_mid= full + SIZE*2;\
2446 int16_t tmp[SIZE*(SIZE+5)];\
2447 uint8_t halfV[SIZE*SIZE];\
2448 uint8_t halfHV[SIZE*SIZE];\
2449 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2450 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2451 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2452 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2455 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2456 uint8_t full[SIZE*(SIZE+5)];\
2457 uint8_t * const full_mid= full + SIZE*2;\
2458 int16_t tmp[SIZE*(SIZE+5)];\
2459 uint8_t halfV[SIZE*SIZE];\
2460 uint8_t halfHV[SIZE*SIZE];\
2461 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2462 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2463 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2464 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2467 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2468 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2469 #define op_put(a, b) a = cm[((b) + 16)>>5]
2470 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2471 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2473 H264_LOWPASS(put_ , op_put, op2_put)
2474 H264_LOWPASS(avg_ , op_avg, op2_avg)
2489 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2490 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2491 #define H264_WEIGHT(W,H) \
2492 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2494 offset <<= log2_denom; \
2495 if(log2_denom) offset += 1<<(log2_denom-1); \
2496 for(y=0; y<H; y++, block += stride){ \
2499 if(W==2) continue; \
2502 if(W==4) continue; \
2507 if(W==8) continue; \
2518 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2520 offset = ((offset + 1) | 1) << log2_denom; \
2521 for(y=0; y<H; y++, dst += stride, src += stride){ \
2524 if(W==2) continue; \
2527 if(W==4) continue; \
2532 if(W==8) continue; \
2559 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2560 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2564 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2565 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2566 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2567 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2568 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2569 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2570 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2571 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2577 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2578 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2582 const int src_1= src[ -srcStride];
2583 const int src0 = src[0 ];
2584 const int src1 = src[ srcStride];
2585 const int src2 = src[2*srcStride];
2586 const int src3 = src[3*srcStride];
2587 const int src4 = src[4*srcStride];
2588 const int src5 = src[5*srcStride];
2589 const int src6 = src[6*srcStride];
2590 const int src7 = src[7*srcStride];
2591 const int src8 = src[8*srcStride];
2592 const int src9 = src[9*srcStride];
2593 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2594 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2595 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2596 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2597 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2598 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2599 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2600 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2606 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2607 put_pixels8_c(dst, src, stride, 8);
2610 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2612 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2613 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2616 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2617 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2620 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2622 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2623 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2626 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2627 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2630 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2634 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2635 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2636 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2637 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2639 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2643 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2644 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2645 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2646 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2648 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2650 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2651 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2654 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2656 const int strength= ff_h263_loop_filter_strength[qscale];
2660 int p0= src[x-2*stride];
2661 int p1= src[x-1*stride];
2662 int p2= src[x+0*stride];
2663 int p3= src[x+1*stride];
2664 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2666 if (d<-2*strength) d1= 0;
2667 else if(d<- strength) d1=-2*strength - d;
2668 else if(d< strength) d1= d;
2669 else if(d< 2*strength) d1= 2*strength - d;
2674 if(p1&256) p1= ~(p1>>31);
2675 if(p2&256) p2= ~(p2>>31);
2677 src[x-1*stride] = p1;
2678 src[x+0*stride] = p2;
2682 d2= clip((p0-p3)/4, -ad1, ad1);
2684 src[x-2*stride] = p0 - d2;
2685 src[x+ stride] = p3 + d2;
2689 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2691 const int strength= ff_h263_loop_filter_strength[qscale];
2695 int p0= src[y*stride-2];
2696 int p1= src[y*stride-1];
2697 int p2= src[y*stride+0];
2698 int p3= src[y*stride+1];
2699 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2701 if (d<-2*strength) d1= 0;
2702 else if(d<- strength) d1=-2*strength - d;
2703 else if(d< strength) d1= d;
2704 else if(d< 2*strength) d1= 2*strength - d;
2709 if(p1&256) p1= ~(p1>>31);
2710 if(p2&256) p2= ~(p2>>31);
2712 src[y*stride-1] = p1;
2713 src[y*stride+0] = p2;
2717 d2= clip((p0-p3)/4, -ad1, ad1);
2719 src[y*stride-2] = p0 - d2;
2720 src[y*stride+1] = p3 + d2;
2724 static void h261_loop_filter_c(uint8_t *src, int stride){
2729 temp[x ] = 4*src[x ];
2730 temp[x + 7*8] = 4*src[x + 7*stride];
2734 xy = y * stride + x;
2736 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2741 src[ y*stride] = (temp[ y*8] + 2)>>2;
2742 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2744 xy = y * stride + x;
2746 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2751 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2754 for( i = 0; i < 4; i++ ) {
2759 for( d = 0; d < 4; d++ ) {
2760 const int p0 = pix[-1*xstride];
2761 const int p1 = pix[-2*xstride];
2762 const int p2 = pix[-3*xstride];
2763 const int q0 = pix[0];
2764 const int q1 = pix[1*xstride];
2765 const int q2 = pix[2*xstride];
2767 if( ABS( p0 - q0 ) < alpha &&
2768 ABS( p1 - p0 ) < beta &&
2769 ABS( q1 - q0 ) < beta ) {
2774 if( ABS( p2 - p0 ) < beta ) {
2775 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2778 if( ABS( q2 - q0 ) < beta ) {
2779 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2783 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2784 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2785 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2791 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2793 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2795 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2797 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2800 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2803 for( i = 0; i < 4; i++ ) {
2804 const int tc = tc0[i];
2809 for( d = 0; d < 2; d++ ) {
2810 const int p0 = pix[-1*xstride];
2811 const int p1 = pix[-2*xstride];
2812 const int q0 = pix[0];
2813 const int q1 = pix[1*xstride];
2815 if( ABS( p0 - q0 ) < alpha &&
2816 ABS( p1 - p0 ) < beta &&
2817 ABS( q1 - q0 ) < beta ) {
2819 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2821 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2822 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2828 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2830 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2832 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2834 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2837 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2840 for( d = 0; d < 8; d++ ) {
2841 const int p0 = pix[-1*xstride];
2842 const int p1 = pix[-2*xstride];
2843 const int q0 = pix[0];
2844 const int q1 = pix[1*xstride];
2846 if( ABS( p0 - q0 ) < alpha &&
2847 ABS( p1 - p0 ) < beta &&
2848 ABS( q1 - q0 ) < beta ) {
2850 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2851 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2856 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2858 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2860 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2862 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2865 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2871 s += abs(pix1[0] - pix2[0]);
2872 s += abs(pix1[1] - pix2[1]);
2873 s += abs(pix1[2] - pix2[2]);
2874 s += abs(pix1[3] - pix2[3]);
2875 s += abs(pix1[4] - pix2[4]);
2876 s += abs(pix1[5] - pix2[5]);
2877 s += abs(pix1[6] - pix2[6]);
2878 s += abs(pix1[7] - pix2[7]);
2879 s += abs(pix1[8] - pix2[8]);
2880 s += abs(pix1[9] - pix2[9]);
2881 s += abs(pix1[10] - pix2[10]);
2882 s += abs(pix1[11] - pix2[11]);
2883 s += abs(pix1[12] - pix2[12]);
2884 s += abs(pix1[13] - pix2[13]);
2885 s += abs(pix1[14] - pix2[14]);
2886 s += abs(pix1[15] - pix2[15]);
2893 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2899 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2900 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2901 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2902 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2903 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2904 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2905 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2906 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2907 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2908 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2909 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2910 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2911 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2912 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2913 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2914 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2921 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2924 uint8_t *pix3 = pix2 + line_size;
2928 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2929 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2930 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2931 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2932 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2933 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2934 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2935 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2936 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2937 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2938 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2939 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2940 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2941 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2942 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2943 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2951 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2954 uint8_t *pix3 = pix2 + line_size;
2958 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2959 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2960 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2961 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2962 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2963 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2964 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2965 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2966 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2967 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2968 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2969 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2970 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2971 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2972 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2973 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2981 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2987 s += abs(pix1[0] - pix2[0]);
2988 s += abs(pix1[1] - pix2[1]);
2989 s += abs(pix1[2] - pix2[2]);
2990 s += abs(pix1[3] - pix2[3]);
2991 s += abs(pix1[4] - pix2[4]);
2992 s += abs(pix1[5] - pix2[5]);
2993 s += abs(pix1[6] - pix2[6]);
2994 s += abs(pix1[7] - pix2[7]);
3001 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3007 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3008 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3009 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3010 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3011 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3012 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3013 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3014 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3021 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3024 uint8_t *pix3 = pix2 + line_size;
3028 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3029 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3030 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3031 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3032 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3033 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3034 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3035 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3043 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3046 uint8_t *pix3 = pix2 + line_size;
3050 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3051 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3052 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3053 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3054 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3055 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3056 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3057 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3065 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3066 MpegEncContext *c = v;
3072 for(x=0; x<16; x++){
3073 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3076 for(x=0; x<15; x++){
3077 score2+= ABS( s1[x ] - s1[x +stride]
3078 - s1[x+1] + s1[x+1+stride])
3079 -ABS( s2[x ] - s2[x +stride]
3080 - s2[x+1] + s2[x+1+stride]);
3087 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3088 else return score1 + ABS(score2)*8;
3091 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092 MpegEncContext *c = v;
3099 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3103 score2+= ABS( s1[x ] - s1[x +stride]
3104 - s1[x+1] + s1[x+1+stride])
3105 -ABS( s2[x ] - s2[x +stride]
3106 - s2[x+1] + s2[x+1+stride]);
3113 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114 else return score1 + ABS(score2)*8;
3117 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3121 for(i=0; i<8*8; i++){
3122 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3125 assert(-512<b && b<512);
3127 sum += (w*b)*(w*b)>>4;
3132 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3135 for(i=0; i<8*8; i++){
3136 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3141 * permutes an 8x8 block.
3142 * @param block the block which will be permuted according to the given permutation vector
3143 * @param permutation the permutation vector
3144 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3145 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3146 * (inverse) permutated to scantable order!
3148 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3154 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3156 for(i=0; i<=last; i++){
3157 const int j= scantable[i];
3162 for(i=0; i<=last; i++){
3163 const int j= scantable[i];
3164 const int perm_j= permutation[j];
3165 block[perm_j]= temp[j];
3169 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3173 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3176 memset(cmp, 0, sizeof(void*)*5);
3184 cmp[i]= c->hadamard8_diff[i];
3190 cmp[i]= c->dct_sad[i];
3193 cmp[i]= c->dct264_sad[i];
3196 cmp[i]= c->dct_max[i];
3199 cmp[i]= c->quant_psnr[i];
3226 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3232 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3234 static void clear_blocks_c(DCTELEM *blocks)
3236 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3239 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3241 for(i=0; i+7<w; i+=8){
3242 dst[i+0] += src[i+0];
3243 dst[i+1] += src[i+1];
3244 dst[i+2] += src[i+2];
3245 dst[i+3] += src[i+3];
3246 dst[i+4] += src[i+4];
3247 dst[i+5] += src[i+5];
3248 dst[i+6] += src[i+6];
3249 dst[i+7] += src[i+7];
3252 dst[i+0] += src[i+0];
3255 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3257 for(i=0; i+7<w; i+=8){
3258 dst[i+0] = src1[i+0]-src2[i+0];
3259 dst[i+1] = src1[i+1]-src2[i+1];
3260 dst[i+2] = src1[i+2]-src2[i+2];
3261 dst[i+3] = src1[i+3]-src2[i+3];
3262 dst[i+4] = src1[i+4]-src2[i+4];
3263 dst[i+5] = src1[i+5]-src2[i+5];
3264 dst[i+6] = src1[i+6]-src2[i+6];
3265 dst[i+7] = src1[i+7]-src2[i+7];
3268 dst[i+0] = src1[i+0]-src2[i+0];
3271 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3279 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3289 #define BUTTERFLY2(o1,o2,i1,i2) \
3293 #define BUTTERFLY1(x,y) \
3302 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3304 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3312 //FIXME try pointer walks
3313 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3314 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3315 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3316 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3318 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3319 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3320 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3321 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3323 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3324 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3325 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3326 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3330 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3331 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3332 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3333 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3335 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3336 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3337 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3338 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3341 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3342 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3343 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3344 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3350 printf("MAX:%d\n", maxi);
3356 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3364 //FIXME try pointer walks
3365 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3366 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3367 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3368 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3370 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3375 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3382 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3387 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3393 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3399 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3404 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3405 MpegEncContext * const s= (MpegEncContext *)c;
3406 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3407 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3412 s->dsp.diff_pixels(temp, src1, src2, stride);
3423 const int s07 = SRC(0) + SRC(7);\
3424 const int s16 = SRC(1) + SRC(6);\
3425 const int s25 = SRC(2) + SRC(5);\
3426 const int s34 = SRC(3) + SRC(4);\
3427 const int a0 = s07 + s34;\
3428 const int a1 = s16 + s25;\
3429 const int a2 = s07 - s34;\
3430 const int a3 = s16 - s25;\
3431 const int d07 = SRC(0) - SRC(7);\
3432 const int d16 = SRC(1) - SRC(6);\
3433 const int d25 = SRC(2) - SRC(5);\
3434 const int d34 = SRC(3) - SRC(4);\
3435 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3436 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3437 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3438 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3440 DST(1, a4 + (a7>>2)) ;\
3441 DST(2, a2 + (a3>>1)) ;\
3442 DST(3, a5 + (a6>>2)) ;\
3444 DST(5, a6 - (a5>>2)) ;\
3445 DST(6, (a2>>1) - a3 ) ;\
3446 DST(7, (a4>>2) - a7 ) ;\
3449 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3450 MpegEncContext * const s= (MpegEncContext *)c;
3455 s->dsp.diff_pixels(dct, src1, src2, stride);
3457 #define SRC(x) dct[i][x]
3458 #define DST(x,v) dct[i][x]= v
3459 for( i = 0; i < 8; i++ )
3464 #define SRC(x) dct[x][i]
3465 #define DST(x,v) sum += ABS(v)
3466 for( i = 0; i < 8; i++ )
3474 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3475 MpegEncContext * const s= (MpegEncContext *)c;
3476 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3477 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3482 s->dsp.diff_pixels(temp, src1, src2, stride);
3486 sum= FFMAX(sum, ABS(temp[i]));
3491 void simple_idct(DCTELEM *block); //FIXME
3493 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3494 MpegEncContext * const s= (MpegEncContext *)c;
3495 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3496 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3497 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3503 s->dsp.diff_pixels(temp, src1, src2, stride);
3505 memcpy(bak, temp, 64*sizeof(DCTELEM));
3507 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3508 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3509 simple_idct(temp); //FIXME
3512 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3517 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3518 MpegEncContext * const s= (MpegEncContext *)c;
3519 const uint8_t *scantable= s->intra_scantable.permutated;
3520 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3521 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3522 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3523 uint8_t * const bak= (uint8_t*)aligned_bak;
3524 int i, last, run, bits, level, distoration, start_i;
3525 const int esc_length= s->ac_esc_length;
3527 uint8_t * last_length;
3532 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3533 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3536 s->dsp.diff_pixels(temp, src1, src2, stride);
3538 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3544 length = s->intra_ac_vlc_length;
3545 last_length= s->intra_ac_vlc_last_length;
3546 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3549 length = s->inter_ac_vlc_length;
3550 last_length= s->inter_ac_vlc_last_length;
3555 for(i=start_i; i<last; i++){
3556 int j= scantable[i];
3561 if((level&(~127)) == 0){
3562 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3571 level= temp[i] + 64;
3575 if((level&(~127)) == 0){
3576 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3584 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3586 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3589 s->dsp.idct_add(bak, stride, temp);
3591 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3593 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3596 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3597 MpegEncContext * const s= (MpegEncContext *)c;
3598 const uint8_t *scantable= s->intra_scantable.permutated;
3599 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3600 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3601 int i, last, run, bits, level, start_i;
3602 const int esc_length= s->ac_esc_length;
3604 uint8_t * last_length;
3608 s->dsp.diff_pixels(temp, src1, src2, stride);
3610 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3616 length = s->intra_ac_vlc_length;
3617 last_length= s->intra_ac_vlc_last_length;
3618 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3621 length = s->inter_ac_vlc_length;
3622 last_length= s->inter_ac_vlc_last_length;
3627 for(i=start_i; i<last; i++){
3628 int j= scantable[i];
3633 if((level&(~127)) == 0){
3634 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3643 level= temp[i] + 64;
3647 if((level&(~127)) == 0){
3648 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3656 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3661 for(x=0; x<16; x+=4){
3662 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3663 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3671 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3676 for(x=0; x<16; x++){
3677 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3686 #define SQ(a) ((a)*(a))
3687 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3692 for(x=0; x<16; x+=4){
3693 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3694 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3702 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3707 for(x=0; x<16; x++){
3708 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3717 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3718 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3719 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3721 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3723 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3724 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3725 WARPER8_16_SQ(rd8x8_c, rd16_c)
3726 WARPER8_16_SQ(bit8x8_c, bit16_c)
3728 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3730 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3733 put_pixels_clamped_c(block, dest, line_size);
3735 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3738 add_pixels_clamped_c(block, dest, line_size);
3741 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3744 put_pixels_clamped4_c(block, dest, line_size);
3746 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3749 add_pixels_clamped4_c(block, dest, line_size);
3752 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3755 put_pixels_clamped2_c(block, dest, line_size);
3757 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3760 add_pixels_clamped2_c(block, dest, line_size);
3763 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3765 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3767 dest[0] = cm[(block[0] + 4)>>3];
3769 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3771 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3773 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3776 static void just_return() { return; }
3778 /* init static data */
3779 void dsputil_static_init(void)
3783 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3784 for(i=0;i<MAX_NEG_CROP;i++) {
3786 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3789 for(i=0;i<512;i++) {
3790 squareTbl[i] = (i - 256) * (i - 256);
3793 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3797 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3801 #ifdef CONFIG_ENCODERS
3802 if(avctx->dct_algo==FF_DCT_FASTINT) {
3803 c->fdct = fdct_ifast;
3804 c->fdct248 = fdct_ifast248;
3806 else if(avctx->dct_algo==FF_DCT_FAAN) {
3807 c->fdct = ff_faandct;
3808 c->fdct248 = ff_faandct248;
3811 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3812 c->fdct248 = ff_fdct248_islow;
3814 #endif //CONFIG_ENCODERS
3816 if(avctx->lowres==1){
3817 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3818 c->idct_put= ff_jref_idct4_put;
3819 c->idct_add= ff_jref_idct4_add;
3821 c->idct_put= ff_h264_lowres_idct_put_c;
3822 c->idct_add= ff_h264_lowres_idct_add_c;
3824 c->idct = j_rev_dct4;
3825 c->idct_permutation_type= FF_NO_IDCT_PERM;
3826 }else if(avctx->lowres==2){
3827 c->idct_put= ff_jref_idct2_put;
3828 c->idct_add= ff_jref_idct2_add;
3829 c->idct = j_rev_dct2;
3830 c->idct_permutation_type= FF_NO_IDCT_PERM;
3831 }else if(avctx->lowres==3){
3832 c->idct_put= ff_jref_idct1_put;
3833 c->idct_add= ff_jref_idct1_add;
3834 c->idct = j_rev_dct1;
3835 c->idct_permutation_type= FF_NO_IDCT_PERM;
3837 if(avctx->idct_algo==FF_IDCT_INT){
3838 c->idct_put= ff_jref_idct_put;
3839 c->idct_add= ff_jref_idct_add;
3840 c->idct = j_rev_dct;
3841 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3842 }else if(avctx->idct_algo==FF_IDCT_VP3){
3843 c->idct_put= ff_vp3_idct_put_c;
3844 c->idct_add= ff_vp3_idct_add_c;
3845 c->idct = ff_vp3_idct_c;
3846 c->idct_permutation_type= FF_NO_IDCT_PERM;
3847 }else{ //accurate/default
3848 c->idct_put= simple_idct_put;
3849 c->idct_add= simple_idct_add;
3850 c->idct = simple_idct;
3851 c->idct_permutation_type= FF_NO_IDCT_PERM;
3855 c->h264_idct_add= ff_h264_idct_add_c;
3856 c->h264_idct8_add= ff_h264_idct8_add_c;
3857 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3858 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3860 c->get_pixels = get_pixels_c;
3861 c->diff_pixels = diff_pixels_c;
3862 c->put_pixels_clamped = put_pixels_clamped_c;
3863 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3864 c->add_pixels_clamped = add_pixels_clamped_c;
3865 c->add_pixels8 = add_pixels8_c;
3866 c->add_pixels4 = add_pixels4_c;
3869 c->clear_blocks = clear_blocks_c;
3870 c->pix_sum = pix_sum_c;
3871 c->pix_norm1 = pix_norm1_c;
3873 /* TODO [0] 16 [1] 8 */
3874 c->pix_abs[0][0] = pix_abs16_c;
3875 c->pix_abs[0][1] = pix_abs16_x2_c;
3876 c->pix_abs[0][2] = pix_abs16_y2_c;
3877 c->pix_abs[0][3] = pix_abs16_xy2_c;
3878 c->pix_abs[1][0] = pix_abs8_c;
3879 c->pix_abs[1][1] = pix_abs8_x2_c;
3880 c->pix_abs[1][2] = pix_abs8_y2_c;
3881 c->pix_abs[1][3] = pix_abs8_xy2_c;
3883 #define dspfunc(PFX, IDX, NUM) \
3884 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3885 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3886 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3887 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3889 dspfunc(put, 0, 16);
3890 dspfunc(put_no_rnd, 0, 16);
3892 dspfunc(put_no_rnd, 1, 8);
3896 dspfunc(avg, 0, 16);
3897 dspfunc(avg_no_rnd, 0, 16);
3899 dspfunc(avg_no_rnd, 1, 8);
3904 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3905 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3907 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3908 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3909 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3910 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3911 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3912 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3913 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3914 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3915 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3917 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3918 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3919 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3920 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3921 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3922 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3923 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3924 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3925 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3927 #define dspfunc(PFX, IDX, NUM) \
3928 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3929 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3930 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3931 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3932 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3933 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3934 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3935 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3936 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3937 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3938 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3939 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3940 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3941 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3942 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3943 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3945 dspfunc(put_qpel, 0, 16);
3946 dspfunc(put_no_rnd_qpel, 0, 16);
3948 dspfunc(avg_qpel, 0, 16);
3949 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3951 dspfunc(put_qpel, 1, 8);
3952 dspfunc(put_no_rnd_qpel, 1, 8);
3954 dspfunc(avg_qpel, 1, 8);
3955 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3957 dspfunc(put_h264_qpel, 0, 16);
3958 dspfunc(put_h264_qpel, 1, 8);
3959 dspfunc(put_h264_qpel, 2, 4);
3960 dspfunc(put_h264_qpel, 3, 2);
3961 dspfunc(avg_h264_qpel, 0, 16);
3962 dspfunc(avg_h264_qpel, 1, 8);
3963 dspfunc(avg_h264_qpel, 2, 4);
3966 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3967 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3968 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3969 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3970 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3971 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3973 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3974 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3975 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3976 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3977 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3978 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3979 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3980 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3981 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3982 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3983 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3984 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3985 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
3986 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
3987 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
3988 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
3989 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
3990 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
3991 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
3992 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
3994 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
3995 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3996 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3997 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3998 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3999 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4000 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4001 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4003 #define SET_CMP_FUNC(name) \
4004 c->name[0]= name ## 16_c;\
4005 c->name[1]= name ## 8x8_c;
4007 SET_CMP_FUNC(hadamard8_diff)
4008 c->hadamard8_diff[4]= hadamard8_intra16_c;
4009 SET_CMP_FUNC(dct_sad)
4010 SET_CMP_FUNC(dct_max)
4012 SET_CMP_FUNC(dct264_sad)
4014 c->sad[0]= pix_abs16_c;
4015 c->sad[1]= pix_abs8_c;
4019 SET_CMP_FUNC(quant_psnr)
4022 c->vsad[0]= vsad16_c;
4023 c->vsad[4]= vsad_intra16_c;
4024 c->vsse[0]= vsse16_c;
4025 c->vsse[4]= vsse_intra16_c;
4026 c->nsse[0]= nsse16_c;
4027 c->nsse[1]= nsse8_c;
4028 c->w53[0]= w53_16_c;
4030 c->w97[0]= w97_16_c;
4033 c->add_bytes= add_bytes_c;
4034 c->diff_bytes= diff_bytes_c;
4035 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4036 c->bswap_buf= bswap_buf;
4038 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4039 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4040 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4041 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4042 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4043 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4045 c->h263_h_loop_filter= h263_h_loop_filter_c;
4046 c->h263_v_loop_filter= h263_v_loop_filter_c;
4048 c->h261_loop_filter= h261_loop_filter_c;
4050 c->try_8x8basis= try_8x8basis_c;
4051 c->add_8x8basis= add_8x8basis_c;
4053 #ifdef CONFIG_SNOW_ENCODER
4054 c->vertical_compose97i = ff_snow_vertical_compose97i;
4055 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4056 c->inner_add_yblock = ff_snow_inner_add_yblock;
4059 c->shrink[0]= ff_img_copy_plane;
4060 c->shrink[1]= ff_shrink22;
4061 c->shrink[2]= ff_shrink44;
4062 c->shrink[3]= ff_shrink88;
4064 c->prefetch= just_return;
4067 dsputil_init_mmx(c, avctx);
4070 dsputil_init_armv4l(c, avctx);
4073 dsputil_init_mlib(c, avctx);
4076 dsputil_init_vis(c,avctx);
4079 dsputil_init_alpha(c, avctx);
4082 dsputil_init_ppc(c, avctx);
4085 dsputil_init_mmi(c, avctx);
4088 dsputil_init_sh4(c,avctx);
4091 switch(c->idct_permutation_type){
4092 case FF_NO_IDCT_PERM:
4094 c->idct_permutation[i]= i;
4096 case FF_LIBMPEG2_IDCT_PERM:
4098 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4100 case FF_SIMPLE_IDCT_PERM:
4102 c->idct_permutation[i]= simple_mmx_permutation[i];
4104 case FF_TRANSPOSE_IDCT_PERM:
4106 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4108 case FF_PARTTRANS_IDCT_PERM:
4110 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4113 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");