3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
38 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
39 uint32_t squareTbl[512] = {0, };
41 const uint8_t ff_zigzag_direct[64] = {
42 0, 1, 8, 16, 9, 2, 3, 10,
43 17, 24, 32, 25, 18, 11, 4, 5,
44 12, 19, 26, 33, 40, 48, 41, 34,
45 27, 20, 13, 6, 7, 14, 21, 28,
46 35, 42, 49, 56, 57, 50, 43, 36,
47 29, 22, 15, 23, 30, 37, 44, 51,
48 58, 59, 52, 45, 38, 31, 39, 46,
49 53, 60, 61, 54, 47, 55, 62, 63
52 /* Specific zigzag scan for 248 idct. NOTE that unlike the
53 specification, we interleave the fields */
54 const uint8_t ff_zigzag248_direct[64] = {
55 0, 8, 1, 9, 16, 24, 2, 10,
56 17, 25, 32, 40, 48, 56, 33, 41,
57 18, 26, 3, 11, 4, 12, 19, 27,
58 34, 42, 49, 57, 50, 58, 35, 43,
59 20, 28, 5, 13, 6, 14, 21, 29,
60 36, 44, 51, 59, 52, 60, 37, 45,
61 22, 30, 7, 15, 23, 31, 38, 46,
62 53, 61, 54, 62, 39, 47, 55, 63,
65 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
66 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
68 const uint8_t ff_alternate_horizontal_scan[64] = {
69 0, 1, 2, 3, 8, 9, 16, 17,
70 10, 11, 4, 5, 6, 7, 15, 14,
71 13, 12, 19, 18, 24, 25, 32, 33,
72 26, 27, 20, 21, 22, 23, 28, 29,
73 30, 31, 34, 35, 40, 41, 48, 49,
74 42, 43, 36, 37, 38, 39, 44, 45,
75 46, 47, 50, 51, 56, 57, 58, 59,
76 52, 53, 54, 55, 60, 61, 62, 63,
79 const uint8_t ff_alternate_vertical_scan[64] = {
80 0, 8, 16, 24, 1, 9, 2, 10,
81 17, 25, 32, 40, 48, 56, 57, 49,
82 41, 33, 26, 18, 3, 11, 4, 12,
83 19, 27, 34, 42, 50, 58, 35, 43,
84 51, 59, 20, 28, 5, 13, 6, 14,
85 21, 29, 36, 44, 52, 60, 37, 45,
86 53, 61, 22, 30, 7, 15, 23, 31,
87 38, 46, 54, 62, 39, 47, 55, 63,
90 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
91 const uint32_t inverse[256]={
92 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
93 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
94 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
95 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
96 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
97 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
98 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
99 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
100 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
101 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
102 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
103 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
104 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
105 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
106 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
107 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
108 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
109 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
110 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
111 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
112 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
113 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
114 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
115 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
116 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
117 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
118 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
119 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
120 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
121 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
122 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
123 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
126 /* Input permutation for the simple_idct_mmx */
127 static const uint8_t simple_mmx_permutation[64]={
128 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
129 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
130 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
131 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
132 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
133 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
134 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
135 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
138 static int pix_sum_c(uint8_t * pix, int line_size)
143 for (i = 0; i < 16; i++) {
144 for (j = 0; j < 16; j += 8) {
155 pix += line_size - 16;
160 static int pix_norm1_c(uint8_t * pix, int line_size)
163 uint32_t *sq = squareTbl + 256;
166 for (i = 0; i < 16; i++) {
167 for (j = 0; j < 16; j += 8) {
178 #if LONG_MAX > 2147483647
179 register uint64_t x=*(uint64_t*)pix;
181 s += sq[(x>>8)&0xff];
182 s += sq[(x>>16)&0xff];
183 s += sq[(x>>24)&0xff];
184 s += sq[(x>>32)&0xff];
185 s += sq[(x>>40)&0xff];
186 s += sq[(x>>48)&0xff];
187 s += sq[(x>>56)&0xff];
189 register uint32_t x=*(uint32_t*)pix;
191 s += sq[(x>>8)&0xff];
192 s += sq[(x>>16)&0xff];
193 s += sq[(x>>24)&0xff];
194 x=*(uint32_t*)(pix+4);
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
203 pix += line_size - 16;
208 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
211 for(i=0; i+8<=w; i+=8){
212 dst[i+0]= bswap_32(src[i+0]);
213 dst[i+1]= bswap_32(src[i+1]);
214 dst[i+2]= bswap_32(src[i+2]);
215 dst[i+3]= bswap_32(src[i+3]);
216 dst[i+4]= bswap_32(src[i+4]);
217 dst[i+5]= bswap_32(src[i+5]);
218 dst[i+6]= bswap_32(src[i+6]);
219 dst[i+7]= bswap_32(src[i+7]);
222 dst[i+0]= bswap_32(src[i+0]);
226 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
229 uint32_t *sq = squareTbl + 256;
232 for (i = 0; i < h; i++) {
233 s += sq[pix1[0] - pix2[0]];
234 s += sq[pix1[1] - pix2[1]];
235 s += sq[pix1[2] - pix2[2]];
236 s += sq[pix1[3] - pix2[3]];
243 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
246 uint32_t *sq = squareTbl + 256;
249 for (i = 0; i < h; i++) {
250 s += sq[pix1[0] - pix2[0]];
251 s += sq[pix1[1] - pix2[1]];
252 s += sq[pix1[2] - pix2[2]];
253 s += sq[pix1[3] - pix2[3]];
254 s += sq[pix1[4] - pix2[4]];
255 s += sq[pix1[5] - pix2[5]];
256 s += sq[pix1[6] - pix2[6]];
257 s += sq[pix1[7] - pix2[7]];
264 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
267 uint32_t *sq = squareTbl + 256;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[ 0] - pix2[ 0]];
272 s += sq[pix1[ 1] - pix2[ 1]];
273 s += sq[pix1[ 2] - pix2[ 2]];
274 s += sq[pix1[ 3] - pix2[ 3]];
275 s += sq[pix1[ 4] - pix2[ 4]];
276 s += sq[pix1[ 5] - pix2[ 5]];
277 s += sq[pix1[ 6] - pix2[ 6]];
278 s += sq[pix1[ 7] - pix2[ 7]];
279 s += sq[pix1[ 8] - pix2[ 8]];
280 s += sq[pix1[ 9] - pix2[ 9]];
281 s += sq[pix1[10] - pix2[10]];
282 s += sq[pix1[11] - pix2[11]];
283 s += sq[pix1[12] - pix2[12]];
284 s += sq[pix1[13] - pix2[13]];
285 s += sq[pix1[14] - pix2[14]];
286 s += sq[pix1[15] - pix2[15]];
295 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
296 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
298 const int dec_count= w==8 ? 3 : 4;
301 static const int scale[2][2][4][4]={
305 {268, 239, 239, 213},
309 // 9/7 16x16 or 32x32 dec=4
310 {344, 310, 310, 280},
318 {275, 245, 245, 218},
322 // 5/3 16x16 or 32x32 dec=4
323 {352, 317, 317, 286},
331 for (i = 0; i < h; i++) {
332 for (j = 0; j < w; j+=4) {
333 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
334 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
335 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
336 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
342 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
346 for(level=0; level<dec_count; level++){
347 for(ori= level ? 1 : 0; ori<4; ori++){
348 int size= w>>(dec_count-level);
349 int sx= (ori&1) ? size : 0;
350 int stride= 32<<(dec_count-level);
351 int sy= (ori&2) ? stride>>1 : 0;
353 for(i=0; i<size; i++){
354 for(j=0; j<size; j++){
355 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
365 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
366 return w_c(v, pix1, pix2, line_size, 8, h, 1);
369 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
370 return w_c(v, pix1, pix2, line_size, 8, h, 0);
373 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
374 return w_c(v, pix1, pix2, line_size, 16, h, 1);
377 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
378 return w_c(v, pix1, pix2, line_size, 16, h, 0);
381 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
382 return w_c(v, pix1, pix2, line_size, 32, h, 1);
385 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
386 return w_c(v, pix1, pix2, line_size, 32, h, 0);
390 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
394 /* read the pixels */
396 block[0] = pixels[0];
397 block[1] = pixels[1];
398 block[2] = pixels[2];
399 block[3] = pixels[3];
400 block[4] = pixels[4];
401 block[5] = pixels[5];
402 block[6] = pixels[6];
403 block[7] = pixels[7];
409 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
410 const uint8_t *s2, int stride){
413 /* read the pixels */
415 block[0] = s1[0] - s2[0];
416 block[1] = s1[1] - s2[1];
417 block[2] = s1[2] - s2[2];
418 block[3] = s1[3] - s2[3];
419 block[4] = s1[4] - s2[4];
420 block[5] = s1[5] - s2[5];
421 block[6] = s1[6] - s2[6];
422 block[7] = s1[7] - s2[7];
430 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
434 uint8_t *cm = cropTbl + MAX_NEG_CROP;
436 /* read the pixels */
438 pixels[0] = cm[block[0]];
439 pixels[1] = cm[block[1]];
440 pixels[2] = cm[block[2]];
441 pixels[3] = cm[block[3]];
442 pixels[4] = cm[block[4]];
443 pixels[5] = cm[block[5]];
444 pixels[6] = cm[block[6]];
445 pixels[7] = cm[block[7]];
452 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
456 uint8_t *cm = cropTbl + MAX_NEG_CROP;
458 /* read the pixels */
460 pixels[0] = cm[block[0]];
461 pixels[1] = cm[block[1]];
462 pixels[2] = cm[block[2]];
463 pixels[3] = cm[block[3]];
470 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
474 uint8_t *cm = cropTbl + MAX_NEG_CROP;
476 /* read the pixels */
478 pixels[0] = cm[block[0]];
479 pixels[1] = cm[block[1]];
486 static void put_signed_pixels_clamped_c(const DCTELEM *block,
487 uint8_t *restrict pixels,
492 for (i = 0; i < 8; i++) {
493 for (j = 0; j < 8; j++) {
496 else if (*block > 127)
499 *pixels = (uint8_t)(*block + 128);
503 pixels += (line_size - 8);
507 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
511 uint8_t *cm = cropTbl + MAX_NEG_CROP;
513 /* read the pixels */
515 pixels[0] = cm[pixels[0] + block[0]];
516 pixels[1] = cm[pixels[1] + block[1]];
517 pixels[2] = cm[pixels[2] + block[2]];
518 pixels[3] = cm[pixels[3] + block[3]];
519 pixels[4] = cm[pixels[4] + block[4]];
520 pixels[5] = cm[pixels[5] + block[5]];
521 pixels[6] = cm[pixels[6] + block[6]];
522 pixels[7] = cm[pixels[7] + block[7]];
528 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
532 uint8_t *cm = cropTbl + MAX_NEG_CROP;
534 /* read the pixels */
536 pixels[0] = cm[pixels[0] + block[0]];
537 pixels[1] = cm[pixels[1] + block[1]];
538 pixels[2] = cm[pixels[2] + block[2]];
539 pixels[3] = cm[pixels[3] + block[3]];
545 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
549 uint8_t *cm = cropTbl + MAX_NEG_CROP;
551 /* read the pixels */
553 pixels[0] = cm[pixels[0] + block[0]];
554 pixels[1] = cm[pixels[1] + block[1]];
560 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
564 pixels[0] += block[0];
565 pixels[1] += block[1];
566 pixels[2] += block[2];
567 pixels[3] += block[3];
568 pixels[4] += block[4];
569 pixels[5] += block[5];
570 pixels[6] += block[6];
571 pixels[7] += block[7];
577 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
581 pixels[0] += block[0];
582 pixels[1] += block[1];
583 pixels[2] += block[2];
584 pixels[3] += block[3];
592 #define PIXOP2(OPNAME, OP) \
593 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
597 OP(*((uint64_t*)block), LD64(pixels));\
603 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
607 const uint64_t a= LD64(pixels );\
608 const uint64_t b= LD64(pixels+1);\
609 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
615 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
619 const uint64_t a= LD64(pixels );\
620 const uint64_t b= LD64(pixels+1);\
621 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
627 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
631 const uint64_t a= LD64(pixels );\
632 const uint64_t b= LD64(pixels+line_size);\
633 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
639 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
643 const uint64_t a= LD64(pixels );\
644 const uint64_t b= LD64(pixels+line_size);\
645 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
651 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
654 const uint64_t a= LD64(pixels );\
655 const uint64_t b= LD64(pixels+1);\
656 uint64_t l0= (a&0x0303030303030303ULL)\
657 + (b&0x0303030303030303ULL)\
658 + 0x0202020202020202ULL;\
659 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
660 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
664 for(i=0; i<h; i+=2){\
665 uint64_t a= LD64(pixels );\
666 uint64_t b= LD64(pixels+1);\
667 l1= (a&0x0303030303030303ULL)\
668 + (b&0x0303030303030303ULL);\
669 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
670 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
671 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
676 l0= (a&0x0303030303030303ULL)\
677 + (b&0x0303030303030303ULL)\
678 + 0x0202020202020202ULL;\
679 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
680 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
687 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
690 const uint64_t a= LD64(pixels );\
691 const uint64_t b= LD64(pixels+1);\
692 uint64_t l0= (a&0x0303030303030303ULL)\
693 + (b&0x0303030303030303ULL)\
694 + 0x0101010101010101ULL;\
695 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
696 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
700 for(i=0; i<h; i+=2){\
701 uint64_t a= LD64(pixels );\
702 uint64_t b= LD64(pixels+1);\
703 l1= (a&0x0303030303030303ULL)\
704 + (b&0x0303030303030303ULL);\
705 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
706 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
707 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
712 l0= (a&0x0303030303030303ULL)\
713 + (b&0x0303030303030303ULL)\
714 + 0x0101010101010101ULL;\
715 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
716 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
723 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
724 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
725 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
726 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
727 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
731 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
732 #else // 64 bit variant
734 #define PIXOP2(OPNAME, OP) \
735 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
738 OP(*((uint16_t*)(block )), LD16(pixels ));\
743 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
746 OP(*((uint32_t*)(block )), LD32(pixels ));\
751 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
754 OP(*((uint32_t*)(block )), LD32(pixels ));\
755 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
760 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
761 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
764 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
765 int src_stride1, int src_stride2, int h){\
769 a= LD32(&src1[i*src_stride1 ]);\
770 b= LD32(&src2[i*src_stride2 ]);\
771 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
772 a= LD32(&src1[i*src_stride1+4]);\
773 b= LD32(&src2[i*src_stride2+4]);\
774 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
778 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
779 int src_stride1, int src_stride2, int h){\
783 a= LD32(&src1[i*src_stride1 ]);\
784 b= LD32(&src2[i*src_stride2 ]);\
785 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
786 a= LD32(&src1[i*src_stride1+4]);\
787 b= LD32(&src2[i*src_stride2+4]);\
788 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
792 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
793 int src_stride1, int src_stride2, int h){\
797 a= LD32(&src1[i*src_stride1 ]);\
798 b= LD32(&src2[i*src_stride2 ]);\
799 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
803 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
804 int src_stride1, int src_stride2, int h){\
808 a= LD16(&src1[i*src_stride1 ]);\
809 b= LD16(&src2[i*src_stride2 ]);\
810 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
814 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
815 int src_stride1, int src_stride2, int h){\
816 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
817 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
820 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
822 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
823 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
826 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
830 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
831 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
834 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
835 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
838 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
839 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
842 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
843 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
846 uint32_t a, b, c, d, l0, l1, h0, h1;\
847 a= LD32(&src1[i*src_stride1]);\
848 b= LD32(&src2[i*src_stride2]);\
849 c= LD32(&src3[i*src_stride3]);\
850 d= LD32(&src4[i*src_stride4]);\
851 l0= (a&0x03030303UL)\
854 h0= ((a&0xFCFCFCFCUL)>>2)\
855 + ((b&0xFCFCFCFCUL)>>2);\
856 l1= (c&0x03030303UL)\
858 h1= ((c&0xFCFCFCFCUL)>>2)\
859 + ((d&0xFCFCFCFCUL)>>2);\
860 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
861 a= LD32(&src1[i*src_stride1+4]);\
862 b= LD32(&src2[i*src_stride2+4]);\
863 c= LD32(&src3[i*src_stride3+4]);\
864 d= LD32(&src4[i*src_stride4+4]);\
865 l0= (a&0x03030303UL)\
868 h0= ((a&0xFCFCFCFCUL)>>2)\
869 + ((b&0xFCFCFCFCUL)>>2);\
870 l1= (c&0x03030303UL)\
872 h1= ((c&0xFCFCFCFCUL)>>2)\
873 + ((d&0xFCFCFCFCUL)>>2);\
874 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
879 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
882 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
883 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
886 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
887 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
890 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
891 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
894 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
895 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
898 uint32_t a, b, c, d, l0, l1, h0, h1;\
899 a= LD32(&src1[i*src_stride1]);\
900 b= LD32(&src2[i*src_stride2]);\
901 c= LD32(&src3[i*src_stride3]);\
902 d= LD32(&src4[i*src_stride4]);\
903 l0= (a&0x03030303UL)\
906 h0= ((a&0xFCFCFCFCUL)>>2)\
907 + ((b&0xFCFCFCFCUL)>>2);\
908 l1= (c&0x03030303UL)\
910 h1= ((c&0xFCFCFCFCUL)>>2)\
911 + ((d&0xFCFCFCFCUL)>>2);\
912 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
913 a= LD32(&src1[i*src_stride1+4]);\
914 b= LD32(&src2[i*src_stride2+4]);\
915 c= LD32(&src3[i*src_stride3+4]);\
916 d= LD32(&src4[i*src_stride4+4]);\
917 l0= (a&0x03030303UL)\
920 h0= ((a&0xFCFCFCFCUL)>>2)\
921 + ((b&0xFCFCFCFCUL)>>2);\
922 l1= (c&0x03030303UL)\
924 h1= ((c&0xFCFCFCFCUL)>>2)\
925 + ((d&0xFCFCFCFCUL)>>2);\
926 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
929 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
930 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
931 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
932 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
934 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
942 int i, a0, b0, a1, b1;\
949 for(i=0; i<h; i+=2){\
955 block[0]= (a1+a0)>>2; /* FIXME non put */\
956 block[1]= (b1+b0)>>2;\
966 block[0]= (a1+a0)>>2;\
967 block[1]= (b1+b0)>>2;\
973 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
976 const uint32_t a= LD32(pixels );\
977 const uint32_t b= LD32(pixels+1);\
978 uint32_t l0= (a&0x03030303UL)\
981 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
982 + ((b&0xFCFCFCFCUL)>>2);\
986 for(i=0; i<h; i+=2){\
987 uint32_t a= LD32(pixels );\
988 uint32_t b= LD32(pixels+1);\
989 l1= (a&0x03030303UL)\
991 h1= ((a&0xFCFCFCFCUL)>>2)\
992 + ((b&0xFCFCFCFCUL)>>2);\
993 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
998 l0= (a&0x03030303UL)\
1001 h0= ((a&0xFCFCFCFCUL)>>2)\
1002 + ((b&0xFCFCFCFCUL)>>2);\
1003 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1012 for(j=0; j<2; j++){\
1014 const uint32_t a= LD32(pixels );\
1015 const uint32_t b= LD32(pixels+1);\
1016 uint32_t l0= (a&0x03030303UL)\
1019 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1020 + ((b&0xFCFCFCFCUL)>>2);\
1024 for(i=0; i<h; i+=2){\
1025 uint32_t a= LD32(pixels );\
1026 uint32_t b= LD32(pixels+1);\
1027 l1= (a&0x03030303UL)\
1028 + (b&0x03030303UL);\
1029 h1= ((a&0xFCFCFCFCUL)>>2)\
1030 + ((b&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1036 l0= (a&0x03030303UL)\
1039 h0= ((a&0xFCFCFCFCUL)>>2)\
1040 + ((b&0xFCFCFCFCUL)>>2);\
1041 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1045 pixels+=4-line_size*(h+1);\
1046 block +=4-line_size*h;\
1050 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1053 for(j=0; j<2; j++){\
1055 const uint32_t a= LD32(pixels );\
1056 const uint32_t b= LD32(pixels+1);\
1057 uint32_t l0= (a&0x03030303UL)\
1060 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1061 + ((b&0xFCFCFCFCUL)>>2);\
1065 for(i=0; i<h; i+=2){\
1066 uint32_t a= LD32(pixels );\
1067 uint32_t b= LD32(pixels+1);\
1068 l1= (a&0x03030303UL)\
1069 + (b&0x03030303UL);\
1070 h1= ((a&0xFCFCFCFCUL)>>2)\
1071 + ((b&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 l0= (a&0x03030303UL)\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086 pixels+=4-line_size*(h+1);\
1087 block +=4-line_size*h;\
1091 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1092 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1093 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1095 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1100 #define op_avg(a, b) a = rnd_avg32(a, b)
1102 #define op_put(a, b) a = b
1109 #define avg2(a,b) ((a+b+1)>>1)
1110 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1112 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1113 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1116 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1117 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1120 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1122 const int A=(16-x16)*(16-y16);
1123 const int B=( x16)*(16-y16);
1124 const int C=(16-x16)*( y16);
1125 const int D=( x16)*( y16);
1130 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1131 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1132 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1133 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1134 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1135 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1136 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1137 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1143 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1144 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1147 const int s= 1<<shift;
1157 for(x=0; x<8; x++){ //XXX FIXME optimize
1158 int src_x, src_y, frac_x, frac_y, index;
1162 frac_x= src_x&(s-1);
1163 frac_y= src_y&(s-1);
1167 if((unsigned)src_x < width){
1168 if((unsigned)src_y < height){
1169 index= src_x + src_y*stride;
1170 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1171 + src[index +1]* frac_x )*(s-frac_y)
1172 + ( src[index+stride ]*(s-frac_x)
1173 + src[index+stride+1]* frac_x )* frac_y
1176 index= src_x + clip(src_y, 0, height)*stride;
1177 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1178 + src[index +1]* frac_x )*s
1182 if((unsigned)src_y < height){
1183 index= clip(src_x, 0, width) + src_y*stride;
1184 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1185 + src[index+stride ]* frac_y )*s
1188 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1189 dst[y*stride + x]= src[index ];
1201 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1203 case 2: put_pixels2_c (dst, src, stride, height); break;
1204 case 4: put_pixels4_c (dst, src, stride, height); break;
1205 case 8: put_pixels8_c (dst, src, stride, height); break;
1206 case 16:put_pixels16_c(dst, src, stride, height); break;
1210 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1212 for (i=0; i < height; i++) {
1213 for (j=0; j < width; j++) {
1214 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1221 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1223 for (i=0; i < height; i++) {
1224 for (j=0; j < width; j++) {
1225 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1232 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1234 for (i=0; i < height; i++) {
1235 for (j=0; j < width; j++) {
1236 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1243 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1245 for (i=0; i < height; i++) {
1246 for (j=0; j < width; j++) {
1247 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1254 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1256 for (i=0; i < height; i++) {
1257 for (j=0; j < width; j++) {
1258 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1265 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1267 for (i=0; i < height; i++) {
1268 for (j=0; j < width; j++) {
1269 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1276 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1278 for (i=0; i < height; i++) {
1279 for (j=0; j < width; j++) {
1280 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1287 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1289 for (i=0; i < height; i++) {
1290 for (j=0; j < width; j++) {
1291 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1298 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1300 case 2: avg_pixels2_c (dst, src, stride, height); break;
1301 case 4: avg_pixels4_c (dst, src, stride, height); break;
1302 case 8: avg_pixels8_c (dst, src, stride, height); break;
1303 case 16:avg_pixels16_c(dst, src, stride, height); break;
1307 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1309 for (i=0; i < height; i++) {
1310 for (j=0; j < width; j++) {
1311 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1318 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1320 for (i=0; i < height; i++) {
1321 for (j=0; j < width; j++) {
1322 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1329 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1331 for (i=0; i < height; i++) {
1332 for (j=0; j < width; j++) {
1333 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1340 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1342 for (i=0; i < height; i++) {
1343 for (j=0; j < width; j++) {
1344 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1351 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1353 for (i=0; i < height; i++) {
1354 for (j=0; j < width; j++) {
1355 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1362 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1364 for (i=0; i < height; i++) {
1365 for (j=0; j < width; j++) {
1366 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1373 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1375 for (i=0; i < height; i++) {
1376 for (j=0; j < width; j++) {
1377 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1384 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1386 for (i=0; i < height; i++) {
1387 for (j=0; j < width; j++) {
1388 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1395 #define TPEL_WIDTH(width)\
1396 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1397 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1398 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1399 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1400 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1401 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1402 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1403 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1404 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1405 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1406 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1407 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1408 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1409 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1410 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1411 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1412 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1413 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1416 #define H264_CHROMA_MC(OPNAME, OP)\
1417 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1418 const int A=(8-x)*(8-y);\
1419 const int B=( x)*(8-y);\
1420 const int C=(8-x)*( y);\
1421 const int D=( x)*( y);\
1424 assert(x<8 && y<8 && x>=0 && y>=0);\
1428 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1429 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1435 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1436 const int A=(8-x)*(8-y);\
1437 const int B=( x)*(8-y);\
1438 const int C=(8-x)*( y);\
1439 const int D=( x)*( y);\
1442 assert(x<8 && y<8 && x>=0 && y>=0);\
1446 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1448 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1449 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1455 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1456 const int A=(8-x)*(8-y);\
1457 const int B=( x)*(8-y);\
1458 const int C=(8-x)*( y);\
1459 const int D=( x)*( y);\
1462 assert(x<8 && y<8 && x>=0 && y>=0);\
1466 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1467 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1468 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1469 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1470 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1471 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1472 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1473 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1479 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1480 #define op_put(a, b) a = (((b) + 32)>>6)
1482 H264_CHROMA_MC(put_ , op_put)
1483 H264_CHROMA_MC(avg_ , op_avg)
1487 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1492 ST16(dst , LD16(src ));
1498 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1503 ST32(dst , LD32(src ));
1509 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1514 ST32(dst , LD32(src ));
1515 ST32(dst+4 , LD32(src+4 ));
1521 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1526 ST32(dst , LD32(src ));
1527 ST32(dst+4 , LD32(src+4 ));
1528 ST32(dst+8 , LD32(src+8 ));
1529 ST32(dst+12, LD32(src+12));
1535 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1540 ST32(dst , LD32(src ));
1541 ST32(dst+4 , LD32(src+4 ));
1542 ST32(dst+8 , LD32(src+8 ));
1543 ST32(dst+12, LD32(src+12));
1550 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1555 ST32(dst , LD32(src ));
1556 ST32(dst+4 , LD32(src+4 ));
1564 #define QPEL_MC(r, OPNAME, RND, OP) \
1565 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1566 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1570 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1571 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1572 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1573 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1574 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1575 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1576 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1577 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1583 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1585 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1589 const int src0= src[0*srcStride];\
1590 const int src1= src[1*srcStride];\
1591 const int src2= src[2*srcStride];\
1592 const int src3= src[3*srcStride];\
1593 const int src4= src[4*srcStride];\
1594 const int src5= src[5*srcStride];\
1595 const int src6= src[6*srcStride];\
1596 const int src7= src[7*srcStride];\
1597 const int src8= src[8*srcStride];\
1598 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1599 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1600 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1601 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1602 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1603 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1604 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1605 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1611 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1612 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1617 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1618 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1619 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1620 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1621 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1622 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1623 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1624 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1625 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1626 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1627 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1628 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1629 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1630 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1631 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1632 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1638 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1639 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1644 const int src0= src[0*srcStride];\
1645 const int src1= src[1*srcStride];\
1646 const int src2= src[2*srcStride];\
1647 const int src3= src[3*srcStride];\
1648 const int src4= src[4*srcStride];\
1649 const int src5= src[5*srcStride];\
1650 const int src6= src[6*srcStride];\
1651 const int src7= src[7*srcStride];\
1652 const int src8= src[8*srcStride];\
1653 const int src9= src[9*srcStride];\
1654 const int src10= src[10*srcStride];\
1655 const int src11= src[11*srcStride];\
1656 const int src12= src[12*srcStride];\
1657 const int src13= src[13*srcStride];\
1658 const int src14= src[14*srcStride];\
1659 const int src15= src[15*srcStride];\
1660 const int src16= src[16*srcStride];\
1661 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1662 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1663 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1664 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1665 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1666 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1667 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1668 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1669 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1670 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1671 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1672 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1673 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1674 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1675 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1676 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1682 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1683 OPNAME ## pixels8_c(dst, src, stride, 8);\
1686 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1688 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1689 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1692 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1693 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1696 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1698 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1699 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1702 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1703 uint8_t full[16*9];\
1705 copy_block9(full, src, 16, stride, 9);\
1706 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1707 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1710 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1712 copy_block9(full, src, 16, stride, 9);\
1713 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1716 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1717 uint8_t full[16*9];\
1719 copy_block9(full, src, 16, stride, 9);\
1720 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1721 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1723 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1724 uint8_t full[16*9];\
1727 uint8_t halfHV[64];\
1728 copy_block9(full, src, 16, stride, 9);\
1729 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1730 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1731 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1732 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1734 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1735 uint8_t full[16*9];\
1737 uint8_t halfHV[64];\
1738 copy_block9(full, src, 16, stride, 9);\
1739 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1740 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1741 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1742 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1744 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1745 uint8_t full[16*9];\
1748 uint8_t halfHV[64];\
1749 copy_block9(full, src, 16, stride, 9);\
1750 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1751 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1752 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1753 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1755 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1756 uint8_t full[16*9];\
1758 uint8_t halfHV[64];\
1759 copy_block9(full, src, 16, stride, 9);\
1760 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1761 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1762 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1763 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1765 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1766 uint8_t full[16*9];\
1769 uint8_t halfHV[64];\
1770 copy_block9(full, src, 16, stride, 9);\
1771 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1772 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1773 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1774 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1776 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1777 uint8_t full[16*9];\
1779 uint8_t halfHV[64];\
1780 copy_block9(full, src, 16, stride, 9);\
1781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1782 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1783 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1784 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1786 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1787 uint8_t full[16*9];\
1790 uint8_t halfHV[64];\
1791 copy_block9(full, src, 16, stride, 9);\
1792 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1794 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1795 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1797 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1798 uint8_t full[16*9];\
1800 uint8_t halfHV[64];\
1801 copy_block9(full, src, 16, stride, 9);\
1802 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1803 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1804 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1805 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1807 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1809 uint8_t halfHV[64];\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1812 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1814 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t halfHV[64];\
1817 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1818 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1819 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1821 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1822 uint8_t full[16*9];\
1825 uint8_t halfHV[64];\
1826 copy_block9(full, src, 16, stride, 9);\
1827 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1828 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1829 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1830 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1832 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1833 uint8_t full[16*9];\
1835 copy_block9(full, src, 16, stride, 9);\
1836 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1837 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1838 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1840 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[16*9];\
1844 uint8_t halfHV[64];\
1845 copy_block9(full, src, 16, stride, 9);\
1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1849 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1851 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1856 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1857 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1859 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1861 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1862 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1864 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1865 OPNAME ## pixels16_c(dst, src, stride, 16);\
1868 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1870 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1871 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1874 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1875 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1878 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1880 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1881 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1884 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1885 uint8_t full[24*17];\
1887 copy_block17(full, src, 24, stride, 17);\
1888 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1889 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1892 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1894 copy_block17(full, src, 24, stride, 17);\
1895 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1898 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[24*17];\
1901 copy_block17(full, src, 24, stride, 17);\
1902 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1903 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1905 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1906 uint8_t full[24*17];\
1907 uint8_t halfH[272];\
1908 uint8_t halfV[256];\
1909 uint8_t halfHV[256];\
1910 copy_block17(full, src, 24, stride, 17);\
1911 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1912 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1913 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1914 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1916 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1917 uint8_t full[24*17];\
1918 uint8_t halfH[272];\
1919 uint8_t halfHV[256];\
1920 copy_block17(full, src, 24, stride, 17);\
1921 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1922 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1923 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1924 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1926 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1927 uint8_t full[24*17];\
1928 uint8_t halfH[272];\
1929 uint8_t halfV[256];\
1930 uint8_t halfHV[256];\
1931 copy_block17(full, src, 24, stride, 17);\
1932 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1933 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1934 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1935 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1937 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1938 uint8_t full[24*17];\
1939 uint8_t halfH[272];\
1940 uint8_t halfHV[256];\
1941 copy_block17(full, src, 24, stride, 17);\
1942 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1943 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1944 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1945 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1947 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t full[24*17];\
1949 uint8_t halfH[272];\
1950 uint8_t halfV[256];\
1951 uint8_t halfHV[256];\
1952 copy_block17(full, src, 24, stride, 17);\
1953 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1954 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1955 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1956 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1958 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1959 uint8_t full[24*17];\
1960 uint8_t halfH[272];\
1961 uint8_t halfHV[256];\
1962 copy_block17(full, src, 24, stride, 17);\
1963 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1964 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1965 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1966 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1968 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1969 uint8_t full[24*17];\
1970 uint8_t halfH[272];\
1971 uint8_t halfV[256];\
1972 uint8_t halfHV[256];\
1973 copy_block17(full, src, 24, stride, 17);\
1974 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1975 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1976 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1977 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1979 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1980 uint8_t full[24*17];\
1981 uint8_t halfH[272];\
1982 uint8_t halfHV[256];\
1983 copy_block17(full, src, 24, stride, 17);\
1984 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1985 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1986 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1987 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1989 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1990 uint8_t halfH[272];\
1991 uint8_t halfHV[256];\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1994 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1996 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t halfH[272];\
1998 uint8_t halfHV[256];\
1999 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2000 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2001 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2003 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2004 uint8_t full[24*17];\
2005 uint8_t halfH[272];\
2006 uint8_t halfV[256];\
2007 uint8_t halfHV[256];\
2008 copy_block17(full, src, 24, stride, 17);\
2009 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2010 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2011 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2012 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2014 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2015 uint8_t full[24*17];\
2016 uint8_t halfH[272];\
2017 copy_block17(full, src, 24, stride, 17);\
2018 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2019 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2020 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2022 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 uint8_t halfV[256];\
2026 uint8_t halfHV[256];\
2027 copy_block17(full, src, 24, stride, 17);\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2031 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2033 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 uint8_t halfH[272];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2038 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2039 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2041 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t halfH[272];\
2043 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2044 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2047 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2048 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2049 #define op_put(a, b) a = cm[((b) + 16)>>5]
2050 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2052 QPEL_MC(0, put_ , _ , op_put)
2053 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2054 QPEL_MC(0, avg_ , _ , op_avg)
2055 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2057 #undef op_avg_no_rnd
2059 #undef op_put_no_rnd
2062 #define H264_LOWPASS(OPNAME, OP, OP2) \
2063 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2065 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2069 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2070 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2076 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2078 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2082 const int srcB= src[-2*srcStride];\
2083 const int srcA= src[-1*srcStride];\
2084 const int src0= src[0 *srcStride];\
2085 const int src1= src[1 *srcStride];\
2086 const int src2= src[2 *srcStride];\
2087 const int src3= src[3 *srcStride];\
2088 const int src4= src[4 *srcStride];\
2089 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2090 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2096 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2099 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2101 src -= 2*srcStride;\
2102 for(i=0; i<h+5; i++)\
2104 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2105 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2109 tmp -= tmpStride*(h+5-2);\
2112 const int tmpB= tmp[-2*tmpStride];\
2113 const int tmpA= tmp[-1*tmpStride];\
2114 const int tmp0= tmp[0 *tmpStride];\
2115 const int tmp1= tmp[1 *tmpStride];\
2116 const int tmp2= tmp[2 *tmpStride];\
2117 const int tmp3= tmp[3 *tmpStride];\
2118 const int tmp4= tmp[4 *tmpStride];\
2119 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2120 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2125 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2127 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2131 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2132 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2133 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2134 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2140 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2142 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2146 const int srcB= src[-2*srcStride];\
2147 const int srcA= src[-1*srcStride];\
2148 const int src0= src[0 *srcStride];\
2149 const int src1= src[1 *srcStride];\
2150 const int src2= src[2 *srcStride];\
2151 const int src3= src[3 *srcStride];\
2152 const int src4= src[4 *srcStride];\
2153 const int src5= src[5 *srcStride];\
2154 const int src6= src[6 *srcStride];\
2155 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2156 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2157 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2158 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2164 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2167 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2169 src -= 2*srcStride;\
2170 for(i=0; i<h+5; i++)\
2172 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2173 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2174 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2175 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2179 tmp -= tmpStride*(h+5-2);\
2182 const int tmpB= tmp[-2*tmpStride];\
2183 const int tmpA= tmp[-1*tmpStride];\
2184 const int tmp0= tmp[0 *tmpStride];\
2185 const int tmp1= tmp[1 *tmpStride];\
2186 const int tmp2= tmp[2 *tmpStride];\
2187 const int tmp3= tmp[3 *tmpStride];\
2188 const int tmp4= tmp[4 *tmpStride];\
2189 const int tmp5= tmp[5 *tmpStride];\
2190 const int tmp6= tmp[6 *tmpStride];\
2191 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2192 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2193 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2194 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2200 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2202 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2206 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2207 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2208 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2209 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2210 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2211 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2212 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2213 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2219 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2221 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2225 const int srcB= src[-2*srcStride];\
2226 const int srcA= src[-1*srcStride];\
2227 const int src0= src[0 *srcStride];\
2228 const int src1= src[1 *srcStride];\
2229 const int src2= src[2 *srcStride];\
2230 const int src3= src[3 *srcStride];\
2231 const int src4= src[4 *srcStride];\
2232 const int src5= src[5 *srcStride];\
2233 const int src6= src[6 *srcStride];\
2234 const int src7= src[7 *srcStride];\
2235 const int src8= src[8 *srcStride];\
2236 const int src9= src[9 *srcStride];\
2237 const int src10=src[10*srcStride];\
2238 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2239 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2240 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2241 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2242 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2243 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2244 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2245 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2251 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2254 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2256 src -= 2*srcStride;\
2257 for(i=0; i<h+5; i++)\
2259 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2260 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2261 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2262 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2263 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2264 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2265 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2266 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2270 tmp -= tmpStride*(h+5-2);\
2273 const int tmpB= tmp[-2*tmpStride];\
2274 const int tmpA= tmp[-1*tmpStride];\
2275 const int tmp0= tmp[0 *tmpStride];\
2276 const int tmp1= tmp[1 *tmpStride];\
2277 const int tmp2= tmp[2 *tmpStride];\
2278 const int tmp3= tmp[3 *tmpStride];\
2279 const int tmp4= tmp[4 *tmpStride];\
2280 const int tmp5= tmp[5 *tmpStride];\
2281 const int tmp6= tmp[6 *tmpStride];\
2282 const int tmp7= tmp[7 *tmpStride];\
2283 const int tmp8= tmp[8 *tmpStride];\
2284 const int tmp9= tmp[9 *tmpStride];\
2285 const int tmp10=tmp[10*tmpStride];\
2286 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2287 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2288 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2289 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2290 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2291 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2292 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2293 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2299 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2300 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2301 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2302 src += 8*srcStride;\
2303 dst += 8*dstStride;\
2304 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2305 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2308 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2310 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 src += 8*srcStride;\
2312 dst += 8*dstStride;\
2313 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2314 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2317 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2318 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2319 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2320 src += 8*srcStride;\
2321 dst += 8*dstStride;\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2323 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2326 #define H264_MC(OPNAME, SIZE) \
2327 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2328 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2331 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2332 uint8_t half[SIZE*SIZE];\
2333 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2334 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2337 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2338 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2341 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2342 uint8_t half[SIZE*SIZE];\
2343 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 uint8_t half[SIZE*SIZE];\
2351 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2352 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2353 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2357 uint8_t full[SIZE*(SIZE+5)];\
2358 uint8_t * const full_mid= full + SIZE*2;\
2359 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2360 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2363 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2364 uint8_t full[SIZE*(SIZE+5)];\
2365 uint8_t * const full_mid= full + SIZE*2;\
2366 uint8_t half[SIZE*SIZE];\
2367 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2368 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2369 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2372 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2373 uint8_t full[SIZE*(SIZE+5)];\
2374 uint8_t * const full_mid= full + SIZE*2;\
2375 uint8_t halfH[SIZE*SIZE];\
2376 uint8_t halfV[SIZE*SIZE];\
2377 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2379 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2380 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2384 uint8_t full[SIZE*(SIZE+5)];\
2385 uint8_t * const full_mid= full + SIZE*2;\
2386 uint8_t halfH[SIZE*SIZE];\
2387 uint8_t halfV[SIZE*SIZE];\
2388 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2389 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2390 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2391 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2394 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2395 uint8_t full[SIZE*(SIZE+5)];\
2396 uint8_t * const full_mid= full + SIZE*2;\
2397 uint8_t halfH[SIZE*SIZE];\
2398 uint8_t halfV[SIZE*SIZE];\
2399 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2400 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2401 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2402 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2405 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2406 uint8_t full[SIZE*(SIZE+5)];\
2407 uint8_t * const full_mid= full + SIZE*2;\
2408 uint8_t halfH[SIZE*SIZE];\
2409 uint8_t halfV[SIZE*SIZE];\
2410 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2411 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2412 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2413 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2416 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2417 int16_t tmp[SIZE*(SIZE+5)];\
2418 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2422 int16_t tmp[SIZE*(SIZE+5)];\
2423 uint8_t halfH[SIZE*SIZE];\
2424 uint8_t halfHV[SIZE*SIZE];\
2425 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2426 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2427 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2430 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2431 int16_t tmp[SIZE*(SIZE+5)];\
2432 uint8_t halfH[SIZE*SIZE];\
2433 uint8_t halfHV[SIZE*SIZE];\
2434 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2435 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2436 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2439 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2440 uint8_t full[SIZE*(SIZE+5)];\
2441 uint8_t * const full_mid= full + SIZE*2;\
2442 int16_t tmp[SIZE*(SIZE+5)];\
2443 uint8_t halfV[SIZE*SIZE];\
2444 uint8_t halfHV[SIZE*SIZE];\
2445 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2446 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2447 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2448 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2451 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2452 uint8_t full[SIZE*(SIZE+5)];\
2453 uint8_t * const full_mid= full + SIZE*2;\
2454 int16_t tmp[SIZE*(SIZE+5)];\
2455 uint8_t halfV[SIZE*SIZE];\
2456 uint8_t halfHV[SIZE*SIZE];\
2457 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2458 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2459 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2460 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2463 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2464 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2465 #define op_put(a, b) a = cm[((b) + 16)>>5]
2466 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2467 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2469 H264_LOWPASS(put_ , op_put, op2_put)
2470 H264_LOWPASS(avg_ , op_avg, op2_avg)
2485 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2486 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2487 #define H264_WEIGHT(W,H) \
2488 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2490 offset <<= log2_denom; \
2491 if(log2_denom) offset += 1<<(log2_denom-1); \
2492 for(y=0; y<H; y++, block += stride){ \
2495 if(W==2) continue; \
2498 if(W==4) continue; \
2503 if(W==8) continue; \
2514 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2516 offset = ((offset + 1) | 1) << log2_denom; \
2517 for(y=0; y<H; y++, dst += stride, src += stride){ \
2520 if(W==2) continue; \
2523 if(W==4) continue; \
2528 if(W==8) continue; \
2555 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2556 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2560 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2561 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2562 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2563 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2564 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2565 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2566 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2567 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2574 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2576 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2577 put_pixels8_c(dst, src, stride, 8);
2579 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580 avg_pixels8_c(dst, src, stride, 8);
2582 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583 put_pixels16_c(dst, src, stride, 16);
2585 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 avg_pixels16_c(dst, src, stride, 16);
2589 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2590 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2594 const int src_1= src[ -srcStride];
2595 const int src0 = src[0 ];
2596 const int src1 = src[ srcStride];
2597 const int src2 = src[2*srcStride];
2598 const int src3 = src[3*srcStride];
2599 const int src4 = src[4*srcStride];
2600 const int src5 = src[5*srcStride];
2601 const int src6 = src[6*srcStride];
2602 const int src7 = src[7*srcStride];
2603 const int src8 = src[8*srcStride];
2604 const int src9 = src[9*srcStride];
2605 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2606 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2607 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2608 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2609 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2610 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2611 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2612 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2618 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2619 put_pixels8_c(dst, src, stride, 8);
2622 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2624 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2625 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2628 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2629 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2632 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2634 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2635 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2638 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2639 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2642 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2646 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2647 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2648 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2649 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2651 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2655 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2656 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2657 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2658 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2660 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2662 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2663 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2666 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2668 const int strength= ff_h263_loop_filter_strength[qscale];
2672 int p0= src[x-2*stride];
2673 int p1= src[x-1*stride];
2674 int p2= src[x+0*stride];
2675 int p3= src[x+1*stride];
2676 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2678 if (d<-2*strength) d1= 0;
2679 else if(d<- strength) d1=-2*strength - d;
2680 else if(d< strength) d1= d;
2681 else if(d< 2*strength) d1= 2*strength - d;
2686 if(p1&256) p1= ~(p1>>31);
2687 if(p2&256) p2= ~(p2>>31);
2689 src[x-1*stride] = p1;
2690 src[x+0*stride] = p2;
2694 d2= clip((p0-p3)/4, -ad1, ad1);
2696 src[x-2*stride] = p0 - d2;
2697 src[x+ stride] = p3 + d2;
2701 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2703 const int strength= ff_h263_loop_filter_strength[qscale];
2707 int p0= src[y*stride-2];
2708 int p1= src[y*stride-1];
2709 int p2= src[y*stride+0];
2710 int p3= src[y*stride+1];
2711 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2713 if (d<-2*strength) d1= 0;
2714 else if(d<- strength) d1=-2*strength - d;
2715 else if(d< strength) d1= d;
2716 else if(d< 2*strength) d1= 2*strength - d;
2721 if(p1&256) p1= ~(p1>>31);
2722 if(p2&256) p2= ~(p2>>31);
2724 src[y*stride-1] = p1;
2725 src[y*stride+0] = p2;
2729 d2= clip((p0-p3)/4, -ad1, ad1);
2731 src[y*stride-2] = p0 - d2;
2732 src[y*stride+1] = p3 + d2;
2736 static void h261_loop_filter_c(uint8_t *src, int stride){
2741 temp[x ] = 4*src[x ];
2742 temp[x + 7*8] = 4*src[x + 7*stride];
2746 xy = y * stride + x;
2748 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2753 src[ y*stride] = (temp[ y*8] + 2)>>2;
2754 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2756 xy = y * stride + x;
2758 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2763 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2766 for( i = 0; i < 4; i++ ) {
2771 for( d = 0; d < 4; d++ ) {
2772 const int p0 = pix[-1*xstride];
2773 const int p1 = pix[-2*xstride];
2774 const int p2 = pix[-3*xstride];
2775 const int q0 = pix[0];
2776 const int q1 = pix[1*xstride];
2777 const int q2 = pix[2*xstride];
2779 if( ABS( p0 - q0 ) < alpha &&
2780 ABS( p1 - p0 ) < beta &&
2781 ABS( q1 - q0 ) < beta ) {
2786 if( ABS( p2 - p0 ) < beta ) {
2787 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2790 if( ABS( q2 - q0 ) < beta ) {
2791 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2795 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2796 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2797 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2803 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2805 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2807 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2809 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2812 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2815 for( i = 0; i < 4; i++ ) {
2816 const int tc = tc0[i];
2821 for( d = 0; d < 2; d++ ) {
2822 const int p0 = pix[-1*xstride];
2823 const int p1 = pix[-2*xstride];
2824 const int q0 = pix[0];
2825 const int q1 = pix[1*xstride];
2827 if( ABS( p0 - q0 ) < alpha &&
2828 ABS( p1 - p0 ) < beta &&
2829 ABS( q1 - q0 ) < beta ) {
2831 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2833 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2834 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2840 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2842 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2844 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2846 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2849 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2852 for( d = 0; d < 8; d++ ) {
2853 const int p0 = pix[-1*xstride];
2854 const int p1 = pix[-2*xstride];
2855 const int q0 = pix[0];
2856 const int q1 = pix[1*xstride];
2858 if( ABS( p0 - q0 ) < alpha &&
2859 ABS( p1 - p0 ) < beta &&
2860 ABS( q1 - q0 ) < beta ) {
2862 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2863 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2868 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2870 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2872 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2874 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2877 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2883 s += abs(pix1[0] - pix2[0]);
2884 s += abs(pix1[1] - pix2[1]);
2885 s += abs(pix1[2] - pix2[2]);
2886 s += abs(pix1[3] - pix2[3]);
2887 s += abs(pix1[4] - pix2[4]);
2888 s += abs(pix1[5] - pix2[5]);
2889 s += abs(pix1[6] - pix2[6]);
2890 s += abs(pix1[7] - pix2[7]);
2891 s += abs(pix1[8] - pix2[8]);
2892 s += abs(pix1[9] - pix2[9]);
2893 s += abs(pix1[10] - pix2[10]);
2894 s += abs(pix1[11] - pix2[11]);
2895 s += abs(pix1[12] - pix2[12]);
2896 s += abs(pix1[13] - pix2[13]);
2897 s += abs(pix1[14] - pix2[14]);
2898 s += abs(pix1[15] - pix2[15]);
2905 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2911 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2912 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2913 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2914 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2915 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2916 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2917 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2918 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2919 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2920 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2921 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2922 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2923 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2924 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2925 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2926 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2933 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2936 uint8_t *pix3 = pix2 + line_size;
2940 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2941 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2942 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2943 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2944 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2945 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2946 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2947 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2948 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2949 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2950 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2951 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2952 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2953 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2954 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2955 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2963 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2966 uint8_t *pix3 = pix2 + line_size;
2970 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2971 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2972 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2973 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2974 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2975 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2976 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2977 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2978 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2979 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2980 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2981 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2982 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2983 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2984 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2985 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2993 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2999 s += abs(pix1[0] - pix2[0]);
3000 s += abs(pix1[1] - pix2[1]);
3001 s += abs(pix1[2] - pix2[2]);
3002 s += abs(pix1[3] - pix2[3]);
3003 s += abs(pix1[4] - pix2[4]);
3004 s += abs(pix1[5] - pix2[5]);
3005 s += abs(pix1[6] - pix2[6]);
3006 s += abs(pix1[7] - pix2[7]);
3013 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3019 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3020 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3021 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3022 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3023 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3024 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3025 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3026 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3033 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036 uint8_t *pix3 = pix2 + line_size;
3040 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3041 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3042 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3043 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3044 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3045 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3046 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3047 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3055 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3058 uint8_t *pix3 = pix2 + line_size;
3062 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3063 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3064 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3065 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3066 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3067 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3068 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3069 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3077 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3078 MpegEncContext *c = v;
3084 for(x=0; x<16; x++){
3085 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3088 for(x=0; x<15; x++){
3089 score2+= ABS( s1[x ] - s1[x +stride]
3090 - s1[x+1] + s1[x+1+stride])
3091 -ABS( s2[x ] - s2[x +stride]
3092 - s2[x+1] + s2[x+1+stride]);
3099 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3100 else return score1 + ABS(score2)*8;
3103 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3104 MpegEncContext *c = v;
3111 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3115 score2+= ABS( s1[x ] - s1[x +stride]
3116 - s1[x+1] + s1[x+1+stride])
3117 -ABS( s2[x ] - s2[x +stride]
3118 - s2[x+1] + s2[x+1+stride]);
3125 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3126 else return score1 + ABS(score2)*8;
3129 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3133 for(i=0; i<8*8; i++){
3134 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3137 assert(-512<b && b<512);
3139 sum += (w*b)*(w*b)>>4;
3144 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3147 for(i=0; i<8*8; i++){
3148 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3153 * permutes an 8x8 block.
3154 * @param block the block which will be permuted according to the given permutation vector
3155 * @param permutation the permutation vector
3156 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3157 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3158 * (inverse) permutated to scantable order!
3160 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3166 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3168 for(i=0; i<=last; i++){
3169 const int j= scantable[i];
3174 for(i=0; i<=last; i++){
3175 const int j= scantable[i];
3176 const int perm_j= permutation[j];
3177 block[perm_j]= temp[j];
3181 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3185 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3188 memset(cmp, 0, sizeof(void*)*5);
3196 cmp[i]= c->hadamard8_diff[i];
3202 cmp[i]= c->dct_sad[i];
3205 cmp[i]= c->dct264_sad[i];
3208 cmp[i]= c->dct_max[i];
3211 cmp[i]= c->quant_psnr[i];
3231 #ifdef CONFIG_SNOW_ENCODER
3240 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3246 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3248 static void clear_blocks_c(DCTELEM *blocks)
3250 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3253 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3255 for(i=0; i+7<w; i+=8){
3256 dst[i+0] += src[i+0];
3257 dst[i+1] += src[i+1];
3258 dst[i+2] += src[i+2];
3259 dst[i+3] += src[i+3];
3260 dst[i+4] += src[i+4];
3261 dst[i+5] += src[i+5];
3262 dst[i+6] += src[i+6];
3263 dst[i+7] += src[i+7];
3266 dst[i+0] += src[i+0];
3269 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3271 for(i=0; i+7<w; i+=8){
3272 dst[i+0] = src1[i+0]-src2[i+0];
3273 dst[i+1] = src1[i+1]-src2[i+1];
3274 dst[i+2] = src1[i+2]-src2[i+2];
3275 dst[i+3] = src1[i+3]-src2[i+3];
3276 dst[i+4] = src1[i+4]-src2[i+4];
3277 dst[i+5] = src1[i+5]-src2[i+5];
3278 dst[i+6] = src1[i+6]-src2[i+6];
3279 dst[i+7] = src1[i+7]-src2[i+7];
3282 dst[i+0] = src1[i+0]-src2[i+0];
3285 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3293 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3303 #define BUTTERFLY2(o1,o2,i1,i2) \
3307 #define BUTTERFLY1(x,y) \
3316 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3318 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3326 //FIXME try pointer walks
3327 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3328 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3329 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3330 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3332 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3333 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3334 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3335 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3337 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3338 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3339 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3340 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3344 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3345 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3346 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3347 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3349 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3350 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3351 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3352 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3355 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3356 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3357 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3358 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3364 printf("MAX:%d\n", maxi);
3370 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3378 //FIXME try pointer walks
3379 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3380 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3381 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3382 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3384 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3385 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3386 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3387 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3389 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3390 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3391 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3392 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3396 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3397 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3398 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3399 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3401 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3402 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3403 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3404 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3407 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3408 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3409 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3410 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3413 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3418 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3419 MpegEncContext * const s= (MpegEncContext *)c;
3420 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3421 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3426 s->dsp.diff_pixels(temp, src1, src2, stride);
3437 const int s07 = SRC(0) + SRC(7);\
3438 const int s16 = SRC(1) + SRC(6);\
3439 const int s25 = SRC(2) + SRC(5);\
3440 const int s34 = SRC(3) + SRC(4);\
3441 const int a0 = s07 + s34;\
3442 const int a1 = s16 + s25;\
3443 const int a2 = s07 - s34;\
3444 const int a3 = s16 - s25;\
3445 const int d07 = SRC(0) - SRC(7);\
3446 const int d16 = SRC(1) - SRC(6);\
3447 const int d25 = SRC(2) - SRC(5);\
3448 const int d34 = SRC(3) - SRC(4);\
3449 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3450 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3451 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3452 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3454 DST(1, a4 + (a7>>2)) ;\
3455 DST(2, a2 + (a3>>1)) ;\
3456 DST(3, a5 + (a6>>2)) ;\
3458 DST(5, a6 - (a5>>2)) ;\
3459 DST(6, (a2>>1) - a3 ) ;\
3460 DST(7, (a4>>2) - a7 ) ;\
3463 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3464 MpegEncContext * const s= (MpegEncContext *)c;
3469 s->dsp.diff_pixels(dct, src1, src2, stride);
3471 #define SRC(x) dct[i][x]
3472 #define DST(x,v) dct[i][x]= v
3473 for( i = 0; i < 8; i++ )
3478 #define SRC(x) dct[x][i]
3479 #define DST(x,v) sum += ABS(v)
3480 for( i = 0; i < 8; i++ )
3488 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3489 MpegEncContext * const s= (MpegEncContext *)c;
3490 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3491 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3496 s->dsp.diff_pixels(temp, src1, src2, stride);
3500 sum= FFMAX(sum, ABS(temp[i]));
3505 void simple_idct(DCTELEM *block); //FIXME
3507 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3508 MpegEncContext * const s= (MpegEncContext *)c;
3509 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3510 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3511 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3517 s->dsp.diff_pixels(temp, src1, src2, stride);
3519 memcpy(bak, temp, 64*sizeof(DCTELEM));
3521 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3522 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3523 simple_idct(temp); //FIXME
3526 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3531 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3532 MpegEncContext * const s= (MpegEncContext *)c;
3533 const uint8_t *scantable= s->intra_scantable.permutated;
3534 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3535 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3536 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3537 uint8_t * const bak= (uint8_t*)aligned_bak;
3538 int i, last, run, bits, level, distoration, start_i;
3539 const int esc_length= s->ac_esc_length;
3541 uint8_t * last_length;
3546 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3547 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3550 s->dsp.diff_pixels(temp, src1, src2, stride);
3552 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3558 length = s->intra_ac_vlc_length;
3559 last_length= s->intra_ac_vlc_last_length;
3560 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3563 length = s->inter_ac_vlc_length;
3564 last_length= s->inter_ac_vlc_last_length;
3569 for(i=start_i; i<last; i++){
3570 int j= scantable[i];
3575 if((level&(~127)) == 0){
3576 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3585 level= temp[i] + 64;
3589 if((level&(~127)) == 0){
3590 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3598 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3600 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3603 s->dsp.idct_add(bak, stride, temp);
3605 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3607 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3610 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3611 MpegEncContext * const s= (MpegEncContext *)c;
3612 const uint8_t *scantable= s->intra_scantable.permutated;
3613 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3614 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3615 int i, last, run, bits, level, start_i;
3616 const int esc_length= s->ac_esc_length;
3618 uint8_t * last_length;
3622 s->dsp.diff_pixels(temp, src1, src2, stride);
3624 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3630 length = s->intra_ac_vlc_length;
3631 last_length= s->intra_ac_vlc_last_length;
3632 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3635 length = s->inter_ac_vlc_length;
3636 last_length= s->inter_ac_vlc_last_length;
3641 for(i=start_i; i<last; i++){
3642 int j= scantable[i];
3647 if((level&(~127)) == 0){
3648 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3657 level= temp[i] + 64;
3661 if((level&(~127)) == 0){
3662 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3670 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3675 for(x=0; x<16; x+=4){
3676 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3677 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3685 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3690 for(x=0; x<16; x++){
3691 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3700 #define SQ(a) ((a)*(a))
3701 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3706 for(x=0; x<16; x+=4){
3707 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3708 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3716 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3721 for(x=0; x<16; x++){
3722 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3731 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3732 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3733 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3735 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3737 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3738 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3739 WARPER8_16_SQ(rd8x8_c, rd16_c)
3740 WARPER8_16_SQ(bit8x8_c, bit16_c)
3742 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3744 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3747 put_pixels_clamped_c(block, dest, line_size);
3749 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3752 add_pixels_clamped_c(block, dest, line_size);
3755 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3758 put_pixels_clamped4_c(block, dest, line_size);
3760 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3763 add_pixels_clamped4_c(block, dest, line_size);
3766 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3769 put_pixels_clamped2_c(block, dest, line_size);
3771 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3774 add_pixels_clamped2_c(block, dest, line_size);
3777 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3779 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3781 dest[0] = cm[(block[0] + 4)>>3];
3783 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3785 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3787 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3790 static void just_return() { return; }
3792 /* init static data */
3793 void dsputil_static_init(void)
3797 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3798 for(i=0;i<MAX_NEG_CROP;i++) {
3800 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3803 for(i=0;i<512;i++) {
3804 squareTbl[i] = (i - 256) * (i - 256);
3807 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3811 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3815 #ifdef CONFIG_ENCODERS
3816 if(avctx->dct_algo==FF_DCT_FASTINT) {
3817 c->fdct = fdct_ifast;
3818 c->fdct248 = fdct_ifast248;
3820 else if(avctx->dct_algo==FF_DCT_FAAN) {
3821 c->fdct = ff_faandct;
3822 c->fdct248 = ff_faandct248;
3825 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3826 c->fdct248 = ff_fdct248_islow;
3828 #endif //CONFIG_ENCODERS
3830 if(avctx->lowres==1){
3831 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3832 c->idct_put= ff_jref_idct4_put;
3833 c->idct_add= ff_jref_idct4_add;
3835 c->idct_put= ff_h264_lowres_idct_put_c;
3836 c->idct_add= ff_h264_lowres_idct_add_c;
3838 c->idct = j_rev_dct4;
3839 c->idct_permutation_type= FF_NO_IDCT_PERM;
3840 }else if(avctx->lowres==2){
3841 c->idct_put= ff_jref_idct2_put;
3842 c->idct_add= ff_jref_idct2_add;
3843 c->idct = j_rev_dct2;
3844 c->idct_permutation_type= FF_NO_IDCT_PERM;
3845 }else if(avctx->lowres==3){
3846 c->idct_put= ff_jref_idct1_put;
3847 c->idct_add= ff_jref_idct1_add;
3848 c->idct = j_rev_dct1;
3849 c->idct_permutation_type= FF_NO_IDCT_PERM;
3851 if(avctx->idct_algo==FF_IDCT_INT){
3852 c->idct_put= ff_jref_idct_put;
3853 c->idct_add= ff_jref_idct_add;
3854 c->idct = j_rev_dct;
3855 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3856 }else if(avctx->idct_algo==FF_IDCT_VP3){
3857 c->idct_put= ff_vp3_idct_put_c;
3858 c->idct_add= ff_vp3_idct_add_c;
3859 c->idct = ff_vp3_idct_c;
3860 c->idct_permutation_type= FF_NO_IDCT_PERM;
3861 }else{ //accurate/default
3862 c->idct_put= simple_idct_put;
3863 c->idct_add= simple_idct_add;
3864 c->idct = simple_idct;
3865 c->idct_permutation_type= FF_NO_IDCT_PERM;
3869 c->h264_idct_add= ff_h264_idct_add_c;
3870 c->h264_idct8_add= ff_h264_idct8_add_c;
3871 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3872 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3874 c->get_pixels = get_pixels_c;
3875 c->diff_pixels = diff_pixels_c;
3876 c->put_pixels_clamped = put_pixels_clamped_c;
3877 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3878 c->add_pixels_clamped = add_pixels_clamped_c;
3879 c->add_pixels8 = add_pixels8_c;
3880 c->add_pixels4 = add_pixels4_c;
3883 c->clear_blocks = clear_blocks_c;
3884 c->pix_sum = pix_sum_c;
3885 c->pix_norm1 = pix_norm1_c;
3887 /* TODO [0] 16 [1] 8 */
3888 c->pix_abs[0][0] = pix_abs16_c;
3889 c->pix_abs[0][1] = pix_abs16_x2_c;
3890 c->pix_abs[0][2] = pix_abs16_y2_c;
3891 c->pix_abs[0][3] = pix_abs16_xy2_c;
3892 c->pix_abs[1][0] = pix_abs8_c;
3893 c->pix_abs[1][1] = pix_abs8_x2_c;
3894 c->pix_abs[1][2] = pix_abs8_y2_c;
3895 c->pix_abs[1][3] = pix_abs8_xy2_c;
3897 #define dspfunc(PFX, IDX, NUM) \
3898 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3899 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3900 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3901 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3903 dspfunc(put, 0, 16);
3904 dspfunc(put_no_rnd, 0, 16);
3906 dspfunc(put_no_rnd, 1, 8);
3910 dspfunc(avg, 0, 16);
3911 dspfunc(avg_no_rnd, 0, 16);
3913 dspfunc(avg_no_rnd, 1, 8);
3918 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3919 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3921 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3922 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3923 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3924 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3925 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3926 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3927 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3928 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3929 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3931 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3932 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3933 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3934 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3935 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3936 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3937 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3938 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3939 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3941 #define dspfunc(PFX, IDX, NUM) \
3942 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3943 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3944 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3945 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3946 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3947 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3948 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3949 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3950 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3951 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3952 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3953 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3954 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3955 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3956 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3957 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3959 dspfunc(put_qpel, 0, 16);
3960 dspfunc(put_no_rnd_qpel, 0, 16);
3962 dspfunc(avg_qpel, 0, 16);
3963 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3965 dspfunc(put_qpel, 1, 8);
3966 dspfunc(put_no_rnd_qpel, 1, 8);
3968 dspfunc(avg_qpel, 1, 8);
3969 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3971 dspfunc(put_h264_qpel, 0, 16);
3972 dspfunc(put_h264_qpel, 1, 8);
3973 dspfunc(put_h264_qpel, 2, 4);
3974 dspfunc(put_h264_qpel, 3, 2);
3975 dspfunc(avg_h264_qpel, 0, 16);
3976 dspfunc(avg_h264_qpel, 1, 8);
3977 dspfunc(avg_h264_qpel, 2, 4);
3980 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3981 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3982 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3983 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3984 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3985 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
3987 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
3988 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
3989 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
3990 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
3991 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
3992 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
3993 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
3994 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
3995 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
3996 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
3997 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
3998 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
3999 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4000 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4001 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4002 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4003 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4004 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4005 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4006 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4008 ff_cavsdsp_init(c,avctx);
4010 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4011 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4012 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4013 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4014 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4015 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4016 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4017 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4019 #define SET_CMP_FUNC(name) \
4020 c->name[0]= name ## 16_c;\
4021 c->name[1]= name ## 8x8_c;
4023 SET_CMP_FUNC(hadamard8_diff)
4024 c->hadamard8_diff[4]= hadamard8_intra16_c;
4025 SET_CMP_FUNC(dct_sad)
4026 SET_CMP_FUNC(dct_max)
4028 SET_CMP_FUNC(dct264_sad)
4030 c->sad[0]= pix_abs16_c;
4031 c->sad[1]= pix_abs8_c;
4035 SET_CMP_FUNC(quant_psnr)
4038 c->vsad[0]= vsad16_c;
4039 c->vsad[4]= vsad_intra16_c;
4040 c->vsse[0]= vsse16_c;
4041 c->vsse[4]= vsse_intra16_c;
4042 c->nsse[0]= nsse16_c;
4043 c->nsse[1]= nsse8_c;
4044 #ifdef CONFIG_SNOW_ENCODER
4045 c->w53[0]= w53_16_c;
4047 c->w97[0]= w97_16_c;
4051 c->add_bytes= add_bytes_c;
4052 c->diff_bytes= diff_bytes_c;
4053 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4054 c->bswap_buf= bswap_buf;
4056 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4057 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4058 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4059 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4060 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4061 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4063 c->h263_h_loop_filter= h263_h_loop_filter_c;
4064 c->h263_v_loop_filter= h263_v_loop_filter_c;
4066 c->h261_loop_filter= h261_loop_filter_c;
4068 c->try_8x8basis= try_8x8basis_c;
4069 c->add_8x8basis= add_8x8basis_c;
4071 #ifdef CONFIG_SNOW_ENCODER
4072 c->vertical_compose97i = ff_snow_vertical_compose97i;
4073 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4074 c->inner_add_yblock = ff_snow_inner_add_yblock;
4077 c->shrink[0]= ff_img_copy_plane;
4078 c->shrink[1]= ff_shrink22;
4079 c->shrink[2]= ff_shrink44;
4080 c->shrink[3]= ff_shrink88;
4082 c->prefetch= just_return;
4085 dsputil_init_mmx(c, avctx);
4088 dsputil_init_armv4l(c, avctx);
4091 dsputil_init_mlib(c, avctx);
4094 dsputil_init_vis(c,avctx);
4097 dsputil_init_alpha(c, avctx);
4100 dsputil_init_ppc(c, avctx);
4103 dsputil_init_mmi(c, avctx);
4106 dsputil_init_sh4(c,avctx);
4109 switch(c->idct_permutation_type){
4110 case FF_NO_IDCT_PERM:
4112 c->idct_permutation[i]= i;
4114 case FF_LIBMPEG2_IDCT_PERM:
4116 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4118 case FF_SIMPLE_IDCT_PERM:
4120 c->idct_permutation[i]= simple_mmx_permutation[i];
4122 case FF_TRANSPOSE_IDCT_PERM:
4124 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4126 case FF_PARTTRANS_IDCT_PERM:
4128 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4131 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");