3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
41 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
42 uint32_t squareTbl[512] = {0, };
44 const uint8_t ff_zigzag_direct[64] = {
45 0, 1, 8, 16, 9, 2, 3, 10,
46 17, 24, 32, 25, 18, 11, 4, 5,
47 12, 19, 26, 33, 40, 48, 41, 34,
48 27, 20, 13, 6, 7, 14, 21, 28,
49 35, 42, 49, 56, 57, 50, 43, 36,
50 29, 22, 15, 23, 30, 37, 44, 51,
51 58, 59, 52, 45, 38, 31, 39, 46,
52 53, 60, 61, 54, 47, 55, 62, 63
55 /* Specific zigzag scan for 248 idct. NOTE that unlike the
56 specification, we interleave the fields */
57 const uint8_t ff_zigzag248_direct[64] = {
58 0, 8, 1, 9, 16, 24, 2, 10,
59 17, 25, 32, 40, 48, 56, 33, 41,
60 18, 26, 3, 11, 4, 12, 19, 27,
61 34, 42, 49, 57, 50, 58, 35, 43,
62 20, 28, 5, 13, 6, 14, 21, 29,
63 36, 44, 51, 59, 52, 60, 37, 45,
64 22, 30, 7, 15, 23, 31, 38, 46,
65 53, 61, 54, 62, 39, 47, 55, 63,
68 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
69 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
71 const uint8_t ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 const uint8_t ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
93 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
94 const uint32_t inverse[256]={
95 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
96 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
97 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
98 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
99 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
100 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
101 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
102 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
103 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
104 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
105 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
106 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
107 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
108 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
109 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
110 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
111 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
112 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
113 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
114 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
115 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
116 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
117 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
118 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
119 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
120 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
121 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
122 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
123 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
124 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
125 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
126 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
129 /* Input permutation for the simple_idct_mmx */
130 static const uint8_t simple_mmx_permutation[64]={
131 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
132 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
133 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
134 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
135 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
136 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
137 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
138 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141 static int pix_sum_c(uint8_t * pix, int line_size)
146 for (i = 0; i < 16; i++) {
147 for (j = 0; j < 16; j += 8) {
158 pix += line_size - 16;
163 static int pix_norm1_c(uint8_t * pix, int line_size)
166 uint32_t *sq = squareTbl + 256;
169 for (i = 0; i < 16; i++) {
170 for (j = 0; j < 16; j += 8) {
181 #if LONG_MAX > 2147483647
182 register uint64_t x=*(uint64_t*)pix;
184 s += sq[(x>>8)&0xff];
185 s += sq[(x>>16)&0xff];
186 s += sq[(x>>24)&0xff];
187 s += sq[(x>>32)&0xff];
188 s += sq[(x>>40)&0xff];
189 s += sq[(x>>48)&0xff];
190 s += sq[(x>>56)&0xff];
192 register uint32_t x=*(uint32_t*)pix;
194 s += sq[(x>>8)&0xff];
195 s += sq[(x>>16)&0xff];
196 s += sq[(x>>24)&0xff];
197 x=*(uint32_t*)(pix+4);
199 s += sq[(x>>8)&0xff];
200 s += sq[(x>>16)&0xff];
201 s += sq[(x>>24)&0xff];
206 pix += line_size - 16;
211 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214 for(i=0; i+8<=w; i+=8){
215 dst[i+0]= bswap_32(src[i+0]);
216 dst[i+1]= bswap_32(src[i+1]);
217 dst[i+2]= bswap_32(src[i+2]);
218 dst[i+3]= bswap_32(src[i+3]);
219 dst[i+4]= bswap_32(src[i+4]);
220 dst[i+5]= bswap_32(src[i+5]);
221 dst[i+6]= bswap_32(src[i+6]);
222 dst[i+7]= bswap_32(src[i+7]);
225 dst[i+0]= bswap_32(src[i+0]);
229 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232 uint32_t *sq = squareTbl + 256;
235 for (i = 0; i < h; i++) {
236 s += sq[pix1[0] - pix2[0]];
237 s += sq[pix1[1] - pix2[1]];
238 s += sq[pix1[2] - pix2[2]];
239 s += sq[pix1[3] - pix2[3]];
246 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249 uint32_t *sq = squareTbl + 256;
252 for (i = 0; i < h; i++) {
253 s += sq[pix1[0] - pix2[0]];
254 s += sq[pix1[1] - pix2[1]];
255 s += sq[pix1[2] - pix2[2]];
256 s += sq[pix1[3] - pix2[3]];
257 s += sq[pix1[4] - pix2[4]];
258 s += sq[pix1[5] - pix2[5]];
259 s += sq[pix1[6] - pix2[6]];
260 s += sq[pix1[7] - pix2[7]];
267 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270 uint32_t *sq = squareTbl + 256;
273 for (i = 0; i < h; i++) {
274 s += sq[pix1[ 0] - pix2[ 0]];
275 s += sq[pix1[ 1] - pix2[ 1]];
276 s += sq[pix1[ 2] - pix2[ 2]];
277 s += sq[pix1[ 3] - pix2[ 3]];
278 s += sq[pix1[ 4] - pix2[ 4]];
279 s += sq[pix1[ 5] - pix2[ 5]];
280 s += sq[pix1[ 6] - pix2[ 6]];
281 s += sq[pix1[ 7] - pix2[ 7]];
282 s += sq[pix1[ 8] - pix2[ 8]];
283 s += sq[pix1[ 9] - pix2[ 9]];
284 s += sq[pix1[10] - pix2[10]];
285 s += sq[pix1[11] - pix2[11]];
286 s += sq[pix1[12] - pix2[12]];
287 s += sq[pix1[13] - pix2[13]];
288 s += sq[pix1[14] - pix2[14]];
289 s += sq[pix1[15] - pix2[15]];
298 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
299 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
301 const int dec_count= w==8 ? 3 : 4;
304 static const int scale[2][2][4][4]={
308 {268, 239, 239, 213},
312 // 9/7 16x16 or 32x32 dec=4
313 {344, 310, 310, 280},
321 {275, 245, 245, 218},
325 // 5/3 16x16 or 32x32 dec=4
326 {352, 317, 317, 286},
334 for (i = 0; i < h; i++) {
335 for (j = 0; j < w; j+=4) {
336 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
337 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
338 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
339 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
345 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
349 for(level=0; level<dec_count; level++){
350 for(ori= level ? 1 : 0; ori<4; ori++){
351 int size= w>>(dec_count-level);
352 int sx= (ori&1) ? size : 0;
353 int stride= 32<<(dec_count-level);
354 int sy= (ori&2) ? stride>>1 : 0;
356 for(i=0; i<size; i++){
357 for(j=0; j<size; j++){
358 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
368 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
369 return w_c(v, pix1, pix2, line_size, 8, h, 1);
372 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
373 return w_c(v, pix1, pix2, line_size, 8, h, 0);
376 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
377 return w_c(v, pix1, pix2, line_size, 16, h, 1);
380 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
381 return w_c(v, pix1, pix2, line_size, 16, h, 0);
384 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
385 return w_c(v, pix1, pix2, line_size, 32, h, 1);
388 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
389 return w_c(v, pix1, pix2, line_size, 32, h, 0);
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397 /* read the pixels */
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
416 /* read the pixels */
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
439 /* read the pixels */
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461 /* read the pixels */
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
499 else if (*block > 127)
502 *pixels = (uint8_t)(*block + 128);
506 pixels += (line_size - 8);
510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
516 /* read the pixels */
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537 /* read the pixels */
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554 /* read the pixels */
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
567 pixels[0] += block[0];
568 pixels[1] += block[1];
569 pixels[2] += block[2];
570 pixels[3] += block[3];
571 pixels[4] += block[4];
572 pixels[5] += block[5];
573 pixels[6] += block[6];
574 pixels[7] += block[7];
580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
584 pixels[0] += block[0];
585 pixels[1] += block[1];
586 pixels[2] += block[2];
587 pixels[3] += block[3];
595 #define PIXOP2(OPNAME, OP) \
596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600 OP(*((uint64_t*)block), LD64(pixels));\
606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
610 const uint64_t a= LD64(pixels );\
611 const uint64_t b= LD64(pixels+1);\
612 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
622 const uint64_t a= LD64(pixels );\
623 const uint64_t b= LD64(pixels+1);\
624 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
634 const uint64_t a= LD64(pixels );\
635 const uint64_t b= LD64(pixels+line_size);\
636 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
646 const uint64_t a= LD64(pixels );\
647 const uint64_t b= LD64(pixels+line_size);\
648 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 const uint64_t a= LD64(pixels );\
658 const uint64_t b= LD64(pixels+1);\
659 uint64_t l0= (a&0x0303030303030303ULL)\
660 + (b&0x0303030303030303ULL)\
661 + 0x0202020202020202ULL;\
662 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
667 for(i=0; i<h; i+=2){\
668 uint64_t a= LD64(pixels );\
669 uint64_t b= LD64(pixels+1);\
670 l1= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL);\
672 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
679 l0= (a&0x0303030303030303ULL)\
680 + (b&0x0303030303030303ULL)\
681 + 0x0202020202020202ULL;\
682 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 const uint64_t a= LD64(pixels );\
694 const uint64_t b= LD64(pixels+1);\
695 uint64_t l0= (a&0x0303030303030303ULL)\
696 + (b&0x0303030303030303ULL)\
697 + 0x0101010101010101ULL;\
698 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
703 for(i=0; i<h; i+=2){\
704 uint64_t a= LD64(pixels );\
705 uint64_t b= LD64(pixels+1);\
706 l1= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL);\
708 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
715 l0= (a&0x0303030303030303ULL)\
716 + (b&0x0303030303030303ULL)\
717 + 0x0101010101010101ULL;\
718 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
726 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735 #else // 64 bit variant
737 #define PIXOP2(OPNAME, OP) \
738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741 OP(*((uint16_t*)(block )), LD16(pixels ));\
746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749 OP(*((uint32_t*)(block )), LD32(pixels ));\
754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757 OP(*((uint32_t*)(block )), LD32(pixels ));\
758 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768 int src_stride1, int src_stride2, int h){\
772 a= LD32(&src1[i*src_stride1 ]);\
773 b= LD32(&src2[i*src_stride2 ]);\
774 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
775 a= LD32(&src1[i*src_stride1+4]);\
776 b= LD32(&src2[i*src_stride2+4]);\
777 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= LD32(&src1[i*src_stride1 ]);\
787 b= LD32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
789 a= LD32(&src1[i*src_stride1+4]);\
790 b= LD32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= LD32(&src1[i*src_stride1 ]);\
801 b= LD32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
811 a= LD16(&src1[i*src_stride1 ]);\
812 b= LD16(&src2[i*src_stride2 ]);\
813 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
819 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
820 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824 int src_stride1, int src_stride2, int h){\
825 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
826 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
834 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849 uint32_t a, b, c, d, l0, l1, h0, h1;\
850 a= LD32(&src1[i*src_stride1]);\
851 b= LD32(&src2[i*src_stride2]);\
852 c= LD32(&src3[i*src_stride3]);\
853 d= LD32(&src4[i*src_stride4]);\
854 l0= (a&0x03030303UL)\
857 h0= ((a&0xFCFCFCFCUL)>>2)\
858 + ((b&0xFCFCFCFCUL)>>2);\
859 l1= (c&0x03030303UL)\
861 h1= ((c&0xFCFCFCFCUL)>>2)\
862 + ((d&0xFCFCFCFCUL)>>2);\
863 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864 a= LD32(&src1[i*src_stride1+4]);\
865 b= LD32(&src2[i*src_stride2+4]);\
866 c= LD32(&src3[i*src_stride3+4]);\
867 d= LD32(&src4[i*src_stride4+4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901 uint32_t a, b, c, d, l0, l1, h0, h1;\
902 a= LD32(&src1[i*src_stride1]);\
903 b= LD32(&src2[i*src_stride2]);\
904 c= LD32(&src3[i*src_stride3]);\
905 d= LD32(&src4[i*src_stride4]);\
906 l0= (a&0x03030303UL)\
909 h0= ((a&0xFCFCFCFCUL)>>2)\
910 + ((b&0xFCFCFCFCUL)>>2);\
911 l1= (c&0x03030303UL)\
913 h1= ((c&0xFCFCFCFCUL)>>2)\
914 + ((d&0xFCFCFCFCUL)>>2);\
915 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916 a= LD32(&src1[i*src_stride1+4]);\
917 b= LD32(&src2[i*src_stride2+4]);\
918 c= LD32(&src3[i*src_stride3+4]);\
919 d= LD32(&src4[i*src_stride4+4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 int i, a0, b0, a1, b1;\
952 for(i=0; i<h; i+=2){\
958 block[0]= (a1+a0)>>2; /* FIXME non put */\
959 block[1]= (b1+b0)>>2;\
969 block[0]= (a1+a0)>>2;\
970 block[1]= (b1+b0)>>2;\
976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979 const uint32_t a= LD32(pixels );\
980 const uint32_t b= LD32(pixels+1);\
981 uint32_t l0= (a&0x03030303UL)\
984 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985 + ((b&0xFCFCFCFCUL)>>2);\
989 for(i=0; i<h; i+=2){\
990 uint32_t a= LD32(pixels );\
991 uint32_t b= LD32(pixels+1);\
992 l1= (a&0x03030303UL)\
994 h1= ((a&0xFCFCFCFCUL)>>2)\
995 + ((b&0xFCFCFCFCUL)>>2);\
996 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001 l0= (a&0x03030303UL)\
1004 h0= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015 for(j=0; j<2; j++){\
1017 const uint32_t a= LD32(pixels );\
1018 const uint32_t b= LD32(pixels+1);\
1019 uint32_t l0= (a&0x03030303UL)\
1022 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023 + ((b&0xFCFCFCFCUL)>>2);\
1027 for(i=0; i<h; i+=2){\
1028 uint32_t a= LD32(pixels );\
1029 uint32_t b= LD32(pixels+1);\
1030 l1= (a&0x03030303UL)\
1031 + (b&0x03030303UL);\
1032 h1= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039 l0= (a&0x03030303UL)\
1042 h0= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1048 pixels+=4-line_size*(h+1);\
1049 block +=4-line_size*h;\
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056 for(j=0; j<2; j++){\
1058 const uint32_t a= LD32(pixels );\
1059 const uint32_t b= LD32(pixels+1);\
1060 uint32_t l0= (a&0x03030303UL)\
1063 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1068 for(i=0; i<h; i+=2){\
1069 uint32_t a= LD32(pixels );\
1070 uint32_t b= LD32(pixels+1);\
1071 l1= (a&0x03030303UL)\
1072 + (b&0x03030303UL);\
1073 h1= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1075 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080 l0= (a&0x03030303UL)\
1083 h0= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 pixels+=4-line_size*(h+1);\
1090 block +=4-line_size*h;\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1105 #define op_put(a, b) a = b
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 const int A=(16-x16)*(16-y16);
1126 const int B=( x16)*(16-y16);
1127 const int C=(16-x16)*( y16);
1128 const int D=( x16)*( y16);
1133 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1146 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150 const int s= 1<<shift;
1160 for(x=0; x<8; x++){ //XXX FIXME optimize
1161 int src_x, src_y, frac_x, frac_y, index;
1165 frac_x= src_x&(s-1);
1166 frac_y= src_y&(s-1);
1170 if((unsigned)src_x < width){
1171 if((unsigned)src_y < height){
1172 index= src_x + src_y*stride;
1173 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1174 + src[index +1]* frac_x )*(s-frac_y)
1175 + ( src[index+stride ]*(s-frac_x)
1176 + src[index+stride+1]* frac_x )* frac_y
1179 index= src_x + clip(src_y, 0, height)*stride;
1180 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1181 + src[index +1]* frac_x )*s
1185 if((unsigned)src_y < height){
1186 index= clip(src_x, 0, width) + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1188 + src[index+stride ]* frac_y )*s
1191 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192 dst[y*stride + x]= src[index ];
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 case 2: put_pixels2_c (dst, src, stride, height); break;
1207 case 4: put_pixels4_c (dst, src, stride, height); break;
1208 case 8: put_pixels8_c (dst, src, stride, height); break;
1209 case 16:put_pixels16_c(dst, src, stride, height); break;
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 for (i=0; i < height; i++) {
1216 for (j=0; j < width; j++) {
1217 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
1228 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
1239 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
1250 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
1261 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
1272 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 case 2: avg_pixels2_c (dst, src, stride, height); break;
1304 case 4: avg_pixels4_c (dst, src, stride, height); break;
1305 case 8: avg_pixels8_c (dst, src, stride, height); break;
1306 case 16:avg_pixels16_c(dst, src, stride, height); break;
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421 const int A=(8-x)*(8-y);\
1422 const int B=( x)*(8-y);\
1423 const int C=(8-x)*( y);\
1424 const int D=( x)*( y);\
1427 assert(x<8 && y<8 && x>=0 && y>=0);\
1431 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439 const int A=(8-x)*(8-y);\
1440 const int B=( x)*(8-y);\
1441 const int C=(8-x)*( y);\
1442 const int D=( x)*( y);\
1445 assert(x<8 && y<8 && x>=0 && y>=0);\
1449 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459 const int A=(8-x)*(8-y);\
1460 const int B=( x)*(8-y);\
1461 const int C=(8-x)*( y);\
1462 const int D=( x)*( y);\
1465 assert(x<8 && y<8 && x>=0 && y>=0);\
1469 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1485 H264_CHROMA_MC(put_ , op_put)
1486 H264_CHROMA_MC(avg_ , op_avg)
1490 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1495 ST16(dst , LD16(src ));
1501 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1506 ST32(dst , LD32(src ));
1512 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1517 ST32(dst , LD32(src ));
1518 ST32(dst+4 , LD32(src+4 ));
1524 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1529 ST32(dst , LD32(src ));
1530 ST32(dst+4 , LD32(src+4 ));
1531 ST32(dst+8 , LD32(src+8 ));
1532 ST32(dst+12, LD32(src+12));
1538 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543 ST32(dst , LD32(src ));
1544 ST32(dst+4 , LD32(src+4 ));
1545 ST32(dst+8 , LD32(src+8 ));
1546 ST32(dst+12, LD32(src+12));
1553 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1558 ST32(dst , LD32(src ));
1559 ST32(dst+4 , LD32(src+4 ));
1567 #define QPEL_MC(r, OPNAME, RND, OP) \
1568 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1569 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1573 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1574 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1575 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1576 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1577 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1578 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1579 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1580 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1586 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1588 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1592 const int src0= src[0*srcStride];\
1593 const int src1= src[1*srcStride];\
1594 const int src2= src[2*srcStride];\
1595 const int src3= src[3*srcStride];\
1596 const int src4= src[4*srcStride];\
1597 const int src5= src[5*srcStride];\
1598 const int src6= src[6*srcStride];\
1599 const int src7= src[7*srcStride];\
1600 const int src8= src[8*srcStride];\
1601 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1602 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1603 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1604 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1605 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1606 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1607 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1608 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1614 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1615 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1620 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1621 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1622 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1623 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1624 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1625 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1626 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1627 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1628 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1629 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1630 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1631 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1632 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1633 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1634 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1635 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1641 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1642 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1647 const int src0= src[0*srcStride];\
1648 const int src1= src[1*srcStride];\
1649 const int src2= src[2*srcStride];\
1650 const int src3= src[3*srcStride];\
1651 const int src4= src[4*srcStride];\
1652 const int src5= src[5*srcStride];\
1653 const int src6= src[6*srcStride];\
1654 const int src7= src[7*srcStride];\
1655 const int src8= src[8*srcStride];\
1656 const int src9= src[9*srcStride];\
1657 const int src10= src[10*srcStride];\
1658 const int src11= src[11*srcStride];\
1659 const int src12= src[12*srcStride];\
1660 const int src13= src[13*srcStride];\
1661 const int src14= src[14*srcStride];\
1662 const int src15= src[15*srcStride];\
1663 const int src16= src[16*srcStride];\
1664 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1665 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1666 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1667 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1668 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1669 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1670 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1671 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1672 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1673 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1674 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1675 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1676 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1677 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1678 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1679 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1685 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1686 OPNAME ## pixels8_c(dst, src, stride, 8);\
1689 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1692 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1695 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1696 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1699 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1701 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1702 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1705 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1706 uint8_t full[16*9];\
1708 copy_block9(full, src, 16, stride, 9);\
1709 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1710 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1713 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1714 uint8_t full[16*9];\
1715 copy_block9(full, src, 16, stride, 9);\
1716 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1719 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1720 uint8_t full[16*9];\
1722 copy_block9(full, src, 16, stride, 9);\
1723 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1724 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1726 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1727 uint8_t full[16*9];\
1730 uint8_t halfHV[64];\
1731 copy_block9(full, src, 16, stride, 9);\
1732 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1734 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1735 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1737 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1740 uint8_t halfHV[64];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1743 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1744 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1745 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1747 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1748 uint8_t full[16*9];\
1751 uint8_t halfHV[64];\
1752 copy_block9(full, src, 16, stride, 9);\
1753 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1755 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1756 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1758 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1759 uint8_t full[16*9];\
1761 uint8_t halfHV[64];\
1762 copy_block9(full, src, 16, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1764 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1765 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1766 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1768 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1769 uint8_t full[16*9];\
1772 uint8_t halfHV[64];\
1773 copy_block9(full, src, 16, stride, 9);\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1776 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1777 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1779 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t full[16*9];\
1782 uint8_t halfHV[64];\
1783 copy_block9(full, src, 16, stride, 9);\
1784 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1785 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1786 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1787 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1789 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1790 uint8_t full[16*9];\
1793 uint8_t halfHV[64];\
1794 copy_block9(full, src, 16, stride, 9);\
1795 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1797 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1798 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1800 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1801 uint8_t full[16*9];\
1803 uint8_t halfHV[64];\
1804 copy_block9(full, src, 16, stride, 9);\
1805 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1806 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1807 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1808 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1810 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1812 uint8_t halfHV[64];\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1815 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1817 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1819 uint8_t halfHV[64];\
1820 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1824 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t full[16*9];\
1828 uint8_t halfHV[64];\
1829 copy_block9(full, src, 16, stride, 9);\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1832 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1833 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1835 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t full[16*9];\
1838 copy_block9(full, src, 16, stride, 9);\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1840 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1841 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1843 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1847 uint8_t halfHV[64];\
1848 copy_block9(full, src, 16, stride, 9);\
1849 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1851 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1852 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1854 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1855 uint8_t full[16*9];\
1857 copy_block9(full, src, 16, stride, 9);\
1858 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1859 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1860 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1862 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1864 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1865 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1868 OPNAME ## pixels16_c(dst, src, stride, 16);\
1871 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1874 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1877 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1878 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1881 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1883 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1884 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1887 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1888 uint8_t full[24*17];\
1890 copy_block17(full, src, 24, stride, 17);\
1891 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1892 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1895 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1896 uint8_t full[24*17];\
1897 copy_block17(full, src, 24, stride, 17);\
1898 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1901 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t full[24*17];\
1904 copy_block17(full, src, 24, stride, 17);\
1905 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1906 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1908 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1909 uint8_t full[24*17];\
1910 uint8_t halfH[272];\
1911 uint8_t halfV[256];\
1912 uint8_t halfHV[256];\
1913 copy_block17(full, src, 24, stride, 17);\
1914 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1916 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1917 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1919 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 uint8_t halfH[272];\
1922 uint8_t halfHV[256];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1925 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1926 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1927 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1929 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1930 uint8_t full[24*17];\
1931 uint8_t halfH[272];\
1932 uint8_t halfV[256];\
1933 uint8_t halfHV[256];\
1934 copy_block17(full, src, 24, stride, 17);\
1935 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1937 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1938 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1940 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1941 uint8_t full[24*17];\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 copy_block17(full, src, 24, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1946 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1947 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1948 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1950 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t full[24*17];\
1952 uint8_t halfH[272];\
1953 uint8_t halfV[256];\
1954 uint8_t halfHV[256];\
1955 copy_block17(full, src, 24, stride, 17);\
1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1958 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1959 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1961 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1962 uint8_t full[24*17];\
1963 uint8_t halfH[272];\
1964 uint8_t halfHV[256];\
1965 copy_block17(full, src, 24, stride, 17);\
1966 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1967 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1968 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1969 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1971 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1972 uint8_t full[24*17];\
1973 uint8_t halfH[272];\
1974 uint8_t halfV[256];\
1975 uint8_t halfHV[256];\
1976 copy_block17(full, src, 24, stride, 17);\
1977 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1978 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1979 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1980 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1982 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[24*17];\
1984 uint8_t halfH[272];\
1985 uint8_t halfHV[256];\
1986 copy_block17(full, src, 24, stride, 17);\
1987 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1988 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1989 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1990 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1992 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1993 uint8_t halfH[272];\
1994 uint8_t halfHV[256];\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1997 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1999 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2000 uint8_t halfH[272];\
2001 uint8_t halfHV[256];\
2002 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2006 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
2009 uint8_t halfV[256];\
2010 uint8_t halfHV[256];\
2011 copy_block17(full, src, 24, stride, 17);\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2014 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2015 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2017 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[24*17];\
2019 uint8_t halfH[272];\
2020 copy_block17(full, src, 24, stride, 17);\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2022 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2023 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2025 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 uint8_t halfH[272];\
2028 uint8_t halfV[256];\
2029 uint8_t halfHV[256];\
2030 copy_block17(full, src, 24, stride, 17);\
2031 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2033 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2034 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2036 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2037 uint8_t full[24*17];\
2038 uint8_t halfH[272];\
2039 copy_block17(full, src, 24, stride, 17);\
2040 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2041 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2042 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2044 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t halfH[272];\
2046 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2047 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2050 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2051 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2052 #define op_put(a, b) a = cm[((b) + 16)>>5]
2053 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2055 QPEL_MC(0, put_ , _ , op_put)
2056 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2057 QPEL_MC(0, avg_ , _ , op_avg)
2058 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2060 #undef op_avg_no_rnd
2062 #undef op_put_no_rnd
2065 #define H264_LOWPASS(OPNAME, OP, OP2) \
2066 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2068 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2072 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2073 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2079 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2081 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2085 const int srcB= src[-2*srcStride];\
2086 const int srcA= src[-1*srcStride];\
2087 const int src0= src[0 *srcStride];\
2088 const int src1= src[1 *srcStride];\
2089 const int src2= src[2 *srcStride];\
2090 const int src3= src[3 *srcStride];\
2091 const int src4= src[4 *srcStride];\
2092 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2093 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2099 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2102 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2104 src -= 2*srcStride;\
2105 for(i=0; i<h+5; i++)\
2107 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2108 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2112 tmp -= tmpStride*(h+5-2);\
2115 const int tmpB= tmp[-2*tmpStride];\
2116 const int tmpA= tmp[-1*tmpStride];\
2117 const int tmp0= tmp[0 *tmpStride];\
2118 const int tmp1= tmp[1 *tmpStride];\
2119 const int tmp2= tmp[2 *tmpStride];\
2120 const int tmp3= tmp[3 *tmpStride];\
2121 const int tmp4= tmp[4 *tmpStride];\
2122 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2123 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2128 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2130 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2134 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2135 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2136 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2137 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2143 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2145 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2149 const int srcB= src[-2*srcStride];\
2150 const int srcA= src[-1*srcStride];\
2151 const int src0= src[0 *srcStride];\
2152 const int src1= src[1 *srcStride];\
2153 const int src2= src[2 *srcStride];\
2154 const int src3= src[3 *srcStride];\
2155 const int src4= src[4 *srcStride];\
2156 const int src5= src[5 *srcStride];\
2157 const int src6= src[6 *srcStride];\
2158 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2159 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2160 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2161 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2167 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2170 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2172 src -= 2*srcStride;\
2173 for(i=0; i<h+5; i++)\
2175 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2176 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2177 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2178 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2182 tmp -= tmpStride*(h+5-2);\
2185 const int tmpB= tmp[-2*tmpStride];\
2186 const int tmpA= tmp[-1*tmpStride];\
2187 const int tmp0= tmp[0 *tmpStride];\
2188 const int tmp1= tmp[1 *tmpStride];\
2189 const int tmp2= tmp[2 *tmpStride];\
2190 const int tmp3= tmp[3 *tmpStride];\
2191 const int tmp4= tmp[4 *tmpStride];\
2192 const int tmp5= tmp[5 *tmpStride];\
2193 const int tmp6= tmp[6 *tmpStride];\
2194 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2195 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2196 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2197 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2203 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2205 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2209 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2210 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2211 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2212 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2213 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2214 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2215 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2216 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2222 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2224 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2228 const int srcB= src[-2*srcStride];\
2229 const int srcA= src[-1*srcStride];\
2230 const int src0= src[0 *srcStride];\
2231 const int src1= src[1 *srcStride];\
2232 const int src2= src[2 *srcStride];\
2233 const int src3= src[3 *srcStride];\
2234 const int src4= src[4 *srcStride];\
2235 const int src5= src[5 *srcStride];\
2236 const int src6= src[6 *srcStride];\
2237 const int src7= src[7 *srcStride];\
2238 const int src8= src[8 *srcStride];\
2239 const int src9= src[9 *srcStride];\
2240 const int src10=src[10*srcStride];\
2241 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2242 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2243 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2244 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2245 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2246 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2247 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2248 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2254 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2257 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2259 src -= 2*srcStride;\
2260 for(i=0; i<h+5; i++)\
2262 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2263 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2264 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2265 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2266 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2267 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2268 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2269 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2273 tmp -= tmpStride*(h+5-2);\
2276 const int tmpB= tmp[-2*tmpStride];\
2277 const int tmpA= tmp[-1*tmpStride];\
2278 const int tmp0= tmp[0 *tmpStride];\
2279 const int tmp1= tmp[1 *tmpStride];\
2280 const int tmp2= tmp[2 *tmpStride];\
2281 const int tmp3= tmp[3 *tmpStride];\
2282 const int tmp4= tmp[4 *tmpStride];\
2283 const int tmp5= tmp[5 *tmpStride];\
2284 const int tmp6= tmp[6 *tmpStride];\
2285 const int tmp7= tmp[7 *tmpStride];\
2286 const int tmp8= tmp[8 *tmpStride];\
2287 const int tmp9= tmp[9 *tmpStride];\
2288 const int tmp10=tmp[10*tmpStride];\
2289 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2290 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2291 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2292 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2293 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2294 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2295 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2296 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2302 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2303 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2304 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2305 src += 8*srcStride;\
2306 dst += 8*dstStride;\
2307 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2308 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2311 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2312 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2313 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2314 src += 8*srcStride;\
2315 dst += 8*dstStride;\
2316 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2317 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2320 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2321 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2322 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2323 src += 8*srcStride;\
2324 dst += 8*dstStride;\
2325 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2329 #define H264_MC(OPNAME, SIZE) \
2330 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2331 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2334 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2335 uint8_t half[SIZE*SIZE];\
2336 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2337 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2340 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2341 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2344 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2345 uint8_t half[SIZE*SIZE];\
2346 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2347 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2350 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2351 uint8_t full[SIZE*(SIZE+5)];\
2352 uint8_t * const full_mid= full + SIZE*2;\
2353 uint8_t half[SIZE*SIZE];\
2354 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2355 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2356 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2359 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2360 uint8_t full[SIZE*(SIZE+5)];\
2361 uint8_t * const full_mid= full + SIZE*2;\
2362 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2363 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2367 uint8_t full[SIZE*(SIZE+5)];\
2368 uint8_t * const full_mid= full + SIZE*2;\
2369 uint8_t half[SIZE*SIZE];\
2370 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2371 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2372 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2375 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2376 uint8_t full[SIZE*(SIZE+5)];\
2377 uint8_t * const full_mid= full + SIZE*2;\
2378 uint8_t halfH[SIZE*SIZE];\
2379 uint8_t halfV[SIZE*SIZE];\
2380 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2381 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2382 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2383 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2386 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2387 uint8_t full[SIZE*(SIZE+5)];\
2388 uint8_t * const full_mid= full + SIZE*2;\
2389 uint8_t halfH[SIZE*SIZE];\
2390 uint8_t halfV[SIZE*SIZE];\
2391 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2392 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2393 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2394 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2397 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2398 uint8_t full[SIZE*(SIZE+5)];\
2399 uint8_t * const full_mid= full + SIZE*2;\
2400 uint8_t halfH[SIZE*SIZE];\
2401 uint8_t halfV[SIZE*SIZE];\
2402 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2403 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2404 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2405 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2408 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2409 uint8_t full[SIZE*(SIZE+5)];\
2410 uint8_t * const full_mid= full + SIZE*2;\
2411 uint8_t halfH[SIZE*SIZE];\
2412 uint8_t halfV[SIZE*SIZE];\
2413 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2414 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2415 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2416 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2419 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2420 int16_t tmp[SIZE*(SIZE+5)];\
2421 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2425 int16_t tmp[SIZE*(SIZE+5)];\
2426 uint8_t halfH[SIZE*SIZE];\
2427 uint8_t halfHV[SIZE*SIZE];\
2428 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2429 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2430 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2433 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2434 int16_t tmp[SIZE*(SIZE+5)];\
2435 uint8_t halfH[SIZE*SIZE];\
2436 uint8_t halfHV[SIZE*SIZE];\
2437 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2439 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2442 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2443 uint8_t full[SIZE*(SIZE+5)];\
2444 uint8_t * const full_mid= full + SIZE*2;\
2445 int16_t tmp[SIZE*(SIZE+5)];\
2446 uint8_t halfV[SIZE*SIZE];\
2447 uint8_t halfHV[SIZE*SIZE];\
2448 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2449 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2450 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2451 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2454 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2455 uint8_t full[SIZE*(SIZE+5)];\
2456 uint8_t * const full_mid= full + SIZE*2;\
2457 int16_t tmp[SIZE*(SIZE+5)];\
2458 uint8_t halfV[SIZE*SIZE];\
2459 uint8_t halfHV[SIZE*SIZE];\
2460 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2461 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2462 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2466 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2467 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2468 #define op_put(a, b) a = cm[((b) + 16)>>5]
2469 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2470 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2472 H264_LOWPASS(put_ , op_put, op2_put)
2473 H264_LOWPASS(avg_ , op_avg, op2_avg)
2488 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2489 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2490 #define H264_WEIGHT(W,H) \
2491 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2493 offset <<= log2_denom; \
2494 if(log2_denom) offset += 1<<(log2_denom-1); \
2495 for(y=0; y<H; y++, block += stride){ \
2498 if(W==2) continue; \
2501 if(W==4) continue; \
2506 if(W==8) continue; \
2517 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2519 offset = ((offset + 1) | 1) << log2_denom; \
2520 for(y=0; y<H; y++, dst += stride, src += stride){ \
2523 if(W==2) continue; \
2526 if(W==4) continue; \
2531 if(W==8) continue; \
2558 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2559 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2563 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2564 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2565 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2566 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2567 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2568 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2569 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2570 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2576 #ifdef CONFIG_CAVS_DECODER
2578 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2580 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2581 put_pixels8_c(dst, src, stride, 8);
2583 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2584 avg_pixels8_c(dst, src, stride, 8);
2586 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2587 put_pixels16_c(dst, src, stride, 16);
2589 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2590 avg_pixels16_c(dst, src, stride, 16);
2592 #endif /* CONFIG_CAVS_DECODER */
2594 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2596 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2598 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2599 put_pixels8_c(dst, src, stride, 8);
2601 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2603 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2604 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2608 const int src_1= src[ -srcStride];
2609 const int src0 = src[0 ];
2610 const int src1 = src[ srcStride];
2611 const int src2 = src[2*srcStride];
2612 const int src3 = src[3*srcStride];
2613 const int src4 = src[4*srcStride];
2614 const int src5 = src[5*srcStride];
2615 const int src6 = src[6*srcStride];
2616 const int src7 = src[7*srcStride];
2617 const int src8 = src[8*srcStride];
2618 const int src9 = src[9*srcStride];
2619 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2620 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2621 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2622 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2623 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2624 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2625 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2626 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2632 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2633 put_pixels8_c(dst, src, stride, 8);
2636 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2638 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2639 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2642 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2643 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2646 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2648 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2652 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2653 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2656 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2660 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2661 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2662 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2663 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2665 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2669 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2670 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2671 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2672 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2674 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2676 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2677 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2680 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2682 const int strength= ff_h263_loop_filter_strength[qscale];
2686 int p0= src[x-2*stride];
2687 int p1= src[x-1*stride];
2688 int p2= src[x+0*stride];
2689 int p3= src[x+1*stride];
2690 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2692 if (d<-2*strength) d1= 0;
2693 else if(d<- strength) d1=-2*strength - d;
2694 else if(d< strength) d1= d;
2695 else if(d< 2*strength) d1= 2*strength - d;
2700 if(p1&256) p1= ~(p1>>31);
2701 if(p2&256) p2= ~(p2>>31);
2703 src[x-1*stride] = p1;
2704 src[x+0*stride] = p2;
2708 d2= clip((p0-p3)/4, -ad1, ad1);
2710 src[x-2*stride] = p0 - d2;
2711 src[x+ stride] = p3 + d2;
2715 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2717 const int strength= ff_h263_loop_filter_strength[qscale];
2721 int p0= src[y*stride-2];
2722 int p1= src[y*stride-1];
2723 int p2= src[y*stride+0];
2724 int p3= src[y*stride+1];
2725 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2727 if (d<-2*strength) d1= 0;
2728 else if(d<- strength) d1=-2*strength - d;
2729 else if(d< strength) d1= d;
2730 else if(d< 2*strength) d1= 2*strength - d;
2735 if(p1&256) p1= ~(p1>>31);
2736 if(p2&256) p2= ~(p2>>31);
2738 src[y*stride-1] = p1;
2739 src[y*stride+0] = p2;
2743 d2= clip((p0-p3)/4, -ad1, ad1);
2745 src[y*stride-2] = p0 - d2;
2746 src[y*stride+1] = p3 + d2;
2750 static void h261_loop_filter_c(uint8_t *src, int stride){
2755 temp[x ] = 4*src[x ];
2756 temp[x + 7*8] = 4*src[x + 7*stride];
2760 xy = y * stride + x;
2762 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2767 src[ y*stride] = (temp[ y*8] + 2)>>2;
2768 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2770 xy = y * stride + x;
2772 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2777 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2780 for( i = 0; i < 4; i++ ) {
2785 for( d = 0; d < 4; d++ ) {
2786 const int p0 = pix[-1*xstride];
2787 const int p1 = pix[-2*xstride];
2788 const int p2 = pix[-3*xstride];
2789 const int q0 = pix[0];
2790 const int q1 = pix[1*xstride];
2791 const int q2 = pix[2*xstride];
2793 if( ABS( p0 - q0 ) < alpha &&
2794 ABS( p1 - p0 ) < beta &&
2795 ABS( q1 - q0 ) < beta ) {
2800 if( ABS( p2 - p0 ) < beta ) {
2801 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2804 if( ABS( q2 - q0 ) < beta ) {
2805 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2809 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2810 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2811 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2817 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2819 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2821 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2823 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2826 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2829 for( i = 0; i < 4; i++ ) {
2830 const int tc = tc0[i];
2835 for( d = 0; d < 2; d++ ) {
2836 const int p0 = pix[-1*xstride];
2837 const int p1 = pix[-2*xstride];
2838 const int q0 = pix[0];
2839 const int q1 = pix[1*xstride];
2841 if( ABS( p0 - q0 ) < alpha &&
2842 ABS( p1 - p0 ) < beta &&
2843 ABS( q1 - q0 ) < beta ) {
2845 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2847 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2848 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2854 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2856 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2858 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2860 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2863 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2866 for( d = 0; d < 8; d++ ) {
2867 const int p0 = pix[-1*xstride];
2868 const int p1 = pix[-2*xstride];
2869 const int q0 = pix[0];
2870 const int q1 = pix[1*xstride];
2872 if( ABS( p0 - q0 ) < alpha &&
2873 ABS( p1 - p0 ) < beta &&
2874 ABS( q1 - q0 ) < beta ) {
2876 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2877 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2882 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2884 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2886 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2888 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2891 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2897 s += abs(pix1[0] - pix2[0]);
2898 s += abs(pix1[1] - pix2[1]);
2899 s += abs(pix1[2] - pix2[2]);
2900 s += abs(pix1[3] - pix2[3]);
2901 s += abs(pix1[4] - pix2[4]);
2902 s += abs(pix1[5] - pix2[5]);
2903 s += abs(pix1[6] - pix2[6]);
2904 s += abs(pix1[7] - pix2[7]);
2905 s += abs(pix1[8] - pix2[8]);
2906 s += abs(pix1[9] - pix2[9]);
2907 s += abs(pix1[10] - pix2[10]);
2908 s += abs(pix1[11] - pix2[11]);
2909 s += abs(pix1[12] - pix2[12]);
2910 s += abs(pix1[13] - pix2[13]);
2911 s += abs(pix1[14] - pix2[14]);
2912 s += abs(pix1[15] - pix2[15]);
2919 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2925 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2926 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2927 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2928 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2929 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2930 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2931 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2932 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2933 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2934 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2935 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2936 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2937 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2938 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2939 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2940 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2947 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2950 uint8_t *pix3 = pix2 + line_size;
2954 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2955 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2956 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2957 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2958 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2959 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2960 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2961 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2962 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2963 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2964 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2965 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2966 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2967 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2968 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2969 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2977 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2980 uint8_t *pix3 = pix2 + line_size;
2984 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2985 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2986 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2987 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2988 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2989 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2990 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2991 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2992 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2993 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2994 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2995 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2996 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2997 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2998 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2999 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3007 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3013 s += abs(pix1[0] - pix2[0]);
3014 s += abs(pix1[1] - pix2[1]);
3015 s += abs(pix1[2] - pix2[2]);
3016 s += abs(pix1[3] - pix2[3]);
3017 s += abs(pix1[4] - pix2[4]);
3018 s += abs(pix1[5] - pix2[5]);
3019 s += abs(pix1[6] - pix2[6]);
3020 s += abs(pix1[7] - pix2[7]);
3027 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3033 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3034 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3035 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3036 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3037 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3038 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3039 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3040 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3047 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3050 uint8_t *pix3 = pix2 + line_size;
3054 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3055 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3056 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3057 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3058 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3059 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3060 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3061 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3069 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3072 uint8_t *pix3 = pix2 + line_size;
3076 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3077 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3078 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3079 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3080 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3081 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3082 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3083 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3091 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3092 MpegEncContext *c = v;
3098 for(x=0; x<16; x++){
3099 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3102 for(x=0; x<15; x++){
3103 score2+= ABS( s1[x ] - s1[x +stride]
3104 - s1[x+1] + s1[x+1+stride])
3105 -ABS( s2[x ] - s2[x +stride]
3106 - s2[x+1] + s2[x+1+stride]);
3113 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3114 else return score1 + ABS(score2)*8;
3117 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3118 MpegEncContext *c = v;
3125 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3129 score2+= ABS( s1[x ] - s1[x +stride]
3130 - s1[x+1] + s1[x+1+stride])
3131 -ABS( s2[x ] - s2[x +stride]
3132 - s2[x+1] + s2[x+1+stride]);
3139 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3140 else return score1 + ABS(score2)*8;
3143 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3147 for(i=0; i<8*8; i++){
3148 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3151 assert(-512<b && b<512);
3153 sum += (w*b)*(w*b)>>4;
3158 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3161 for(i=0; i<8*8; i++){
3162 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3167 * permutes an 8x8 block.
3168 * @param block the block which will be permuted according to the given permutation vector
3169 * @param permutation the permutation vector
3170 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3171 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3172 * (inverse) permutated to scantable order!
3174 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3180 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3182 for(i=0; i<=last; i++){
3183 const int j= scantable[i];
3188 for(i=0; i<=last; i++){
3189 const int j= scantable[i];
3190 const int perm_j= permutation[j];
3191 block[perm_j]= temp[j];
3195 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3199 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3202 memset(cmp, 0, sizeof(void*)*5);
3210 cmp[i]= c->hadamard8_diff[i];
3216 cmp[i]= c->dct_sad[i];
3219 cmp[i]= c->dct264_sad[i];
3222 cmp[i]= c->dct_max[i];
3225 cmp[i]= c->quant_psnr[i];
3245 #ifdef CONFIG_SNOW_ENCODER
3254 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3260 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3262 static void clear_blocks_c(DCTELEM *blocks)
3264 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3267 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3269 for(i=0; i+7<w; i+=8){
3270 dst[i+0] += src[i+0];
3271 dst[i+1] += src[i+1];
3272 dst[i+2] += src[i+2];
3273 dst[i+3] += src[i+3];
3274 dst[i+4] += src[i+4];
3275 dst[i+5] += src[i+5];
3276 dst[i+6] += src[i+6];
3277 dst[i+7] += src[i+7];
3280 dst[i+0] += src[i+0];
3283 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3285 for(i=0; i+7<w; i+=8){
3286 dst[i+0] = src1[i+0]-src2[i+0];
3287 dst[i+1] = src1[i+1]-src2[i+1];
3288 dst[i+2] = src1[i+2]-src2[i+2];
3289 dst[i+3] = src1[i+3]-src2[i+3];
3290 dst[i+4] = src1[i+4]-src2[i+4];
3291 dst[i+5] = src1[i+5]-src2[i+5];
3292 dst[i+6] = src1[i+6]-src2[i+6];
3293 dst[i+7] = src1[i+7]-src2[i+7];
3296 dst[i+0] = src1[i+0]-src2[i+0];
3299 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3307 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3317 #define BUTTERFLY2(o1,o2,i1,i2) \
3321 #define BUTTERFLY1(x,y) \
3330 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3332 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3340 //FIXME try pointer walks
3341 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3342 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3343 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3344 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3346 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3347 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3348 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3349 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3351 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3352 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3353 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3354 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3358 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3359 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3360 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3361 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3363 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3364 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3365 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3366 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3369 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3370 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3371 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3372 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3378 printf("MAX:%d\n", maxi);
3384 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3392 //FIXME try pointer walks
3393 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3394 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3395 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3396 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3398 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3399 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3400 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3401 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3403 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3404 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3405 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3406 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3410 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3411 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3412 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3413 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3415 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3416 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3417 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3418 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3421 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3422 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3423 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3424 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3427 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3432 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3433 MpegEncContext * const s= (MpegEncContext *)c;
3434 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3435 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3440 s->dsp.diff_pixels(temp, src1, src2, stride);
3451 const int s07 = SRC(0) + SRC(7);\
3452 const int s16 = SRC(1) + SRC(6);\
3453 const int s25 = SRC(2) + SRC(5);\
3454 const int s34 = SRC(3) + SRC(4);\
3455 const int a0 = s07 + s34;\
3456 const int a1 = s16 + s25;\
3457 const int a2 = s07 - s34;\
3458 const int a3 = s16 - s25;\
3459 const int d07 = SRC(0) - SRC(7);\
3460 const int d16 = SRC(1) - SRC(6);\
3461 const int d25 = SRC(2) - SRC(5);\
3462 const int d34 = SRC(3) - SRC(4);\
3463 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3464 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3465 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3466 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3468 DST(1, a4 + (a7>>2)) ;\
3469 DST(2, a2 + (a3>>1)) ;\
3470 DST(3, a5 + (a6>>2)) ;\
3472 DST(5, a6 - (a5>>2)) ;\
3473 DST(6, (a2>>1) - a3 ) ;\
3474 DST(7, (a4>>2) - a7 ) ;\
3477 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3478 MpegEncContext * const s= (MpegEncContext *)c;
3483 s->dsp.diff_pixels(dct, src1, src2, stride);
3485 #define SRC(x) dct[i][x]
3486 #define DST(x,v) dct[i][x]= v
3487 for( i = 0; i < 8; i++ )
3492 #define SRC(x) dct[x][i]
3493 #define DST(x,v) sum += ABS(v)
3494 for( i = 0; i < 8; i++ )
3502 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3503 MpegEncContext * const s= (MpegEncContext *)c;
3504 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3505 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3510 s->dsp.diff_pixels(temp, src1, src2, stride);
3514 sum= FFMAX(sum, ABS(temp[i]));
3519 void simple_idct(DCTELEM *block); //FIXME
3521 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3522 MpegEncContext * const s= (MpegEncContext *)c;
3523 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3524 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3525 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3531 s->dsp.diff_pixels(temp, src1, src2, stride);
3533 memcpy(bak, temp, 64*sizeof(DCTELEM));
3535 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3536 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3537 simple_idct(temp); //FIXME
3540 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3545 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546 MpegEncContext * const s= (MpegEncContext *)c;
3547 const uint8_t *scantable= s->intra_scantable.permutated;
3548 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3549 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3550 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3551 uint8_t * const bak= (uint8_t*)aligned_bak;
3552 int i, last, run, bits, level, distoration, start_i;
3553 const int esc_length= s->ac_esc_length;
3555 uint8_t * last_length;
3560 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3561 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3564 s->dsp.diff_pixels(temp, src1, src2, stride);
3566 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3572 length = s->intra_ac_vlc_length;
3573 last_length= s->intra_ac_vlc_last_length;
3574 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3577 length = s->inter_ac_vlc_length;
3578 last_length= s->inter_ac_vlc_last_length;
3583 for(i=start_i; i<last; i++){
3584 int j= scantable[i];
3589 if((level&(~127)) == 0){
3590 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3599 level= temp[i] + 64;
3603 if((level&(~127)) == 0){
3604 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3612 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3614 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3617 s->dsp.idct_add(bak, stride, temp);
3619 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3621 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3624 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3625 MpegEncContext * const s= (MpegEncContext *)c;
3626 const uint8_t *scantable= s->intra_scantable.permutated;
3627 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3628 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3629 int i, last, run, bits, level, start_i;
3630 const int esc_length= s->ac_esc_length;
3632 uint8_t * last_length;
3636 s->dsp.diff_pixels(temp, src1, src2, stride);
3638 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3644 length = s->intra_ac_vlc_length;
3645 last_length= s->intra_ac_vlc_last_length;
3646 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3649 length = s->inter_ac_vlc_length;
3650 last_length= s->inter_ac_vlc_last_length;
3655 for(i=start_i; i<last; i++){
3656 int j= scantable[i];
3661 if((level&(~127)) == 0){
3662 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3671 level= temp[i] + 64;
3675 if((level&(~127)) == 0){
3676 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3684 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3689 for(x=0; x<16; x+=4){
3690 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3691 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3699 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3704 for(x=0; x<16; x++){
3705 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3714 #define SQ(a) ((a)*(a))
3715 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3720 for(x=0; x<16; x+=4){
3721 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3722 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3730 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3735 for(x=0; x<16; x++){
3736 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3745 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3746 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3747 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3749 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3751 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3752 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3753 WARPER8_16_SQ(rd8x8_c, rd16_c)
3754 WARPER8_16_SQ(bit8x8_c, bit16_c)
3756 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3758 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3761 put_pixels_clamped_c(block, dest, line_size);
3763 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3766 add_pixels_clamped_c(block, dest, line_size);
3769 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3772 put_pixels_clamped4_c(block, dest, line_size);
3774 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3777 add_pixels_clamped4_c(block, dest, line_size);
3780 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3783 put_pixels_clamped2_c(block, dest, line_size);
3785 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3788 add_pixels_clamped2_c(block, dest, line_size);
3791 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3793 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3795 dest[0] = cm[(block[0] + 4)>>3];
3797 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3799 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3801 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3804 static void just_return() { return; }
3806 /* init static data */
3807 void dsputil_static_init(void)
3811 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3812 for(i=0;i<MAX_NEG_CROP;i++) {
3814 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3817 for(i=0;i<512;i++) {
3818 squareTbl[i] = (i - 256) * (i - 256);
3821 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3825 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3829 #ifdef CONFIG_ENCODERS
3830 if(avctx->dct_algo==FF_DCT_FASTINT) {
3831 c->fdct = fdct_ifast;
3832 c->fdct248 = fdct_ifast248;
3834 else if(avctx->dct_algo==FF_DCT_FAAN) {
3835 c->fdct = ff_faandct;
3836 c->fdct248 = ff_faandct248;
3839 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3840 c->fdct248 = ff_fdct248_islow;
3842 #endif //CONFIG_ENCODERS
3844 if(avctx->lowres==1){
3845 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3846 c->idct_put= ff_jref_idct4_put;
3847 c->idct_add= ff_jref_idct4_add;
3849 c->idct_put= ff_h264_lowres_idct_put_c;
3850 c->idct_add= ff_h264_lowres_idct_add_c;
3852 c->idct = j_rev_dct4;
3853 c->idct_permutation_type= FF_NO_IDCT_PERM;
3854 }else if(avctx->lowres==2){
3855 c->idct_put= ff_jref_idct2_put;
3856 c->idct_add= ff_jref_idct2_add;
3857 c->idct = j_rev_dct2;
3858 c->idct_permutation_type= FF_NO_IDCT_PERM;
3859 }else if(avctx->lowres==3){
3860 c->idct_put= ff_jref_idct1_put;
3861 c->idct_add= ff_jref_idct1_add;
3862 c->idct = j_rev_dct1;
3863 c->idct_permutation_type= FF_NO_IDCT_PERM;
3865 if(avctx->idct_algo==FF_IDCT_INT){
3866 c->idct_put= ff_jref_idct_put;
3867 c->idct_add= ff_jref_idct_add;
3868 c->idct = j_rev_dct;
3869 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3870 }else if(avctx->idct_algo==FF_IDCT_VP3){
3871 c->idct_put= ff_vp3_idct_put_c;
3872 c->idct_add= ff_vp3_idct_add_c;
3873 c->idct = ff_vp3_idct_c;
3874 c->idct_permutation_type= FF_NO_IDCT_PERM;
3875 }else{ //accurate/default
3876 c->idct_put= simple_idct_put;
3877 c->idct_add= simple_idct_add;
3878 c->idct = simple_idct;
3879 c->idct_permutation_type= FF_NO_IDCT_PERM;
3883 c->h264_idct_add= ff_h264_idct_add_c;
3884 c->h264_idct8_add= ff_h264_idct8_add_c;
3885 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3886 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3888 c->get_pixels = get_pixels_c;
3889 c->diff_pixels = diff_pixels_c;
3890 c->put_pixels_clamped = put_pixels_clamped_c;
3891 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3892 c->add_pixels_clamped = add_pixels_clamped_c;
3893 c->add_pixels8 = add_pixels8_c;
3894 c->add_pixels4 = add_pixels4_c;
3897 c->clear_blocks = clear_blocks_c;
3898 c->pix_sum = pix_sum_c;
3899 c->pix_norm1 = pix_norm1_c;
3901 /* TODO [0] 16 [1] 8 */
3902 c->pix_abs[0][0] = pix_abs16_c;
3903 c->pix_abs[0][1] = pix_abs16_x2_c;
3904 c->pix_abs[0][2] = pix_abs16_y2_c;
3905 c->pix_abs[0][3] = pix_abs16_xy2_c;
3906 c->pix_abs[1][0] = pix_abs8_c;
3907 c->pix_abs[1][1] = pix_abs8_x2_c;
3908 c->pix_abs[1][2] = pix_abs8_y2_c;
3909 c->pix_abs[1][3] = pix_abs8_xy2_c;
3911 #define dspfunc(PFX, IDX, NUM) \
3912 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3913 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3914 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3915 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3917 dspfunc(put, 0, 16);
3918 dspfunc(put_no_rnd, 0, 16);
3920 dspfunc(put_no_rnd, 1, 8);
3924 dspfunc(avg, 0, 16);
3925 dspfunc(avg_no_rnd, 0, 16);
3927 dspfunc(avg_no_rnd, 1, 8);
3932 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3933 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3935 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3936 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3937 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3938 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3939 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3940 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3941 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3942 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3943 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3945 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3946 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3947 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3948 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3949 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3950 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3951 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3952 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3953 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3955 #define dspfunc(PFX, IDX, NUM) \
3956 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3957 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3958 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3959 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3960 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3961 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3962 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3963 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3964 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3965 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3966 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3967 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3968 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3969 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3970 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3971 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3973 dspfunc(put_qpel, 0, 16);
3974 dspfunc(put_no_rnd_qpel, 0, 16);
3976 dspfunc(avg_qpel, 0, 16);
3977 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3979 dspfunc(put_qpel, 1, 8);
3980 dspfunc(put_no_rnd_qpel, 1, 8);
3982 dspfunc(avg_qpel, 1, 8);
3983 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3985 dspfunc(put_h264_qpel, 0, 16);
3986 dspfunc(put_h264_qpel, 1, 8);
3987 dspfunc(put_h264_qpel, 2, 4);
3988 dspfunc(put_h264_qpel, 3, 2);
3989 dspfunc(avg_h264_qpel, 0, 16);
3990 dspfunc(avg_h264_qpel, 1, 8);
3991 dspfunc(avg_h264_qpel, 2, 4);
3994 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
3995 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
3996 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
3997 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
3998 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
3999 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4001 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4002 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4003 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4004 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4005 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4006 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4007 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4008 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4009 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4010 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4011 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4012 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4013 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4014 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4015 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4016 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4017 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4018 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4019 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4020 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4022 #ifdef CONFIG_CAVS_DECODER
4023 ff_cavsdsp_init(c,avctx);
4025 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4026 ff_vc1dsp_init(c,avctx);
4029 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4030 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4031 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4032 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4033 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4034 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4035 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4036 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4038 #define SET_CMP_FUNC(name) \
4039 c->name[0]= name ## 16_c;\
4040 c->name[1]= name ## 8x8_c;
4042 SET_CMP_FUNC(hadamard8_diff)
4043 c->hadamard8_diff[4]= hadamard8_intra16_c;
4044 SET_CMP_FUNC(dct_sad)
4045 SET_CMP_FUNC(dct_max)
4047 SET_CMP_FUNC(dct264_sad)
4049 c->sad[0]= pix_abs16_c;
4050 c->sad[1]= pix_abs8_c;
4054 SET_CMP_FUNC(quant_psnr)
4057 c->vsad[0]= vsad16_c;
4058 c->vsad[4]= vsad_intra16_c;
4059 c->vsse[0]= vsse16_c;
4060 c->vsse[4]= vsse_intra16_c;
4061 c->nsse[0]= nsse16_c;
4062 c->nsse[1]= nsse8_c;
4063 #ifdef CONFIG_SNOW_ENCODER
4064 c->w53[0]= w53_16_c;
4066 c->w97[0]= w97_16_c;
4070 c->add_bytes= add_bytes_c;
4071 c->diff_bytes= diff_bytes_c;
4072 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4073 c->bswap_buf= bswap_buf;
4075 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4076 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4077 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4078 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4079 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4080 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4082 c->h263_h_loop_filter= h263_h_loop_filter_c;
4083 c->h263_v_loop_filter= h263_v_loop_filter_c;
4085 c->h261_loop_filter= h261_loop_filter_c;
4087 c->try_8x8basis= try_8x8basis_c;
4088 c->add_8x8basis= add_8x8basis_c;
4090 #ifdef CONFIG_SNOW_ENCODER
4091 c->vertical_compose97i = ff_snow_vertical_compose97i;
4092 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4093 c->inner_add_yblock = ff_snow_inner_add_yblock;
4096 #ifdef CONFIG_VORBIS_DECODER
4097 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4100 c->shrink[0]= ff_img_copy_plane;
4101 c->shrink[1]= ff_shrink22;
4102 c->shrink[2]= ff_shrink44;
4103 c->shrink[3]= ff_shrink88;
4105 c->prefetch= just_return;
4108 dsputil_init_mmx(c, avctx);
4111 dsputil_init_armv4l(c, avctx);
4114 dsputil_init_mlib(c, avctx);
4117 dsputil_init_vis(c,avctx);
4120 dsputil_init_alpha(c, avctx);
4123 dsputil_init_ppc(c, avctx);
4126 dsputil_init_mmi(c, avctx);
4129 dsputil_init_sh4(c,avctx);
4132 switch(c->idct_permutation_type){
4133 case FF_NO_IDCT_PERM:
4135 c->idct_permutation[i]= i;
4137 case FF_LIBMPEG2_IDCT_PERM:
4139 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4141 case FF_SIMPLE_IDCT_PERM:
4143 c->idct_permutation[i]= simple_mmx_permutation[i];
4145 case FF_TRANSPOSE_IDCT_PERM:
4147 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4149 case FF_PARTTRANS_IDCT_PERM:
4151 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4154 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");