3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
30 #include "mpegvideo.h"
31 #include "simple_idct.h"
36 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
39 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
41 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
42 uint32_t squareTbl[512] = {0, };
44 const uint8_t ff_zigzag_direct[64] = {
45 0, 1, 8, 16, 9, 2, 3, 10,
46 17, 24, 32, 25, 18, 11, 4, 5,
47 12, 19, 26, 33, 40, 48, 41, 34,
48 27, 20, 13, 6, 7, 14, 21, 28,
49 35, 42, 49, 56, 57, 50, 43, 36,
50 29, 22, 15, 23, 30, 37, 44, 51,
51 58, 59, 52, 45, 38, 31, 39, 46,
52 53, 60, 61, 54, 47, 55, 62, 63
55 /* Specific zigzag scan for 248 idct. NOTE that unlike the
56 specification, we interleave the fields */
57 const uint8_t ff_zigzag248_direct[64] = {
58 0, 8, 1, 9, 16, 24, 2, 10,
59 17, 25, 32, 40, 48, 56, 33, 41,
60 18, 26, 3, 11, 4, 12, 19, 27,
61 34, 42, 49, 57, 50, 58, 35, 43,
62 20, 28, 5, 13, 6, 14, 21, 29,
63 36, 44, 51, 59, 52, 60, 37, 45,
64 22, 30, 7, 15, 23, 31, 38, 46,
65 53, 61, 54, 62, 39, 47, 55, 63,
68 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
69 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
71 const uint8_t ff_alternate_horizontal_scan[64] = {
72 0, 1, 2, 3, 8, 9, 16, 17,
73 10, 11, 4, 5, 6, 7, 15, 14,
74 13, 12, 19, 18, 24, 25, 32, 33,
75 26, 27, 20, 21, 22, 23, 28, 29,
76 30, 31, 34, 35, 40, 41, 48, 49,
77 42, 43, 36, 37, 38, 39, 44, 45,
78 46, 47, 50, 51, 56, 57, 58, 59,
79 52, 53, 54, 55, 60, 61, 62, 63,
82 const uint8_t ff_alternate_vertical_scan[64] = {
83 0, 8, 16, 24, 1, 9, 2, 10,
84 17, 25, 32, 40, 48, 56, 57, 49,
85 41, 33, 26, 18, 3, 11, 4, 12,
86 19, 27, 34, 42, 50, 58, 35, 43,
87 51, 59, 20, 28, 5, 13, 6, 14,
88 21, 29, 36, 44, 52, 60, 37, 45,
89 53, 61, 22, 30, 7, 15, 23, 31,
90 38, 46, 54, 62, 39, 47, 55, 63,
93 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
94 const uint32_t inverse[256]={
95 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
96 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
97 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
98 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
99 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
100 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
101 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
102 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
103 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
104 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
105 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
106 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
107 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
108 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
109 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
110 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
111 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
112 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
113 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
114 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
115 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
116 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
117 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
118 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
119 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
120 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
121 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
122 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
123 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
124 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
125 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
126 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
129 /* Input permutation for the simple_idct_mmx */
130 static const uint8_t simple_mmx_permutation[64]={
131 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
132 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
133 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
134 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
135 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
136 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
137 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
138 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
141 static int pix_sum_c(uint8_t * pix, int line_size)
146 for (i = 0; i < 16; i++) {
147 for (j = 0; j < 16; j += 8) {
158 pix += line_size - 16;
163 static int pix_norm1_c(uint8_t * pix, int line_size)
166 uint32_t *sq = squareTbl + 256;
169 for (i = 0; i < 16; i++) {
170 for (j = 0; j < 16; j += 8) {
181 #if LONG_MAX > 2147483647
182 register uint64_t x=*(uint64_t*)pix;
184 s += sq[(x>>8)&0xff];
185 s += sq[(x>>16)&0xff];
186 s += sq[(x>>24)&0xff];
187 s += sq[(x>>32)&0xff];
188 s += sq[(x>>40)&0xff];
189 s += sq[(x>>48)&0xff];
190 s += sq[(x>>56)&0xff];
192 register uint32_t x=*(uint32_t*)pix;
194 s += sq[(x>>8)&0xff];
195 s += sq[(x>>16)&0xff];
196 s += sq[(x>>24)&0xff];
197 x=*(uint32_t*)(pix+4);
199 s += sq[(x>>8)&0xff];
200 s += sq[(x>>16)&0xff];
201 s += sq[(x>>24)&0xff];
206 pix += line_size - 16;
211 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
214 for(i=0; i+8<=w; i+=8){
215 dst[i+0]= bswap_32(src[i+0]);
216 dst[i+1]= bswap_32(src[i+1]);
217 dst[i+2]= bswap_32(src[i+2]);
218 dst[i+3]= bswap_32(src[i+3]);
219 dst[i+4]= bswap_32(src[i+4]);
220 dst[i+5]= bswap_32(src[i+5]);
221 dst[i+6]= bswap_32(src[i+6]);
222 dst[i+7]= bswap_32(src[i+7]);
225 dst[i+0]= bswap_32(src[i+0]);
229 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
232 uint32_t *sq = squareTbl + 256;
235 for (i = 0; i < h; i++) {
236 s += sq[pix1[0] - pix2[0]];
237 s += sq[pix1[1] - pix2[1]];
238 s += sq[pix1[2] - pix2[2]];
239 s += sq[pix1[3] - pix2[3]];
246 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
249 uint32_t *sq = squareTbl + 256;
252 for (i = 0; i < h; i++) {
253 s += sq[pix1[0] - pix2[0]];
254 s += sq[pix1[1] - pix2[1]];
255 s += sq[pix1[2] - pix2[2]];
256 s += sq[pix1[3] - pix2[3]];
257 s += sq[pix1[4] - pix2[4]];
258 s += sq[pix1[5] - pix2[5]];
259 s += sq[pix1[6] - pix2[6]];
260 s += sq[pix1[7] - pix2[7]];
267 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
270 uint32_t *sq = squareTbl + 256;
273 for (i = 0; i < h; i++) {
274 s += sq[pix1[ 0] - pix2[ 0]];
275 s += sq[pix1[ 1] - pix2[ 1]];
276 s += sq[pix1[ 2] - pix2[ 2]];
277 s += sq[pix1[ 3] - pix2[ 3]];
278 s += sq[pix1[ 4] - pix2[ 4]];
279 s += sq[pix1[ 5] - pix2[ 5]];
280 s += sq[pix1[ 6] - pix2[ 6]];
281 s += sq[pix1[ 7] - pix2[ 7]];
282 s += sq[pix1[ 8] - pix2[ 8]];
283 s += sq[pix1[ 9] - pix2[ 9]];
284 s += sq[pix1[10] - pix2[10]];
285 s += sq[pix1[11] - pix2[11]];
286 s += sq[pix1[12] - pix2[12]];
287 s += sq[pix1[13] - pix2[13]];
288 s += sq[pix1[14] - pix2[14]];
289 s += sq[pix1[15] - pix2[15]];
298 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
299 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
301 const int dec_count= w==8 ? 3 : 4;
304 static const int scale[2][2][4][4]={
308 {268, 239, 239, 213},
312 // 9/7 16x16 or 32x32 dec=4
313 {344, 310, 310, 280},
321 {275, 245, 245, 218},
325 // 5/3 16x16 or 32x32 dec=4
326 {352, 317, 317, 286},
334 for (i = 0; i < h; i++) {
335 for (j = 0; j < w; j+=4) {
336 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
337 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
338 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
339 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
345 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
349 for(level=0; level<dec_count; level++){
350 for(ori= level ? 1 : 0; ori<4; ori++){
351 int size= w>>(dec_count-level);
352 int sx= (ori&1) ? size : 0;
353 int stride= 32<<(dec_count-level);
354 int sy= (ori&2) ? stride>>1 : 0;
356 for(i=0; i<size; i++){
357 for(j=0; j<size; j++){
358 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
368 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
369 return w_c(v, pix1, pix2, line_size, 8, h, 1);
372 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
373 return w_c(v, pix1, pix2, line_size, 8, h, 0);
376 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
377 return w_c(v, pix1, pix2, line_size, 16, h, 1);
380 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
381 return w_c(v, pix1, pix2, line_size, 16, h, 0);
384 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
385 return w_c(v, pix1, pix2, line_size, 32, h, 1);
388 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
389 return w_c(v, pix1, pix2, line_size, 32, h, 0);
393 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
397 /* read the pixels */
399 block[0] = pixels[0];
400 block[1] = pixels[1];
401 block[2] = pixels[2];
402 block[3] = pixels[3];
403 block[4] = pixels[4];
404 block[5] = pixels[5];
405 block[6] = pixels[6];
406 block[7] = pixels[7];
412 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
413 const uint8_t *s2, int stride){
416 /* read the pixels */
418 block[0] = s1[0] - s2[0];
419 block[1] = s1[1] - s2[1];
420 block[2] = s1[2] - s2[2];
421 block[3] = s1[3] - s2[3];
422 block[4] = s1[4] - s2[4];
423 block[5] = s1[5] - s2[5];
424 block[6] = s1[6] - s2[6];
425 block[7] = s1[7] - s2[7];
433 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
437 uint8_t *cm = cropTbl + MAX_NEG_CROP;
439 /* read the pixels */
441 pixels[0] = cm[block[0]];
442 pixels[1] = cm[block[1]];
443 pixels[2] = cm[block[2]];
444 pixels[3] = cm[block[3]];
445 pixels[4] = cm[block[4]];
446 pixels[5] = cm[block[5]];
447 pixels[6] = cm[block[6]];
448 pixels[7] = cm[block[7]];
455 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
459 uint8_t *cm = cropTbl + MAX_NEG_CROP;
461 /* read the pixels */
463 pixels[0] = cm[block[0]];
464 pixels[1] = cm[block[1]];
465 pixels[2] = cm[block[2]];
466 pixels[3] = cm[block[3]];
473 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
477 uint8_t *cm = cropTbl + MAX_NEG_CROP;
479 /* read the pixels */
481 pixels[0] = cm[block[0]];
482 pixels[1] = cm[block[1]];
489 static void put_signed_pixels_clamped_c(const DCTELEM *block,
490 uint8_t *restrict pixels,
495 for (i = 0; i < 8; i++) {
496 for (j = 0; j < 8; j++) {
499 else if (*block > 127)
502 *pixels = (uint8_t)(*block + 128);
506 pixels += (line_size - 8);
510 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
514 uint8_t *cm = cropTbl + MAX_NEG_CROP;
516 /* read the pixels */
518 pixels[0] = cm[pixels[0] + block[0]];
519 pixels[1] = cm[pixels[1] + block[1]];
520 pixels[2] = cm[pixels[2] + block[2]];
521 pixels[3] = cm[pixels[3] + block[3]];
522 pixels[4] = cm[pixels[4] + block[4]];
523 pixels[5] = cm[pixels[5] + block[5]];
524 pixels[6] = cm[pixels[6] + block[6]];
525 pixels[7] = cm[pixels[7] + block[7]];
531 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
535 uint8_t *cm = cropTbl + MAX_NEG_CROP;
537 /* read the pixels */
539 pixels[0] = cm[pixels[0] + block[0]];
540 pixels[1] = cm[pixels[1] + block[1]];
541 pixels[2] = cm[pixels[2] + block[2]];
542 pixels[3] = cm[pixels[3] + block[3]];
548 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
552 uint8_t *cm = cropTbl + MAX_NEG_CROP;
554 /* read the pixels */
556 pixels[0] = cm[pixels[0] + block[0]];
557 pixels[1] = cm[pixels[1] + block[1]];
563 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
567 pixels[0] += block[0];
568 pixels[1] += block[1];
569 pixels[2] += block[2];
570 pixels[3] += block[3];
571 pixels[4] += block[4];
572 pixels[5] += block[5];
573 pixels[6] += block[6];
574 pixels[7] += block[7];
580 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
584 pixels[0] += block[0];
585 pixels[1] += block[1];
586 pixels[2] += block[2];
587 pixels[3] += block[3];
595 #define PIXOP2(OPNAME, OP) \
596 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
600 OP(*((uint64_t*)block), LD64(pixels));\
606 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
610 const uint64_t a= LD64(pixels );\
611 const uint64_t b= LD64(pixels+1);\
612 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
618 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
622 const uint64_t a= LD64(pixels );\
623 const uint64_t b= LD64(pixels+1);\
624 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
630 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
634 const uint64_t a= LD64(pixels );\
635 const uint64_t b= LD64(pixels+line_size);\
636 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
642 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
646 const uint64_t a= LD64(pixels );\
647 const uint64_t b= LD64(pixels+line_size);\
648 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
654 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
657 const uint64_t a= LD64(pixels );\
658 const uint64_t b= LD64(pixels+1);\
659 uint64_t l0= (a&0x0303030303030303ULL)\
660 + (b&0x0303030303030303ULL)\
661 + 0x0202020202020202ULL;\
662 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
663 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
667 for(i=0; i<h; i+=2){\
668 uint64_t a= LD64(pixels );\
669 uint64_t b= LD64(pixels+1);\
670 l1= (a&0x0303030303030303ULL)\
671 + (b&0x0303030303030303ULL);\
672 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
673 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
674 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
679 l0= (a&0x0303030303030303ULL)\
680 + (b&0x0303030303030303ULL)\
681 + 0x0202020202020202ULL;\
682 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
683 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
684 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
690 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
693 const uint64_t a= LD64(pixels );\
694 const uint64_t b= LD64(pixels+1);\
695 uint64_t l0= (a&0x0303030303030303ULL)\
696 + (b&0x0303030303030303ULL)\
697 + 0x0101010101010101ULL;\
698 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
699 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
703 for(i=0; i<h; i+=2){\
704 uint64_t a= LD64(pixels );\
705 uint64_t b= LD64(pixels+1);\
706 l1= (a&0x0303030303030303ULL)\
707 + (b&0x0303030303030303ULL);\
708 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
709 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
710 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
715 l0= (a&0x0303030303030303ULL)\
716 + (b&0x0303030303030303ULL)\
717 + 0x0101010101010101ULL;\
718 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
719 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
720 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
726 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
727 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
730 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
734 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
735 #else // 64 bit variant
737 #define PIXOP2(OPNAME, OP) \
738 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
741 OP(*((uint16_t*)(block )), LD16(pixels ));\
746 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
749 OP(*((uint32_t*)(block )), LD32(pixels ));\
754 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
757 OP(*((uint32_t*)(block )), LD32(pixels ));\
758 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
763 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
764 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
767 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
768 int src_stride1, int src_stride2, int h){\
772 a= LD32(&src1[i*src_stride1 ]);\
773 b= LD32(&src2[i*src_stride2 ]);\
774 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
775 a= LD32(&src1[i*src_stride1+4]);\
776 b= LD32(&src2[i*src_stride2+4]);\
777 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
781 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= LD32(&src1[i*src_stride1 ]);\
787 b= LD32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
789 a= LD32(&src1[i*src_stride1+4]);\
790 b= LD32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= LD32(&src1[i*src_stride1 ]);\
801 b= LD32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
806 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
807 int src_stride1, int src_stride2, int h){\
811 a= LD16(&src1[i*src_stride1 ]);\
812 b= LD16(&src2[i*src_stride2 ]);\
813 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
817 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
818 int src_stride1, int src_stride2, int h){\
819 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
820 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
823 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
824 int src_stride1, int src_stride2, int h){\
825 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
826 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
829 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
830 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
833 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
834 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
837 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
838 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
841 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
842 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
845 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
846 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
849 uint32_t a, b, c, d, l0, l1, h0, h1;\
850 a= LD32(&src1[i*src_stride1]);\
851 b= LD32(&src2[i*src_stride2]);\
852 c= LD32(&src3[i*src_stride3]);\
853 d= LD32(&src4[i*src_stride4]);\
854 l0= (a&0x03030303UL)\
857 h0= ((a&0xFCFCFCFCUL)>>2)\
858 + ((b&0xFCFCFCFCUL)>>2);\
859 l1= (c&0x03030303UL)\
861 h1= ((c&0xFCFCFCFCUL)>>2)\
862 + ((d&0xFCFCFCFCUL)>>2);\
863 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
864 a= LD32(&src1[i*src_stride1+4]);\
865 b= LD32(&src2[i*src_stride2+4]);\
866 c= LD32(&src3[i*src_stride3+4]);\
867 d= LD32(&src4[i*src_stride4+4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
881 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
882 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
885 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
889 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
890 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
893 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
897 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
898 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
901 uint32_t a, b, c, d, l0, l1, h0, h1;\
902 a= LD32(&src1[i*src_stride1]);\
903 b= LD32(&src2[i*src_stride2]);\
904 c= LD32(&src3[i*src_stride3]);\
905 d= LD32(&src4[i*src_stride4]);\
906 l0= (a&0x03030303UL)\
909 h0= ((a&0xFCFCFCFCUL)>>2)\
910 + ((b&0xFCFCFCFCUL)>>2);\
911 l1= (c&0x03030303UL)\
913 h1= ((c&0xFCFCFCFCUL)>>2)\
914 + ((d&0xFCFCFCFCUL)>>2);\
915 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
916 a= LD32(&src1[i*src_stride1+4]);\
917 b= LD32(&src2[i*src_stride2+4]);\
918 c= LD32(&src3[i*src_stride3+4]);\
919 d= LD32(&src4[i*src_stride4+4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
932 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
933 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
934 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
935 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
938 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
939 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
940 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
943 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
945 int i, a0, b0, a1, b1;\
952 for(i=0; i<h; i+=2){\
958 block[0]= (a1+a0)>>2; /* FIXME non put */\
959 block[1]= (b1+b0)>>2;\
969 block[0]= (a1+a0)>>2;\
970 block[1]= (b1+b0)>>2;\
976 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
979 const uint32_t a= LD32(pixels );\
980 const uint32_t b= LD32(pixels+1);\
981 uint32_t l0= (a&0x03030303UL)\
984 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
985 + ((b&0xFCFCFCFCUL)>>2);\
989 for(i=0; i<h; i+=2){\
990 uint32_t a= LD32(pixels );\
991 uint32_t b= LD32(pixels+1);\
992 l1= (a&0x03030303UL)\
994 h1= ((a&0xFCFCFCFCUL)>>2)\
995 + ((b&0xFCFCFCFCUL)>>2);\
996 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1001 l0= (a&0x03030303UL)\
1004 h0= ((a&0xFCFCFCFCUL)>>2)\
1005 + ((b&0xFCFCFCFCUL)>>2);\
1006 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1012 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1015 for(j=0; j<2; j++){\
1017 const uint32_t a= LD32(pixels );\
1018 const uint32_t b= LD32(pixels+1);\
1019 uint32_t l0= (a&0x03030303UL)\
1022 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1023 + ((b&0xFCFCFCFCUL)>>2);\
1027 for(i=0; i<h; i+=2){\
1028 uint32_t a= LD32(pixels );\
1029 uint32_t b= LD32(pixels+1);\
1030 l1= (a&0x03030303UL)\
1031 + (b&0x03030303UL);\
1032 h1= ((a&0xFCFCFCFCUL)>>2)\
1033 + ((b&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1039 l0= (a&0x03030303UL)\
1042 h0= ((a&0xFCFCFCFCUL)>>2)\
1043 + ((b&0xFCFCFCFCUL)>>2);\
1044 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1048 pixels+=4-line_size*(h+1);\
1049 block +=4-line_size*h;\
1053 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1056 for(j=0; j<2; j++){\
1058 const uint32_t a= LD32(pixels );\
1059 const uint32_t b= LD32(pixels+1);\
1060 uint32_t l0= (a&0x03030303UL)\
1063 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1068 for(i=0; i<h; i+=2){\
1069 uint32_t a= LD32(pixels );\
1070 uint32_t b= LD32(pixels+1);\
1071 l1= (a&0x03030303UL)\
1072 + (b&0x03030303UL);\
1073 h1= ((a&0xFCFCFCFCUL)>>2)\
1074 + ((b&0xFCFCFCFCUL)>>2);\
1075 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1080 l0= (a&0x03030303UL)\
1083 h0= ((a&0xFCFCFCFCUL)>>2)\
1084 + ((b&0xFCFCFCFCUL)>>2);\
1085 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 pixels+=4-line_size*(h+1);\
1090 block +=4-line_size*h;\
1094 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1095 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1098 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1103 #define op_avg(a, b) a = rnd_avg32(a, b)
1105 #define op_put(a, b) a = b
1112 #define avg2(a,b) ((a+b+1)>>1)
1113 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1115 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1116 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1119 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1120 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1123 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1125 const int A=(16-x16)*(16-y16);
1126 const int B=( x16)*(16-y16);
1127 const int C=(16-x16)*( y16);
1128 const int D=( x16)*( y16);
1133 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1134 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1135 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1136 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1137 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1138 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1139 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1140 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1146 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1147 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1150 const int s= 1<<shift;
1160 for(x=0; x<8; x++){ //XXX FIXME optimize
1161 int src_x, src_y, frac_x, frac_y, index;
1165 frac_x= src_x&(s-1);
1166 frac_y= src_y&(s-1);
1170 if((unsigned)src_x < width){
1171 if((unsigned)src_y < height){
1172 index= src_x + src_y*stride;
1173 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1174 + src[index +1]* frac_x )*(s-frac_y)
1175 + ( src[index+stride ]*(s-frac_x)
1176 + src[index+stride+1]* frac_x )* frac_y
1179 index= src_x + clip(src_y, 0, height)*stride;
1180 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1181 + src[index +1]* frac_x )*s
1185 if((unsigned)src_y < height){
1186 index= clip(src_x, 0, width) + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1188 + src[index+stride ]* frac_y )*s
1191 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1192 dst[y*stride + x]= src[index ];
1204 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1206 case 2: put_pixels2_c (dst, src, stride, height); break;
1207 case 4: put_pixels4_c (dst, src, stride, height); break;
1208 case 8: put_pixels8_c (dst, src, stride, height); break;
1209 case 16:put_pixels16_c(dst, src, stride, height); break;
1213 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1215 for (i=0; i < height; i++) {
1216 for (j=0; j < width; j++) {
1217 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1224 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1226 for (i=0; i < height; i++) {
1227 for (j=0; j < width; j++) {
1228 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1235 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1237 for (i=0; i < height; i++) {
1238 for (j=0; j < width; j++) {
1239 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1246 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1248 for (i=0; i < height; i++) {
1249 for (j=0; j < width; j++) {
1250 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1257 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1259 for (i=0; i < height; i++) {
1260 for (j=0; j < width; j++) {
1261 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1268 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1270 for (i=0; i < height; i++) {
1271 for (j=0; j < width; j++) {
1272 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1279 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1281 for (i=0; i < height; i++) {
1282 for (j=0; j < width; j++) {
1283 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1290 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1292 for (i=0; i < height; i++) {
1293 for (j=0; j < width; j++) {
1294 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1301 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1303 case 2: avg_pixels2_c (dst, src, stride, height); break;
1304 case 4: avg_pixels4_c (dst, src, stride, height); break;
1305 case 8: avg_pixels8_c (dst, src, stride, height); break;
1306 case 16:avg_pixels16_c(dst, src, stride, height); break;
1310 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1312 for (i=0; i < height; i++) {
1313 for (j=0; j < width; j++) {
1314 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1321 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1323 for (i=0; i < height; i++) {
1324 for (j=0; j < width; j++) {
1325 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1332 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1334 for (i=0; i < height; i++) {
1335 for (j=0; j < width; j++) {
1336 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1343 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1345 for (i=0; i < height; i++) {
1346 for (j=0; j < width; j++) {
1347 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1354 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1356 for (i=0; i < height; i++) {
1357 for (j=0; j < width; j++) {
1358 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1365 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1367 for (i=0; i < height; i++) {
1368 for (j=0; j < width; j++) {
1369 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1376 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1378 for (i=0; i < height; i++) {
1379 for (j=0; j < width; j++) {
1380 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1387 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1389 for (i=0; i < height; i++) {
1390 for (j=0; j < width; j++) {
1391 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1398 #define TPEL_WIDTH(width)\
1399 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1400 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1401 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1419 #define H264_CHROMA_MC(OPNAME, OP)\
1420 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1421 const int A=(8-x)*(8-y);\
1422 const int B=( x)*(8-y);\
1423 const int C=(8-x)*( y);\
1424 const int D=( x)*( y);\
1427 assert(x<8 && y<8 && x>=0 && y>=0);\
1431 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1432 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1438 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1439 const int A=(8-x)*(8-y);\
1440 const int B=( x)*(8-y);\
1441 const int C=(8-x)*( y);\
1442 const int D=( x)*( y);\
1445 assert(x<8 && y<8 && x>=0 && y>=0);\
1449 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1450 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1451 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1452 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1458 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1459 const int A=(8-x)*(8-y);\
1460 const int B=( x)*(8-y);\
1461 const int C=(8-x)*( y);\
1462 const int D=( x)*( y);\
1465 assert(x<8 && y<8 && x>=0 && y>=0);\
1469 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1470 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1471 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1472 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1473 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1474 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1475 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1476 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1482 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1483 #define op_put(a, b) a = (((b) + 32)>>6)
1485 H264_CHROMA_MC(put_ , op_put)
1486 H264_CHROMA_MC(avg_ , op_avg)
1490 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1491 const int A=(8-x)*(8-y);
1492 const int B=( x)*(8-y);
1493 const int C=(8-x)*( y);
1494 const int D=( x)*( y);
1497 assert(x<8 && y<8 && x>=0 && y>=0);
1501 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1502 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1503 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1504 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1505 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1506 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1507 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1508 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1514 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1519 ST16(dst , LD16(src ));
1525 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1530 ST32(dst , LD32(src ));
1536 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1541 ST32(dst , LD32(src ));
1542 ST32(dst+4 , LD32(src+4 ));
1548 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1553 ST32(dst , LD32(src ));
1554 ST32(dst+4 , LD32(src+4 ));
1555 ST32(dst+8 , LD32(src+8 ));
1556 ST32(dst+12, LD32(src+12));
1562 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1567 ST32(dst , LD32(src ));
1568 ST32(dst+4 , LD32(src+4 ));
1569 ST32(dst+8 , LD32(src+8 ));
1570 ST32(dst+12, LD32(src+12));
1577 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1582 ST32(dst , LD32(src ));
1583 ST32(dst+4 , LD32(src+4 ));
1591 #define QPEL_MC(r, OPNAME, RND, OP) \
1592 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1593 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1597 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1598 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1599 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1600 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1601 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1602 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1603 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1604 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1610 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1612 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1616 const int src0= src[0*srcStride];\
1617 const int src1= src[1*srcStride];\
1618 const int src2= src[2*srcStride];\
1619 const int src3= src[3*srcStride];\
1620 const int src4= src[4*srcStride];\
1621 const int src5= src[5*srcStride];\
1622 const int src6= src[6*srcStride];\
1623 const int src7= src[7*srcStride];\
1624 const int src8= src[8*srcStride];\
1625 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1626 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1627 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1628 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1629 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1630 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1631 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1632 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1638 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1639 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1644 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1645 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1646 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1647 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1648 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1649 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1650 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1651 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1652 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1653 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1654 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1655 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1656 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1657 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1658 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1659 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1665 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1666 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1671 const int src0= src[0*srcStride];\
1672 const int src1= src[1*srcStride];\
1673 const int src2= src[2*srcStride];\
1674 const int src3= src[3*srcStride];\
1675 const int src4= src[4*srcStride];\
1676 const int src5= src[5*srcStride];\
1677 const int src6= src[6*srcStride];\
1678 const int src7= src[7*srcStride];\
1679 const int src8= src[8*srcStride];\
1680 const int src9= src[9*srcStride];\
1681 const int src10= src[10*srcStride];\
1682 const int src11= src[11*srcStride];\
1683 const int src12= src[12*srcStride];\
1684 const int src13= src[13*srcStride];\
1685 const int src14= src[14*srcStride];\
1686 const int src15= src[15*srcStride];\
1687 const int src16= src[16*srcStride];\
1688 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1689 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1690 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1691 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1692 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1693 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1694 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1695 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1696 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1697 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1698 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1699 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1700 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1701 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1702 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1703 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1709 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1710 OPNAME ## pixels8_c(dst, src, stride, 8);\
1713 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1715 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1716 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1719 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1720 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1723 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1725 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1726 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1729 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t full[16*9];\
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1734 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1737 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1738 uint8_t full[16*9];\
1739 copy_block9(full, src, 16, stride, 9);\
1740 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1743 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1744 uint8_t full[16*9];\
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1748 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1750 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t full[16*9];\
1754 uint8_t halfHV[64];\
1755 copy_block9(full, src, 16, stride, 9);\
1756 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1757 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1758 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1759 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1761 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1762 uint8_t full[16*9];\
1764 uint8_t halfHV[64];\
1765 copy_block9(full, src, 16, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1767 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1768 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1769 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1771 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1772 uint8_t full[16*9];\
1775 uint8_t halfHV[64];\
1776 copy_block9(full, src, 16, stride, 9);\
1777 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1778 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1779 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1780 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1782 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1783 uint8_t full[16*9];\
1785 uint8_t halfHV[64];\
1786 copy_block9(full, src, 16, stride, 9);\
1787 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1788 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1789 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1790 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1792 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t full[16*9];\
1796 uint8_t halfHV[64];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1800 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1803 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1804 uint8_t full[16*9];\
1806 uint8_t halfHV[64];\
1807 copy_block9(full, src, 16, stride, 9);\
1808 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1809 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1810 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1811 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1813 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1814 uint8_t full[16*9];\
1817 uint8_t halfHV[64];\
1818 copy_block9(full, src, 16, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1820 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1821 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1822 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1824 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1825 uint8_t full[16*9];\
1827 uint8_t halfHV[64];\
1828 copy_block9(full, src, 16, stride, 9);\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1830 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1832 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1834 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t halfHV[64];\
1837 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1839 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1841 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t halfHV[64];\
1844 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1845 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1846 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1848 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[16*9];\
1852 uint8_t halfHV[64];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1859 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[16*9];\
1862 copy_block9(full, src, 16, stride, 9);\
1863 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1864 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1865 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1868 uint8_t full[16*9];\
1871 uint8_t halfHV[64];\
1872 copy_block9(full, src, 16, stride, 9);\
1873 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1874 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1875 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1876 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1878 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t full[16*9];\
1881 copy_block9(full, src, 16, stride, 9);\
1882 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1883 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1884 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1886 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1888 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1889 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1891 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1892 OPNAME ## pixels16_c(dst, src, stride, 16);\
1895 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1897 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1898 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1901 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1902 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1905 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1907 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1908 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1911 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[24*17];\
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1916 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1919 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[24*17];\
1921 copy_block17(full, src, 24, stride, 17);\
1922 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1925 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1926 uint8_t full[24*17];\
1928 copy_block17(full, src, 24, stride, 17);\
1929 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1930 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1932 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1933 uint8_t full[24*17];\
1934 uint8_t halfH[272];\
1935 uint8_t halfV[256];\
1936 uint8_t halfHV[256];\
1937 copy_block17(full, src, 24, stride, 17);\
1938 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1939 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1940 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1941 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1943 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1944 uint8_t full[24*17];\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
1947 copy_block17(full, src, 24, stride, 17);\
1948 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1949 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1950 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1951 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1953 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1954 uint8_t full[24*17];\
1955 uint8_t halfH[272];\
1956 uint8_t halfV[256];\
1957 uint8_t halfHV[256];\
1958 copy_block17(full, src, 24, stride, 17);\
1959 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1960 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1961 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1962 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1964 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1965 uint8_t full[24*17];\
1966 uint8_t halfH[272];\
1967 uint8_t halfHV[256];\
1968 copy_block17(full, src, 24, stride, 17);\
1969 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1970 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1971 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1972 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1974 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[24*17];\
1976 uint8_t halfH[272];\
1977 uint8_t halfV[256];\
1978 uint8_t halfHV[256];\
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1985 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1986 uint8_t full[24*17];\
1987 uint8_t halfH[272];\
1988 uint8_t halfHV[256];\
1989 copy_block17(full, src, 24, stride, 17);\
1990 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1991 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1992 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1993 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1995 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1996 uint8_t full[24*17];\
1997 uint8_t halfH[272];\
1998 uint8_t halfV[256];\
1999 uint8_t halfHV[256];\
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2002 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2003 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2004 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2006 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2007 uint8_t full[24*17];\
2008 uint8_t halfH[272];\
2009 uint8_t halfHV[256];\
2010 copy_block17(full, src, 24, stride, 17);\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2012 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2014 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2016 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t halfH[272];\
2018 uint8_t halfHV[256];\
2019 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2021 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2023 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2024 uint8_t halfH[272];\
2025 uint8_t halfHV[256];\
2026 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2027 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2028 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2030 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2031 uint8_t full[24*17];\
2032 uint8_t halfH[272];\
2033 uint8_t halfV[256];\
2034 uint8_t halfHV[256];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2041 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[24*17];\
2043 uint8_t halfH[272];\
2044 copy_block17(full, src, 24, stride, 17);\
2045 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2046 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2047 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[24*17];\
2051 uint8_t halfH[272];\
2052 uint8_t halfV[256];\
2053 uint8_t halfHV[256];\
2054 copy_block17(full, src, 24, stride, 17);\
2055 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2056 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2057 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2058 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2060 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2061 uint8_t full[24*17];\
2062 uint8_t halfH[272];\
2063 copy_block17(full, src, 24, stride, 17);\
2064 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2065 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2066 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2068 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t halfH[272];\
2070 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2071 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2074 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2075 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2076 #define op_put(a, b) a = cm[((b) + 16)>>5]
2077 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2079 QPEL_MC(0, put_ , _ , op_put)
2080 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2081 QPEL_MC(0, avg_ , _ , op_avg)
2082 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2084 #undef op_avg_no_rnd
2086 #undef op_put_no_rnd
2089 #define H264_LOWPASS(OPNAME, OP, OP2) \
2090 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2092 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2096 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2097 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2103 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2105 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2109 const int srcB= src[-2*srcStride];\
2110 const int srcA= src[-1*srcStride];\
2111 const int src0= src[0 *srcStride];\
2112 const int src1= src[1 *srcStride];\
2113 const int src2= src[2 *srcStride];\
2114 const int src3= src[3 *srcStride];\
2115 const int src4= src[4 *srcStride];\
2116 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2117 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2123 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2126 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2128 src -= 2*srcStride;\
2129 for(i=0; i<h+5; i++)\
2131 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2132 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2136 tmp -= tmpStride*(h+5-2);\
2139 const int tmpB= tmp[-2*tmpStride];\
2140 const int tmpA= tmp[-1*tmpStride];\
2141 const int tmp0= tmp[0 *tmpStride];\
2142 const int tmp1= tmp[1 *tmpStride];\
2143 const int tmp2= tmp[2 *tmpStride];\
2144 const int tmp3= tmp[3 *tmpStride];\
2145 const int tmp4= tmp[4 *tmpStride];\
2146 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2147 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2152 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2154 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2158 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2159 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2160 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2161 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2167 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2169 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2173 const int srcB= src[-2*srcStride];\
2174 const int srcA= src[-1*srcStride];\
2175 const int src0= src[0 *srcStride];\
2176 const int src1= src[1 *srcStride];\
2177 const int src2= src[2 *srcStride];\
2178 const int src3= src[3 *srcStride];\
2179 const int src4= src[4 *srcStride];\
2180 const int src5= src[5 *srcStride];\
2181 const int src6= src[6 *srcStride];\
2182 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2183 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2184 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2185 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2191 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2194 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2196 src -= 2*srcStride;\
2197 for(i=0; i<h+5; i++)\
2199 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2200 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2201 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2202 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2206 tmp -= tmpStride*(h+5-2);\
2209 const int tmpB= tmp[-2*tmpStride];\
2210 const int tmpA= tmp[-1*tmpStride];\
2211 const int tmp0= tmp[0 *tmpStride];\
2212 const int tmp1= tmp[1 *tmpStride];\
2213 const int tmp2= tmp[2 *tmpStride];\
2214 const int tmp3= tmp[3 *tmpStride];\
2215 const int tmp4= tmp[4 *tmpStride];\
2216 const int tmp5= tmp[5 *tmpStride];\
2217 const int tmp6= tmp[6 *tmpStride];\
2218 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2219 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2220 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2221 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2227 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2229 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2233 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2234 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2235 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2236 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2237 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2238 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2239 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2240 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2246 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2248 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2252 const int srcB= src[-2*srcStride];\
2253 const int srcA= src[-1*srcStride];\
2254 const int src0= src[0 *srcStride];\
2255 const int src1= src[1 *srcStride];\
2256 const int src2= src[2 *srcStride];\
2257 const int src3= src[3 *srcStride];\
2258 const int src4= src[4 *srcStride];\
2259 const int src5= src[5 *srcStride];\
2260 const int src6= src[6 *srcStride];\
2261 const int src7= src[7 *srcStride];\
2262 const int src8= src[8 *srcStride];\
2263 const int src9= src[9 *srcStride];\
2264 const int src10=src[10*srcStride];\
2265 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2266 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2267 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2268 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2269 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2270 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2271 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2272 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2278 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2281 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2283 src -= 2*srcStride;\
2284 for(i=0; i<h+5; i++)\
2286 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2287 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2288 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2289 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2290 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2291 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2292 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2293 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2297 tmp -= tmpStride*(h+5-2);\
2300 const int tmpB= tmp[-2*tmpStride];\
2301 const int tmpA= tmp[-1*tmpStride];\
2302 const int tmp0= tmp[0 *tmpStride];\
2303 const int tmp1= tmp[1 *tmpStride];\
2304 const int tmp2= tmp[2 *tmpStride];\
2305 const int tmp3= tmp[3 *tmpStride];\
2306 const int tmp4= tmp[4 *tmpStride];\
2307 const int tmp5= tmp[5 *tmpStride];\
2308 const int tmp6= tmp[6 *tmpStride];\
2309 const int tmp7= tmp[7 *tmpStride];\
2310 const int tmp8= tmp[8 *tmpStride];\
2311 const int tmp9= tmp[9 *tmpStride];\
2312 const int tmp10=tmp[10*tmpStride];\
2313 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2314 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2315 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2316 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2317 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2318 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2319 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2320 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2326 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2327 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2328 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2329 src += 8*srcStride;\
2330 dst += 8*dstStride;\
2331 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2332 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2335 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2336 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2337 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2338 src += 8*srcStride;\
2339 dst += 8*dstStride;\
2340 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2341 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2344 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2345 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2346 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2347 src += 8*srcStride;\
2348 dst += 8*dstStride;\
2349 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2350 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2353 #define H264_MC(OPNAME, SIZE) \
2354 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2355 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2358 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2359 uint8_t half[SIZE*SIZE];\
2360 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2361 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2364 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2365 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2368 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2369 uint8_t half[SIZE*SIZE];\
2370 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2371 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2374 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2375 uint8_t full[SIZE*(SIZE+5)];\
2376 uint8_t * const full_mid= full + SIZE*2;\
2377 uint8_t half[SIZE*SIZE];\
2378 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2379 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2380 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2383 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2384 uint8_t full[SIZE*(SIZE+5)];\
2385 uint8_t * const full_mid= full + SIZE*2;\
2386 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2387 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2390 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2391 uint8_t full[SIZE*(SIZE+5)];\
2392 uint8_t * const full_mid= full + SIZE*2;\
2393 uint8_t half[SIZE*SIZE];\
2394 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2395 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2396 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2399 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2400 uint8_t full[SIZE*(SIZE+5)];\
2401 uint8_t * const full_mid= full + SIZE*2;\
2402 uint8_t halfH[SIZE*SIZE];\
2403 uint8_t halfV[SIZE*SIZE];\
2404 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2405 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2406 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2407 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2410 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2411 uint8_t full[SIZE*(SIZE+5)];\
2412 uint8_t * const full_mid= full + SIZE*2;\
2413 uint8_t halfH[SIZE*SIZE];\
2414 uint8_t halfV[SIZE*SIZE];\
2415 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2416 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2417 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2418 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2421 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2422 uint8_t full[SIZE*(SIZE+5)];\
2423 uint8_t * const full_mid= full + SIZE*2;\
2424 uint8_t halfH[SIZE*SIZE];\
2425 uint8_t halfV[SIZE*SIZE];\
2426 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2427 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2428 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2429 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2432 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2433 uint8_t full[SIZE*(SIZE+5)];\
2434 uint8_t * const full_mid= full + SIZE*2;\
2435 uint8_t halfH[SIZE*SIZE];\
2436 uint8_t halfV[SIZE*SIZE];\
2437 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2438 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2439 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2440 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2443 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2444 int16_t tmp[SIZE*(SIZE+5)];\
2445 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2448 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2449 int16_t tmp[SIZE*(SIZE+5)];\
2450 uint8_t halfH[SIZE*SIZE];\
2451 uint8_t halfHV[SIZE*SIZE];\
2452 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2453 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2454 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2457 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2458 int16_t tmp[SIZE*(SIZE+5)];\
2459 uint8_t halfH[SIZE*SIZE];\
2460 uint8_t halfHV[SIZE*SIZE];\
2461 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2462 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2463 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2466 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2467 uint8_t full[SIZE*(SIZE+5)];\
2468 uint8_t * const full_mid= full + SIZE*2;\
2469 int16_t tmp[SIZE*(SIZE+5)];\
2470 uint8_t halfV[SIZE*SIZE];\
2471 uint8_t halfHV[SIZE*SIZE];\
2472 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2473 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2474 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2479 uint8_t full[SIZE*(SIZE+5)];\
2480 uint8_t * const full_mid= full + SIZE*2;\
2481 int16_t tmp[SIZE*(SIZE+5)];\
2482 uint8_t halfV[SIZE*SIZE];\
2483 uint8_t halfHV[SIZE*SIZE];\
2484 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2485 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2486 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2487 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2490 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2491 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2492 #define op_put(a, b) a = cm[((b) + 16)>>5]
2493 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2494 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2496 H264_LOWPASS(put_ , op_put, op2_put)
2497 H264_LOWPASS(avg_ , op_avg, op2_avg)
2512 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2513 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2514 #define H264_WEIGHT(W,H) \
2515 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2517 offset <<= log2_denom; \
2518 if(log2_denom) offset += 1<<(log2_denom-1); \
2519 for(y=0; y<H; y++, block += stride){ \
2522 if(W==2) continue; \
2525 if(W==4) continue; \
2530 if(W==8) continue; \
2541 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2543 offset = ((offset + 1) | 1) << log2_denom; \
2544 for(y=0; y<H; y++, dst += stride, src += stride){ \
2547 if(W==2) continue; \
2550 if(W==4) continue; \
2555 if(W==8) continue; \
2582 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2583 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2587 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2588 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2589 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2590 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2591 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2592 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2593 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2594 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2600 #ifdef CONFIG_CAVS_DECODER
2602 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2604 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2605 put_pixels8_c(dst, src, stride, 8);
2607 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2608 avg_pixels8_c(dst, src, stride, 8);
2610 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2611 put_pixels16_c(dst, src, stride, 16);
2613 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2614 avg_pixels16_c(dst, src, stride, 16);
2616 #endif /* CONFIG_CAVS_DECODER */
2618 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2620 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2622 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2623 put_pixels8_c(dst, src, stride, 8);
2625 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2627 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2628 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2632 const int src_1= src[ -srcStride];
2633 const int src0 = src[0 ];
2634 const int src1 = src[ srcStride];
2635 const int src2 = src[2*srcStride];
2636 const int src3 = src[3*srcStride];
2637 const int src4 = src[4*srcStride];
2638 const int src5 = src[5*srcStride];
2639 const int src6 = src[6*srcStride];
2640 const int src7 = src[7*srcStride];
2641 const int src8 = src[8*srcStride];
2642 const int src9 = src[9*srcStride];
2643 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2644 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2645 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2646 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2647 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2648 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2649 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2650 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2656 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2657 put_pixels8_c(dst, src, stride, 8);
2660 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2662 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2663 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2666 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2667 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2670 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2672 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2673 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2676 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2677 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2680 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2684 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2685 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2686 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2687 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2689 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2693 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2694 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2695 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2696 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2698 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2700 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2701 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2704 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2706 const int strength= ff_h263_loop_filter_strength[qscale];
2710 int p0= src[x-2*stride];
2711 int p1= src[x-1*stride];
2712 int p2= src[x+0*stride];
2713 int p3= src[x+1*stride];
2714 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2716 if (d<-2*strength) d1= 0;
2717 else if(d<- strength) d1=-2*strength - d;
2718 else if(d< strength) d1= d;
2719 else if(d< 2*strength) d1= 2*strength - d;
2724 if(p1&256) p1= ~(p1>>31);
2725 if(p2&256) p2= ~(p2>>31);
2727 src[x-1*stride] = p1;
2728 src[x+0*stride] = p2;
2732 d2= clip((p0-p3)/4, -ad1, ad1);
2734 src[x-2*stride] = p0 - d2;
2735 src[x+ stride] = p3 + d2;
2739 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2741 const int strength= ff_h263_loop_filter_strength[qscale];
2745 int p0= src[y*stride-2];
2746 int p1= src[y*stride-1];
2747 int p2= src[y*stride+0];
2748 int p3= src[y*stride+1];
2749 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2751 if (d<-2*strength) d1= 0;
2752 else if(d<- strength) d1=-2*strength - d;
2753 else if(d< strength) d1= d;
2754 else if(d< 2*strength) d1= 2*strength - d;
2759 if(p1&256) p1= ~(p1>>31);
2760 if(p2&256) p2= ~(p2>>31);
2762 src[y*stride-1] = p1;
2763 src[y*stride+0] = p2;
2767 d2= clip((p0-p3)/4, -ad1, ad1);
2769 src[y*stride-2] = p0 - d2;
2770 src[y*stride+1] = p3 + d2;
2774 static void h261_loop_filter_c(uint8_t *src, int stride){
2779 temp[x ] = 4*src[x ];
2780 temp[x + 7*8] = 4*src[x + 7*stride];
2784 xy = y * stride + x;
2786 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2791 src[ y*stride] = (temp[ y*8] + 2)>>2;
2792 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2794 xy = y * stride + x;
2796 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2801 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2804 for( i = 0; i < 4; i++ ) {
2809 for( d = 0; d < 4; d++ ) {
2810 const int p0 = pix[-1*xstride];
2811 const int p1 = pix[-2*xstride];
2812 const int p2 = pix[-3*xstride];
2813 const int q0 = pix[0];
2814 const int q1 = pix[1*xstride];
2815 const int q2 = pix[2*xstride];
2817 if( ABS( p0 - q0 ) < alpha &&
2818 ABS( p1 - p0 ) < beta &&
2819 ABS( q1 - q0 ) < beta ) {
2824 if( ABS( p2 - p0 ) < beta ) {
2825 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2828 if( ABS( q2 - q0 ) < beta ) {
2829 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2833 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2834 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2835 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2841 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2843 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2845 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2847 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2850 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2853 for( i = 0; i < 4; i++ ) {
2854 const int tc = tc0[i];
2859 for( d = 0; d < 2; d++ ) {
2860 const int p0 = pix[-1*xstride];
2861 const int p1 = pix[-2*xstride];
2862 const int q0 = pix[0];
2863 const int q1 = pix[1*xstride];
2865 if( ABS( p0 - q0 ) < alpha &&
2866 ABS( p1 - p0 ) < beta &&
2867 ABS( q1 - q0 ) < beta ) {
2869 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2871 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2872 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2878 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2880 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2882 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2884 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2887 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2890 for( d = 0; d < 8; d++ ) {
2891 const int p0 = pix[-1*xstride];
2892 const int p1 = pix[-2*xstride];
2893 const int q0 = pix[0];
2894 const int q1 = pix[1*xstride];
2896 if( ABS( p0 - q0 ) < alpha &&
2897 ABS( p1 - p0 ) < beta &&
2898 ABS( q1 - q0 ) < beta ) {
2900 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2901 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2906 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2908 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2910 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2912 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2915 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2921 s += abs(pix1[0] - pix2[0]);
2922 s += abs(pix1[1] - pix2[1]);
2923 s += abs(pix1[2] - pix2[2]);
2924 s += abs(pix1[3] - pix2[3]);
2925 s += abs(pix1[4] - pix2[4]);
2926 s += abs(pix1[5] - pix2[5]);
2927 s += abs(pix1[6] - pix2[6]);
2928 s += abs(pix1[7] - pix2[7]);
2929 s += abs(pix1[8] - pix2[8]);
2930 s += abs(pix1[9] - pix2[9]);
2931 s += abs(pix1[10] - pix2[10]);
2932 s += abs(pix1[11] - pix2[11]);
2933 s += abs(pix1[12] - pix2[12]);
2934 s += abs(pix1[13] - pix2[13]);
2935 s += abs(pix1[14] - pix2[14]);
2936 s += abs(pix1[15] - pix2[15]);
2943 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2949 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2950 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2951 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2952 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2953 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2954 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2955 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2956 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2957 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2958 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2959 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2960 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2961 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2962 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2963 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2964 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2971 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2974 uint8_t *pix3 = pix2 + line_size;
2978 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2979 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2980 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2981 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2982 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2983 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2984 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2985 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2986 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2987 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2988 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2989 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2990 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2991 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2992 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2993 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3001 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3004 uint8_t *pix3 = pix2 + line_size;
3008 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3009 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3010 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3011 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3012 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3013 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3014 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3015 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3016 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3017 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3018 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3019 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3020 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3021 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3022 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3023 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3031 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3037 s += abs(pix1[0] - pix2[0]);
3038 s += abs(pix1[1] - pix2[1]);
3039 s += abs(pix1[2] - pix2[2]);
3040 s += abs(pix1[3] - pix2[3]);
3041 s += abs(pix1[4] - pix2[4]);
3042 s += abs(pix1[5] - pix2[5]);
3043 s += abs(pix1[6] - pix2[6]);
3044 s += abs(pix1[7] - pix2[7]);
3051 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3057 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3058 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3059 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3060 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3061 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3062 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3063 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3064 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3071 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3074 uint8_t *pix3 = pix2 + line_size;
3078 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3079 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3080 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3081 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3082 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3083 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3084 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3085 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3093 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3096 uint8_t *pix3 = pix2 + line_size;
3100 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3101 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3102 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3103 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3104 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3105 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3106 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3107 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3115 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3116 MpegEncContext *c = v;
3122 for(x=0; x<16; x++){
3123 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3126 for(x=0; x<15; x++){
3127 score2+= ABS( s1[x ] - s1[x +stride]
3128 - s1[x+1] + s1[x+1+stride])
3129 -ABS( s2[x ] - s2[x +stride]
3130 - s2[x+1] + s2[x+1+stride]);
3137 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3138 else return score1 + ABS(score2)*8;
3141 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3142 MpegEncContext *c = v;
3149 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3153 score2+= ABS( s1[x ] - s1[x +stride]
3154 - s1[x+1] + s1[x+1+stride])
3155 -ABS( s2[x ] - s2[x +stride]
3156 - s2[x+1] + s2[x+1+stride]);
3163 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
3164 else return score1 + ABS(score2)*8;
3167 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3171 for(i=0; i<8*8; i++){
3172 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3175 assert(-512<b && b<512);
3177 sum += (w*b)*(w*b)>>4;
3182 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3185 for(i=0; i<8*8; i++){
3186 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3191 * permutes an 8x8 block.
3192 * @param block the block which will be permuted according to the given permutation vector
3193 * @param permutation the permutation vector
3194 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3195 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3196 * (inverse) permutated to scantable order!
3198 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3204 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3206 for(i=0; i<=last; i++){
3207 const int j= scantable[i];
3212 for(i=0; i<=last; i++){
3213 const int j= scantable[i];
3214 const int perm_j= permutation[j];
3215 block[perm_j]= temp[j];
3219 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3223 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3226 memset(cmp, 0, sizeof(void*)*5);
3234 cmp[i]= c->hadamard8_diff[i];
3240 cmp[i]= c->dct_sad[i];
3243 cmp[i]= c->dct264_sad[i];
3246 cmp[i]= c->dct_max[i];
3249 cmp[i]= c->quant_psnr[i];
3269 #ifdef CONFIG_SNOW_ENCODER
3278 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3284 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3286 static void clear_blocks_c(DCTELEM *blocks)
3288 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3291 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3293 for(i=0; i+7<w; i+=8){
3294 dst[i+0] += src[i+0];
3295 dst[i+1] += src[i+1];
3296 dst[i+2] += src[i+2];
3297 dst[i+3] += src[i+3];
3298 dst[i+4] += src[i+4];
3299 dst[i+5] += src[i+5];
3300 dst[i+6] += src[i+6];
3301 dst[i+7] += src[i+7];
3304 dst[i+0] += src[i+0];
3307 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3309 for(i=0; i+7<w; i+=8){
3310 dst[i+0] = src1[i+0]-src2[i+0];
3311 dst[i+1] = src1[i+1]-src2[i+1];
3312 dst[i+2] = src1[i+2]-src2[i+2];
3313 dst[i+3] = src1[i+3]-src2[i+3];
3314 dst[i+4] = src1[i+4]-src2[i+4];
3315 dst[i+5] = src1[i+5]-src2[i+5];
3316 dst[i+6] = src1[i+6]-src2[i+6];
3317 dst[i+7] = src1[i+7]-src2[i+7];
3320 dst[i+0] = src1[i+0]-src2[i+0];
3323 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3331 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3341 #define BUTTERFLY2(o1,o2,i1,i2) \
3345 #define BUTTERFLY1(x,y) \
3354 #define BUTTERFLYA(x,y) (ABS((x)+(y)) + ABS((x)-(y)))
3356 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3364 //FIXME try pointer walks
3365 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3366 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3367 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3368 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3370 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3371 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3372 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3373 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3375 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3376 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3377 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3378 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3382 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3383 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3384 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3385 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3387 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3388 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3389 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3390 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3393 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3394 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3395 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3396 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3402 printf("MAX:%d\n", maxi);
3408 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3416 //FIXME try pointer walks
3417 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3418 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3419 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3420 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3422 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3423 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3424 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3425 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3427 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3428 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3429 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3430 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3434 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3435 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3436 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3437 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3439 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3440 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3441 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3442 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3445 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3446 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3447 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3448 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3451 sum -= ABS(temp[8*0] + temp[8*4]); // -mean
3456 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3457 MpegEncContext * const s= (MpegEncContext *)c;
3458 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3459 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3464 s->dsp.diff_pixels(temp, src1, src2, stride);
3475 const int s07 = SRC(0) + SRC(7);\
3476 const int s16 = SRC(1) + SRC(6);\
3477 const int s25 = SRC(2) + SRC(5);\
3478 const int s34 = SRC(3) + SRC(4);\
3479 const int a0 = s07 + s34;\
3480 const int a1 = s16 + s25;\
3481 const int a2 = s07 - s34;\
3482 const int a3 = s16 - s25;\
3483 const int d07 = SRC(0) - SRC(7);\
3484 const int d16 = SRC(1) - SRC(6);\
3485 const int d25 = SRC(2) - SRC(5);\
3486 const int d34 = SRC(3) - SRC(4);\
3487 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3488 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3489 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3490 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3492 DST(1, a4 + (a7>>2)) ;\
3493 DST(2, a2 + (a3>>1)) ;\
3494 DST(3, a5 + (a6>>2)) ;\
3496 DST(5, a6 - (a5>>2)) ;\
3497 DST(6, (a2>>1) - a3 ) ;\
3498 DST(7, (a4>>2) - a7 ) ;\
3501 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3502 MpegEncContext * const s= (MpegEncContext *)c;
3507 s->dsp.diff_pixels(dct, src1, src2, stride);
3509 #define SRC(x) dct[i][x]
3510 #define DST(x,v) dct[i][x]= v
3511 for( i = 0; i < 8; i++ )
3516 #define SRC(x) dct[x][i]
3517 #define DST(x,v) sum += ABS(v)
3518 for( i = 0; i < 8; i++ )
3526 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3527 MpegEncContext * const s= (MpegEncContext *)c;
3528 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3529 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3534 s->dsp.diff_pixels(temp, src1, src2, stride);
3538 sum= FFMAX(sum, ABS(temp[i]));
3543 void simple_idct(DCTELEM *block); //FIXME
3545 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546 MpegEncContext * const s= (MpegEncContext *)c;
3547 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3548 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3549 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3555 s->dsp.diff_pixels(temp, src1, src2, stride);
3557 memcpy(bak, temp, 64*sizeof(DCTELEM));
3559 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3561 simple_idct(temp); //FIXME
3564 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3569 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570 MpegEncContext * const s= (MpegEncContext *)c;
3571 const uint8_t *scantable= s->intra_scantable.permutated;
3572 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3573 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3574 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3575 uint8_t * const bak= (uint8_t*)aligned_bak;
3576 int i, last, run, bits, level, distoration, start_i;
3577 const int esc_length= s->ac_esc_length;
3579 uint8_t * last_length;
3584 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3585 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3588 s->dsp.diff_pixels(temp, src1, src2, stride);
3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3596 length = s->intra_ac_vlc_length;
3597 last_length= s->intra_ac_vlc_last_length;
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3601 length = s->inter_ac_vlc_length;
3602 last_length= s->inter_ac_vlc_last_length;
3607 for(i=start_i; i<last; i++){
3608 int j= scantable[i];
3613 if((level&(~127)) == 0){
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3623 level= temp[i] + 64;
3627 if((level&(~127)) == 0){
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3638 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3641 s->dsp.idct_add(bak, stride, temp);
3643 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3645 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3648 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3649 MpegEncContext * const s= (MpegEncContext *)c;
3650 const uint8_t *scantable= s->intra_scantable.permutated;
3651 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3652 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3653 int i, last, run, bits, level, start_i;
3654 const int esc_length= s->ac_esc_length;
3656 uint8_t * last_length;
3660 s->dsp.diff_pixels(temp, src1, src2, stride);
3662 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3668 length = s->intra_ac_vlc_length;
3669 last_length= s->intra_ac_vlc_last_length;
3670 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3673 length = s->inter_ac_vlc_length;
3674 last_length= s->inter_ac_vlc_last_length;
3679 for(i=start_i; i<last; i++){
3680 int j= scantable[i];
3685 if((level&(~127)) == 0){
3686 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3695 level= temp[i] + 64;
3699 if((level&(~127)) == 0){
3700 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3708 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3713 for(x=0; x<16; x+=4){
3714 score+= ABS(s[x ] - s[x +stride]) + ABS(s[x+1] - s[x+1+stride])
3715 +ABS(s[x+2] - s[x+2+stride]) + ABS(s[x+3] - s[x+3+stride]);
3723 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3728 for(x=0; x<16; x++){
3729 score+= ABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3738 #define SQ(a) ((a)*(a))
3739 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3744 for(x=0; x<16; x+=4){
3745 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3746 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3754 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3759 for(x=0; x<16; x++){
3760 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3769 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3773 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3775 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777 WARPER8_16_SQ(rd8x8_c, rd16_c)
3778 WARPER8_16_SQ(bit8x8_c, bit16_c)
3780 static void vector_fmul_c(float *dst, const float *src, int len){
3782 for(i=0; i<len; i++)
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3789 for(i=0; i<len; i++)
3790 dst[i] = src0[i] * src1[-i];
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3795 for(i=0; i<len; i++)
3796 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3801 for(i=0; i<len; i++) {
3802 int_fast32_t tmp = ((int32_t*)src)[i];
3804 tmp = (0x43c0ffff - tmp)>>31;
3805 // is this faster on some gcc/cpu combinations?
3806 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3809 dst[i] = tmp - 0x8000;
3813 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3815 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3818 put_pixels_clamped_c(block, dest, line_size);
3820 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3823 add_pixels_clamped_c(block, dest, line_size);
3826 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3829 put_pixels_clamped4_c(block, dest, line_size);
3831 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3834 add_pixels_clamped4_c(block, dest, line_size);
3837 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3840 put_pixels_clamped2_c(block, dest, line_size);
3842 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3845 add_pixels_clamped2_c(block, dest, line_size);
3848 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3850 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3852 dest[0] = cm[(block[0] + 4)>>3];
3854 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3856 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3858 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3861 static void just_return() { return; }
3863 /* init static data */
3864 void dsputil_static_init(void)
3868 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3869 for(i=0;i<MAX_NEG_CROP;i++) {
3871 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3874 for(i=0;i<512;i++) {
3875 squareTbl[i] = (i - 256) * (i - 256);
3878 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3882 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3886 #ifdef CONFIG_ENCODERS
3887 if(avctx->dct_algo==FF_DCT_FASTINT) {
3888 c->fdct = fdct_ifast;
3889 c->fdct248 = fdct_ifast248;
3891 else if(avctx->dct_algo==FF_DCT_FAAN) {
3892 c->fdct = ff_faandct;
3893 c->fdct248 = ff_faandct248;
3896 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3897 c->fdct248 = ff_fdct248_islow;
3899 #endif //CONFIG_ENCODERS
3901 if(avctx->lowres==1){
3902 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3903 c->idct_put= ff_jref_idct4_put;
3904 c->idct_add= ff_jref_idct4_add;
3906 c->idct_put= ff_h264_lowres_idct_put_c;
3907 c->idct_add= ff_h264_lowres_idct_add_c;
3909 c->idct = j_rev_dct4;
3910 c->idct_permutation_type= FF_NO_IDCT_PERM;
3911 }else if(avctx->lowres==2){
3912 c->idct_put= ff_jref_idct2_put;
3913 c->idct_add= ff_jref_idct2_add;
3914 c->idct = j_rev_dct2;
3915 c->idct_permutation_type= FF_NO_IDCT_PERM;
3916 }else if(avctx->lowres==3){
3917 c->idct_put= ff_jref_idct1_put;
3918 c->idct_add= ff_jref_idct1_add;
3919 c->idct = j_rev_dct1;
3920 c->idct_permutation_type= FF_NO_IDCT_PERM;
3922 if(avctx->idct_algo==FF_IDCT_INT){
3923 c->idct_put= ff_jref_idct_put;
3924 c->idct_add= ff_jref_idct_add;
3925 c->idct = j_rev_dct;
3926 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3927 }else if(avctx->idct_algo==FF_IDCT_VP3){
3928 c->idct_put= ff_vp3_idct_put_c;
3929 c->idct_add= ff_vp3_idct_add_c;
3930 c->idct = ff_vp3_idct_c;
3931 c->idct_permutation_type= FF_NO_IDCT_PERM;
3932 }else{ //accurate/default
3933 c->idct_put= simple_idct_put;
3934 c->idct_add= simple_idct_add;
3935 c->idct = simple_idct;
3936 c->idct_permutation_type= FF_NO_IDCT_PERM;
3940 c->h264_idct_add= ff_h264_idct_add_c;
3941 c->h264_idct8_add= ff_h264_idct8_add_c;
3942 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3943 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3945 c->get_pixels = get_pixels_c;
3946 c->diff_pixels = diff_pixels_c;
3947 c->put_pixels_clamped = put_pixels_clamped_c;
3948 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3949 c->add_pixels_clamped = add_pixels_clamped_c;
3950 c->add_pixels8 = add_pixels8_c;
3951 c->add_pixels4 = add_pixels4_c;
3954 c->clear_blocks = clear_blocks_c;
3955 c->pix_sum = pix_sum_c;
3956 c->pix_norm1 = pix_norm1_c;
3958 /* TODO [0] 16 [1] 8 */
3959 c->pix_abs[0][0] = pix_abs16_c;
3960 c->pix_abs[0][1] = pix_abs16_x2_c;
3961 c->pix_abs[0][2] = pix_abs16_y2_c;
3962 c->pix_abs[0][3] = pix_abs16_xy2_c;
3963 c->pix_abs[1][0] = pix_abs8_c;
3964 c->pix_abs[1][1] = pix_abs8_x2_c;
3965 c->pix_abs[1][2] = pix_abs8_y2_c;
3966 c->pix_abs[1][3] = pix_abs8_xy2_c;
3968 #define dspfunc(PFX, IDX, NUM) \
3969 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3970 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3971 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3972 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3974 dspfunc(put, 0, 16);
3975 dspfunc(put_no_rnd, 0, 16);
3977 dspfunc(put_no_rnd, 1, 8);
3981 dspfunc(avg, 0, 16);
3982 dspfunc(avg_no_rnd, 0, 16);
3984 dspfunc(avg_no_rnd, 1, 8);
3989 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3990 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3992 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3993 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3994 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3995 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3996 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3997 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3998 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3999 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4000 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4002 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4003 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4004 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4005 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4006 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4007 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4008 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4009 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4010 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4012 #define dspfunc(PFX, IDX, NUM) \
4013 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4014 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4015 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4016 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4017 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4018 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4019 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4020 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4021 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4022 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4023 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4024 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4025 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4026 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4027 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4028 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4030 dspfunc(put_qpel, 0, 16);
4031 dspfunc(put_no_rnd_qpel, 0, 16);
4033 dspfunc(avg_qpel, 0, 16);
4034 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4036 dspfunc(put_qpel, 1, 8);
4037 dspfunc(put_no_rnd_qpel, 1, 8);
4039 dspfunc(avg_qpel, 1, 8);
4040 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4042 dspfunc(put_h264_qpel, 0, 16);
4043 dspfunc(put_h264_qpel, 1, 8);
4044 dspfunc(put_h264_qpel, 2, 4);
4045 dspfunc(put_h264_qpel, 3, 2);
4046 dspfunc(avg_h264_qpel, 0, 16);
4047 dspfunc(avg_h264_qpel, 1, 8);
4048 dspfunc(avg_h264_qpel, 2, 4);
4051 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4052 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4053 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4054 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4055 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4056 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4057 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4059 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4060 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4061 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4062 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4063 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4064 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4065 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4066 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4067 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4068 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4069 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4070 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4071 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4072 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4073 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4074 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4075 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4076 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4077 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4078 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4080 #ifdef CONFIG_CAVS_DECODER
4081 ff_cavsdsp_init(c,avctx);
4083 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4084 ff_vc1dsp_init(c,avctx);
4087 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4088 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4089 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4090 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4091 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4092 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4093 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4094 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4096 #define SET_CMP_FUNC(name) \
4097 c->name[0]= name ## 16_c;\
4098 c->name[1]= name ## 8x8_c;
4100 SET_CMP_FUNC(hadamard8_diff)
4101 c->hadamard8_diff[4]= hadamard8_intra16_c;
4102 SET_CMP_FUNC(dct_sad)
4103 SET_CMP_FUNC(dct_max)
4105 SET_CMP_FUNC(dct264_sad)
4107 c->sad[0]= pix_abs16_c;
4108 c->sad[1]= pix_abs8_c;
4112 SET_CMP_FUNC(quant_psnr)
4115 c->vsad[0]= vsad16_c;
4116 c->vsad[4]= vsad_intra16_c;
4117 c->vsse[0]= vsse16_c;
4118 c->vsse[4]= vsse_intra16_c;
4119 c->nsse[0]= nsse16_c;
4120 c->nsse[1]= nsse8_c;
4121 #ifdef CONFIG_SNOW_ENCODER
4122 c->w53[0]= w53_16_c;
4124 c->w97[0]= w97_16_c;
4128 c->add_bytes= add_bytes_c;
4129 c->diff_bytes= diff_bytes_c;
4130 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4131 c->bswap_buf= bswap_buf;
4133 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4134 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4135 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4136 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4137 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4138 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4139 c->h264_loop_filter_strength= NULL;
4141 c->h263_h_loop_filter= h263_h_loop_filter_c;
4142 c->h263_v_loop_filter= h263_v_loop_filter_c;
4144 c->h261_loop_filter= h261_loop_filter_c;
4146 c->try_8x8basis= try_8x8basis_c;
4147 c->add_8x8basis= add_8x8basis_c;
4149 #ifdef CONFIG_SNOW_ENCODER
4150 c->vertical_compose97i = ff_snow_vertical_compose97i;
4151 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4152 c->inner_add_yblock = ff_snow_inner_add_yblock;
4155 #ifdef CONFIG_VORBIS_DECODER
4156 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4158 c->vector_fmul = vector_fmul_c;
4159 c->vector_fmul_reverse = vector_fmul_reverse_c;
4160 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4161 c->float_to_int16 = ff_float_to_int16_c;
4163 c->shrink[0]= ff_img_copy_plane;
4164 c->shrink[1]= ff_shrink22;
4165 c->shrink[2]= ff_shrink44;
4166 c->shrink[3]= ff_shrink88;
4168 c->prefetch= just_return;
4171 dsputil_init_mmx(c, avctx);
4174 dsputil_init_armv4l(c, avctx);
4177 dsputil_init_mlib(c, avctx);
4180 dsputil_init_vis(c,avctx);
4183 dsputil_init_alpha(c, avctx);
4186 dsputil_init_ppc(c, avctx);
4189 dsputil_init_mmi(c, avctx);
4192 dsputil_init_sh4(c,avctx);
4195 switch(c->idct_permutation_type){
4196 case FF_NO_IDCT_PERM:
4198 c->idct_permutation[i]= i;
4200 case FF_LIBMPEG2_IDCT_PERM:
4202 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4204 case FF_SIMPLE_IDCT_PERM:
4206 c->idct_permutation[i]= simple_mmx_permutation[i];
4208 case FF_TRANSPOSE_IDCT_PERM:
4210 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4212 case FF_PARTTRANS_IDCT_PERM:
4214 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4217 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");