3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t ff_squareTbl[512] = {0, };
46 const uint8_t ff_zigzag_direct[64] = {
47 0, 1, 8, 16, 9, 2, 3, 10,
48 17, 24, 32, 25, 18, 11, 4, 5,
49 12, 19, 26, 33, 40, 48, 41, 34,
50 27, 20, 13, 6, 7, 14, 21, 28,
51 35, 42, 49, 56, 57, 50, 43, 36,
52 29, 22, 15, 23, 30, 37, 44, 51,
53 58, 59, 52, 45, 38, 31, 39, 46,
54 53, 60, 61, 54, 47, 55, 62, 63
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96 const uint32_t ff_inverse[256]={
97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
131 /* Input permutation for the simple_idct_mmx */
132 static const uint8_t simple_mmx_permutation[64]={
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
143 static int pix_sum_c(uint8_t * pix, int line_size)
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 pix += line_size - 16;
165 static int pix_norm1_c(uint8_t * pix, int line_size)
168 uint32_t *sq = ff_squareTbl + 256;
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
183 #if LONG_MAX > 2147483647
184 register uint64_t x=*(uint64_t*)pix;
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 s += sq[(x>>32)&0xff];
190 s += sq[(x>>40)&0xff];
191 s += sq[(x>>48)&0xff];
192 s += sq[(x>>56)&0xff];
194 register uint32_t x=*(uint32_t*)pix;
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199 x=*(uint32_t*)(pix+4);
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
208 pix += line_size - 16;
213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
216 for(i=0; i+8<=w; i+=8){
217 dst[i+0]= bswap_32(src[i+0]);
218 dst[i+1]= bswap_32(src[i+1]);
219 dst[i+2]= bswap_32(src[i+2]);
220 dst[i+3]= bswap_32(src[i+3]);
221 dst[i+4]= bswap_32(src[i+4]);
222 dst[i+5]= bswap_32(src[i+5]);
223 dst[i+6]= bswap_32(src[i+6]);
224 dst[i+7]= bswap_32(src[i+7]);
227 dst[i+0]= bswap_32(src[i+0]);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = ff_squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = ff_squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = ff_squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
303 const int dec_count= w==8 ? 3 : 4;
306 static const int scale[2][2][4][4]={
310 {268, 239, 239, 213},
314 // 9/7 16x16 or 32x32 dec=4
315 {344, 310, 310, 280},
323 {275, 245, 245, 218},
327 // 5/3 16x16 or 32x32 dec=4
328 {352, 317, 317, 286},
336 for (i = 0; i < h; i++) {
337 for (j = 0; j < w; j+=4) {
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
351 for(level=0; level<dec_count; level++){
352 for(ori= level ? 1 : 0; ori<4; ori++){
353 int size= w>>(dec_count-level);
354 int sx= (ori&1) ? size : 0;
355 int stride= 32<<(dec_count-level);
356 int sy= (ori&2) ? stride>>1 : 0;
358 for(i=0; i<size; i++){
359 for(j=0; j<size; j++){
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371 return w_c(v, pix1, pix2, line_size, 8, h, 1);
374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 0);
378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 16, h, 1);
382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 0);
386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 32, h, 1);
390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 0);
395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
399 /* read the pixels */
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415 const uint8_t *s2, int stride){
418 /* read the pixels */
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
439 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
441 /* read the pixels */
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
463 /* read the pixels */
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
479 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
481 /* read the pixels */
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
492 uint8_t *restrict pixels,
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
501 else if (*block > 127)
504 *pixels = (uint8_t)(*block + 128);
508 pixels += (line_size - 8);
512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
516 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
518 /* read the pixels */
520 pixels[0] = cm[pixels[0] + block[0]];
521 pixels[1] = cm[pixels[1] + block[1]];
522 pixels[2] = cm[pixels[2] + block[2]];
523 pixels[3] = cm[pixels[3] + block[3]];
524 pixels[4] = cm[pixels[4] + block[4]];
525 pixels[5] = cm[pixels[5] + block[5]];
526 pixels[6] = cm[pixels[6] + block[6]];
527 pixels[7] = cm[pixels[7] + block[7]];
533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
539 /* read the pixels */
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
554 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
556 /* read the pixels */
558 pixels[0] = cm[pixels[0] + block[0]];
559 pixels[1] = cm[pixels[1] + block[1]];
565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
569 pixels[0] += block[0];
570 pixels[1] += block[1];
571 pixels[2] += block[2];
572 pixels[3] += block[3];
573 pixels[4] += block[4];
574 pixels[5] += block[5];
575 pixels[6] += block[6];
576 pixels[7] += block[7];
582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
586 pixels[0] += block[0];
587 pixels[1] += block[1];
588 pixels[2] += block[2];
589 pixels[3] += block[3];
597 #define PIXOP2(OPNAME, OP) \
598 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
602 OP(*((uint64_t*)block), LD64(pixels));\
608 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+1);\
614 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
620 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint64_t a= LD64(pixels );\
625 const uint64_t b= LD64(pixels+1);\
626 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
632 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
636 const uint64_t a= LD64(pixels );\
637 const uint64_t b= LD64(pixels+line_size);\
638 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
644 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
648 const uint64_t a= LD64(pixels );\
649 const uint64_t b= LD64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
656 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0202020202020202ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0202020202020202ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
692 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
695 const uint64_t a= LD64(pixels );\
696 const uint64_t b= LD64(pixels+1);\
697 uint64_t l0= (a&0x0303030303030303ULL)\
698 + (b&0x0303030303030303ULL)\
699 + 0x0101010101010101ULL;\
700 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
705 for(i=0; i<h; i+=2){\
706 uint64_t a= LD64(pixels );\
707 uint64_t b= LD64(pixels+1);\
708 l1= (a&0x0303030303030303ULL)\
709 + (b&0x0303030303030303ULL);\
710 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
717 l0= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL)\
719 + 0x0101010101010101ULL;\
720 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
736 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737 #else // 64 bit variant
739 #define PIXOP2(OPNAME, OP) \
740 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
743 OP(*((uint16_t*)(block )), LD16(pixels ));\
748 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
751 OP(*((uint32_t*)(block )), LD32(pixels ));\
756 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
759 OP(*((uint32_t*)(block )), LD32(pixels ));\
760 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
765 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
769 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
774 a= LD32(&src1[i*src_stride1 ]);\
775 b= LD32(&src2[i*src_stride2 ]);\
776 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
777 a= LD32(&src1[i*src_stride1+4]);\
778 b= LD32(&src2[i*src_stride2+4]);\
779 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
783 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
788 a= LD32(&src1[i*src_stride1 ]);\
789 b= LD32(&src2[i*src_stride2 ]);\
790 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
791 a= LD32(&src1[i*src_stride1+4]);\
792 b= LD32(&src2[i*src_stride2+4]);\
793 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
797 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798 int src_stride1, int src_stride2, int h){\
802 a= LD32(&src1[i*src_stride1 ]);\
803 b= LD32(&src2[i*src_stride2 ]);\
804 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
808 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809 int src_stride1, int src_stride2, int h){\
813 a= LD16(&src1[i*src_stride1 ]);\
814 b= LD16(&src2[i*src_stride2 ]);\
815 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
819 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820 int src_stride1, int src_stride2, int h){\
821 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
822 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
825 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826 int src_stride1, int src_stride2, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
828 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
831 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
835 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
839 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
843 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
851 uint32_t a, b, c, d, l0, l1, h0, h1;\
852 a= LD32(&src1[i*src_stride1]);\
853 b= LD32(&src2[i*src_stride2]);\
854 c= LD32(&src3[i*src_stride3]);\
855 d= LD32(&src4[i*src_stride4]);\
856 l0= (a&0x03030303UL)\
859 h0= ((a&0xFCFCFCFCUL)>>2)\
860 + ((b&0xFCFCFCFCUL)>>2);\
861 l1= (c&0x03030303UL)\
863 h1= ((c&0xFCFCFCFCUL)>>2)\
864 + ((d&0xFCFCFCFCUL)>>2);\
865 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866 a= LD32(&src1[i*src_stride1+4]);\
867 b= LD32(&src2[i*src_stride2+4]);\
868 c= LD32(&src3[i*src_stride3+4]);\
869 d= LD32(&src4[i*src_stride4+4]);\
870 l0= (a&0x03030303UL)\
873 h0= ((a&0xFCFCFCFCUL)>>2)\
874 + ((b&0xFCFCFCFCUL)>>2);\
875 l1= (c&0x03030303UL)\
877 h1= ((c&0xFCFCFCFCUL)>>2)\
878 + ((d&0xFCFCFCFCUL)>>2);\
879 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
883 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
887 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
891 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
895 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903 uint32_t a, b, c, d, l0, l1, h0, h1;\
904 a= LD32(&src1[i*src_stride1]);\
905 b= LD32(&src2[i*src_stride2]);\
906 c= LD32(&src3[i*src_stride3]);\
907 d= LD32(&src4[i*src_stride4]);\
908 l0= (a&0x03030303UL)\
911 h0= ((a&0xFCFCFCFCUL)>>2)\
912 + ((b&0xFCFCFCFCUL)>>2);\
913 l1= (c&0x03030303UL)\
915 h1= ((c&0xFCFCFCFCUL)>>2)\
916 + ((d&0xFCFCFCFCUL)>>2);\
917 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918 a= LD32(&src1[i*src_stride1+4]);\
919 b= LD32(&src2[i*src_stride2+4]);\
920 c= LD32(&src3[i*src_stride3+4]);\
921 d= LD32(&src4[i*src_stride4+4]);\
922 l0= (a&0x03030303UL)\
925 h0= ((a&0xFCFCFCFCUL)>>2)\
926 + ((b&0xFCFCFCFCUL)>>2);\
927 l1= (c&0x03030303UL)\
929 h1= ((c&0xFCFCFCFCUL)>>2)\
930 + ((d&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
934 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
939 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
945 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
947 int i, a0, b0, a1, b1;\
954 for(i=0; i<h; i+=2){\
960 block[0]= (a1+a0)>>2; /* FIXME non put */\
961 block[1]= (b1+b0)>>2;\
971 block[0]= (a1+a0)>>2;\
972 block[1]= (b1+b0)>>2;\
978 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
981 const uint32_t a= LD32(pixels );\
982 const uint32_t b= LD32(pixels+1);\
983 uint32_t l0= (a&0x03030303UL)\
986 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987 + ((b&0xFCFCFCFCUL)>>2);\
991 for(i=0; i<h; i+=2){\
992 uint32_t a= LD32(pixels );\
993 uint32_t b= LD32(pixels+1);\
994 l1= (a&0x03030303UL)\
996 h1= ((a&0xFCFCFCFCUL)>>2)\
997 + ((b&0xFCFCFCFCUL)>>2);\
998 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1003 l0= (a&0x03030303UL)\
1006 h0= ((a&0xFCFCFCFCUL)>>2)\
1007 + ((b&0xFCFCFCFCUL)>>2);\
1008 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017 for(j=0; j<2; j++){\
1019 const uint32_t a= LD32(pixels );\
1020 const uint32_t b= LD32(pixels+1);\
1021 uint32_t l0= (a&0x03030303UL)\
1024 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1029 for(i=0; i<h; i+=2){\
1030 uint32_t a= LD32(pixels );\
1031 uint32_t b= LD32(pixels+1);\
1032 l1= (a&0x03030303UL)\
1033 + (b&0x03030303UL);\
1034 h1= ((a&0xFCFCFCFCUL)>>2)\
1035 + ((b&0xFCFCFCFCUL)>>2);\
1036 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1041 l0= (a&0x03030303UL)\
1044 h0= ((a&0xFCFCFCFCUL)>>2)\
1045 + ((b&0xFCFCFCFCUL)>>2);\
1046 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1050 pixels+=4-line_size*(h+1);\
1051 block +=4-line_size*h;\
1055 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1058 for(j=0; j<2; j++){\
1060 const uint32_t a= LD32(pixels );\
1061 const uint32_t b= LD32(pixels+1);\
1062 uint32_t l0= (a&0x03030303UL)\
1065 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066 + ((b&0xFCFCFCFCUL)>>2);\
1070 for(i=0; i<h; i+=2){\
1071 uint32_t a= LD32(pixels );\
1072 uint32_t b= LD32(pixels+1);\
1073 l1= (a&0x03030303UL)\
1074 + (b&0x03030303UL);\
1075 h1= ((a&0xFCFCFCFCUL)>>2)\
1076 + ((b&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082 l0= (a&0x03030303UL)\
1085 h0= ((a&0xFCFCFCFCUL)>>2)\
1086 + ((b&0xFCFCFCFCUL)>>2);\
1087 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1091 pixels+=4-line_size*(h+1);\
1092 block +=4-line_size*h;\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1105 #define op_avg(a, b) a = rnd_avg32(a, b)
1107 #define op_put(a, b) a = b
1114 #define avg2(a,b) ((a+b+1)>>1)
1115 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1117 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1121 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1125 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1127 const int A=(16-x16)*(16-y16);
1128 const int B=( x16)*(16-y16);
1129 const int C=(16-x16)*( y16);
1130 const int D=( x16)*( y16);
1135 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1148 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1152 const int s= 1<<shift;
1162 for(x=0; x<8; x++){ //XXX FIXME optimize
1163 int src_x, src_y, frac_x, frac_y, index;
1167 frac_x= src_x&(s-1);
1168 frac_y= src_y&(s-1);
1172 if((unsigned)src_x < width){
1173 if((unsigned)src_y < height){
1174 index= src_x + src_y*stride;
1175 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1176 + src[index +1]* frac_x )*(s-frac_y)
1177 + ( src[index+stride ]*(s-frac_x)
1178 + src[index+stride+1]* frac_x )* frac_y
1181 index= src_x + av_clip(src_y, 0, height)*stride;
1182 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1183 + src[index +1]* frac_x )*s
1187 if((unsigned)src_y < height){
1188 index= av_clip(src_x, 0, width) + src_y*stride;
1189 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1190 + src[index+stride ]* frac_y )*s
1193 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= src[index ];
1206 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1208 case 2: put_pixels2_c (dst, src, stride, height); break;
1209 case 4: put_pixels4_c (dst, src, stride, height); break;
1210 case 8: put_pixels8_c (dst, src, stride, height); break;
1211 case 16:put_pixels16_c(dst, src, stride, height); break;
1215 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1226 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1237 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1248 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1259 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1270 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 for (i=0; i < height; i++) {
1273 for (j=0; j < width; j++) {
1274 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1281 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283 for (i=0; i < height; i++) {
1284 for (j=0; j < width; j++) {
1285 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1292 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294 for (i=0; i < height; i++) {
1295 for (j=0; j < width; j++) {
1296 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1303 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305 case 2: avg_pixels2_c (dst, src, stride, height); break;
1306 case 4: avg_pixels4_c (dst, src, stride, height); break;
1307 case 8: avg_pixels8_c (dst, src, stride, height); break;
1308 case 16:avg_pixels16_c(dst, src, stride, height); break;
1312 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1323 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1334 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1345 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1367 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1378 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1400 #define TPEL_WIDTH(width)\
1401 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1421 #define H264_CHROMA_MC(OPNAME, OP)\
1422 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423 const int A=(8-x)*(8-y);\
1424 const int B=( x)*(8-y);\
1425 const int C=(8-x)*( y);\
1426 const int D=( x)*( y);\
1429 assert(x<8 && y<8 && x>=0 && y>=0);\
1433 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1440 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441 const int A=(8-x)*(8-y);\
1442 const int B=( x)*(8-y);\
1443 const int C=(8-x)*( y);\
1444 const int D=( x)*( y);\
1447 assert(x<8 && y<8 && x>=0 && y>=0);\
1451 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1460 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461 const int A=(8-x)*(8-y);\
1462 const int B=( x)*(8-y);\
1463 const int C=(8-x)*( y);\
1464 const int D=( x)*( y);\
1467 assert(x<8 && y<8 && x>=0 && y>=0);\
1471 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1484 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485 #define op_put(a, b) a = (((b) + 32)>>6)
1487 H264_CHROMA_MC(put_ , op_put)
1488 H264_CHROMA_MC(avg_ , op_avg)
1492 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493 const int A=(8-x)*(8-y);
1494 const int B=( x)*(8-y);
1495 const int C=(8-x)*( y);
1496 const int D=( x)*( y);
1499 assert(x<8 && y<8 && x>=0 && y>=0);
1503 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1516 #define QPEL_MC(r, OPNAME, RND, OP) \
1517 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1518 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1522 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1523 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1524 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1525 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1526 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1527 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1528 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1529 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1535 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1537 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1541 const int src0= src[0*srcStride];\
1542 const int src1= src[1*srcStride];\
1543 const int src2= src[2*srcStride];\
1544 const int src3= src[3*srcStride];\
1545 const int src4= src[4*srcStride];\
1546 const int src5= src[5*srcStride];\
1547 const int src6= src[6*srcStride];\
1548 const int src7= src[7*srcStride];\
1549 const int src8= src[8*srcStride];\
1550 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1551 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1552 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1553 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1554 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1555 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1556 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1557 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1563 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1564 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1569 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1570 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1571 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1572 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1573 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1574 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1575 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1576 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1577 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1578 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1579 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1580 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1581 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1582 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1583 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1584 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1590 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1591 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1596 const int src0= src[0*srcStride];\
1597 const int src1= src[1*srcStride];\
1598 const int src2= src[2*srcStride];\
1599 const int src3= src[3*srcStride];\
1600 const int src4= src[4*srcStride];\
1601 const int src5= src[5*srcStride];\
1602 const int src6= src[6*srcStride];\
1603 const int src7= src[7*srcStride];\
1604 const int src8= src[8*srcStride];\
1605 const int src9= src[9*srcStride];\
1606 const int src10= src[10*srcStride];\
1607 const int src11= src[11*srcStride];\
1608 const int src12= src[12*srcStride];\
1609 const int src13= src[13*srcStride];\
1610 const int src14= src[14*srcStride];\
1611 const int src15= src[15*srcStride];\
1612 const int src16= src[16*srcStride];\
1613 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1614 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1615 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1616 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1617 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1618 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1619 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1620 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1621 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1622 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1623 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1624 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1625 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1626 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1627 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1628 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1634 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1635 OPNAME ## pixels8_c(dst, src, stride, 8);\
1638 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1640 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1641 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1644 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1645 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1648 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1650 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1651 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1654 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1655 uint8_t full[16*9];\
1657 copy_block9(full, src, 16, stride, 9);\
1658 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1659 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1662 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1663 uint8_t full[16*9];\
1664 copy_block9(full, src, 16, stride, 9);\
1665 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1668 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1669 uint8_t full[16*9];\
1671 copy_block9(full, src, 16, stride, 9);\
1672 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1673 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1675 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1676 uint8_t full[16*9];\
1679 uint8_t halfHV[64];\
1680 copy_block9(full, src, 16, stride, 9);\
1681 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1682 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1683 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1684 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1686 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1687 uint8_t full[16*9];\
1689 uint8_t halfHV[64];\
1690 copy_block9(full, src, 16, stride, 9);\
1691 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1692 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1693 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1694 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1696 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1697 uint8_t full[16*9];\
1700 uint8_t halfHV[64];\
1701 copy_block9(full, src, 16, stride, 9);\
1702 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1703 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1704 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1705 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1707 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1708 uint8_t full[16*9];\
1710 uint8_t halfHV[64];\
1711 copy_block9(full, src, 16, stride, 9);\
1712 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1713 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1714 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1715 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1717 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1718 uint8_t full[16*9];\
1721 uint8_t halfHV[64];\
1722 copy_block9(full, src, 16, stride, 9);\
1723 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1724 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1725 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1726 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1728 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1729 uint8_t full[16*9];\
1731 uint8_t halfHV[64];\
1732 copy_block9(full, src, 16, stride, 9);\
1733 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1734 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1736 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1738 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1739 uint8_t full[16*9];\
1742 uint8_t halfHV[64];\
1743 copy_block9(full, src, 16, stride, 9);\
1744 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1745 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1746 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1747 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1749 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1750 uint8_t full[16*9];\
1752 uint8_t halfHV[64];\
1753 copy_block9(full, src, 16, stride, 9);\
1754 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1755 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1756 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1757 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1759 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1761 uint8_t halfHV[64];\
1762 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1763 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1764 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1766 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1768 uint8_t halfHV[64];\
1769 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1773 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1777 uint8_t halfHV[64];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1784 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1785 uint8_t full[16*9];\
1787 copy_block9(full, src, 16, stride, 9);\
1788 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1789 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1790 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1792 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1793 uint8_t full[16*9];\
1796 uint8_t halfHV[64];\
1797 copy_block9(full, src, 16, stride, 9);\
1798 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1799 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1800 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1801 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1803 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1804 uint8_t full[16*9];\
1806 copy_block9(full, src, 16, stride, 9);\
1807 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1808 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1809 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1811 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1813 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1814 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1816 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1817 OPNAME ## pixels16_c(dst, src, stride, 16);\
1820 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1822 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1823 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1826 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1827 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1830 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1832 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1833 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1836 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1837 uint8_t full[24*17];\
1839 copy_block17(full, src, 24, stride, 17);\
1840 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1841 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1844 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[24*17];\
1846 copy_block17(full, src, 24, stride, 17);\
1847 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1850 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[24*17];\
1853 copy_block17(full, src, 24, stride, 17);\
1854 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1855 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1857 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[24*17];\
1859 uint8_t halfH[272];\
1860 uint8_t halfV[256];\
1861 uint8_t halfHV[256];\
1862 copy_block17(full, src, 24, stride, 17);\
1863 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1864 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1865 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1866 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1868 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1869 uint8_t full[24*17];\
1870 uint8_t halfH[272];\
1871 uint8_t halfHV[256];\
1872 copy_block17(full, src, 24, stride, 17);\
1873 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1874 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1875 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1876 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1878 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1879 uint8_t full[24*17];\
1880 uint8_t halfH[272];\
1881 uint8_t halfV[256];\
1882 uint8_t halfHV[256];\
1883 copy_block17(full, src, 24, stride, 17);\
1884 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1885 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1886 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1887 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1889 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1890 uint8_t full[24*17];\
1891 uint8_t halfH[272];\
1892 uint8_t halfHV[256];\
1893 copy_block17(full, src, 24, stride, 17);\
1894 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1895 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1896 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1897 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1899 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1900 uint8_t full[24*17];\
1901 uint8_t halfH[272];\
1902 uint8_t halfV[256];\
1903 uint8_t halfHV[256];\
1904 copy_block17(full, src, 24, stride, 17);\
1905 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1906 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1907 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1908 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1910 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1911 uint8_t full[24*17];\
1912 uint8_t halfH[272];\
1913 uint8_t halfHV[256];\
1914 copy_block17(full, src, 24, stride, 17);\
1915 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1916 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1918 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1920 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1921 uint8_t full[24*17];\
1922 uint8_t halfH[272];\
1923 uint8_t halfV[256];\
1924 uint8_t halfHV[256];\
1925 copy_block17(full, src, 24, stride, 17);\
1926 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1927 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1928 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1929 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1931 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1932 uint8_t full[24*17];\
1933 uint8_t halfH[272];\
1934 uint8_t halfHV[256];\
1935 copy_block17(full, src, 24, stride, 17);\
1936 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1937 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1938 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1939 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1941 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t halfH[272];\
1943 uint8_t halfHV[256];\
1944 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1945 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1946 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1948 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t halfH[272];\
1950 uint8_t halfHV[256];\
1951 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1955 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfV[256];\
1959 uint8_t halfHV[256];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1966 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 uint8_t halfH[272];\
1969 copy_block17(full, src, 24, stride, 17);\
1970 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1971 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1972 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1974 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[24*17];\
1976 uint8_t halfH[272];\
1977 uint8_t halfV[256];\
1978 uint8_t halfHV[256];\
1979 copy_block17(full, src, 24, stride, 17);\
1980 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1981 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1982 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1983 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1985 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1986 uint8_t full[24*17];\
1987 uint8_t halfH[272];\
1988 copy_block17(full, src, 24, stride, 17);\
1989 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1990 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1991 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1993 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t halfH[272];\
1995 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1996 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1999 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2000 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2001 #define op_put(a, b) a = cm[((b) + 16)>>5]
2002 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2004 QPEL_MC(0, put_ , _ , op_put)
2005 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2006 QPEL_MC(0, avg_ , _ , op_avg)
2007 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2009 #undef op_avg_no_rnd
2011 #undef op_put_no_rnd
2014 #define H264_LOWPASS(OPNAME, OP, OP2) \
2015 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2017 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2021 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2022 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2028 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2030 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2034 const int srcB= src[-2*srcStride];\
2035 const int srcA= src[-1*srcStride];\
2036 const int src0= src[0 *srcStride];\
2037 const int src1= src[1 *srcStride];\
2038 const int src2= src[2 *srcStride];\
2039 const int src3= src[3 *srcStride];\
2040 const int src4= src[4 *srcStride];\
2041 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2042 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2048 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2051 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2053 src -= 2*srcStride;\
2054 for(i=0; i<h+5; i++)\
2056 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2057 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2061 tmp -= tmpStride*(h+5-2);\
2064 const int tmpB= tmp[-2*tmpStride];\
2065 const int tmpA= tmp[-1*tmpStride];\
2066 const int tmp0= tmp[0 *tmpStride];\
2067 const int tmp1= tmp[1 *tmpStride];\
2068 const int tmp2= tmp[2 *tmpStride];\
2069 const int tmp3= tmp[3 *tmpStride];\
2070 const int tmp4= tmp[4 *tmpStride];\
2071 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2072 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2077 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2079 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2083 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2084 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2085 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2086 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2092 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2094 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2098 const int srcB= src[-2*srcStride];\
2099 const int srcA= src[-1*srcStride];\
2100 const int src0= src[0 *srcStride];\
2101 const int src1= src[1 *srcStride];\
2102 const int src2= src[2 *srcStride];\
2103 const int src3= src[3 *srcStride];\
2104 const int src4= src[4 *srcStride];\
2105 const int src5= src[5 *srcStride];\
2106 const int src6= src[6 *srcStride];\
2107 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2108 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2109 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2110 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2116 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2119 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2121 src -= 2*srcStride;\
2122 for(i=0; i<h+5; i++)\
2124 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2125 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2126 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2127 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2131 tmp -= tmpStride*(h+5-2);\
2134 const int tmpB= tmp[-2*tmpStride];\
2135 const int tmpA= tmp[-1*tmpStride];\
2136 const int tmp0= tmp[0 *tmpStride];\
2137 const int tmp1= tmp[1 *tmpStride];\
2138 const int tmp2= tmp[2 *tmpStride];\
2139 const int tmp3= tmp[3 *tmpStride];\
2140 const int tmp4= tmp[4 *tmpStride];\
2141 const int tmp5= tmp[5 *tmpStride];\
2142 const int tmp6= tmp[6 *tmpStride];\
2143 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2144 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2145 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2146 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2152 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2154 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2158 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2159 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2160 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2161 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2162 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2163 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2164 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2165 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2171 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2173 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2177 const int srcB= src[-2*srcStride];\
2178 const int srcA= src[-1*srcStride];\
2179 const int src0= src[0 *srcStride];\
2180 const int src1= src[1 *srcStride];\
2181 const int src2= src[2 *srcStride];\
2182 const int src3= src[3 *srcStride];\
2183 const int src4= src[4 *srcStride];\
2184 const int src5= src[5 *srcStride];\
2185 const int src6= src[6 *srcStride];\
2186 const int src7= src[7 *srcStride];\
2187 const int src8= src[8 *srcStride];\
2188 const int src9= src[9 *srcStride];\
2189 const int src10=src[10*srcStride];\
2190 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2191 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2192 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2193 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2194 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2195 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2196 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2197 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2203 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 src -= 2*srcStride;\
2209 for(i=0; i<h+5; i++)\
2211 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2212 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2213 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2214 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2215 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2216 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2217 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2218 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2222 tmp -= tmpStride*(h+5-2);\
2225 const int tmpB= tmp[-2*tmpStride];\
2226 const int tmpA= tmp[-1*tmpStride];\
2227 const int tmp0= tmp[0 *tmpStride];\
2228 const int tmp1= tmp[1 *tmpStride];\
2229 const int tmp2= tmp[2 *tmpStride];\
2230 const int tmp3= tmp[3 *tmpStride];\
2231 const int tmp4= tmp[4 *tmpStride];\
2232 const int tmp5= tmp[5 *tmpStride];\
2233 const int tmp6= tmp[6 *tmpStride];\
2234 const int tmp7= tmp[7 *tmpStride];\
2235 const int tmp8= tmp[8 *tmpStride];\
2236 const int tmp9= tmp[9 *tmpStride];\
2237 const int tmp10=tmp[10*tmpStride];\
2238 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2239 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2240 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2241 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2242 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2243 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2244 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2245 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2251 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2252 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2253 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2254 src += 8*srcStride;\
2255 dst += 8*dstStride;\
2256 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2257 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2260 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2261 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2262 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2263 src += 8*srcStride;\
2264 dst += 8*dstStride;\
2265 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2266 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2269 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2270 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2271 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2272 src += 8*srcStride;\
2273 dst += 8*dstStride;\
2274 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2275 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2278 #define H264_MC(OPNAME, SIZE) \
2279 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2280 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2283 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2284 uint8_t half[SIZE*SIZE];\
2285 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2286 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2289 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2290 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2293 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2294 uint8_t half[SIZE*SIZE];\
2295 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2296 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2299 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2300 uint8_t full[SIZE*(SIZE+5)];\
2301 uint8_t * const full_mid= full + SIZE*2;\
2302 uint8_t half[SIZE*SIZE];\
2303 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2304 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2305 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2308 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2309 uint8_t full[SIZE*(SIZE+5)];\
2310 uint8_t * const full_mid= full + SIZE*2;\
2311 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2312 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2315 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2316 uint8_t full[SIZE*(SIZE+5)];\
2317 uint8_t * const full_mid= full + SIZE*2;\
2318 uint8_t half[SIZE*SIZE];\
2319 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2320 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2321 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2324 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2325 uint8_t full[SIZE*(SIZE+5)];\
2326 uint8_t * const full_mid= full + SIZE*2;\
2327 uint8_t halfH[SIZE*SIZE];\
2328 uint8_t halfV[SIZE*SIZE];\
2329 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2330 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2331 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2332 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2336 uint8_t full[SIZE*(SIZE+5)];\
2337 uint8_t * const full_mid= full + SIZE*2;\
2338 uint8_t halfH[SIZE*SIZE];\
2339 uint8_t halfV[SIZE*SIZE];\
2340 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2341 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2342 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2343 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2346 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2347 uint8_t full[SIZE*(SIZE+5)];\
2348 uint8_t * const full_mid= full + SIZE*2;\
2349 uint8_t halfH[SIZE*SIZE];\
2350 uint8_t halfV[SIZE*SIZE];\
2351 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2352 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2353 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2354 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2357 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2358 uint8_t full[SIZE*(SIZE+5)];\
2359 uint8_t * const full_mid= full + SIZE*2;\
2360 uint8_t halfH[SIZE*SIZE];\
2361 uint8_t halfV[SIZE*SIZE];\
2362 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2363 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2364 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2365 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2368 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2369 int16_t tmp[SIZE*(SIZE+5)];\
2370 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2373 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2374 int16_t tmp[SIZE*(SIZE+5)];\
2375 uint8_t halfH[SIZE*SIZE];\
2376 uint8_t halfHV[SIZE*SIZE];\
2377 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2378 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2379 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2382 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2383 int16_t tmp[SIZE*(SIZE+5)];\
2384 uint8_t halfH[SIZE*SIZE];\
2385 uint8_t halfHV[SIZE*SIZE];\
2386 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2387 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2388 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2392 uint8_t full[SIZE*(SIZE+5)];\
2393 uint8_t * const full_mid= full + SIZE*2;\
2394 int16_t tmp[SIZE*(SIZE+5)];\
2395 uint8_t halfV[SIZE*SIZE];\
2396 uint8_t halfHV[SIZE*SIZE];\
2397 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2398 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2403 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2404 uint8_t full[SIZE*(SIZE+5)];\
2405 uint8_t * const full_mid= full + SIZE*2;\
2406 int16_t tmp[SIZE*(SIZE+5)];\
2407 uint8_t halfV[SIZE*SIZE];\
2408 uint8_t halfHV[SIZE*SIZE];\
2409 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2410 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2415 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2416 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2417 #define op_put(a, b) a = cm[((b) + 16)>>5]
2418 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2419 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2421 H264_LOWPASS(put_ , op_put, op2_put)
2422 H264_LOWPASS(avg_ , op_avg, op2_avg)
2437 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2438 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2439 #define H264_WEIGHT(W,H) \
2440 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2442 offset <<= log2_denom; \
2443 if(log2_denom) offset += 1<<(log2_denom-1); \
2444 for(y=0; y<H; y++, block += stride){ \
2447 if(W==2) continue; \
2450 if(W==4) continue; \
2455 if(W==8) continue; \
2466 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2468 offset = ((offset + 1) | 1) << log2_denom; \
2469 for(y=0; y<H; y++, dst += stride, src += stride){ \
2472 if(W==2) continue; \
2475 if(W==4) continue; \
2480 if(W==8) continue; \
2507 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2508 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2512 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2513 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2514 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2515 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2516 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2517 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2518 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2519 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2525 #ifdef CONFIG_CAVS_DECODER
2527 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2529 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2530 put_pixels8_c(dst, src, stride, 8);
2532 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2533 avg_pixels8_c(dst, src, stride, 8);
2535 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2536 put_pixels16_c(dst, src, stride, 16);
2538 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2539 avg_pixels16_c(dst, src, stride, 16);
2541 #endif /* CONFIG_CAVS_DECODER */
2543 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2545 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2547 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2548 put_pixels8_c(dst, src, stride, 8);
2550 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2552 #if defined(CONFIG_H264_ENCODER)
2554 void ff_h264dsp_init(DSPContext* c, AVCodecContext *avctx);
2555 #endif /* CONFIG_H264_ENCODER */
2557 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2562 const int src_1= src[ -srcStride];
2563 const int src0 = src[0 ];
2564 const int src1 = src[ srcStride];
2565 const int src2 = src[2*srcStride];
2566 const int src3 = src[3*srcStride];
2567 const int src4 = src[4*srcStride];
2568 const int src5 = src[5*srcStride];
2569 const int src6 = src[6*srcStride];
2570 const int src7 = src[7*srcStride];
2571 const int src8 = src[8*srcStride];
2572 const int src9 = src[9*srcStride];
2573 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2574 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2575 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2576 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2577 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2578 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2579 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2580 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2586 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2587 put_pixels8_c(dst, src, stride, 8);
2590 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2592 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2593 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2596 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2597 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2600 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2602 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2603 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2606 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2607 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2610 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2614 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2615 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2616 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2617 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2619 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2623 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2624 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2625 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2626 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2628 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2630 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2631 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2634 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2636 const int strength= ff_h263_loop_filter_strength[qscale];
2640 int p0= src[x-2*stride];
2641 int p1= src[x-1*stride];
2642 int p2= src[x+0*stride];
2643 int p3= src[x+1*stride];
2644 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2646 if (d<-2*strength) d1= 0;
2647 else if(d<- strength) d1=-2*strength - d;
2648 else if(d< strength) d1= d;
2649 else if(d< 2*strength) d1= 2*strength - d;
2654 if(p1&256) p1= ~(p1>>31);
2655 if(p2&256) p2= ~(p2>>31);
2657 src[x-1*stride] = p1;
2658 src[x+0*stride] = p2;
2662 d2= av_clip((p0-p3)/4, -ad1, ad1);
2664 src[x-2*stride] = p0 - d2;
2665 src[x+ stride] = p3 + d2;
2669 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2671 const int strength= ff_h263_loop_filter_strength[qscale];
2675 int p0= src[y*stride-2];
2676 int p1= src[y*stride-1];
2677 int p2= src[y*stride+0];
2678 int p3= src[y*stride+1];
2679 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2681 if (d<-2*strength) d1= 0;
2682 else if(d<- strength) d1=-2*strength - d;
2683 else if(d< strength) d1= d;
2684 else if(d< 2*strength) d1= 2*strength - d;
2689 if(p1&256) p1= ~(p1>>31);
2690 if(p2&256) p2= ~(p2>>31);
2692 src[y*stride-1] = p1;
2693 src[y*stride+0] = p2;
2697 d2= av_clip((p0-p3)/4, -ad1, ad1);
2699 src[y*stride-2] = p0 - d2;
2700 src[y*stride+1] = p3 + d2;
2704 static void h261_loop_filter_c(uint8_t *src, int stride){
2709 temp[x ] = 4*src[x ];
2710 temp[x + 7*8] = 4*src[x + 7*stride];
2714 xy = y * stride + x;
2716 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2721 src[ y*stride] = (temp[ y*8] + 2)>>2;
2722 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2724 xy = y * stride + x;
2726 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2731 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2734 for( i = 0; i < 4; i++ ) {
2739 for( d = 0; d < 4; d++ ) {
2740 const int p0 = pix[-1*xstride];
2741 const int p1 = pix[-2*xstride];
2742 const int p2 = pix[-3*xstride];
2743 const int q0 = pix[0];
2744 const int q1 = pix[1*xstride];
2745 const int q2 = pix[2*xstride];
2747 if( FFABS( p0 - q0 ) < alpha &&
2748 FFABS( p1 - p0 ) < beta &&
2749 FFABS( q1 - q0 ) < beta ) {
2754 if( FFABS( p2 - p0 ) < beta ) {
2755 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2758 if( FFABS( q2 - q0 ) < beta ) {
2759 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2763 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2764 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2765 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2771 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2773 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2775 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2777 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2780 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2783 for( i = 0; i < 4; i++ ) {
2784 const int tc = tc0[i];
2789 for( d = 0; d < 2; d++ ) {
2790 const int p0 = pix[-1*xstride];
2791 const int p1 = pix[-2*xstride];
2792 const int q0 = pix[0];
2793 const int q1 = pix[1*xstride];
2795 if( FFABS( p0 - q0 ) < alpha &&
2796 FFABS( p1 - p0 ) < beta &&
2797 FFABS( q1 - q0 ) < beta ) {
2799 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2801 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2802 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2808 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2810 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2812 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2814 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2817 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2820 for( d = 0; d < 8; d++ ) {
2821 const int p0 = pix[-1*xstride];
2822 const int p1 = pix[-2*xstride];
2823 const int q0 = pix[0];
2824 const int q1 = pix[1*xstride];
2826 if( FFABS( p0 - q0 ) < alpha &&
2827 FFABS( p1 - p0 ) < beta &&
2828 FFABS( q1 - q0 ) < beta ) {
2830 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2831 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2836 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2838 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2840 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2842 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2845 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2851 s += abs(pix1[0] - pix2[0]);
2852 s += abs(pix1[1] - pix2[1]);
2853 s += abs(pix1[2] - pix2[2]);
2854 s += abs(pix1[3] - pix2[3]);
2855 s += abs(pix1[4] - pix2[4]);
2856 s += abs(pix1[5] - pix2[5]);
2857 s += abs(pix1[6] - pix2[6]);
2858 s += abs(pix1[7] - pix2[7]);
2859 s += abs(pix1[8] - pix2[8]);
2860 s += abs(pix1[9] - pix2[9]);
2861 s += abs(pix1[10] - pix2[10]);
2862 s += abs(pix1[11] - pix2[11]);
2863 s += abs(pix1[12] - pix2[12]);
2864 s += abs(pix1[13] - pix2[13]);
2865 s += abs(pix1[14] - pix2[14]);
2866 s += abs(pix1[15] - pix2[15]);
2873 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2879 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2880 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2881 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2882 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2883 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2884 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2885 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2886 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2887 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2888 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2889 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2890 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2891 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2892 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2893 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2894 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2901 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2904 uint8_t *pix3 = pix2 + line_size;
2908 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2909 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2910 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2911 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2912 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2913 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2914 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2915 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2916 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2917 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2918 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2919 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2920 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2921 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2922 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2923 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2931 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2934 uint8_t *pix3 = pix2 + line_size;
2938 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2939 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2940 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2941 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2942 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2943 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2944 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2945 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2946 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2947 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2948 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2949 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2950 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2951 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2952 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2953 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2961 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2967 s += abs(pix1[0] - pix2[0]);
2968 s += abs(pix1[1] - pix2[1]);
2969 s += abs(pix1[2] - pix2[2]);
2970 s += abs(pix1[3] - pix2[3]);
2971 s += abs(pix1[4] - pix2[4]);
2972 s += abs(pix1[5] - pix2[5]);
2973 s += abs(pix1[6] - pix2[6]);
2974 s += abs(pix1[7] - pix2[7]);
2981 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2987 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2988 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2989 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2990 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2991 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2992 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2993 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2994 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3001 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3004 uint8_t *pix3 = pix2 + line_size;
3008 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3009 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3010 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3011 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3012 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3013 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3014 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3015 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3023 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3026 uint8_t *pix3 = pix2 + line_size;
3030 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3031 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3032 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3033 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3034 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3035 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3036 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3037 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3045 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3046 MpegEncContext *c = v;
3052 for(x=0; x<16; x++){
3053 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3056 for(x=0; x<15; x++){
3057 score2+= FFABS( s1[x ] - s1[x +stride]
3058 - s1[x+1] + s1[x+1+stride])
3059 -FFABS( s2[x ] - s2[x +stride]
3060 - s2[x+1] + s2[x+1+stride]);
3067 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3068 else return score1 + FFABS(score2)*8;
3071 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3072 MpegEncContext *c = v;
3079 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3083 score2+= FFABS( s1[x ] - s1[x +stride]
3084 - s1[x+1] + s1[x+1+stride])
3085 -FFABS( s2[x ] - s2[x +stride]
3086 - s2[x+1] + s2[x+1+stride]);
3093 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3094 else return score1 + FFABS(score2)*8;
3097 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3101 for(i=0; i<8*8; i++){
3102 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3105 assert(-512<b && b<512);
3107 sum += (w*b)*(w*b)>>4;
3112 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3115 for(i=0; i<8*8; i++){
3116 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3121 * permutes an 8x8 block.
3122 * @param block the block which will be permuted according to the given permutation vector
3123 * @param permutation the permutation vector
3124 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3125 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3126 * (inverse) permutated to scantable order!
3128 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3134 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3136 for(i=0; i<=last; i++){
3137 const int j= scantable[i];
3142 for(i=0; i<=last; i++){
3143 const int j= scantable[i];
3144 const int perm_j= permutation[j];
3145 block[perm_j]= temp[j];
3149 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3153 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3156 memset(cmp, 0, sizeof(void*)*5);
3164 cmp[i]= c->hadamard8_diff[i];
3170 cmp[i]= c->dct_sad[i];
3173 cmp[i]= c->dct264_sad[i];
3176 cmp[i]= c->dct_max[i];
3179 cmp[i]= c->quant_psnr[i];
3199 #ifdef CONFIG_SNOW_ENCODER
3208 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3214 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3216 static void clear_blocks_c(DCTELEM *blocks)
3218 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3221 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3223 for(i=0; i+7<w; i+=8){
3224 dst[i+0] += src[i+0];
3225 dst[i+1] += src[i+1];
3226 dst[i+2] += src[i+2];
3227 dst[i+3] += src[i+3];
3228 dst[i+4] += src[i+4];
3229 dst[i+5] += src[i+5];
3230 dst[i+6] += src[i+6];
3231 dst[i+7] += src[i+7];
3234 dst[i+0] += src[i+0];
3237 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3239 for(i=0; i+7<w; i+=8){
3240 dst[i+0] = src1[i+0]-src2[i+0];
3241 dst[i+1] = src1[i+1]-src2[i+1];
3242 dst[i+2] = src1[i+2]-src2[i+2];
3243 dst[i+3] = src1[i+3]-src2[i+3];
3244 dst[i+4] = src1[i+4]-src2[i+4];
3245 dst[i+5] = src1[i+5]-src2[i+5];
3246 dst[i+6] = src1[i+6]-src2[i+6];
3247 dst[i+7] = src1[i+7]-src2[i+7];
3250 dst[i+0] = src1[i+0]-src2[i+0];
3253 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3261 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3271 #define BUTTERFLY2(o1,o2,i1,i2) \
3275 #define BUTTERFLY1(x,y) \
3284 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3286 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3294 //FIXME try pointer walks
3295 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3296 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3297 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3298 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3300 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3301 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3302 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3303 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3305 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3306 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3307 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3308 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3312 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3313 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3314 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3315 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3317 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3318 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3319 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3320 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3323 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3324 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3325 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3326 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3332 printf("MAX:%d\n", maxi);
3338 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3346 //FIXME try pointer walks
3347 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3348 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3349 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3350 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3352 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3353 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3354 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3355 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3357 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3358 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3359 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3360 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3364 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3365 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3366 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3367 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3369 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3370 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3371 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3372 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3375 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3376 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3377 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3378 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3381 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3386 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3387 MpegEncContext * const s= (MpegEncContext *)c;
3388 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3389 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3394 s->dsp.diff_pixels(temp, src1, src2, stride);
3398 sum+= FFABS(temp[i]);
3405 const int s07 = SRC(0) + SRC(7);\
3406 const int s16 = SRC(1) + SRC(6);\
3407 const int s25 = SRC(2) + SRC(5);\
3408 const int s34 = SRC(3) + SRC(4);\
3409 const int a0 = s07 + s34;\
3410 const int a1 = s16 + s25;\
3411 const int a2 = s07 - s34;\
3412 const int a3 = s16 - s25;\
3413 const int d07 = SRC(0) - SRC(7);\
3414 const int d16 = SRC(1) - SRC(6);\
3415 const int d25 = SRC(2) - SRC(5);\
3416 const int d34 = SRC(3) - SRC(4);\
3417 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3418 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3419 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3420 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3422 DST(1, a4 + (a7>>2)) ;\
3423 DST(2, a2 + (a3>>1)) ;\
3424 DST(3, a5 + (a6>>2)) ;\
3426 DST(5, a6 - (a5>>2)) ;\
3427 DST(6, (a2>>1) - a3 ) ;\
3428 DST(7, (a4>>2) - a7 ) ;\
3431 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3432 MpegEncContext * const s= (MpegEncContext *)c;
3437 s->dsp.diff_pixels(dct, src1, src2, stride);
3439 #define SRC(x) dct[i][x]
3440 #define DST(x,v) dct[i][x]= v
3441 for( i = 0; i < 8; i++ )
3446 #define SRC(x) dct[x][i]
3447 #define DST(x,v) sum += FFABS(v)
3448 for( i = 0; i < 8; i++ )
3456 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3457 MpegEncContext * const s= (MpegEncContext *)c;
3458 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3459 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3464 s->dsp.diff_pixels(temp, src1, src2, stride);
3468 sum= FFMAX(sum, FFABS(temp[i]));
3473 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3474 MpegEncContext * const s= (MpegEncContext *)c;
3475 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3476 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3477 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3483 s->dsp.diff_pixels(temp, src1, src2, stride);
3485 memcpy(bak, temp, 64*sizeof(DCTELEM));
3487 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3488 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3489 simple_idct(temp); //FIXME
3492 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3497 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3498 MpegEncContext * const s= (MpegEncContext *)c;
3499 const uint8_t *scantable= s->intra_scantable.permutated;
3500 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3501 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3502 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3503 uint8_t * const bak= (uint8_t*)aligned_bak;
3504 int i, last, run, bits, level, distoration, start_i;
3505 const int esc_length= s->ac_esc_length;
3507 uint8_t * last_length;
3512 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3513 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3516 s->dsp.diff_pixels(temp, src1, src2, stride);
3518 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3524 length = s->intra_ac_vlc_length;
3525 last_length= s->intra_ac_vlc_last_length;
3526 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3529 length = s->inter_ac_vlc_length;
3530 last_length= s->inter_ac_vlc_last_length;
3535 for(i=start_i; i<last; i++){
3536 int j= scantable[i];
3541 if((level&(~127)) == 0){
3542 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3551 level= temp[i] + 64;
3555 if((level&(~127)) == 0){
3556 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3564 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3566 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3569 s->dsp.idct_add(bak, stride, temp);
3571 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3573 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3576 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3577 MpegEncContext * const s= (MpegEncContext *)c;
3578 const uint8_t *scantable= s->intra_scantable.permutated;
3579 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3580 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3581 int i, last, run, bits, level, start_i;
3582 const int esc_length= s->ac_esc_length;
3584 uint8_t * last_length;
3588 s->dsp.diff_pixels(temp, src1, src2, stride);
3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3596 length = s->intra_ac_vlc_length;
3597 last_length= s->intra_ac_vlc_last_length;
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3601 length = s->inter_ac_vlc_length;
3602 last_length= s->inter_ac_vlc_last_length;
3607 for(i=start_i; i<last; i++){
3608 int j= scantable[i];
3613 if((level&(~127)) == 0){
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3623 level= temp[i] + 64;
3627 if((level&(~127)) == 0){
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3641 for(x=0; x<16; x+=4){
3642 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3643 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3651 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3656 for(x=0; x<16; x++){
3657 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3666 #define SQ(a) ((a)*(a))
3667 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3672 for(x=0; x<16; x+=4){
3673 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3674 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3682 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3687 for(x=0; x<16; x++){
3688 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3697 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3698 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3699 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3701 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3703 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3704 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3705 WARPER8_16_SQ(rd8x8_c, rd16_c)
3706 WARPER8_16_SQ(bit8x8_c, bit16_c)
3708 static void vector_fmul_c(float *dst, const float *src, int len){
3710 for(i=0; i<len; i++)
3714 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3717 for(i=0; i<len; i++)
3718 dst[i] = src0[i] * src1[-i];
3721 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3723 for(i=0; i<len; i++)
3724 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3727 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3729 for(i=0; i<len; i++) {
3730 int_fast32_t tmp = ((int32_t*)src)[i];
3732 tmp = (0x43c0ffff - tmp)>>31;
3733 // is this faster on some gcc/cpu combinations?
3734 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3737 dst[i] = tmp - 0x8000;
3741 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3743 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3746 put_pixels_clamped_c(block, dest, line_size);
3748 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3751 add_pixels_clamped_c(block, dest, line_size);
3754 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3757 put_pixels_clamped4_c(block, dest, line_size);
3759 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3762 add_pixels_clamped4_c(block, dest, line_size);
3765 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3768 put_pixels_clamped2_c(block, dest, line_size);
3770 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3773 add_pixels_clamped2_c(block, dest, line_size);
3776 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3778 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3780 dest[0] = cm[(block[0] + 4)>>3];
3782 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3784 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3786 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3789 static void just_return() { return; }
3791 /* init static data */
3792 void dsputil_static_init(void)
3796 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3797 for(i=0;i<MAX_NEG_CROP;i++) {
3799 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3802 for(i=0;i<512;i++) {
3803 ff_squareTbl[i] = (i - 256) * (i - 256);
3806 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3809 int ff_check_alignment(void){
3810 static int did_fail=0;
3811 DECLARE_ALIGNED_16(int, aligned);
3813 if((int)&aligned & 15){
3815 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3816 av_log(NULL, AV_LOG_ERROR,
3817 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3818 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3819 "but in the compiler. Do not report crashes to FFmpeg developers.\n");
3828 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3832 ff_check_alignment();
3834 #ifdef CONFIG_ENCODERS
3835 if(avctx->dct_algo==FF_DCT_FASTINT) {
3836 c->fdct = fdct_ifast;
3837 c->fdct248 = fdct_ifast248;
3839 else if(avctx->dct_algo==FF_DCT_FAAN) {
3840 c->fdct = ff_faandct;
3841 c->fdct248 = ff_faandct248;
3844 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3845 c->fdct248 = ff_fdct248_islow;
3847 #endif //CONFIG_ENCODERS
3849 if(avctx->lowres==1){
3850 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3851 c->idct_put= ff_jref_idct4_put;
3852 c->idct_add= ff_jref_idct4_add;
3854 c->idct_put= ff_h264_lowres_idct_put_c;
3855 c->idct_add= ff_h264_lowres_idct_add_c;
3857 c->idct = j_rev_dct4;
3858 c->idct_permutation_type= FF_NO_IDCT_PERM;
3859 }else if(avctx->lowres==2){
3860 c->idct_put= ff_jref_idct2_put;
3861 c->idct_add= ff_jref_idct2_add;
3862 c->idct = j_rev_dct2;
3863 c->idct_permutation_type= FF_NO_IDCT_PERM;
3864 }else if(avctx->lowres==3){
3865 c->idct_put= ff_jref_idct1_put;
3866 c->idct_add= ff_jref_idct1_add;
3867 c->idct = j_rev_dct1;
3868 c->idct_permutation_type= FF_NO_IDCT_PERM;
3870 if(avctx->idct_algo==FF_IDCT_INT){
3871 c->idct_put= ff_jref_idct_put;
3872 c->idct_add= ff_jref_idct_add;
3873 c->idct = j_rev_dct;
3874 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3875 }else if(avctx->idct_algo==FF_IDCT_VP3){
3876 c->idct_put= ff_vp3_idct_put_c;
3877 c->idct_add= ff_vp3_idct_add_c;
3878 c->idct = ff_vp3_idct_c;
3879 c->idct_permutation_type= FF_NO_IDCT_PERM;
3880 }else{ //accurate/default
3881 c->idct_put= simple_idct_put;
3882 c->idct_add= simple_idct_add;
3883 c->idct = simple_idct;
3884 c->idct_permutation_type= FF_NO_IDCT_PERM;
3888 c->h264_idct_add= ff_h264_idct_add_c;
3889 c->h264_idct8_add= ff_h264_idct8_add_c;
3890 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3891 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3893 c->get_pixels = get_pixels_c;
3894 c->diff_pixels = diff_pixels_c;
3895 c->put_pixels_clamped = put_pixels_clamped_c;
3896 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3897 c->add_pixels_clamped = add_pixels_clamped_c;
3898 c->add_pixels8 = add_pixels8_c;
3899 c->add_pixels4 = add_pixels4_c;
3902 c->clear_blocks = clear_blocks_c;
3903 c->pix_sum = pix_sum_c;
3904 c->pix_norm1 = pix_norm1_c;
3906 /* TODO [0] 16 [1] 8 */
3907 c->pix_abs[0][0] = pix_abs16_c;
3908 c->pix_abs[0][1] = pix_abs16_x2_c;
3909 c->pix_abs[0][2] = pix_abs16_y2_c;
3910 c->pix_abs[0][3] = pix_abs16_xy2_c;
3911 c->pix_abs[1][0] = pix_abs8_c;
3912 c->pix_abs[1][1] = pix_abs8_x2_c;
3913 c->pix_abs[1][2] = pix_abs8_y2_c;
3914 c->pix_abs[1][3] = pix_abs8_xy2_c;
3916 #define dspfunc(PFX, IDX, NUM) \
3917 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3918 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3919 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3920 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3922 dspfunc(put, 0, 16);
3923 dspfunc(put_no_rnd, 0, 16);
3925 dspfunc(put_no_rnd, 1, 8);
3929 dspfunc(avg, 0, 16);
3930 dspfunc(avg_no_rnd, 0, 16);
3932 dspfunc(avg_no_rnd, 1, 8);
3937 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3938 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3940 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3941 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3942 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3943 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3944 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3945 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3946 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3947 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
3948 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
3950 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
3951 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
3952 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
3953 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
3954 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
3955 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
3956 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
3957 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
3958 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
3960 #define dspfunc(PFX, IDX, NUM) \
3961 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
3962 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
3963 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
3964 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
3965 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
3966 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
3967 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
3968 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
3969 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
3970 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
3971 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
3972 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
3973 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
3974 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
3975 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
3976 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
3978 dspfunc(put_qpel, 0, 16);
3979 dspfunc(put_no_rnd_qpel, 0, 16);
3981 dspfunc(avg_qpel, 0, 16);
3982 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3984 dspfunc(put_qpel, 1, 8);
3985 dspfunc(put_no_rnd_qpel, 1, 8);
3987 dspfunc(avg_qpel, 1, 8);
3988 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3990 dspfunc(put_h264_qpel, 0, 16);
3991 dspfunc(put_h264_qpel, 1, 8);
3992 dspfunc(put_h264_qpel, 2, 4);
3993 dspfunc(put_h264_qpel, 3, 2);
3994 dspfunc(avg_h264_qpel, 0, 16);
3995 dspfunc(avg_h264_qpel, 1, 8);
3996 dspfunc(avg_h264_qpel, 2, 4);
3999 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4000 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4001 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4002 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4003 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4004 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4005 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4007 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4008 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4009 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4010 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4011 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4012 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4013 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4014 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4015 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4016 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4017 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4018 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4019 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4020 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4021 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4022 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4023 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4024 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4025 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4026 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4028 #ifdef CONFIG_CAVS_DECODER
4029 ff_cavsdsp_init(c,avctx);
4031 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4032 ff_vc1dsp_init(c,avctx);
4034 #if defined(CONFIG_H264_ENCODER)
4035 ff_h264dsp_init(c,avctx);
4038 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4039 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4040 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4041 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4042 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4043 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4044 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4045 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4047 #define SET_CMP_FUNC(name) \
4048 c->name[0]= name ## 16_c;\
4049 c->name[1]= name ## 8x8_c;
4051 SET_CMP_FUNC(hadamard8_diff)
4052 c->hadamard8_diff[4]= hadamard8_intra16_c;
4053 SET_CMP_FUNC(dct_sad)
4054 SET_CMP_FUNC(dct_max)
4056 SET_CMP_FUNC(dct264_sad)
4058 c->sad[0]= pix_abs16_c;
4059 c->sad[1]= pix_abs8_c;
4063 SET_CMP_FUNC(quant_psnr)
4066 c->vsad[0]= vsad16_c;
4067 c->vsad[4]= vsad_intra16_c;
4068 c->vsse[0]= vsse16_c;
4069 c->vsse[4]= vsse_intra16_c;
4070 c->nsse[0]= nsse16_c;
4071 c->nsse[1]= nsse8_c;
4072 #ifdef CONFIG_SNOW_ENCODER
4073 c->w53[0]= w53_16_c;
4075 c->w97[0]= w97_16_c;
4079 c->add_bytes= add_bytes_c;
4080 c->diff_bytes= diff_bytes_c;
4081 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4082 c->bswap_buf= bswap_buf;
4084 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4085 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4086 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4087 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4088 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4089 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4090 c->h264_loop_filter_strength= NULL;
4092 c->h263_h_loop_filter= h263_h_loop_filter_c;
4093 c->h263_v_loop_filter= h263_v_loop_filter_c;
4095 c->h261_loop_filter= h261_loop_filter_c;
4097 c->try_8x8basis= try_8x8basis_c;
4098 c->add_8x8basis= add_8x8basis_c;
4100 #ifdef CONFIG_SNOW_DECODER
4101 c->vertical_compose97i = ff_snow_vertical_compose97i;
4102 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4103 c->inner_add_yblock = ff_snow_inner_add_yblock;
4106 #ifdef CONFIG_VORBIS_DECODER
4107 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4109 c->vector_fmul = vector_fmul_c;
4110 c->vector_fmul_reverse = vector_fmul_reverse_c;
4111 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4112 c->float_to_int16 = ff_float_to_int16_c;
4114 c->shrink[0]= ff_img_copy_plane;
4115 c->shrink[1]= ff_shrink22;
4116 c->shrink[2]= ff_shrink44;
4117 c->shrink[3]= ff_shrink88;
4119 c->prefetch= just_return;
4121 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4122 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4125 dsputil_init_mmx(c, avctx);
4128 dsputil_init_armv4l(c, avctx);
4131 dsputil_init_mlib(c, avctx);
4134 dsputil_init_vis(c,avctx);
4137 dsputil_init_alpha(c, avctx);
4140 dsputil_init_ppc(c, avctx);
4143 dsputil_init_mmi(c, avctx);
4146 dsputil_init_sh4(c,avctx);
4149 dsputil_init_bfin(c,avctx);
4152 for(i=0; i<64; i++){
4153 if(!c->put_2tap_qpel_pixels_tab[0][i])
4154 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4155 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4156 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4159 switch(c->idct_permutation_type){
4160 case FF_NO_IDCT_PERM:
4162 c->idct_permutation[i]= i;
4164 case FF_LIBMPEG2_IDCT_PERM:
4166 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4168 case FF_SIMPLE_IDCT_PERM:
4170 c->idct_permutation[i]= simple_mmx_permutation[i];
4172 case FF_TRANSPOSE_IDCT_PERM:
4174 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4176 case FF_PARTTRANS_IDCT_PERM:
4178 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4181 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");