3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
38 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
41 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
43 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
44 uint32_t squareTbl[512] = {0, };
46 const uint8_t ff_zigzag_direct[64] = {
47 0, 1, 8, 16, 9, 2, 3, 10,
48 17, 24, 32, 25, 18, 11, 4, 5,
49 12, 19, 26, 33, 40, 48, 41, 34,
50 27, 20, 13, 6, 7, 14, 21, 28,
51 35, 42, 49, 56, 57, 50, 43, 36,
52 29, 22, 15, 23, 30, 37, 44, 51,
53 58, 59, 52, 45, 38, 31, 39, 46,
54 53, 60, 61, 54, 47, 55, 62, 63
57 /* Specific zigzag scan for 248 idct. NOTE that unlike the
58 specification, we interleave the fields */
59 const uint8_t ff_zigzag248_direct[64] = {
60 0, 8, 1, 9, 16, 24, 2, 10,
61 17, 25, 32, 40, 48, 56, 33, 41,
62 18, 26, 3, 11, 4, 12, 19, 27,
63 34, 42, 49, 57, 50, 58, 35, 43,
64 20, 28, 5, 13, 6, 14, 21, 29,
65 36, 44, 51, 59, 52, 60, 37, 45,
66 22, 30, 7, 15, 23, 31, 38, 46,
67 53, 61, 54, 62, 39, 47, 55, 63,
70 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
71 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
73 const uint8_t ff_alternate_horizontal_scan[64] = {
74 0, 1, 2, 3, 8, 9, 16, 17,
75 10, 11, 4, 5, 6, 7, 15, 14,
76 13, 12, 19, 18, 24, 25, 32, 33,
77 26, 27, 20, 21, 22, 23, 28, 29,
78 30, 31, 34, 35, 40, 41, 48, 49,
79 42, 43, 36, 37, 38, 39, 44, 45,
80 46, 47, 50, 51, 56, 57, 58, 59,
81 52, 53, 54, 55, 60, 61, 62, 63,
84 const uint8_t ff_alternate_vertical_scan[64] = {
85 0, 8, 16, 24, 1, 9, 2, 10,
86 17, 25, 32, 40, 48, 56, 57, 49,
87 41, 33, 26, 18, 3, 11, 4, 12,
88 19, 27, 34, 42, 50, 58, 35, 43,
89 51, 59, 20, 28, 5, 13, 6, 14,
90 21, 29, 36, 44, 52, 60, 37, 45,
91 53, 61, 22, 30, 7, 15, 23, 31,
92 38, 46, 54, 62, 39, 47, 55, 63,
95 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
96 const uint32_t inverse[256]={
97 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
98 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
99 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
100 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
101 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
102 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
103 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
104 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
105 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
106 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
107 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
108 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
109 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
110 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
111 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
112 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
113 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
114 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
115 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
116 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
117 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
118 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
119 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
120 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
121 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
122 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
123 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
124 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
125 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
126 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
127 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
128 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
131 /* Input permutation for the simple_idct_mmx */
132 static const uint8_t simple_mmx_permutation[64]={
133 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
134 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
135 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
136 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
137 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
138 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
139 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
140 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
143 static int pix_sum_c(uint8_t * pix, int line_size)
148 for (i = 0; i < 16; i++) {
149 for (j = 0; j < 16; j += 8) {
160 pix += line_size - 16;
165 static int pix_norm1_c(uint8_t * pix, int line_size)
168 uint32_t *sq = squareTbl + 256;
171 for (i = 0; i < 16; i++) {
172 for (j = 0; j < 16; j += 8) {
183 #if LONG_MAX > 2147483647
184 register uint64_t x=*(uint64_t*)pix;
186 s += sq[(x>>8)&0xff];
187 s += sq[(x>>16)&0xff];
188 s += sq[(x>>24)&0xff];
189 s += sq[(x>>32)&0xff];
190 s += sq[(x>>40)&0xff];
191 s += sq[(x>>48)&0xff];
192 s += sq[(x>>56)&0xff];
194 register uint32_t x=*(uint32_t*)pix;
196 s += sq[(x>>8)&0xff];
197 s += sq[(x>>16)&0xff];
198 s += sq[(x>>24)&0xff];
199 x=*(uint32_t*)(pix+4);
201 s += sq[(x>>8)&0xff];
202 s += sq[(x>>16)&0xff];
203 s += sq[(x>>24)&0xff];
208 pix += line_size - 16;
213 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
216 for(i=0; i+8<=w; i+=8){
217 dst[i+0]= bswap_32(src[i+0]);
218 dst[i+1]= bswap_32(src[i+1]);
219 dst[i+2]= bswap_32(src[i+2]);
220 dst[i+3]= bswap_32(src[i+3]);
221 dst[i+4]= bswap_32(src[i+4]);
222 dst[i+5]= bswap_32(src[i+5]);
223 dst[i+6]= bswap_32(src[i+6]);
224 dst[i+7]= bswap_32(src[i+7]);
227 dst[i+0]= bswap_32(src[i+0]);
231 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
234 uint32_t *sq = squareTbl + 256;
237 for (i = 0; i < h; i++) {
238 s += sq[pix1[0] - pix2[0]];
239 s += sq[pix1[1] - pix2[1]];
240 s += sq[pix1[2] - pix2[2]];
241 s += sq[pix1[3] - pix2[3]];
248 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
251 uint32_t *sq = squareTbl + 256;
254 for (i = 0; i < h; i++) {
255 s += sq[pix1[0] - pix2[0]];
256 s += sq[pix1[1] - pix2[1]];
257 s += sq[pix1[2] - pix2[2]];
258 s += sq[pix1[3] - pix2[3]];
259 s += sq[pix1[4] - pix2[4]];
260 s += sq[pix1[5] - pix2[5]];
261 s += sq[pix1[6] - pix2[6]];
262 s += sq[pix1[7] - pix2[7]];
269 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
272 uint32_t *sq = squareTbl + 256;
275 for (i = 0; i < h; i++) {
276 s += sq[pix1[ 0] - pix2[ 0]];
277 s += sq[pix1[ 1] - pix2[ 1]];
278 s += sq[pix1[ 2] - pix2[ 2]];
279 s += sq[pix1[ 3] - pix2[ 3]];
280 s += sq[pix1[ 4] - pix2[ 4]];
281 s += sq[pix1[ 5] - pix2[ 5]];
282 s += sq[pix1[ 6] - pix2[ 6]];
283 s += sq[pix1[ 7] - pix2[ 7]];
284 s += sq[pix1[ 8] - pix2[ 8]];
285 s += sq[pix1[ 9] - pix2[ 9]];
286 s += sq[pix1[10] - pix2[10]];
287 s += sq[pix1[11] - pix2[11]];
288 s += sq[pix1[12] - pix2[12]];
289 s += sq[pix1[13] - pix2[13]];
290 s += sq[pix1[14] - pix2[14]];
291 s += sq[pix1[15] - pix2[15]];
300 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
301 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
303 const int dec_count= w==8 ? 3 : 4;
306 static const int scale[2][2][4][4]={
310 {268, 239, 239, 213},
314 // 9/7 16x16 or 32x32 dec=4
315 {344, 310, 310, 280},
323 {275, 245, 245, 218},
327 // 5/3 16x16 or 32x32 dec=4
328 {352, 317, 317, 286},
336 for (i = 0; i < h; i++) {
337 for (j = 0; j < w; j+=4) {
338 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
339 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
340 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
341 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
347 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
351 for(level=0; level<dec_count; level++){
352 for(ori= level ? 1 : 0; ori<4; ori++){
353 int size= w>>(dec_count-level);
354 int sx= (ori&1) ? size : 0;
355 int stride= 32<<(dec_count-level);
356 int sy= (ori&2) ? stride>>1 : 0;
358 for(i=0; i<size; i++){
359 for(j=0; j<size; j++){
360 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
370 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
371 return w_c(v, pix1, pix2, line_size, 8, h, 1);
374 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 0);
378 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 16, h, 1);
382 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 0);
386 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 32, h, 1);
390 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 0);
395 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
399 /* read the pixels */
401 block[0] = pixels[0];
402 block[1] = pixels[1];
403 block[2] = pixels[2];
404 block[3] = pixels[3];
405 block[4] = pixels[4];
406 block[5] = pixels[5];
407 block[6] = pixels[6];
408 block[7] = pixels[7];
414 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
415 const uint8_t *s2, int stride){
418 /* read the pixels */
420 block[0] = s1[0] - s2[0];
421 block[1] = s1[1] - s2[1];
422 block[2] = s1[2] - s2[2];
423 block[3] = s1[3] - s2[3];
424 block[4] = s1[4] - s2[4];
425 block[5] = s1[5] - s2[5];
426 block[6] = s1[6] - s2[6];
427 block[7] = s1[7] - s2[7];
435 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
439 uint8_t *cm = cropTbl + MAX_NEG_CROP;
441 /* read the pixels */
443 pixels[0] = cm[block[0]];
444 pixels[1] = cm[block[1]];
445 pixels[2] = cm[block[2]];
446 pixels[3] = cm[block[3]];
447 pixels[4] = cm[block[4]];
448 pixels[5] = cm[block[5]];
449 pixels[6] = cm[block[6]];
450 pixels[7] = cm[block[7]];
457 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
461 uint8_t *cm = cropTbl + MAX_NEG_CROP;
463 /* read the pixels */
465 pixels[0] = cm[block[0]];
466 pixels[1] = cm[block[1]];
467 pixels[2] = cm[block[2]];
468 pixels[3] = cm[block[3]];
475 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
479 uint8_t *cm = cropTbl + MAX_NEG_CROP;
481 /* read the pixels */
483 pixels[0] = cm[block[0]];
484 pixels[1] = cm[block[1]];
491 static void put_signed_pixels_clamped_c(const DCTELEM *block,
492 uint8_t *restrict pixels,
497 for (i = 0; i < 8; i++) {
498 for (j = 0; j < 8; j++) {
501 else if (*block > 127)
504 *pixels = (uint8_t)(*block + 128);
508 pixels += (line_size - 8);
512 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
516 uint8_t *cm = cropTbl + MAX_NEG_CROP;
518 /* read the pixels */
520 pixels[0] = cm[pixels[0] + block[0]];
521 pixels[1] = cm[pixels[1] + block[1]];
522 pixels[2] = cm[pixels[2] + block[2]];
523 pixels[3] = cm[pixels[3] + block[3]];
524 pixels[4] = cm[pixels[4] + block[4]];
525 pixels[5] = cm[pixels[5] + block[5]];
526 pixels[6] = cm[pixels[6] + block[6]];
527 pixels[7] = cm[pixels[7] + block[7]];
533 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
537 uint8_t *cm = cropTbl + MAX_NEG_CROP;
539 /* read the pixels */
541 pixels[0] = cm[pixels[0] + block[0]];
542 pixels[1] = cm[pixels[1] + block[1]];
543 pixels[2] = cm[pixels[2] + block[2]];
544 pixels[3] = cm[pixels[3] + block[3]];
550 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
554 uint8_t *cm = cropTbl + MAX_NEG_CROP;
556 /* read the pixels */
558 pixels[0] = cm[pixels[0] + block[0]];
559 pixels[1] = cm[pixels[1] + block[1]];
565 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
569 pixels[0] += block[0];
570 pixels[1] += block[1];
571 pixels[2] += block[2];
572 pixels[3] += block[3];
573 pixels[4] += block[4];
574 pixels[5] += block[5];
575 pixels[6] += block[6];
576 pixels[7] += block[7];
582 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
586 pixels[0] += block[0];
587 pixels[1] += block[1];
588 pixels[2] += block[2];
589 pixels[3] += block[3];
597 #define PIXOP2(OPNAME, OP) \
598 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
602 OP(*((uint64_t*)block), LD64(pixels));\
608 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
612 const uint64_t a= LD64(pixels );\
613 const uint64_t b= LD64(pixels+1);\
614 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
620 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint64_t a= LD64(pixels );\
625 const uint64_t b= LD64(pixels+1);\
626 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
632 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
636 const uint64_t a= LD64(pixels );\
637 const uint64_t b= LD64(pixels+line_size);\
638 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
644 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
648 const uint64_t a= LD64(pixels );\
649 const uint64_t b= LD64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
656 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
659 const uint64_t a= LD64(pixels );\
660 const uint64_t b= LD64(pixels+1);\
661 uint64_t l0= (a&0x0303030303030303ULL)\
662 + (b&0x0303030303030303ULL)\
663 + 0x0202020202020202ULL;\
664 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
665 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
669 for(i=0; i<h; i+=2){\
670 uint64_t a= LD64(pixels );\
671 uint64_t b= LD64(pixels+1);\
672 l1= (a&0x0303030303030303ULL)\
673 + (b&0x0303030303030303ULL);\
674 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
675 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
676 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
681 l0= (a&0x0303030303030303ULL)\
682 + (b&0x0303030303030303ULL)\
683 + 0x0202020202020202ULL;\
684 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
685 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
686 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
692 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
695 const uint64_t a= LD64(pixels );\
696 const uint64_t b= LD64(pixels+1);\
697 uint64_t l0= (a&0x0303030303030303ULL)\
698 + (b&0x0303030303030303ULL)\
699 + 0x0101010101010101ULL;\
700 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
701 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
705 for(i=0; i<h; i+=2){\
706 uint64_t a= LD64(pixels );\
707 uint64_t b= LD64(pixels+1);\
708 l1= (a&0x0303030303030303ULL)\
709 + (b&0x0303030303030303ULL);\
710 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
711 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
712 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
717 l0= (a&0x0303030303030303ULL)\
718 + (b&0x0303030303030303ULL)\
719 + 0x0101010101010101ULL;\
720 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
721 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
722 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
728 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
729 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
730 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
731 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
732 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
733 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
734 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
736 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
737 #else // 64 bit variant
739 #define PIXOP2(OPNAME, OP) \
740 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
743 OP(*((uint16_t*)(block )), LD16(pixels ));\
748 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
751 OP(*((uint32_t*)(block )), LD32(pixels ));\
756 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
759 OP(*((uint32_t*)(block )), LD32(pixels ));\
760 OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
765 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
766 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
769 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
770 int src_stride1, int src_stride2, int h){\
774 a= LD32(&src1[i*src_stride1 ]);\
775 b= LD32(&src2[i*src_stride2 ]);\
776 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
777 a= LD32(&src1[i*src_stride1+4]);\
778 b= LD32(&src2[i*src_stride2+4]);\
779 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
783 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
784 int src_stride1, int src_stride2, int h){\
788 a= LD32(&src1[i*src_stride1 ]);\
789 b= LD32(&src2[i*src_stride2 ]);\
790 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
791 a= LD32(&src1[i*src_stride1+4]);\
792 b= LD32(&src2[i*src_stride2+4]);\
793 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
797 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
798 int src_stride1, int src_stride2, int h){\
802 a= LD32(&src1[i*src_stride1 ]);\
803 b= LD32(&src2[i*src_stride2 ]);\
804 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
808 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
809 int src_stride1, int src_stride2, int h){\
813 a= LD16(&src1[i*src_stride1 ]);\
814 b= LD16(&src2[i*src_stride2 ]);\
815 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
819 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
820 int src_stride1, int src_stride2, int h){\
821 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
822 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
825 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
826 int src_stride1, int src_stride2, int h){\
827 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
828 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
831 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
832 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
835 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
836 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
839 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
840 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
843 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
848 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
851 uint32_t a, b, c, d, l0, l1, h0, h1;\
852 a= LD32(&src1[i*src_stride1]);\
853 b= LD32(&src2[i*src_stride2]);\
854 c= LD32(&src3[i*src_stride3]);\
855 d= LD32(&src4[i*src_stride4]);\
856 l0= (a&0x03030303UL)\
859 h0= ((a&0xFCFCFCFCUL)>>2)\
860 + ((b&0xFCFCFCFCUL)>>2);\
861 l1= (c&0x03030303UL)\
863 h1= ((c&0xFCFCFCFCUL)>>2)\
864 + ((d&0xFCFCFCFCUL)>>2);\
865 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
866 a= LD32(&src1[i*src_stride1+4]);\
867 b= LD32(&src2[i*src_stride2+4]);\
868 c= LD32(&src3[i*src_stride3+4]);\
869 d= LD32(&src4[i*src_stride4+4]);\
870 l0= (a&0x03030303UL)\
873 h0= ((a&0xFCFCFCFCUL)>>2)\
874 + ((b&0xFCFCFCFCUL)>>2);\
875 l1= (c&0x03030303UL)\
877 h1= ((c&0xFCFCFCFCUL)>>2)\
878 + ((d&0xFCFCFCFCUL)>>2);\
879 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
883 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
884 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
887 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
888 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
891 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
892 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
895 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
900 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
903 uint32_t a, b, c, d, l0, l1, h0, h1;\
904 a= LD32(&src1[i*src_stride1]);\
905 b= LD32(&src2[i*src_stride2]);\
906 c= LD32(&src3[i*src_stride3]);\
907 d= LD32(&src4[i*src_stride4]);\
908 l0= (a&0x03030303UL)\
911 h0= ((a&0xFCFCFCFCUL)>>2)\
912 + ((b&0xFCFCFCFCUL)>>2);\
913 l1= (c&0x03030303UL)\
915 h1= ((c&0xFCFCFCFCUL)>>2)\
916 + ((d&0xFCFCFCFCUL)>>2);\
917 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
918 a= LD32(&src1[i*src_stride1+4]);\
919 b= LD32(&src2[i*src_stride2+4]);\
920 c= LD32(&src3[i*src_stride3+4]);\
921 d= LD32(&src4[i*src_stride4+4]);\
922 l0= (a&0x03030303UL)\
925 h0= ((a&0xFCFCFCFCUL)>>2)\
926 + ((b&0xFCFCFCFCUL)>>2);\
927 l1= (c&0x03030303UL)\
929 h1= ((c&0xFCFCFCFCUL)>>2)\
930 + ((d&0xFCFCFCFCUL)>>2);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
934 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
935 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
936 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
937 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
939 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
940 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
941 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
942 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
945 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
947 int i, a0, b0, a1, b1;\
954 for(i=0; i<h; i+=2){\
960 block[0]= (a1+a0)>>2; /* FIXME non put */\
961 block[1]= (b1+b0)>>2;\
971 block[0]= (a1+a0)>>2;\
972 block[1]= (b1+b0)>>2;\
978 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
981 const uint32_t a= LD32(pixels );\
982 const uint32_t b= LD32(pixels+1);\
983 uint32_t l0= (a&0x03030303UL)\
986 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
987 + ((b&0xFCFCFCFCUL)>>2);\
991 for(i=0; i<h; i+=2){\
992 uint32_t a= LD32(pixels );\
993 uint32_t b= LD32(pixels+1);\
994 l1= (a&0x03030303UL)\
996 h1= ((a&0xFCFCFCFCUL)>>2)\
997 + ((b&0xFCFCFCFCUL)>>2);\
998 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1003 l0= (a&0x03030303UL)\
1006 h0= ((a&0xFCFCFCFCUL)>>2)\
1007 + ((b&0xFCFCFCFCUL)>>2);\
1008 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1014 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1017 for(j=0; j<2; j++){\
1019 const uint32_t a= LD32(pixels );\
1020 const uint32_t b= LD32(pixels+1);\
1021 uint32_t l0= (a&0x03030303UL)\
1024 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1029 for(i=0; i<h; i+=2){\
1030 uint32_t a= LD32(pixels );\
1031 uint32_t b= LD32(pixels+1);\
1032 l1= (a&0x03030303UL)\
1033 + (b&0x03030303UL);\
1034 h1= ((a&0xFCFCFCFCUL)>>2)\
1035 + ((b&0xFCFCFCFCUL)>>2);\
1036 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1041 l0= (a&0x03030303UL)\
1044 h0= ((a&0xFCFCFCFCUL)>>2)\
1045 + ((b&0xFCFCFCFCUL)>>2);\
1046 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1050 pixels+=4-line_size*(h+1);\
1051 block +=4-line_size*h;\
1055 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1058 for(j=0; j<2; j++){\
1060 const uint32_t a= LD32(pixels );\
1061 const uint32_t b= LD32(pixels+1);\
1062 uint32_t l0= (a&0x03030303UL)\
1065 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1066 + ((b&0xFCFCFCFCUL)>>2);\
1070 for(i=0; i<h; i+=2){\
1071 uint32_t a= LD32(pixels );\
1072 uint32_t b= LD32(pixels+1);\
1073 l1= (a&0x03030303UL)\
1074 + (b&0x03030303UL);\
1075 h1= ((a&0xFCFCFCFCUL)>>2)\
1076 + ((b&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1082 l0= (a&0x03030303UL)\
1085 h0= ((a&0xFCFCFCFCUL)>>2)\
1086 + ((b&0xFCFCFCFCUL)>>2);\
1087 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1091 pixels+=4-line_size*(h+1);\
1092 block +=4-line_size*h;\
1096 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1097 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1098 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1099 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1100 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1101 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1102 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1103 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1105 #define op_avg(a, b) a = rnd_avg32(a, b)
1107 #define op_put(a, b) a = b
1114 #define avg2(a,b) ((a+b+1)>>1)
1115 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1117 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1118 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1121 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1122 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1125 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1127 const int A=(16-x16)*(16-y16);
1128 const int B=( x16)*(16-y16);
1129 const int C=(16-x16)*( y16);
1130 const int D=( x16)*( y16);
1135 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1136 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1137 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1138 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1139 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1140 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1141 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1142 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1148 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1149 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1152 const int s= 1<<shift;
1162 for(x=0; x<8; x++){ //XXX FIXME optimize
1163 int src_x, src_y, frac_x, frac_y, index;
1167 frac_x= src_x&(s-1);
1168 frac_y= src_y&(s-1);
1172 if((unsigned)src_x < width){
1173 if((unsigned)src_y < height){
1174 index= src_x + src_y*stride;
1175 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1176 + src[index +1]* frac_x )*(s-frac_y)
1177 + ( src[index+stride ]*(s-frac_x)
1178 + src[index+stride+1]* frac_x )* frac_y
1181 index= src_x + clip(src_y, 0, height)*stride;
1182 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1183 + src[index +1]* frac_x )*s
1187 if((unsigned)src_y < height){
1188 index= clip(src_x, 0, width) + src_y*stride;
1189 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1190 + src[index+stride ]* frac_y )*s
1193 index= clip(src_x, 0, width) + clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= src[index ];
1206 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1208 case 2: put_pixels2_c (dst, src, stride, height); break;
1209 case 4: put_pixels4_c (dst, src, stride, height); break;
1210 case 8: put_pixels8_c (dst, src, stride, height); break;
1211 case 16:put_pixels16_c(dst, src, stride, height); break;
1215 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1217 for (i=0; i < height; i++) {
1218 for (j=0; j < width; j++) {
1219 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1226 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1228 for (i=0; i < height; i++) {
1229 for (j=0; j < width; j++) {
1230 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1237 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1239 for (i=0; i < height; i++) {
1240 for (j=0; j < width; j++) {
1241 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1248 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1250 for (i=0; i < height; i++) {
1251 for (j=0; j < width; j++) {
1252 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1259 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1261 for (i=0; i < height; i++) {
1262 for (j=0; j < width; j++) {
1263 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1270 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1272 for (i=0; i < height; i++) {
1273 for (j=0; j < width; j++) {
1274 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1281 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1283 for (i=0; i < height; i++) {
1284 for (j=0; j < width; j++) {
1285 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1292 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1294 for (i=0; i < height; i++) {
1295 for (j=0; j < width; j++) {
1296 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1303 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1305 case 2: avg_pixels2_c (dst, src, stride, height); break;
1306 case 4: avg_pixels4_c (dst, src, stride, height); break;
1307 case 8: avg_pixels8_c (dst, src, stride, height); break;
1308 case 16:avg_pixels16_c(dst, src, stride, height); break;
1312 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1314 for (i=0; i < height; i++) {
1315 for (j=0; j < width; j++) {
1316 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1323 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1325 for (i=0; i < height; i++) {
1326 for (j=0; j < width; j++) {
1327 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1334 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1336 for (i=0; i < height; i++) {
1337 for (j=0; j < width; j++) {
1338 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1345 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1347 for (i=0; i < height; i++) {
1348 for (j=0; j < width; j++) {
1349 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1356 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1358 for (i=0; i < height; i++) {
1359 for (j=0; j < width; j++) {
1360 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1367 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1378 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1389 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1400 #define TPEL_WIDTH(width)\
1401 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1402 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1403 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1404 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1405 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1406 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1407 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1408 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1409 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1410 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1411 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1412 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1413 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1421 #define H264_CHROMA_MC(OPNAME, OP)\
1422 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1423 const int A=(8-x)*(8-y);\
1424 const int B=( x)*(8-y);\
1425 const int C=(8-x)*( y);\
1426 const int D=( x)*( y);\
1429 assert(x<8 && y<8 && x>=0 && y>=0);\
1433 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1434 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1440 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1441 const int A=(8-x)*(8-y);\
1442 const int B=( x)*(8-y);\
1443 const int C=(8-x)*( y);\
1444 const int D=( x)*( y);\
1447 assert(x<8 && y<8 && x>=0 && y>=0);\
1451 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1452 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1454 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1460 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1461 const int A=(8-x)*(8-y);\
1462 const int B=( x)*(8-y);\
1463 const int C=(8-x)*( y);\
1464 const int D=( x)*( y);\
1467 assert(x<8 && y<8 && x>=0 && y>=0);\
1471 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1472 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1473 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1474 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1475 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1476 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1477 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1478 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1484 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1485 #define op_put(a, b) a = (((b) + 32)>>6)
1487 H264_CHROMA_MC(put_ , op_put)
1488 H264_CHROMA_MC(avg_ , op_avg)
1492 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1493 const int A=(8-x)*(8-y);
1494 const int B=( x)*(8-y);
1495 const int C=(8-x)*( y);
1496 const int D=( x)*( y);
1499 assert(x<8 && y<8 && x>=0 && y>=0);
1503 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1504 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1505 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1506 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1507 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1508 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1509 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1510 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1516 static inline void copy_block2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1521 ST16(dst , LD16(src ));
1527 static inline void copy_block4(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1532 ST32(dst , LD32(src ));
1538 static inline void copy_block8(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1543 ST32(dst , LD32(src ));
1544 ST32(dst+4 , LD32(src+4 ));
1550 static inline void copy_block16(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1555 ST32(dst , LD32(src ));
1556 ST32(dst+4 , LD32(src+4 ));
1557 ST32(dst+8 , LD32(src+8 ));
1558 ST32(dst+12, LD32(src+12));
1564 static inline void copy_block17(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1569 ST32(dst , LD32(src ));
1570 ST32(dst+4 , LD32(src+4 ));
1571 ST32(dst+8 , LD32(src+8 ));
1572 ST32(dst+12, LD32(src+12));
1579 static inline void copy_block9(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h)
1584 ST32(dst , LD32(src ));
1585 ST32(dst+4 , LD32(src+4 ));
1593 #define QPEL_MC(r, OPNAME, RND, OP) \
1594 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1595 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1599 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1600 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1601 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1602 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1603 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1604 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1605 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1606 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1612 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1614 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1618 const int src0= src[0*srcStride];\
1619 const int src1= src[1*srcStride];\
1620 const int src2= src[2*srcStride];\
1621 const int src3= src[3*srcStride];\
1622 const int src4= src[4*srcStride];\
1623 const int src5= src[5*srcStride];\
1624 const int src6= src[6*srcStride];\
1625 const int src7= src[7*srcStride];\
1626 const int src8= src[8*srcStride];\
1627 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1628 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1629 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1630 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1631 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1632 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1633 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1634 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1640 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1641 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1646 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1647 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1648 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1649 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1650 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1651 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1652 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1653 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1654 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1655 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1656 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1657 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1658 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1659 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1660 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1661 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1667 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1668 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
1673 const int src0= src[0*srcStride];\
1674 const int src1= src[1*srcStride];\
1675 const int src2= src[2*srcStride];\
1676 const int src3= src[3*srcStride];\
1677 const int src4= src[4*srcStride];\
1678 const int src5= src[5*srcStride];\
1679 const int src6= src[6*srcStride];\
1680 const int src7= src[7*srcStride];\
1681 const int src8= src[8*srcStride];\
1682 const int src9= src[9*srcStride];\
1683 const int src10= src[10*srcStride];\
1684 const int src11= src[11*srcStride];\
1685 const int src12= src[12*srcStride];\
1686 const int src13= src[13*srcStride];\
1687 const int src14= src[14*srcStride];\
1688 const int src15= src[15*srcStride];\
1689 const int src16= src[16*srcStride];\
1690 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1691 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1692 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1693 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1694 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1695 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1696 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1697 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1698 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1699 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1700 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1701 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1702 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1703 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1704 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1705 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1711 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1712 OPNAME ## pixels8_c(dst, src, stride, 8);\
1715 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1717 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1718 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1721 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1722 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1725 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1727 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1728 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1731 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1736 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1739 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1740 uint8_t full[16*9];\
1741 copy_block9(full, src, 16, stride, 9);\
1742 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1745 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1746 uint8_t full[16*9];\
1748 copy_block9(full, src, 16, stride, 9);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1750 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1752 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1756 uint8_t halfHV[64];\
1757 copy_block9(full, src, 16, stride, 9);\
1758 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1760 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1763 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1764 uint8_t full[16*9];\
1766 uint8_t halfHV[64];\
1767 copy_block9(full, src, 16, stride, 9);\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1773 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1777 uint8_t halfHV[64];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1784 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1785 uint8_t full[16*9];\
1787 uint8_t halfHV[64];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1791 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1794 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1798 uint8_t halfHV[64];\
1799 copy_block9(full, src, 16, stride, 9);\
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1805 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1806 uint8_t full[16*9];\
1808 uint8_t halfHV[64];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1815 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[16*9];\
1819 uint8_t halfHV[64];\
1820 copy_block9(full, src, 16, stride, 9);\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1822 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1823 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1824 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1826 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1827 uint8_t full[16*9];\
1829 uint8_t halfHV[64];\
1830 copy_block9(full, src, 16, stride, 9);\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1832 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1833 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1834 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1836 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1838 uint8_t halfHV[64];\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1840 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1841 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1843 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t halfHV[64];\
1846 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1848 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1850 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1851 uint8_t full[16*9];\
1854 uint8_t halfHV[64];\
1855 copy_block9(full, src, 16, stride, 9);\
1856 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1857 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1858 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1859 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1861 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1862 uint8_t full[16*9];\
1864 copy_block9(full, src, 16, stride, 9);\
1865 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1866 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1867 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1869 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[16*9];\
1873 uint8_t halfHV[64];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1876 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1877 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1878 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1880 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[16*9];\
1883 copy_block9(full, src, 16, stride, 9);\
1884 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1885 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1886 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1888 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1890 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1891 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1893 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1894 OPNAME ## pixels16_c(dst, src, stride, 16);\
1897 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1899 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1900 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1903 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1904 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1907 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1909 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1910 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1913 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1918 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1921 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1922 uint8_t full[24*17];\
1923 copy_block17(full, src, 24, stride, 17);\
1924 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1927 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[24*17];\
1930 copy_block17(full, src, 24, stride, 17);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1932 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1934 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfV[256];\
1938 uint8_t halfHV[256];\
1939 copy_block17(full, src, 24, stride, 17);\
1940 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1942 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1945 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[24*17];\
1947 uint8_t halfH[272];\
1948 uint8_t halfHV[256];\
1949 copy_block17(full, src, 24, stride, 17);\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1955 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfV[256];\
1959 uint8_t halfHV[256];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1966 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 uint8_t halfH[272];\
1969 uint8_t halfHV[256];\
1970 copy_block17(full, src, 24, stride, 17);\
1971 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1973 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1976 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1978 uint8_t halfH[272];\
1979 uint8_t halfV[256];\
1980 uint8_t halfHV[256];\
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1987 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1988 uint8_t full[24*17];\
1989 uint8_t halfH[272];\
1990 uint8_t halfHV[256];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1997 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t full[24*17];\
1999 uint8_t halfH[272];\
2000 uint8_t halfV[256];\
2001 uint8_t halfHV[256];\
2002 copy_block17(full, src, 24, stride, 17);\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2004 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2005 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2006 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2008 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2009 uint8_t full[24*17];\
2010 uint8_t halfH[272];\
2011 uint8_t halfHV[256];\
2012 copy_block17(full, src, 24, stride, 17);\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2014 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2015 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2016 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2018 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2019 uint8_t halfH[272];\
2020 uint8_t halfHV[256];\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2022 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2023 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2025 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t halfH[272];\
2027 uint8_t halfHV[256];\
2028 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2030 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2032 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2033 uint8_t full[24*17];\
2034 uint8_t halfH[272];\
2035 uint8_t halfV[256];\
2036 uint8_t halfHV[256];\
2037 copy_block17(full, src, 24, stride, 17);\
2038 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2039 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2040 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2041 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2043 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t full[24*17];\
2045 uint8_t halfH[272];\
2046 copy_block17(full, src, 24, stride, 17);\
2047 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2048 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2049 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2051 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2052 uint8_t full[24*17];\
2053 uint8_t halfH[272];\
2054 uint8_t halfV[256];\
2055 uint8_t halfHV[256];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2058 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2059 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2060 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2062 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2063 uint8_t full[24*17];\
2064 uint8_t halfH[272];\
2065 copy_block17(full, src, 24, stride, 17);\
2066 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2067 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2068 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2070 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t halfH[272];\
2072 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2073 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2076 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2077 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2078 #define op_put(a, b) a = cm[((b) + 16)>>5]
2079 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2081 QPEL_MC(0, put_ , _ , op_put)
2082 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2083 QPEL_MC(0, avg_ , _ , op_avg)
2084 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2086 #undef op_avg_no_rnd
2088 #undef op_put_no_rnd
2091 #define H264_LOWPASS(OPNAME, OP, OP2) \
2092 static void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2094 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2098 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2099 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2105 static void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2107 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2111 const int srcB= src[-2*srcStride];\
2112 const int srcA= src[-1*srcStride];\
2113 const int src0= src[0 *srcStride];\
2114 const int src1= src[1 *srcStride];\
2115 const int src2= src[2 *srcStride];\
2116 const int src3= src[3 *srcStride];\
2117 const int src4= src[4 *srcStride];\
2118 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2119 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2125 static void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2128 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2130 src -= 2*srcStride;\
2131 for(i=0; i<h+5; i++)\
2133 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2134 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2138 tmp -= tmpStride*(h+5-2);\
2141 const int tmpB= tmp[-2*tmpStride];\
2142 const int tmpA= tmp[-1*tmpStride];\
2143 const int tmp0= tmp[0 *tmpStride];\
2144 const int tmp1= tmp[1 *tmpStride];\
2145 const int tmp2= tmp[2 *tmpStride];\
2146 const int tmp3= tmp[3 *tmpStride];\
2147 const int tmp4= tmp[4 *tmpStride];\
2148 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2149 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2154 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2156 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2160 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2161 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2162 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2163 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2169 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2171 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2175 const int srcB= src[-2*srcStride];\
2176 const int srcA= src[-1*srcStride];\
2177 const int src0= src[0 *srcStride];\
2178 const int src1= src[1 *srcStride];\
2179 const int src2= src[2 *srcStride];\
2180 const int src3= src[3 *srcStride];\
2181 const int src4= src[4 *srcStride];\
2182 const int src5= src[5 *srcStride];\
2183 const int src6= src[6 *srcStride];\
2184 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2185 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2186 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2187 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2193 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2196 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2198 src -= 2*srcStride;\
2199 for(i=0; i<h+5; i++)\
2201 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2202 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2203 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2204 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2208 tmp -= tmpStride*(h+5-2);\
2211 const int tmpB= tmp[-2*tmpStride];\
2212 const int tmpA= tmp[-1*tmpStride];\
2213 const int tmp0= tmp[0 *tmpStride];\
2214 const int tmp1= tmp[1 *tmpStride];\
2215 const int tmp2= tmp[2 *tmpStride];\
2216 const int tmp3= tmp[3 *tmpStride];\
2217 const int tmp4= tmp[4 *tmpStride];\
2218 const int tmp5= tmp[5 *tmpStride];\
2219 const int tmp6= tmp[6 *tmpStride];\
2220 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2221 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2222 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2223 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2229 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2231 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2235 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2236 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2237 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2238 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2239 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2240 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2241 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2242 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2248 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2250 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2254 const int srcB= src[-2*srcStride];\
2255 const int srcA= src[-1*srcStride];\
2256 const int src0= src[0 *srcStride];\
2257 const int src1= src[1 *srcStride];\
2258 const int src2= src[2 *srcStride];\
2259 const int src3= src[3 *srcStride];\
2260 const int src4= src[4 *srcStride];\
2261 const int src5= src[5 *srcStride];\
2262 const int src6= src[6 *srcStride];\
2263 const int src7= src[7 *srcStride];\
2264 const int src8= src[8 *srcStride];\
2265 const int src9= src[9 *srcStride];\
2266 const int src10=src[10*srcStride];\
2267 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2268 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2269 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2270 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2271 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2272 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2273 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2274 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2280 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2283 uint8_t *cm = cropTbl + MAX_NEG_CROP;\
2285 src -= 2*srcStride;\
2286 for(i=0; i<h+5; i++)\
2288 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2289 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2290 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2291 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2292 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2293 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2294 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2295 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2299 tmp -= tmpStride*(h+5-2);\
2302 const int tmpB= tmp[-2*tmpStride];\
2303 const int tmpA= tmp[-1*tmpStride];\
2304 const int tmp0= tmp[0 *tmpStride];\
2305 const int tmp1= tmp[1 *tmpStride];\
2306 const int tmp2= tmp[2 *tmpStride];\
2307 const int tmp3= tmp[3 *tmpStride];\
2308 const int tmp4= tmp[4 *tmpStride];\
2309 const int tmp5= tmp[5 *tmpStride];\
2310 const int tmp6= tmp[6 *tmpStride];\
2311 const int tmp7= tmp[7 *tmpStride];\
2312 const int tmp8= tmp[8 *tmpStride];\
2313 const int tmp9= tmp[9 *tmpStride];\
2314 const int tmp10=tmp[10*tmpStride];\
2315 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2316 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2317 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2318 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2319 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2320 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2321 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2322 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2328 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2329 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2330 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2331 src += 8*srcStride;\
2332 dst += 8*dstStride;\
2333 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2334 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2337 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2338 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2339 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2340 src += 8*srcStride;\
2341 dst += 8*dstStride;\
2342 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2343 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2346 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2347 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2348 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2349 src += 8*srcStride;\
2350 dst += 8*dstStride;\
2351 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2352 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2355 #define H264_MC(OPNAME, SIZE) \
2356 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2357 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2360 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2361 uint8_t half[SIZE*SIZE];\
2362 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2363 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2366 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2367 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2370 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2371 uint8_t half[SIZE*SIZE];\
2372 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2373 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2376 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2377 uint8_t full[SIZE*(SIZE+5)];\
2378 uint8_t * const full_mid= full + SIZE*2;\
2379 uint8_t half[SIZE*SIZE];\
2380 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2381 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2382 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2386 uint8_t full[SIZE*(SIZE+5)];\
2387 uint8_t * const full_mid= full + SIZE*2;\
2388 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2389 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2392 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2393 uint8_t full[SIZE*(SIZE+5)];\
2394 uint8_t * const full_mid= full + SIZE*2;\
2395 uint8_t half[SIZE*SIZE];\
2396 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2397 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2398 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2401 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2402 uint8_t full[SIZE*(SIZE+5)];\
2403 uint8_t * const full_mid= full + SIZE*2;\
2404 uint8_t halfH[SIZE*SIZE];\
2405 uint8_t halfV[SIZE*SIZE];\
2406 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2407 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2408 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2409 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2412 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2413 uint8_t full[SIZE*(SIZE+5)];\
2414 uint8_t * const full_mid= full + SIZE*2;\
2415 uint8_t halfH[SIZE*SIZE];\
2416 uint8_t halfV[SIZE*SIZE];\
2417 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2418 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2419 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2420 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2424 uint8_t full[SIZE*(SIZE+5)];\
2425 uint8_t * const full_mid= full + SIZE*2;\
2426 uint8_t halfH[SIZE*SIZE];\
2427 uint8_t halfV[SIZE*SIZE];\
2428 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2429 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2430 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2431 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2434 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2435 uint8_t full[SIZE*(SIZE+5)];\
2436 uint8_t * const full_mid= full + SIZE*2;\
2437 uint8_t halfH[SIZE*SIZE];\
2438 uint8_t halfV[SIZE*SIZE];\
2439 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2440 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2441 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2442 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2445 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2446 int16_t tmp[SIZE*(SIZE+5)];\
2447 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2450 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2451 int16_t tmp[SIZE*(SIZE+5)];\
2452 uint8_t halfH[SIZE*SIZE];\
2453 uint8_t halfHV[SIZE*SIZE];\
2454 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2455 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2460 int16_t tmp[SIZE*(SIZE+5)];\
2461 uint8_t halfH[SIZE*SIZE];\
2462 uint8_t halfHV[SIZE*SIZE];\
2463 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2464 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2465 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2469 uint8_t full[SIZE*(SIZE+5)];\
2470 uint8_t * const full_mid= full + SIZE*2;\
2471 int16_t tmp[SIZE*(SIZE+5)];\
2472 uint8_t halfV[SIZE*SIZE];\
2473 uint8_t halfHV[SIZE*SIZE];\
2474 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2475 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2476 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2477 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2481 uint8_t full[SIZE*(SIZE+5)];\
2482 uint8_t * const full_mid= full + SIZE*2;\
2483 int16_t tmp[SIZE*(SIZE+5)];\
2484 uint8_t halfV[SIZE*SIZE];\
2485 uint8_t halfHV[SIZE*SIZE];\
2486 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2487 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2488 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2489 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2492 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2493 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2494 #define op_put(a, b) a = cm[((b) + 16)>>5]
2495 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2496 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2498 H264_LOWPASS(put_ , op_put, op2_put)
2499 H264_LOWPASS(avg_ , op_avg, op2_avg)
2514 #define op_scale1(x) block[x] = clip_uint8( (block[x]*weight + offset) >> log2_denom )
2515 #define op_scale2(x) dst[x] = clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2516 #define H264_WEIGHT(W,H) \
2517 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2519 offset <<= log2_denom; \
2520 if(log2_denom) offset += 1<<(log2_denom-1); \
2521 for(y=0; y<H; y++, block += stride){ \
2524 if(W==2) continue; \
2527 if(W==4) continue; \
2532 if(W==8) continue; \
2543 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2545 offset = ((offset + 1) | 1) << log2_denom; \
2546 for(y=0; y<H; y++, dst += stride, src += stride){ \
2549 if(W==2) continue; \
2552 if(W==4) continue; \
2557 if(W==8) continue; \
2584 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2585 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2589 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2590 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2591 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2592 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2593 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2594 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2595 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2596 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2602 #ifdef CONFIG_CAVS_DECODER
2604 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2606 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2607 put_pixels8_c(dst, src, stride, 8);
2609 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2610 avg_pixels8_c(dst, src, stride, 8);
2612 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2613 put_pixels16_c(dst, src, stride, 16);
2615 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2616 avg_pixels16_c(dst, src, stride, 16);
2618 #endif /* CONFIG_CAVS_DECODER */
2620 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2622 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2624 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2625 put_pixels8_c(dst, src, stride, 8);
2627 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2629 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2630 uint8_t *cm = cropTbl + MAX_NEG_CROP;
2634 const int src_1= src[ -srcStride];
2635 const int src0 = src[0 ];
2636 const int src1 = src[ srcStride];
2637 const int src2 = src[2*srcStride];
2638 const int src3 = src[3*srcStride];
2639 const int src4 = src[4*srcStride];
2640 const int src5 = src[5*srcStride];
2641 const int src6 = src[6*srcStride];
2642 const int src7 = src[7*srcStride];
2643 const int src8 = src[8*srcStride];
2644 const int src9 = src[9*srcStride];
2645 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2646 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2647 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2648 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2649 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2650 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2651 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2652 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2658 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2659 put_pixels8_c(dst, src, stride, 8);
2662 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2664 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2665 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2668 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2669 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2672 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2674 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2675 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2678 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2682 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2686 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2688 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2689 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2691 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2695 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2696 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2697 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2698 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2700 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2702 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2703 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2706 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2708 const int strength= ff_h263_loop_filter_strength[qscale];
2712 int p0= src[x-2*stride];
2713 int p1= src[x-1*stride];
2714 int p2= src[x+0*stride];
2715 int p3= src[x+1*stride];
2716 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2718 if (d<-2*strength) d1= 0;
2719 else if(d<- strength) d1=-2*strength - d;
2720 else if(d< strength) d1= d;
2721 else if(d< 2*strength) d1= 2*strength - d;
2726 if(p1&256) p1= ~(p1>>31);
2727 if(p2&256) p2= ~(p2>>31);
2729 src[x-1*stride] = p1;
2730 src[x+0*stride] = p2;
2734 d2= clip((p0-p3)/4, -ad1, ad1);
2736 src[x-2*stride] = p0 - d2;
2737 src[x+ stride] = p3 + d2;
2741 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2743 const int strength= ff_h263_loop_filter_strength[qscale];
2747 int p0= src[y*stride-2];
2748 int p1= src[y*stride-1];
2749 int p2= src[y*stride+0];
2750 int p3= src[y*stride+1];
2751 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2753 if (d<-2*strength) d1= 0;
2754 else if(d<- strength) d1=-2*strength - d;
2755 else if(d< strength) d1= d;
2756 else if(d< 2*strength) d1= 2*strength - d;
2761 if(p1&256) p1= ~(p1>>31);
2762 if(p2&256) p2= ~(p2>>31);
2764 src[y*stride-1] = p1;
2765 src[y*stride+0] = p2;
2769 d2= clip((p0-p3)/4, -ad1, ad1);
2771 src[y*stride-2] = p0 - d2;
2772 src[y*stride+1] = p3 + d2;
2776 static void h261_loop_filter_c(uint8_t *src, int stride){
2781 temp[x ] = 4*src[x ];
2782 temp[x + 7*8] = 4*src[x + 7*stride];
2786 xy = y * stride + x;
2788 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2793 src[ y*stride] = (temp[ y*8] + 2)>>2;
2794 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2796 xy = y * stride + x;
2798 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2803 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2806 for( i = 0; i < 4; i++ ) {
2811 for( d = 0; d < 4; d++ ) {
2812 const int p0 = pix[-1*xstride];
2813 const int p1 = pix[-2*xstride];
2814 const int p2 = pix[-3*xstride];
2815 const int q0 = pix[0];
2816 const int q1 = pix[1*xstride];
2817 const int q2 = pix[2*xstride];
2819 if( FFABS( p0 - q0 ) < alpha &&
2820 FFABS( p1 - p0 ) < beta &&
2821 FFABS( q1 - q0 ) < beta ) {
2826 if( FFABS( p2 - p0 ) < beta ) {
2827 pix[-2*xstride] = p1 + clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2830 if( FFABS( q2 - q0 ) < beta ) {
2831 pix[ xstride] = q1 + clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2835 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2836 pix[-xstride] = clip_uint8( p0 + i_delta ); /* p0' */
2837 pix[0] = clip_uint8( q0 - i_delta ); /* q0' */
2843 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2845 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2847 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2849 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2852 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2855 for( i = 0; i < 4; i++ ) {
2856 const int tc = tc0[i];
2861 for( d = 0; d < 2; d++ ) {
2862 const int p0 = pix[-1*xstride];
2863 const int p1 = pix[-2*xstride];
2864 const int q0 = pix[0];
2865 const int q1 = pix[1*xstride];
2867 if( FFABS( p0 - q0 ) < alpha &&
2868 FFABS( p1 - p0 ) < beta &&
2869 FFABS( q1 - q0 ) < beta ) {
2871 int delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2873 pix[-xstride] = clip_uint8( p0 + delta ); /* p0' */
2874 pix[0] = clip_uint8( q0 - delta ); /* q0' */
2880 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2882 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2884 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2886 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2889 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2892 for( d = 0; d < 8; d++ ) {
2893 const int p0 = pix[-1*xstride];
2894 const int p1 = pix[-2*xstride];
2895 const int q0 = pix[0];
2896 const int q1 = pix[1*xstride];
2898 if( FFABS( p0 - q0 ) < alpha &&
2899 FFABS( p1 - p0 ) < beta &&
2900 FFABS( q1 - q0 ) < beta ) {
2902 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2903 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2908 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2910 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2912 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2914 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2917 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2923 s += abs(pix1[0] - pix2[0]);
2924 s += abs(pix1[1] - pix2[1]);
2925 s += abs(pix1[2] - pix2[2]);
2926 s += abs(pix1[3] - pix2[3]);
2927 s += abs(pix1[4] - pix2[4]);
2928 s += abs(pix1[5] - pix2[5]);
2929 s += abs(pix1[6] - pix2[6]);
2930 s += abs(pix1[7] - pix2[7]);
2931 s += abs(pix1[8] - pix2[8]);
2932 s += abs(pix1[9] - pix2[9]);
2933 s += abs(pix1[10] - pix2[10]);
2934 s += abs(pix1[11] - pix2[11]);
2935 s += abs(pix1[12] - pix2[12]);
2936 s += abs(pix1[13] - pix2[13]);
2937 s += abs(pix1[14] - pix2[14]);
2938 s += abs(pix1[15] - pix2[15]);
2945 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2951 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2952 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2953 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2954 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2955 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2956 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2957 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2958 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2959 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2960 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2961 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2962 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2963 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2964 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2965 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2966 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2973 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2976 uint8_t *pix3 = pix2 + line_size;
2980 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2981 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2982 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2983 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2984 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2985 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2986 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2987 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2988 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2989 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2990 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2991 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2992 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2993 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2994 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2995 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3003 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3006 uint8_t *pix3 = pix2 + line_size;
3010 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3011 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3012 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3013 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3014 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3015 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3016 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3017 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3018 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3019 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3020 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3021 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3022 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3023 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3024 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3025 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3033 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3039 s += abs(pix1[0] - pix2[0]);
3040 s += abs(pix1[1] - pix2[1]);
3041 s += abs(pix1[2] - pix2[2]);
3042 s += abs(pix1[3] - pix2[3]);
3043 s += abs(pix1[4] - pix2[4]);
3044 s += abs(pix1[5] - pix2[5]);
3045 s += abs(pix1[6] - pix2[6]);
3046 s += abs(pix1[7] - pix2[7]);
3053 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3059 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3060 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3061 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3062 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3063 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3064 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3065 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3066 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3073 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3076 uint8_t *pix3 = pix2 + line_size;
3080 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3081 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3082 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3083 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3084 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3085 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3086 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3087 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3095 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3098 uint8_t *pix3 = pix2 + line_size;
3102 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3103 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3104 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3105 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3106 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3107 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3108 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3109 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3117 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3118 MpegEncContext *c = v;
3124 for(x=0; x<16; x++){
3125 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3128 for(x=0; x<15; x++){
3129 score2+= FFABS( s1[x ] - s1[x +stride]
3130 - s1[x+1] + s1[x+1+stride])
3131 -FFABS( s2[x ] - s2[x +stride]
3132 - s2[x+1] + s2[x+1+stride]);
3139 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3140 else return score1 + FFABS(score2)*8;
3143 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3144 MpegEncContext *c = v;
3151 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3155 score2+= FFABS( s1[x ] - s1[x +stride]
3156 - s1[x+1] + s1[x+1+stride])
3157 -FFABS( s2[x ] - s2[x +stride]
3158 - s2[x+1] + s2[x+1+stride]);
3165 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3166 else return score1 + FFABS(score2)*8;
3169 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3173 for(i=0; i<8*8; i++){
3174 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3177 assert(-512<b && b<512);
3179 sum += (w*b)*(w*b)>>4;
3184 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3187 for(i=0; i<8*8; i++){
3188 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3193 * permutes an 8x8 block.
3194 * @param block the block which will be permuted according to the given permutation vector
3195 * @param permutation the permutation vector
3196 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3197 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3198 * (inverse) permutated to scantable order!
3200 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3206 //if(permutation[1]==1) return; //FIXME its ok but not clean and might fail for some perms
3208 for(i=0; i<=last; i++){
3209 const int j= scantable[i];
3214 for(i=0; i<=last; i++){
3215 const int j= scantable[i];
3216 const int perm_j= permutation[j];
3217 block[perm_j]= temp[j];
3221 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3225 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3228 memset(cmp, 0, sizeof(void*)*5);
3236 cmp[i]= c->hadamard8_diff[i];
3242 cmp[i]= c->dct_sad[i];
3245 cmp[i]= c->dct264_sad[i];
3248 cmp[i]= c->dct_max[i];
3251 cmp[i]= c->quant_psnr[i];
3271 #ifdef CONFIG_SNOW_ENCODER
3280 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3286 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3288 static void clear_blocks_c(DCTELEM *blocks)
3290 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3293 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3295 for(i=0; i+7<w; i+=8){
3296 dst[i+0] += src[i+0];
3297 dst[i+1] += src[i+1];
3298 dst[i+2] += src[i+2];
3299 dst[i+3] += src[i+3];
3300 dst[i+4] += src[i+4];
3301 dst[i+5] += src[i+5];
3302 dst[i+6] += src[i+6];
3303 dst[i+7] += src[i+7];
3306 dst[i+0] += src[i+0];
3309 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3311 for(i=0; i+7<w; i+=8){
3312 dst[i+0] = src1[i+0]-src2[i+0];
3313 dst[i+1] = src1[i+1]-src2[i+1];
3314 dst[i+2] = src1[i+2]-src2[i+2];
3315 dst[i+3] = src1[i+3]-src2[i+3];
3316 dst[i+4] = src1[i+4]-src2[i+4];
3317 dst[i+5] = src1[i+5]-src2[i+5];
3318 dst[i+6] = src1[i+6]-src2[i+6];
3319 dst[i+7] = src1[i+7]-src2[i+7];
3322 dst[i+0] = src1[i+0]-src2[i+0];
3325 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3333 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3343 #define BUTTERFLY2(o1,o2,i1,i2) \
3347 #define BUTTERFLY1(x,y) \
3356 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3358 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3366 //FIXME try pointer walks
3367 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3368 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3369 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3370 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3372 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3373 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3374 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3375 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3377 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3378 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3379 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3380 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3384 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3385 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3386 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3387 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3389 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3390 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3391 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3392 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3395 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3396 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3397 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3398 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3404 printf("MAX:%d\n", maxi);
3410 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3418 //FIXME try pointer walks
3419 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3420 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3421 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3422 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3424 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3425 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3426 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3427 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3429 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3430 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3431 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3432 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3436 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3437 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3438 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3439 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3441 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3442 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3443 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3444 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3447 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3448 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3449 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3450 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3453 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3458 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3459 MpegEncContext * const s= (MpegEncContext *)c;
3460 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3461 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3466 s->dsp.diff_pixels(temp, src1, src2, stride);
3470 sum+= FFABS(temp[i]);
3477 const int s07 = SRC(0) + SRC(7);\
3478 const int s16 = SRC(1) + SRC(6);\
3479 const int s25 = SRC(2) + SRC(5);\
3480 const int s34 = SRC(3) + SRC(4);\
3481 const int a0 = s07 + s34;\
3482 const int a1 = s16 + s25;\
3483 const int a2 = s07 - s34;\
3484 const int a3 = s16 - s25;\
3485 const int d07 = SRC(0) - SRC(7);\
3486 const int d16 = SRC(1) - SRC(6);\
3487 const int d25 = SRC(2) - SRC(5);\
3488 const int d34 = SRC(3) - SRC(4);\
3489 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3490 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3491 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3492 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3494 DST(1, a4 + (a7>>2)) ;\
3495 DST(2, a2 + (a3>>1)) ;\
3496 DST(3, a5 + (a6>>2)) ;\
3498 DST(5, a6 - (a5>>2)) ;\
3499 DST(6, (a2>>1) - a3 ) ;\
3500 DST(7, (a4>>2) - a7 ) ;\
3503 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3504 MpegEncContext * const s= (MpegEncContext *)c;
3509 s->dsp.diff_pixels(dct, src1, src2, stride);
3511 #define SRC(x) dct[i][x]
3512 #define DST(x,v) dct[i][x]= v
3513 for( i = 0; i < 8; i++ )
3518 #define SRC(x) dct[x][i]
3519 #define DST(x,v) sum += FFABS(v)
3520 for( i = 0; i < 8; i++ )
3528 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529 MpegEncContext * const s= (MpegEncContext *)c;
3530 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3531 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3536 s->dsp.diff_pixels(temp, src1, src2, stride);
3540 sum= FFMAX(sum, FFABS(temp[i]));
3545 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3546 MpegEncContext * const s= (MpegEncContext *)c;
3547 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3548 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3549 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3555 s->dsp.diff_pixels(temp, src1, src2, stride);
3557 memcpy(bak, temp, 64*sizeof(DCTELEM));
3559 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3560 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3561 simple_idct(temp); //FIXME
3564 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3569 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3570 MpegEncContext * const s= (MpegEncContext *)c;
3571 const uint8_t *scantable= s->intra_scantable.permutated;
3572 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3573 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3574 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3575 uint8_t * const bak= (uint8_t*)aligned_bak;
3576 int i, last, run, bits, level, distoration, start_i;
3577 const int esc_length= s->ac_esc_length;
3579 uint8_t * last_length;
3584 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3585 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3588 s->dsp.diff_pixels(temp, src1, src2, stride);
3590 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3596 length = s->intra_ac_vlc_length;
3597 last_length= s->intra_ac_vlc_last_length;
3598 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3601 length = s->inter_ac_vlc_length;
3602 last_length= s->inter_ac_vlc_last_length;
3607 for(i=start_i; i<last; i++){
3608 int j= scantable[i];
3613 if((level&(~127)) == 0){
3614 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3623 level= temp[i] + 64;
3627 if((level&(~127)) == 0){
3628 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3636 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3638 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3641 s->dsp.idct_add(bak, stride, temp);
3643 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3645 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3648 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3649 MpegEncContext * const s= (MpegEncContext *)c;
3650 const uint8_t *scantable= s->intra_scantable.permutated;
3651 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3652 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3653 int i, last, run, bits, level, start_i;
3654 const int esc_length= s->ac_esc_length;
3656 uint8_t * last_length;
3660 s->dsp.diff_pixels(temp, src1, src2, stride);
3662 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3668 length = s->intra_ac_vlc_length;
3669 last_length= s->intra_ac_vlc_last_length;
3670 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3673 length = s->inter_ac_vlc_length;
3674 last_length= s->inter_ac_vlc_last_length;
3679 for(i=start_i; i<last; i++){
3680 int j= scantable[i];
3685 if((level&(~127)) == 0){
3686 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3695 level= temp[i] + 64;
3699 if((level&(~127)) == 0){
3700 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3708 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3713 for(x=0; x<16; x+=4){
3714 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3715 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3723 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3728 for(x=0; x<16; x++){
3729 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3738 #define SQ(a) ((a)*(a))
3739 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3744 for(x=0; x<16; x+=4){
3745 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3746 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3754 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3759 for(x=0; x<16; x++){
3760 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3769 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3770 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3771 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3773 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3775 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3776 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3777 WARPER8_16_SQ(rd8x8_c, rd16_c)
3778 WARPER8_16_SQ(bit8x8_c, bit16_c)
3780 static void vector_fmul_c(float *dst, const float *src, int len){
3782 for(i=0; i<len; i++)
3786 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3789 for(i=0; i<len; i++)
3790 dst[i] = src0[i] * src1[-i];
3793 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3795 for(i=0; i<len; i++)
3796 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3799 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3801 for(i=0; i<len; i++) {
3802 int_fast32_t tmp = ((int32_t*)src)[i];
3804 tmp = (0x43c0ffff - tmp)>>31;
3805 // is this faster on some gcc/cpu combinations?
3806 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3809 dst[i] = tmp - 0x8000;
3813 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3815 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3818 put_pixels_clamped_c(block, dest, line_size);
3820 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3823 add_pixels_clamped_c(block, dest, line_size);
3826 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3829 put_pixels_clamped4_c(block, dest, line_size);
3831 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3834 add_pixels_clamped4_c(block, dest, line_size);
3837 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3840 put_pixels_clamped2_c(block, dest, line_size);
3842 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3845 add_pixels_clamped2_c(block, dest, line_size);
3848 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3850 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3852 dest[0] = cm[(block[0] + 4)>>3];
3854 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3856 uint8_t *cm = cropTbl + MAX_NEG_CROP;
3858 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3861 static void just_return() { return; }
3863 /* init static data */
3864 void dsputil_static_init(void)
3868 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
3869 for(i=0;i<MAX_NEG_CROP;i++) {
3871 cropTbl[i + MAX_NEG_CROP + 256] = 255;
3874 for(i=0;i<512;i++) {
3875 squareTbl[i] = (i - 256) * (i - 256);
3878 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3882 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3886 #ifdef CONFIG_ENCODERS
3887 if(avctx->dct_algo==FF_DCT_FASTINT) {
3888 c->fdct = fdct_ifast;
3889 c->fdct248 = fdct_ifast248;
3891 else if(avctx->dct_algo==FF_DCT_FAAN) {
3892 c->fdct = ff_faandct;
3893 c->fdct248 = ff_faandct248;
3896 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3897 c->fdct248 = ff_fdct248_islow;
3899 #endif //CONFIG_ENCODERS
3901 if(avctx->lowres==1){
3902 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO){
3903 c->idct_put= ff_jref_idct4_put;
3904 c->idct_add= ff_jref_idct4_add;
3906 c->idct_put= ff_h264_lowres_idct_put_c;
3907 c->idct_add= ff_h264_lowres_idct_add_c;
3909 c->idct = j_rev_dct4;
3910 c->idct_permutation_type= FF_NO_IDCT_PERM;
3911 }else if(avctx->lowres==2){
3912 c->idct_put= ff_jref_idct2_put;
3913 c->idct_add= ff_jref_idct2_add;
3914 c->idct = j_rev_dct2;
3915 c->idct_permutation_type= FF_NO_IDCT_PERM;
3916 }else if(avctx->lowres==3){
3917 c->idct_put= ff_jref_idct1_put;
3918 c->idct_add= ff_jref_idct1_add;
3919 c->idct = j_rev_dct1;
3920 c->idct_permutation_type= FF_NO_IDCT_PERM;
3922 if(avctx->idct_algo==FF_IDCT_INT){
3923 c->idct_put= ff_jref_idct_put;
3924 c->idct_add= ff_jref_idct_add;
3925 c->idct = j_rev_dct;
3926 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3927 }else if(avctx->idct_algo==FF_IDCT_VP3){
3928 c->idct_put= ff_vp3_idct_put_c;
3929 c->idct_add= ff_vp3_idct_add_c;
3930 c->idct = ff_vp3_idct_c;
3931 c->idct_permutation_type= FF_NO_IDCT_PERM;
3932 }else{ //accurate/default
3933 c->idct_put= simple_idct_put;
3934 c->idct_add= simple_idct_add;
3935 c->idct = simple_idct;
3936 c->idct_permutation_type= FF_NO_IDCT_PERM;
3940 c->h264_idct_add= ff_h264_idct_add_c;
3941 c->h264_idct8_add= ff_h264_idct8_add_c;
3942 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
3943 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
3945 c->get_pixels = get_pixels_c;
3946 c->diff_pixels = diff_pixels_c;
3947 c->put_pixels_clamped = put_pixels_clamped_c;
3948 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
3949 c->add_pixels_clamped = add_pixels_clamped_c;
3950 c->add_pixels8 = add_pixels8_c;
3951 c->add_pixels4 = add_pixels4_c;
3954 c->clear_blocks = clear_blocks_c;
3955 c->pix_sum = pix_sum_c;
3956 c->pix_norm1 = pix_norm1_c;
3958 /* TODO [0] 16 [1] 8 */
3959 c->pix_abs[0][0] = pix_abs16_c;
3960 c->pix_abs[0][1] = pix_abs16_x2_c;
3961 c->pix_abs[0][2] = pix_abs16_y2_c;
3962 c->pix_abs[0][3] = pix_abs16_xy2_c;
3963 c->pix_abs[1][0] = pix_abs8_c;
3964 c->pix_abs[1][1] = pix_abs8_x2_c;
3965 c->pix_abs[1][2] = pix_abs8_y2_c;
3966 c->pix_abs[1][3] = pix_abs8_xy2_c;
3968 #define dspfunc(PFX, IDX, NUM) \
3969 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
3970 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
3971 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
3972 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
3974 dspfunc(put, 0, 16);
3975 dspfunc(put_no_rnd, 0, 16);
3977 dspfunc(put_no_rnd, 1, 8);
3981 dspfunc(avg, 0, 16);
3982 dspfunc(avg_no_rnd, 0, 16);
3984 dspfunc(avg_no_rnd, 1, 8);
3989 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
3990 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
3992 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
3993 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
3994 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
3995 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
3996 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
3997 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
3998 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
3999 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4000 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4002 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4003 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4004 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4005 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4006 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4007 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4008 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4009 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4010 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4012 #define dspfunc(PFX, IDX, NUM) \
4013 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4014 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4015 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4016 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4017 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4018 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4019 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4020 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4021 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4022 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4023 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4024 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4025 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4026 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4027 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4028 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4030 dspfunc(put_qpel, 0, 16);
4031 dspfunc(put_no_rnd_qpel, 0, 16);
4033 dspfunc(avg_qpel, 0, 16);
4034 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4036 dspfunc(put_qpel, 1, 8);
4037 dspfunc(put_no_rnd_qpel, 1, 8);
4039 dspfunc(avg_qpel, 1, 8);
4040 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4042 dspfunc(put_h264_qpel, 0, 16);
4043 dspfunc(put_h264_qpel, 1, 8);
4044 dspfunc(put_h264_qpel, 2, 4);
4045 dspfunc(put_h264_qpel, 3, 2);
4046 dspfunc(avg_h264_qpel, 0, 16);
4047 dspfunc(avg_h264_qpel, 1, 8);
4048 dspfunc(avg_h264_qpel, 2, 4);
4051 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4052 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4053 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4054 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4055 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4056 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4057 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4059 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4060 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4061 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4062 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4063 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4064 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4065 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4066 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4067 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4068 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4069 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4070 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4071 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4072 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4073 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4074 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4075 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4076 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4077 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4078 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4080 #ifdef CONFIG_CAVS_DECODER
4081 ff_cavsdsp_init(c,avctx);
4083 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4084 ff_vc1dsp_init(c,avctx);
4087 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4088 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4089 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4090 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4091 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4092 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4093 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4094 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4096 #define SET_CMP_FUNC(name) \
4097 c->name[0]= name ## 16_c;\
4098 c->name[1]= name ## 8x8_c;
4100 SET_CMP_FUNC(hadamard8_diff)
4101 c->hadamard8_diff[4]= hadamard8_intra16_c;
4102 SET_CMP_FUNC(dct_sad)
4103 SET_CMP_FUNC(dct_max)
4105 SET_CMP_FUNC(dct264_sad)
4107 c->sad[0]= pix_abs16_c;
4108 c->sad[1]= pix_abs8_c;
4112 SET_CMP_FUNC(quant_psnr)
4115 c->vsad[0]= vsad16_c;
4116 c->vsad[4]= vsad_intra16_c;
4117 c->vsse[0]= vsse16_c;
4118 c->vsse[4]= vsse_intra16_c;
4119 c->nsse[0]= nsse16_c;
4120 c->nsse[1]= nsse8_c;
4121 #ifdef CONFIG_SNOW_ENCODER
4122 c->w53[0]= w53_16_c;
4124 c->w97[0]= w97_16_c;
4128 c->add_bytes= add_bytes_c;
4129 c->diff_bytes= diff_bytes_c;
4130 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4131 c->bswap_buf= bswap_buf;
4133 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4134 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4135 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4136 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4137 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4138 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4139 c->h264_loop_filter_strength= NULL;
4141 c->h263_h_loop_filter= h263_h_loop_filter_c;
4142 c->h263_v_loop_filter= h263_v_loop_filter_c;
4144 c->h261_loop_filter= h261_loop_filter_c;
4146 c->try_8x8basis= try_8x8basis_c;
4147 c->add_8x8basis= add_8x8basis_c;
4149 #ifdef CONFIG_SNOW_ENCODER
4150 c->vertical_compose97i = ff_snow_vertical_compose97i;
4151 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4152 c->inner_add_yblock = ff_snow_inner_add_yblock;
4155 #ifdef CONFIG_VORBIS_DECODER
4156 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4158 c->vector_fmul = vector_fmul_c;
4159 c->vector_fmul_reverse = vector_fmul_reverse_c;
4160 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4161 c->float_to_int16 = ff_float_to_int16_c;
4163 c->shrink[0]= ff_img_copy_plane;
4164 c->shrink[1]= ff_shrink22;
4165 c->shrink[2]= ff_shrink44;
4166 c->shrink[3]= ff_shrink88;
4168 c->prefetch= just_return;
4170 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4171 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4174 dsputil_init_mmx(c, avctx);
4177 dsputil_init_armv4l(c, avctx);
4180 dsputil_init_mlib(c, avctx);
4183 dsputil_init_vis(c,avctx);
4186 dsputil_init_alpha(c, avctx);
4189 dsputil_init_ppc(c, avctx);
4192 dsputil_init_mmi(c, avctx);
4195 dsputil_init_sh4(c,avctx);
4198 dsputil_init_bfin(c,avctx);
4201 for(i=0; i<64; i++){
4202 if(!c->put_2tap_qpel_pixels_tab[0][i])
4203 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4204 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4205 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4208 switch(c->idct_permutation_type){
4209 case FF_NO_IDCT_PERM:
4211 c->idct_permutation[i]= i;
4213 case FF_LIBMPEG2_IDCT_PERM:
4215 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4217 case FF_SIMPLE_IDCT_PERM:
4219 c->idct_permutation[i]= simple_mmx_permutation[i];
4221 case FF_TRANSPOSE_IDCT_PERM:
4223 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4225 case FF_PARTTRANS_IDCT_PERM:
4227 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4230 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");