3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48 uint32_t ff_squareTbl[512] = {0, };
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
100 const uint32_t ff_inverse[256]={
101 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
102 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
103 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
104 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
105 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
106 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
107 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
108 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
109 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
110 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
111 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
112 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
113 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
114 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
115 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
116 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
117 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
118 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
119 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
120 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
121 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
122 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
123 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
124 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
125 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
126 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
127 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
128 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
129 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
130 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
131 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
132 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
135 /* Input permutation for the simple_idct_mmx */
136 static const uint8_t simple_mmx_permutation[64]={
137 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
187 #if LONG_MAX > 2147483647
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= bswap_32(src[i+0]);
222 dst[i+1]= bswap_32(src[i+1]);
223 dst[i+2]= bswap_32(src[i+2]);
224 dst[i+3]= bswap_32(src[i+3]);
225 dst[i+4]= bswap_32(src[i+4]);
226 dst[i+5]= bswap_32(src[i+5]);
227 dst[i+6]= bswap_32(src[i+6]);
228 dst[i+7]= bswap_32(src[i+7]);
231 dst[i+0]= bswap_32(src[i+0]);
235 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
238 uint32_t *sq = ff_squareTbl + 256;
241 for (i = 0; i < h; i++) {
242 s += sq[pix1[0] - pix2[0]];
243 s += sq[pix1[1] - pix2[1]];
244 s += sq[pix1[2] - pix2[2]];
245 s += sq[pix1[3] - pix2[3]];
252 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
255 uint32_t *sq = ff_squareTbl + 256;
258 for (i = 0; i < h; i++) {
259 s += sq[pix1[0] - pix2[0]];
260 s += sq[pix1[1] - pix2[1]];
261 s += sq[pix1[2] - pix2[2]];
262 s += sq[pix1[3] - pix2[3]];
263 s += sq[pix1[4] - pix2[4]];
264 s += sq[pix1[5] - pix2[5]];
265 s += sq[pix1[6] - pix2[6]];
266 s += sq[pix1[7] - pix2[7]];
273 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
276 uint32_t *sq = ff_squareTbl + 256;
279 for (i = 0; i < h; i++) {
280 s += sq[pix1[ 0] - pix2[ 0]];
281 s += sq[pix1[ 1] - pix2[ 1]];
282 s += sq[pix1[ 2] - pix2[ 2]];
283 s += sq[pix1[ 3] - pix2[ 3]];
284 s += sq[pix1[ 4] - pix2[ 4]];
285 s += sq[pix1[ 5] - pix2[ 5]];
286 s += sq[pix1[ 6] - pix2[ 6]];
287 s += sq[pix1[ 7] - pix2[ 7]];
288 s += sq[pix1[ 8] - pix2[ 8]];
289 s += sq[pix1[ 9] - pix2[ 9]];
290 s += sq[pix1[10] - pix2[10]];
291 s += sq[pix1[11] - pix2[11]];
292 s += sq[pix1[12] - pix2[12]];
293 s += sq[pix1[13] - pix2[13]];
294 s += sq[pix1[14] - pix2[14]];
295 s += sq[pix1[15] - pix2[15]];
304 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
305 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
307 const int dec_count= w==8 ? 3 : 4;
310 static const int scale[2][2][4][4]={
314 {268, 239, 239, 213},
318 // 9/7 16x16 or 32x32 dec=4
319 {344, 310, 310, 280},
327 {275, 245, 245, 218},
331 // 5/3 16x16 or 32x32 dec=4
332 {352, 317, 317, 286},
340 for (i = 0; i < h; i++) {
341 for (j = 0; j < w; j+=4) {
342 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
351 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
355 for(level=0; level<dec_count; level++){
356 for(ori= level ? 1 : 0; ori<4; ori++){
357 int size= w>>(dec_count-level);
358 int sx= (ori&1) ? size : 0;
359 int stride= 32<<(dec_count-level);
360 int sy= (ori&2) ? stride>>1 : 0;
362 for(i=0; i<size; i++){
363 for(j=0; j<size; j++){
364 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
390 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 1);
394 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395 return w_c(v, pix1, pix2, line_size, 32, h, 0);
399 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
403 /* read the pixels */
405 block[0] = pixels[0];
406 block[1] = pixels[1];
407 block[2] = pixels[2];
408 block[3] = pixels[3];
409 block[4] = pixels[4];
410 block[5] = pixels[5];
411 block[6] = pixels[6];
412 block[7] = pixels[7];
418 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
419 const uint8_t *s2, int stride){
422 /* read the pixels */
424 block[0] = s1[0] - s2[0];
425 block[1] = s1[1] - s2[1];
426 block[2] = s1[2] - s2[2];
427 block[3] = s1[3] - s2[3];
428 block[4] = s1[4] - s2[4];
429 block[5] = s1[5] - s2[5];
430 block[6] = s1[6] - s2[6];
431 block[7] = s1[7] - s2[7];
439 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445 /* read the pixels */
447 pixels[0] = cm[block[0]];
448 pixels[1] = cm[block[1]];
449 pixels[2] = cm[block[2]];
450 pixels[3] = cm[block[3]];
451 pixels[4] = cm[block[4]];
452 pixels[5] = cm[block[5]];
453 pixels[6] = cm[block[6]];
454 pixels[7] = cm[block[7]];
461 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
465 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
467 /* read the pixels */
469 pixels[0] = cm[block[0]];
470 pixels[1] = cm[block[1]];
471 pixels[2] = cm[block[2]];
472 pixels[3] = cm[block[3]];
479 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
483 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485 /* read the pixels */
487 pixels[0] = cm[block[0]];
488 pixels[1] = cm[block[1]];
495 static void put_signed_pixels_clamped_c(const DCTELEM *block,
496 uint8_t *restrict pixels,
501 for (i = 0; i < 8; i++) {
502 for (j = 0; j < 8; j++) {
505 else if (*block > 127)
508 *pixels = (uint8_t)(*block + 128);
512 pixels += (line_size - 8);
516 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
522 /* read the pixels */
524 pixels[0] = cm[pixels[0] + block[0]];
525 pixels[1] = cm[pixels[1] + block[1]];
526 pixels[2] = cm[pixels[2] + block[2]];
527 pixels[3] = cm[pixels[3] + block[3]];
528 pixels[4] = cm[pixels[4] + block[4]];
529 pixels[5] = cm[pixels[5] + block[5]];
530 pixels[6] = cm[pixels[6] + block[6]];
531 pixels[7] = cm[pixels[7] + block[7]];
537 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
543 /* read the pixels */
545 pixels[0] = cm[pixels[0] + block[0]];
546 pixels[1] = cm[pixels[1] + block[1]];
547 pixels[2] = cm[pixels[2] + block[2]];
548 pixels[3] = cm[pixels[3] + block[3]];
554 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
560 /* read the pixels */
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
569 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
573 pixels[0] += block[0];
574 pixels[1] += block[1];
575 pixels[2] += block[2];
576 pixels[3] += block[3];
577 pixels[4] += block[4];
578 pixels[5] += block[5];
579 pixels[6] += block[6];
580 pixels[7] += block[7];
586 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
599 static int sum_abs_dctelem_c(DCTELEM *block)
603 sum+= FFABS(block[i]);
609 #define PIXOP2(OPNAME, OP) \
610 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
614 OP(*((uint64_t*)block), AV_RN64(pixels));\
620 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint64_t a= AV_RN64(pixels );\
625 const uint64_t b= AV_RN64(pixels+1);\
626 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
632 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
636 const uint64_t a= AV_RN64(pixels );\
637 const uint64_t b= AV_RN64(pixels+1);\
638 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
644 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
648 const uint64_t a= AV_RN64(pixels );\
649 const uint64_t b= AV_RN64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
656 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660 const uint64_t a= AV_RN64(pixels );\
661 const uint64_t b= AV_RN64(pixels+line_size);\
662 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
668 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
671 const uint64_t a= AV_RN64(pixels );\
672 const uint64_t b= AV_RN64(pixels+1);\
673 uint64_t l0= (a&0x0303030303030303ULL)\
674 + (b&0x0303030303030303ULL)\
675 + 0x0202020202020202ULL;\
676 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 for(i=0; i<h; i+=2){\
682 uint64_t a= AV_RN64(pixels );\
683 uint64_t b= AV_RN64(pixels+1);\
684 l1= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL);\
686 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
691 a= AV_RN64(pixels );\
692 b= AV_RN64(pixels+1);\
693 l0= (a&0x0303030303030303ULL)\
694 + (b&0x0303030303030303ULL)\
695 + 0x0202020202020202ULL;\
696 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
704 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 const uint64_t a= AV_RN64(pixels );\
708 const uint64_t b= AV_RN64(pixels+1);\
709 uint64_t l0= (a&0x0303030303030303ULL)\
710 + (b&0x0303030303030303ULL)\
711 + 0x0101010101010101ULL;\
712 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 for(i=0; i<h; i+=2){\
718 uint64_t a= AV_RN64(pixels );\
719 uint64_t b= AV_RN64(pixels+1);\
720 l1= (a&0x0303030303030303ULL)\
721 + (b&0x0303030303030303ULL);\
722 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
727 a= AV_RN64(pixels );\
728 b= AV_RN64(pixels+1);\
729 l0= (a&0x0303030303030303ULL)\
730 + (b&0x0303030303030303ULL)\
731 + 0x0101010101010101ULL;\
732 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
740 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
741 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
748 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749 #else // 64 bit variant
751 #define PIXOP2(OPNAME, OP) \
752 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
755 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
760 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
763 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
768 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
771 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
772 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
777 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
781 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= AV_RN32(&src1[i*src_stride1 ]);\
787 b= AV_RN32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
789 a= AV_RN32(&src1[i*src_stride1+4]);\
790 b= AV_RN32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= AV_RN32(&src1[i*src_stride1 ]);\
801 b= AV_RN32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
803 a= AV_RN32(&src1[i*src_stride1+4]);\
804 b= AV_RN32(&src2[i*src_stride2+4]);\
805 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
809 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810 int src_stride1, int src_stride2, int h){\
814 a= AV_RN32(&src1[i*src_stride1 ]);\
815 b= AV_RN32(&src2[i*src_stride2 ]);\
816 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
820 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
825 a= AV_RN16(&src1[i*src_stride1 ]);\
826 b= AV_RN16(&src2[i*src_stride2 ]);\
827 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
831 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832 int src_stride1, int src_stride2, int h){\
833 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
834 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
837 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838 int src_stride1, int src_stride2, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
840 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
843 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
851 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
855 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
859 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
863 uint32_t a, b, c, d, l0, l1, h0, h1;\
864 a= AV_RN32(&src1[i*src_stride1]);\
865 b= AV_RN32(&src2[i*src_stride2]);\
866 c= AV_RN32(&src3[i*src_stride3]);\
867 d= AV_RN32(&src4[i*src_stride4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 c= AV_RN32(&src3[i*src_stride3+4]);\
881 d= AV_RN32(&src4[i*src_stride4+4]);\
882 l0= (a&0x03030303UL)\
885 h0= ((a&0xFCFCFCFCUL)>>2)\
886 + ((b&0xFCFCFCFCUL)>>2);\
887 l1= (c&0x03030303UL)\
889 h1= ((c&0xFCFCFCFCUL)>>2)\
890 + ((d&0xFCFCFCFCUL)>>2);\
891 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
895 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
946 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
959 int i, a0, b0, a1, b1;\
966 for(i=0; i<h; i+=2){\
972 block[0]= (a1+a0)>>2; /* FIXME non put */\
973 block[1]= (b1+b0)>>2;\
983 block[0]= (a1+a0)>>2;\
984 block[1]= (b1+b0)>>2;\
990 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
993 const uint32_t a= AV_RN32(pixels );\
994 const uint32_t b= AV_RN32(pixels+1);\
995 uint32_t l0= (a&0x03030303UL)\
998 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1003 for(i=0; i<h; i+=2){\
1004 uint32_t a= AV_RN32(pixels );\
1005 uint32_t b= AV_RN32(pixels+1);\
1006 l1= (a&0x03030303UL)\
1007 + (b&0x03030303UL);\
1008 h1= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013 a= AV_RN32(pixels );\
1014 b= AV_RN32(pixels+1);\
1015 l0= (a&0x03030303UL)\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1026 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1029 for(j=0; j<2; j++){\
1031 const uint32_t a= AV_RN32(pixels );\
1032 const uint32_t b= AV_RN32(pixels+1);\
1033 uint32_t l0= (a&0x03030303UL)\
1036 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1041 for(i=0; i<h; i+=2){\
1042 uint32_t a= AV_RN32(pixels );\
1043 uint32_t b= AV_RN32(pixels+1);\
1044 l1= (a&0x03030303UL)\
1045 + (b&0x03030303UL);\
1046 h1= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1051 a= AV_RN32(pixels );\
1052 b= AV_RN32(pixels+1);\
1053 l0= (a&0x03030303UL)\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062 pixels+=4-line_size*(h+1);\
1063 block +=4-line_size*h;\
1067 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1070 for(j=0; j<2; j++){\
1072 const uint32_t a= AV_RN32(pixels );\
1073 const uint32_t b= AV_RN32(pixels+1);\
1074 uint32_t l0= (a&0x03030303UL)\
1077 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1082 for(i=0; i<h; i+=2){\
1083 uint32_t a= AV_RN32(pixels );\
1084 uint32_t b= AV_RN32(pixels+1);\
1085 l1= (a&0x03030303UL)\
1086 + (b&0x03030303UL);\
1087 h1= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1092 a= AV_RN32(pixels );\
1093 b= AV_RN32(pixels+1);\
1094 l0= (a&0x03030303UL)\
1097 h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103 pixels+=4-line_size*(h+1);\
1104 block +=4-line_size*h;\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1117 #define op_avg(a, b) a = rnd_avg32(a, b)
1119 #define op_put(a, b) a = b
1126 #define avg2(a,b) ((a+b+1)>>1)
1127 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1129 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1133 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1137 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1139 const int A=(16-x16)*(16-y16);
1140 const int B=( x16)*(16-y16);
1141 const int C=(16-x16)*( y16);
1142 const int D=( x16)*( y16);
1147 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1160 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1164 const int s= 1<<shift;
1174 for(x=0; x<8; x++){ //XXX FIXME optimize
1175 int src_x, src_y, frac_x, frac_y, index;
1179 frac_x= src_x&(s-1);
1180 frac_y= src_y&(s-1);
1184 if((unsigned)src_x < width){
1185 if((unsigned)src_y < height){
1186 index= src_x + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1188 + src[index +1]* frac_x )*(s-frac_y)
1189 + ( src[index+stride ]*(s-frac_x)
1190 + src[index+stride+1]* frac_x )* frac_y
1193 index= src_x + av_clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1195 + src[index +1]* frac_x )*s
1199 if((unsigned)src_y < height){
1200 index= av_clip(src_x, 0, width) + src_y*stride;
1201 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1202 + src[index+stride ]* frac_y )*s
1205 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206 dst[y*stride + x]= src[index ];
1218 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1220 case 2: put_pixels2_c (dst, src, stride, height); break;
1221 case 4: put_pixels4_c (dst, src, stride, height); break;
1222 case 8: put_pixels8_c (dst, src, stride, height); break;
1223 case 16:put_pixels16_c(dst, src, stride, height); break;
1227 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1229 for (i=0; i < height; i++) {
1230 for (j=0; j < width; j++) {
1231 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1238 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1240 for (i=0; i < height; i++) {
1241 for (j=0; j < width; j++) {
1242 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1249 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1251 for (i=0; i < height; i++) {
1252 for (j=0; j < width; j++) {
1253 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1260 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1262 for (i=0; i < height; i++) {
1263 for (j=0; j < width; j++) {
1264 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1271 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1273 for (i=0; i < height; i++) {
1274 for (j=0; j < width; j++) {
1275 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1282 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
1286 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1293 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
1297 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1304 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
1308 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1315 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317 case 2: avg_pixels2_c (dst, src, stride, height); break;
1318 case 4: avg_pixels4_c (dst, src, stride, height); break;
1319 case 8: avg_pixels8_c (dst, src, stride, height); break;
1320 case 16:avg_pixels16_c(dst, src, stride, height); break;
1324 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
1328 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1335 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
1339 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1346 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
1350 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1357 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
1361 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1368 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
1372 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1379 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
1383 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1390 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
1394 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1401 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
1405 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1412 #define TPEL_WIDTH(width)\
1413 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1433 #define H264_CHROMA_MC(OPNAME, OP)\
1434 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435 const int A=(8-x)*(8-y);\
1436 const int B=( x)*(8-y);\
1437 const int C=(8-x)*( y);\
1438 const int D=( x)*( y);\
1441 assert(x<8 && y<8 && x>=0 && y>=0);\
1445 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1446 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1453 const int A=(8-x)*(8-y);\
1454 const int B=( x)*(8-y);\
1455 const int C=(8-x)*( y);\
1456 const int D=( x)*( y);\
1459 assert(x<8 && y<8 && x>=0 && y>=0);\
1463 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1464 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1465 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1466 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1472 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1473 const int A=(8-x)*(8-y);\
1474 const int B=( x)*(8-y);\
1475 const int C=(8-x)*( y);\
1476 const int D=( x)*( y);\
1479 assert(x<8 && y<8 && x>=0 && y>=0);\
1483 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1484 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1485 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1486 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1487 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1488 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1489 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1490 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1496 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1497 #define op_put(a, b) a = (((b) + 32)>>6)
1499 H264_CHROMA_MC(put_ , op_put)
1500 H264_CHROMA_MC(avg_ , op_avg)
1504 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1505 const int A=(8-x)*(8-y);
1506 const int B=( x)*(8-y);
1507 const int C=(8-x)*( y);
1508 const int D=( x)*( y);
1511 assert(x<8 && y<8 && x>=0 && y>=0);
1515 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1516 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1517 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1518 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1519 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1520 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1521 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1522 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1528 #define QPEL_MC(r, OPNAME, RND, OP) \
1529 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1530 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1534 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1535 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1536 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1537 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1538 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1539 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1540 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1541 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1547 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1549 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1553 const int src0= src[0*srcStride];\
1554 const int src1= src[1*srcStride];\
1555 const int src2= src[2*srcStride];\
1556 const int src3= src[3*srcStride];\
1557 const int src4= src[4*srcStride];\
1558 const int src5= src[5*srcStride];\
1559 const int src6= src[6*srcStride];\
1560 const int src7= src[7*srcStride];\
1561 const int src8= src[8*srcStride];\
1562 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1563 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1564 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1565 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1566 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1567 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1568 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1569 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1575 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1576 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1581 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1582 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1583 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1584 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1585 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1586 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1587 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1588 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1589 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1590 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1591 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1592 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1593 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1594 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1595 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1596 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1602 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1603 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1608 const int src0= src[0*srcStride];\
1609 const int src1= src[1*srcStride];\
1610 const int src2= src[2*srcStride];\
1611 const int src3= src[3*srcStride];\
1612 const int src4= src[4*srcStride];\
1613 const int src5= src[5*srcStride];\
1614 const int src6= src[6*srcStride];\
1615 const int src7= src[7*srcStride];\
1616 const int src8= src[8*srcStride];\
1617 const int src9= src[9*srcStride];\
1618 const int src10= src[10*srcStride];\
1619 const int src11= src[11*srcStride];\
1620 const int src12= src[12*srcStride];\
1621 const int src13= src[13*srcStride];\
1622 const int src14= src[14*srcStride];\
1623 const int src15= src[15*srcStride];\
1624 const int src16= src[16*srcStride];\
1625 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1626 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1627 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1628 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1629 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1630 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1631 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1632 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1633 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1634 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1635 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1636 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1637 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1638 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1639 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1640 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1646 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1647 OPNAME ## pixels8_c(dst, src, stride, 8);\
1650 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1652 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1653 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1656 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1657 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1660 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1662 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1663 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1666 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1667 uint8_t full[16*9];\
1669 copy_block9(full, src, 16, stride, 9);\
1670 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1671 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1674 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1675 uint8_t full[16*9];\
1676 copy_block9(full, src, 16, stride, 9);\
1677 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1680 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1681 uint8_t full[16*9];\
1683 copy_block9(full, src, 16, stride, 9);\
1684 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1685 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1687 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1688 uint8_t full[16*9];\
1691 uint8_t halfHV[64];\
1692 copy_block9(full, src, 16, stride, 9);\
1693 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1694 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1695 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1696 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1698 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1699 uint8_t full[16*9];\
1701 uint8_t halfHV[64];\
1702 copy_block9(full, src, 16, stride, 9);\
1703 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1704 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1705 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1706 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1708 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1709 uint8_t full[16*9];\
1712 uint8_t halfHV[64];\
1713 copy_block9(full, src, 16, stride, 9);\
1714 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1715 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1716 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1717 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1719 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1720 uint8_t full[16*9];\
1722 uint8_t halfHV[64];\
1723 copy_block9(full, src, 16, stride, 9);\
1724 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1725 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1726 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1727 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1729 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1730 uint8_t full[16*9];\
1733 uint8_t halfHV[64];\
1734 copy_block9(full, src, 16, stride, 9);\
1735 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1736 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1737 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1738 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1740 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1741 uint8_t full[16*9];\
1743 uint8_t halfHV[64];\
1744 copy_block9(full, src, 16, stride, 9);\
1745 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1746 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1747 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1748 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1750 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1751 uint8_t full[16*9];\
1754 uint8_t halfHV[64];\
1755 copy_block9(full, src, 16, stride, 9);\
1756 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1757 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1758 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1759 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1761 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1762 uint8_t full[16*9];\
1764 uint8_t halfHV[64];\
1765 copy_block9(full, src, 16, stride, 9);\
1766 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1767 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1768 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1769 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1771 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1773 uint8_t halfHV[64];\
1774 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1778 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1780 uint8_t halfHV[64];\
1781 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1782 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1783 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1785 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1786 uint8_t full[16*9];\
1789 uint8_t halfHV[64];\
1790 copy_block9(full, src, 16, stride, 9);\
1791 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1792 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1793 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1794 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1796 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1797 uint8_t full[16*9];\
1799 copy_block9(full, src, 16, stride, 9);\
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1801 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1802 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1804 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1805 uint8_t full[16*9];\
1808 uint8_t halfHV[64];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1815 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1816 uint8_t full[16*9];\
1818 copy_block9(full, src, 16, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1820 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1821 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1823 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1825 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1826 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1828 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1829 OPNAME ## pixels16_c(dst, src, stride, 16);\
1832 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1834 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1835 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1838 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1839 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1842 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1844 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1845 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1848 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[24*17];\
1851 copy_block17(full, src, 24, stride, 17);\
1852 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1853 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1856 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[24*17];\
1858 copy_block17(full, src, 24, stride, 17);\
1859 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1862 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1863 uint8_t full[24*17];\
1865 copy_block17(full, src, 24, stride, 17);\
1866 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1867 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1869 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1870 uint8_t full[24*17];\
1871 uint8_t halfH[272];\
1872 uint8_t halfV[256];\
1873 uint8_t halfHV[256];\
1874 copy_block17(full, src, 24, stride, 17);\
1875 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1876 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1877 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1878 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1880 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1881 uint8_t full[24*17];\
1882 uint8_t halfH[272];\
1883 uint8_t halfHV[256];\
1884 copy_block17(full, src, 24, stride, 17);\
1885 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1886 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1887 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1888 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1890 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1891 uint8_t full[24*17];\
1892 uint8_t halfH[272];\
1893 uint8_t halfV[256];\
1894 uint8_t halfHV[256];\
1895 copy_block17(full, src, 24, stride, 17);\
1896 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1897 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1898 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1899 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1901 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1902 uint8_t full[24*17];\
1903 uint8_t halfH[272];\
1904 uint8_t halfHV[256];\
1905 copy_block17(full, src, 24, stride, 17);\
1906 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1907 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1908 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1909 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1911 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1912 uint8_t full[24*17];\
1913 uint8_t halfH[272];\
1914 uint8_t halfV[256];\
1915 uint8_t halfHV[256];\
1916 copy_block17(full, src, 24, stride, 17);\
1917 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1918 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1919 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1920 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1922 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[24*17];\
1924 uint8_t halfH[272];\
1925 uint8_t halfHV[256];\
1926 copy_block17(full, src, 24, stride, 17);\
1927 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1928 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1929 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1930 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1932 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1933 uint8_t full[24*17];\
1934 uint8_t halfH[272];\
1935 uint8_t halfV[256];\
1936 uint8_t halfHV[256];\
1937 copy_block17(full, src, 24, stride, 17);\
1938 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1939 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1940 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1941 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1943 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1944 uint8_t full[24*17];\
1945 uint8_t halfH[272];\
1946 uint8_t halfHV[256];\
1947 copy_block17(full, src, 24, stride, 17);\
1948 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1949 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1950 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1951 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1953 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1954 uint8_t halfH[272];\
1955 uint8_t halfHV[256];\
1956 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1960 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t halfH[272];\
1962 uint8_t halfHV[256];\
1963 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1964 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1965 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1967 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[24*17];\
1969 uint8_t halfH[272];\
1970 uint8_t halfV[256];\
1971 uint8_t halfHV[256];\
1972 copy_block17(full, src, 24, stride, 17);\
1973 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1974 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1975 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1976 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1978 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1979 uint8_t full[24*17];\
1980 uint8_t halfH[272];\
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1983 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1984 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1986 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[24*17];\
1988 uint8_t halfH[272];\
1989 uint8_t halfV[256];\
1990 uint8_t halfHV[256];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
1997 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t full[24*17];\
1999 uint8_t halfH[272];\
2000 copy_block17(full, src, 24, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2002 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2003 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2005 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2006 uint8_t halfH[272];\
2007 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2008 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2011 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2012 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2013 #define op_put(a, b) a = cm[((b) + 16)>>5]
2014 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2016 QPEL_MC(0, put_ , _ , op_put)
2017 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2018 QPEL_MC(0, avg_ , _ , op_avg)
2019 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2021 #undef op_avg_no_rnd
2023 #undef op_put_no_rnd
2026 #define H264_LOWPASS(OPNAME, OP, OP2) \
2027 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2029 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2033 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2034 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2040 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2042 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2046 const int srcB= src[-2*srcStride];\
2047 const int srcA= src[-1*srcStride];\
2048 const int src0= src[0 *srcStride];\
2049 const int src1= src[1 *srcStride];\
2050 const int src2= src[2 *srcStride];\
2051 const int src3= src[3 *srcStride];\
2052 const int src4= src[4 *srcStride];\
2053 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2054 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2060 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2063 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2065 src -= 2*srcStride;\
2066 for(i=0; i<h+5; i++)\
2068 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2069 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2073 tmp -= tmpStride*(h+5-2);\
2076 const int tmpB= tmp[-2*tmpStride];\
2077 const int tmpA= tmp[-1*tmpStride];\
2078 const int tmp0= tmp[0 *tmpStride];\
2079 const int tmp1= tmp[1 *tmpStride];\
2080 const int tmp2= tmp[2 *tmpStride];\
2081 const int tmp3= tmp[3 *tmpStride];\
2082 const int tmp4= tmp[4 *tmpStride];\
2083 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2084 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2089 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2091 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2095 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2096 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2097 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2098 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2104 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2106 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2110 const int srcB= src[-2*srcStride];\
2111 const int srcA= src[-1*srcStride];\
2112 const int src0= src[0 *srcStride];\
2113 const int src1= src[1 *srcStride];\
2114 const int src2= src[2 *srcStride];\
2115 const int src3= src[3 *srcStride];\
2116 const int src4= src[4 *srcStride];\
2117 const int src5= src[5 *srcStride];\
2118 const int src6= src[6 *srcStride];\
2119 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2120 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2121 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2122 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2128 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2131 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2133 src -= 2*srcStride;\
2134 for(i=0; i<h+5; i++)\
2136 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2137 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2138 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2139 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2143 tmp -= tmpStride*(h+5-2);\
2146 const int tmpB= tmp[-2*tmpStride];\
2147 const int tmpA= tmp[-1*tmpStride];\
2148 const int tmp0= tmp[0 *tmpStride];\
2149 const int tmp1= tmp[1 *tmpStride];\
2150 const int tmp2= tmp[2 *tmpStride];\
2151 const int tmp3= tmp[3 *tmpStride];\
2152 const int tmp4= tmp[4 *tmpStride];\
2153 const int tmp5= tmp[5 *tmpStride];\
2154 const int tmp6= tmp[6 *tmpStride];\
2155 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2156 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2157 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2158 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2164 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2166 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2170 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2171 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2172 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2173 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2174 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2175 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2176 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2177 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2183 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2185 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2189 const int srcB= src[-2*srcStride];\
2190 const int srcA= src[-1*srcStride];\
2191 const int src0= src[0 *srcStride];\
2192 const int src1= src[1 *srcStride];\
2193 const int src2= src[2 *srcStride];\
2194 const int src3= src[3 *srcStride];\
2195 const int src4= src[4 *srcStride];\
2196 const int src5= src[5 *srcStride];\
2197 const int src6= src[6 *srcStride];\
2198 const int src7= src[7 *srcStride];\
2199 const int src8= src[8 *srcStride];\
2200 const int src9= src[9 *srcStride];\
2201 const int src10=src[10*srcStride];\
2202 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2203 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2204 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2205 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2206 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2207 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2208 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2209 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2215 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2218 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2220 src -= 2*srcStride;\
2221 for(i=0; i<h+5; i++)\
2223 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2224 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2225 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2226 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2227 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2228 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2229 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2230 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2234 tmp -= tmpStride*(h+5-2);\
2237 const int tmpB= tmp[-2*tmpStride];\
2238 const int tmpA= tmp[-1*tmpStride];\
2239 const int tmp0= tmp[0 *tmpStride];\
2240 const int tmp1= tmp[1 *tmpStride];\
2241 const int tmp2= tmp[2 *tmpStride];\
2242 const int tmp3= tmp[3 *tmpStride];\
2243 const int tmp4= tmp[4 *tmpStride];\
2244 const int tmp5= tmp[5 *tmpStride];\
2245 const int tmp6= tmp[6 *tmpStride];\
2246 const int tmp7= tmp[7 *tmpStride];\
2247 const int tmp8= tmp[8 *tmpStride];\
2248 const int tmp9= tmp[9 *tmpStride];\
2249 const int tmp10=tmp[10*tmpStride];\
2250 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2251 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2252 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2253 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2254 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2255 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2256 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2257 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2263 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2264 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2265 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2266 src += 8*srcStride;\
2267 dst += 8*dstStride;\
2268 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2269 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2272 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2273 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2274 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2275 src += 8*srcStride;\
2276 dst += 8*dstStride;\
2277 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2278 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2281 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2282 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2283 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2284 src += 8*srcStride;\
2285 dst += 8*dstStride;\
2286 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2287 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2290 #define H264_MC(OPNAME, SIZE) \
2291 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2292 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2295 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2296 uint8_t half[SIZE*SIZE];\
2297 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2298 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2301 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2302 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2305 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2306 uint8_t half[SIZE*SIZE];\
2307 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2308 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2311 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2312 uint8_t full[SIZE*(SIZE+5)];\
2313 uint8_t * const full_mid= full + SIZE*2;\
2314 uint8_t half[SIZE*SIZE];\
2315 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2316 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2317 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2320 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2321 uint8_t full[SIZE*(SIZE+5)];\
2322 uint8_t * const full_mid= full + SIZE*2;\
2323 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2324 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2327 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2328 uint8_t full[SIZE*(SIZE+5)];\
2329 uint8_t * const full_mid= full + SIZE*2;\
2330 uint8_t half[SIZE*SIZE];\
2331 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2332 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2333 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2336 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2337 uint8_t full[SIZE*(SIZE+5)];\
2338 uint8_t * const full_mid= full + SIZE*2;\
2339 uint8_t halfH[SIZE*SIZE];\
2340 uint8_t halfV[SIZE*SIZE];\
2341 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2342 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2343 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2344 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2347 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2348 uint8_t full[SIZE*(SIZE+5)];\
2349 uint8_t * const full_mid= full + SIZE*2;\
2350 uint8_t halfH[SIZE*SIZE];\
2351 uint8_t halfV[SIZE*SIZE];\
2352 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2353 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2355 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2358 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2359 uint8_t full[SIZE*(SIZE+5)];\
2360 uint8_t * const full_mid= full + SIZE*2;\
2361 uint8_t halfH[SIZE*SIZE];\
2362 uint8_t halfV[SIZE*SIZE];\
2363 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2364 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2365 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2366 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2369 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2370 uint8_t full[SIZE*(SIZE+5)];\
2371 uint8_t * const full_mid= full + SIZE*2;\
2372 uint8_t halfH[SIZE*SIZE];\
2373 uint8_t halfV[SIZE*SIZE];\
2374 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2375 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2376 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2377 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2381 int16_t tmp[SIZE*(SIZE+5)];\
2382 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2386 int16_t tmp[SIZE*(SIZE+5)];\
2387 uint8_t halfH[SIZE*SIZE];\
2388 uint8_t halfHV[SIZE*SIZE];\
2389 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2390 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2391 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2394 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2395 int16_t tmp[SIZE*(SIZE+5)];\
2396 uint8_t halfH[SIZE*SIZE];\
2397 uint8_t halfHV[SIZE*SIZE];\
2398 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2399 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2400 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2403 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2404 uint8_t full[SIZE*(SIZE+5)];\
2405 uint8_t * const full_mid= full + SIZE*2;\
2406 int16_t tmp[SIZE*(SIZE+5)];\
2407 uint8_t halfV[SIZE*SIZE];\
2408 uint8_t halfHV[SIZE*SIZE];\
2409 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2410 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2411 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2412 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2415 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2416 uint8_t full[SIZE*(SIZE+5)];\
2417 uint8_t * const full_mid= full + SIZE*2;\
2418 int16_t tmp[SIZE*(SIZE+5)];\
2419 uint8_t halfV[SIZE*SIZE];\
2420 uint8_t halfHV[SIZE*SIZE];\
2421 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2422 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2423 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2424 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2427 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2428 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2429 #define op_put(a, b) a = cm[((b) + 16)>>5]
2430 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2431 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2433 H264_LOWPASS(put_ , op_put, op2_put)
2434 H264_LOWPASS(avg_ , op_avg, op2_avg)
2449 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2450 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2451 #define H264_WEIGHT(W,H) \
2452 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2454 offset <<= log2_denom; \
2455 if(log2_denom) offset += 1<<(log2_denom-1); \
2456 for(y=0; y<H; y++, block += stride){ \
2459 if(W==2) continue; \
2462 if(W==4) continue; \
2467 if(W==8) continue; \
2478 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2480 offset = ((offset + 1) | 1) << log2_denom; \
2481 for(y=0; y<H; y++, dst += stride, src += stride){ \
2484 if(W==2) continue; \
2487 if(W==4) continue; \
2492 if(W==8) continue; \
2519 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2524 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2525 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2526 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2527 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2528 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2529 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2530 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2531 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2537 #ifdef CONFIG_CAVS_DECODER
2539 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2541 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2542 put_pixels8_c(dst, src, stride, 8);
2544 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2545 avg_pixels8_c(dst, src, stride, 8);
2547 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2548 put_pixels16_c(dst, src, stride, 16);
2550 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2551 avg_pixels16_c(dst, src, stride, 16);
2553 #endif /* CONFIG_CAVS_DECODER */
2555 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2557 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2559 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2560 put_pixels8_c(dst, src, stride, 8);
2562 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2564 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2567 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2569 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2570 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2574 const int src_1= src[ -srcStride];
2575 const int src0 = src[0 ];
2576 const int src1 = src[ srcStride];
2577 const int src2 = src[2*srcStride];
2578 const int src3 = src[3*srcStride];
2579 const int src4 = src[4*srcStride];
2580 const int src5 = src[5*srcStride];
2581 const int src6 = src[6*srcStride];
2582 const int src7 = src[7*srcStride];
2583 const int src8 = src[8*srcStride];
2584 const int src9 = src[9*srcStride];
2585 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2586 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2587 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2588 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2589 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2590 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2591 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2592 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2598 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2599 put_pixels8_c(dst, src, stride, 8);
2602 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2604 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2605 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2608 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2609 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2612 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2614 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2615 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2618 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2619 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2622 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2626 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2627 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2628 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2629 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2631 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2635 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2636 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2637 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2638 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2640 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2642 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2643 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2646 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2647 if(ENABLE_ANY_H263) {
2649 const int strength= ff_h263_loop_filter_strength[qscale];
2653 int p0= src[x-2*stride];
2654 int p1= src[x-1*stride];
2655 int p2= src[x+0*stride];
2656 int p3= src[x+1*stride];
2657 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2659 if (d<-2*strength) d1= 0;
2660 else if(d<- strength) d1=-2*strength - d;
2661 else if(d< strength) d1= d;
2662 else if(d< 2*strength) d1= 2*strength - d;
2667 if(p1&256) p1= ~(p1>>31);
2668 if(p2&256) p2= ~(p2>>31);
2670 src[x-1*stride] = p1;
2671 src[x+0*stride] = p2;
2675 d2= av_clip((p0-p3)/4, -ad1, ad1);
2677 src[x-2*stride] = p0 - d2;
2678 src[x+ stride] = p3 + d2;
2683 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2684 if(ENABLE_ANY_H263) {
2686 const int strength= ff_h263_loop_filter_strength[qscale];
2690 int p0= src[y*stride-2];
2691 int p1= src[y*stride-1];
2692 int p2= src[y*stride+0];
2693 int p3= src[y*stride+1];
2694 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2696 if (d<-2*strength) d1= 0;
2697 else if(d<- strength) d1=-2*strength - d;
2698 else if(d< strength) d1= d;
2699 else if(d< 2*strength) d1= 2*strength - d;
2704 if(p1&256) p1= ~(p1>>31);
2705 if(p2&256) p2= ~(p2>>31);
2707 src[y*stride-1] = p1;
2708 src[y*stride+0] = p2;
2712 d2= av_clip((p0-p3)/4, -ad1, ad1);
2714 src[y*stride-2] = p0 - d2;
2715 src[y*stride+1] = p3 + d2;
2720 static void h261_loop_filter_c(uint8_t *src, int stride){
2725 temp[x ] = 4*src[x ];
2726 temp[x + 7*8] = 4*src[x + 7*stride];
2730 xy = y * stride + x;
2732 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2737 src[ y*stride] = (temp[ y*8] + 2)>>2;
2738 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2740 xy = y * stride + x;
2742 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2747 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2750 for( i = 0; i < 4; i++ ) {
2755 for( d = 0; d < 4; d++ ) {
2756 const int p0 = pix[-1*xstride];
2757 const int p1 = pix[-2*xstride];
2758 const int p2 = pix[-3*xstride];
2759 const int q0 = pix[0];
2760 const int q1 = pix[1*xstride];
2761 const int q2 = pix[2*xstride];
2763 if( FFABS( p0 - q0 ) < alpha &&
2764 FFABS( p1 - p0 ) < beta &&
2765 FFABS( q1 - q0 ) < beta ) {
2770 if( FFABS( p2 - p0 ) < beta ) {
2771 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2774 if( FFABS( q2 - q0 ) < beta ) {
2775 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2779 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2780 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2781 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2787 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2789 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2791 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2793 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2796 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2799 for( i = 0; i < 4; i++ ) {
2800 const int tc = tc0[i];
2805 for( d = 0; d < 2; d++ ) {
2806 const int p0 = pix[-1*xstride];
2807 const int p1 = pix[-2*xstride];
2808 const int q0 = pix[0];
2809 const int q1 = pix[1*xstride];
2811 if( FFABS( p0 - q0 ) < alpha &&
2812 FFABS( p1 - p0 ) < beta &&
2813 FFABS( q1 - q0 ) < beta ) {
2815 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2817 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2818 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2824 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2826 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2828 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2830 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2833 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2836 for( d = 0; d < 8; d++ ) {
2837 const int p0 = pix[-1*xstride];
2838 const int p1 = pix[-2*xstride];
2839 const int q0 = pix[0];
2840 const int q1 = pix[1*xstride];
2842 if( FFABS( p0 - q0 ) < alpha &&
2843 FFABS( p1 - p0 ) < beta &&
2844 FFABS( q1 - q0 ) < beta ) {
2846 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2847 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2852 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2854 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2856 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2858 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2861 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2867 s += abs(pix1[0] - pix2[0]);
2868 s += abs(pix1[1] - pix2[1]);
2869 s += abs(pix1[2] - pix2[2]);
2870 s += abs(pix1[3] - pix2[3]);
2871 s += abs(pix1[4] - pix2[4]);
2872 s += abs(pix1[5] - pix2[5]);
2873 s += abs(pix1[6] - pix2[6]);
2874 s += abs(pix1[7] - pix2[7]);
2875 s += abs(pix1[8] - pix2[8]);
2876 s += abs(pix1[9] - pix2[9]);
2877 s += abs(pix1[10] - pix2[10]);
2878 s += abs(pix1[11] - pix2[11]);
2879 s += abs(pix1[12] - pix2[12]);
2880 s += abs(pix1[13] - pix2[13]);
2881 s += abs(pix1[14] - pix2[14]);
2882 s += abs(pix1[15] - pix2[15]);
2889 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2895 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2896 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2897 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2898 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2899 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2900 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2901 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2902 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2903 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2904 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2905 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2906 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2907 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2908 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2909 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2910 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2917 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2920 uint8_t *pix3 = pix2 + line_size;
2924 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2925 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2926 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2927 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2928 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2929 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2930 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2931 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2932 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2933 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2934 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2935 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2936 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2937 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2938 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2939 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2947 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2950 uint8_t *pix3 = pix2 + line_size;
2954 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2955 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2956 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2957 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2958 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2959 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2960 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2961 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
2962 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
2963 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
2964 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
2965 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
2966 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
2967 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
2968 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
2969 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
2977 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2983 s += abs(pix1[0] - pix2[0]);
2984 s += abs(pix1[1] - pix2[1]);
2985 s += abs(pix1[2] - pix2[2]);
2986 s += abs(pix1[3] - pix2[3]);
2987 s += abs(pix1[4] - pix2[4]);
2988 s += abs(pix1[5] - pix2[5]);
2989 s += abs(pix1[6] - pix2[6]);
2990 s += abs(pix1[7] - pix2[7]);
2997 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3003 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3004 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3005 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3006 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3007 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3008 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3009 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3010 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3017 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3020 uint8_t *pix3 = pix2 + line_size;
3024 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3025 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3026 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3027 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3028 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3029 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3030 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3031 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3039 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3042 uint8_t *pix3 = pix2 + line_size;
3046 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3047 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3048 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3049 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3050 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3051 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3052 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3053 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3061 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3062 MpegEncContext *c = v;
3068 for(x=0; x<16; x++){
3069 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3072 for(x=0; x<15; x++){
3073 score2+= FFABS( s1[x ] - s1[x +stride]
3074 - s1[x+1] + s1[x+1+stride])
3075 -FFABS( s2[x ] - s2[x +stride]
3076 - s2[x+1] + s2[x+1+stride]);
3083 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3084 else return score1 + FFABS(score2)*8;
3087 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3088 MpegEncContext *c = v;
3095 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3099 score2+= FFABS( s1[x ] - s1[x +stride]
3100 - s1[x+1] + s1[x+1+stride])
3101 -FFABS( s2[x ] - s2[x +stride]
3102 - s2[x+1] + s2[x+1+stride]);
3109 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3110 else return score1 + FFABS(score2)*8;
3113 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3117 for(i=0; i<8*8; i++){
3118 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3121 assert(-512<b && b<512);
3123 sum += (w*b)*(w*b)>>4;
3128 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3131 for(i=0; i<8*8; i++){
3132 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3137 * permutes an 8x8 block.
3138 * @param block the block which will be permuted according to the given permutation vector
3139 * @param permutation the permutation vector
3140 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3141 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3142 * (inverse) permutated to scantable order!
3144 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3150 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3152 for(i=0; i<=last; i++){
3153 const int j= scantable[i];
3158 for(i=0; i<=last; i++){
3159 const int j= scantable[i];
3160 const int perm_j= permutation[j];
3161 block[perm_j]= temp[j];
3165 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3169 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3172 memset(cmp, 0, sizeof(void*)*5);
3180 cmp[i]= c->hadamard8_diff[i];
3186 cmp[i]= c->dct_sad[i];
3189 cmp[i]= c->dct264_sad[i];
3192 cmp[i]= c->dct_max[i];
3195 cmp[i]= c->quant_psnr[i];
3215 #ifdef CONFIG_SNOW_ENCODER
3224 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3230 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3232 static void clear_blocks_c(DCTELEM *blocks)
3234 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3237 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3239 for(i=0; i+7<w; i+=8){
3240 dst[i+0] += src[i+0];
3241 dst[i+1] += src[i+1];
3242 dst[i+2] += src[i+2];
3243 dst[i+3] += src[i+3];
3244 dst[i+4] += src[i+4];
3245 dst[i+5] += src[i+5];
3246 dst[i+6] += src[i+6];
3247 dst[i+7] += src[i+7];
3250 dst[i+0] += src[i+0];
3253 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3255 for(i=0; i+7<w; i+=8){
3256 dst[i+0] = src1[i+0]-src2[i+0];
3257 dst[i+1] = src1[i+1]-src2[i+1];
3258 dst[i+2] = src1[i+2]-src2[i+2];
3259 dst[i+3] = src1[i+3]-src2[i+3];
3260 dst[i+4] = src1[i+4]-src2[i+4];
3261 dst[i+5] = src1[i+5]-src2[i+5];
3262 dst[i+6] = src1[i+6]-src2[i+6];
3263 dst[i+7] = src1[i+7]-src2[i+7];
3266 dst[i+0] = src1[i+0]-src2[i+0];
3269 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3277 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3287 #define BUTTERFLY2(o1,o2,i1,i2) \
3291 #define BUTTERFLY1(x,y) \
3300 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3302 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3310 //FIXME try pointer walks
3311 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3312 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3313 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3314 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3316 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3317 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3318 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3319 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3321 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3322 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3323 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3324 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3328 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3329 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3330 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3331 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3333 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3334 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3335 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3336 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3339 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3340 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3341 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3342 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3348 printf("MAX:%d\n", maxi);
3354 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3362 //FIXME try pointer walks
3363 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3364 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3365 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3366 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3368 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3369 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3370 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3371 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3373 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3374 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3375 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3376 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3380 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3381 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3382 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3383 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3385 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3386 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3387 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3388 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3391 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3392 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3393 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3394 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3397 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3402 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3403 MpegEncContext * const s= (MpegEncContext *)c;
3404 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3405 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3409 s->dsp.diff_pixels(temp, src1, src2, stride);
3411 return s->dsp.sum_abs_dctelem(temp);
3416 const int s07 = SRC(0) + SRC(7);\
3417 const int s16 = SRC(1) + SRC(6);\
3418 const int s25 = SRC(2) + SRC(5);\
3419 const int s34 = SRC(3) + SRC(4);\
3420 const int a0 = s07 + s34;\
3421 const int a1 = s16 + s25;\
3422 const int a2 = s07 - s34;\
3423 const int a3 = s16 - s25;\
3424 const int d07 = SRC(0) - SRC(7);\
3425 const int d16 = SRC(1) - SRC(6);\
3426 const int d25 = SRC(2) - SRC(5);\
3427 const int d34 = SRC(3) - SRC(4);\
3428 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3429 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3430 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3431 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3433 DST(1, a4 + (a7>>2)) ;\
3434 DST(2, a2 + (a3>>1)) ;\
3435 DST(3, a5 + (a6>>2)) ;\
3437 DST(5, a6 - (a5>>2)) ;\
3438 DST(6, (a2>>1) - a3 ) ;\
3439 DST(7, (a4>>2) - a7 ) ;\
3442 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3443 MpegEncContext * const s= (MpegEncContext *)c;
3448 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3450 #define SRC(x) dct[i][x]
3451 #define DST(x,v) dct[i][x]= v
3452 for( i = 0; i < 8; i++ )
3457 #define SRC(x) dct[x][i]
3458 #define DST(x,v) sum += FFABS(v)
3459 for( i = 0; i < 8; i++ )
3467 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3468 MpegEncContext * const s= (MpegEncContext *)c;
3469 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3470 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3475 s->dsp.diff_pixels(temp, src1, src2, stride);
3479 sum= FFMAX(sum, FFABS(temp[i]));
3484 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3485 MpegEncContext * const s= (MpegEncContext *)c;
3486 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3487 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3488 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3494 s->dsp.diff_pixels(temp, src1, src2, stride);
3496 memcpy(bak, temp, 64*sizeof(DCTELEM));
3498 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3499 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3500 simple_idct(temp); //FIXME
3503 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3508 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3509 MpegEncContext * const s= (MpegEncContext *)c;
3510 const uint8_t *scantable= s->intra_scantable.permutated;
3511 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3512 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3513 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3514 uint8_t * const bak= (uint8_t*)aligned_bak;
3515 int i, last, run, bits, level, distoration, start_i;
3516 const int esc_length= s->ac_esc_length;
3518 uint8_t * last_length;
3523 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3524 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3527 s->dsp.diff_pixels(temp, src1, src2, stride);
3529 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3535 length = s->intra_ac_vlc_length;
3536 last_length= s->intra_ac_vlc_last_length;
3537 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3540 length = s->inter_ac_vlc_length;
3541 last_length= s->inter_ac_vlc_last_length;
3546 for(i=start_i; i<last; i++){
3547 int j= scantable[i];
3552 if((level&(~127)) == 0){
3553 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3562 level= temp[i] + 64;
3566 if((level&(~127)) == 0){
3567 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3575 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3577 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3580 s->dsp.idct_add(bak, stride, temp);
3582 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3584 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3587 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3588 MpegEncContext * const s= (MpegEncContext *)c;
3589 const uint8_t *scantable= s->intra_scantable.permutated;
3590 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3591 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3592 int i, last, run, bits, level, start_i;
3593 const int esc_length= s->ac_esc_length;
3595 uint8_t * last_length;
3599 s->dsp.diff_pixels(temp, src1, src2, stride);
3601 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3607 length = s->intra_ac_vlc_length;
3608 last_length= s->intra_ac_vlc_last_length;
3609 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3612 length = s->inter_ac_vlc_length;
3613 last_length= s->inter_ac_vlc_last_length;
3618 for(i=start_i; i<last; i++){
3619 int j= scantable[i];
3624 if((level&(~127)) == 0){
3625 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3634 level= temp[i] + 64;
3638 if((level&(~127)) == 0){
3639 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3647 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3652 for(x=0; x<16; x+=4){
3653 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3654 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3662 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3667 for(x=0; x<16; x++){
3668 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3677 #define SQ(a) ((a)*(a))
3678 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3683 for(x=0; x<16; x+=4){
3684 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3685 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3693 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3698 for(x=0; x<16; x++){
3699 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3708 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3712 for(i=0; i<size; i++)
3713 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3717 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3718 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3719 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3721 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3723 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3724 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3725 WARPER8_16_SQ(rd8x8_c, rd16_c)
3726 WARPER8_16_SQ(bit8x8_c, bit16_c)
3728 static void vector_fmul_c(float *dst, const float *src, int len){
3730 for(i=0; i<len; i++)
3734 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3737 for(i=0; i<len; i++)
3738 dst[i] = src0[i] * src1[-i];
3741 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3743 for(i=0; i<len; i++)
3744 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3747 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3749 for(i=0; i<len; i++) {
3750 int_fast32_t tmp = ((int32_t*)src)[i];
3752 tmp = (0x43c0ffff - tmp)>>31;
3753 // is this faster on some gcc/cpu combinations?
3754 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3757 dst[i] = tmp - 0x8000;
3762 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3763 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3764 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3765 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3766 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3767 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3768 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3770 static void wmv2_idct_row(short * b)
3773 int a0,a1,a2,a3,a4,a5,a6,a7;
3775 a1 = W1*b[1]+W7*b[7];
3776 a7 = W7*b[1]-W1*b[7];
3777 a5 = W5*b[5]+W3*b[3];
3778 a3 = W3*b[5]-W5*b[3];
3779 a2 = W2*b[2]+W6*b[6];
3780 a6 = W6*b[2]-W2*b[6];
3781 a0 = W0*b[0]+W0*b[4];
3782 a4 = W0*b[0]-W0*b[4];
3784 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3785 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3787 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3788 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3789 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3790 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3791 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3792 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3793 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3794 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3796 static void wmv2_idct_col(short * b)
3799 int a0,a1,a2,a3,a4,a5,a6,a7;
3800 /*step 1, with extended precision*/
3801 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3802 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3803 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3804 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3805 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3806 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3807 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3808 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3810 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3811 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3813 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3814 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3815 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3816 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3818 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3819 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3820 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3821 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3823 void ff_wmv2_idct_c(short * block){
3827 wmv2_idct_row(block+i);
3830 wmv2_idct_col(block+i);
3833 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3835 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3837 ff_wmv2_idct_c(block);
3838 put_pixels_clamped_c(block, dest, line_size);
3840 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3842 ff_wmv2_idct_c(block);
3843 add_pixels_clamped_c(block, dest, line_size);
3845 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3848 put_pixels_clamped_c(block, dest, line_size);
3850 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3853 add_pixels_clamped_c(block, dest, line_size);
3856 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3859 put_pixels_clamped4_c(block, dest, line_size);
3861 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3864 add_pixels_clamped4_c(block, dest, line_size);
3867 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3870 put_pixels_clamped2_c(block, dest, line_size);
3872 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3875 add_pixels_clamped2_c(block, dest, line_size);
3878 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3880 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3882 dest[0] = cm[(block[0] + 4)>>3];
3884 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3886 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3888 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3891 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3893 /* init static data */
3894 void dsputil_static_init(void)
3898 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3899 for(i=0;i<MAX_NEG_CROP;i++) {
3901 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3904 for(i=0;i<512;i++) {
3905 ff_squareTbl[i] = (i - 256) * (i - 256);
3908 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3911 int ff_check_alignment(void){
3912 static int did_fail=0;
3913 DECLARE_ALIGNED_16(int, aligned);
3915 if((long)&aligned & 15){
3917 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3918 av_log(NULL, AV_LOG_ERROR,
3919 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3920 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3921 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3922 "Do not report crashes to FFmpeg developers.\n");
3931 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3935 ff_check_alignment();
3937 #ifdef CONFIG_ENCODERS
3938 if(avctx->dct_algo==FF_DCT_FASTINT) {
3939 c->fdct = fdct_ifast;
3940 c->fdct248 = fdct_ifast248;
3942 else if(avctx->dct_algo==FF_DCT_FAAN) {
3943 c->fdct = ff_faandct;
3944 c->fdct248 = ff_faandct248;
3947 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3948 c->fdct248 = ff_fdct248_islow;
3950 #endif //CONFIG_ENCODERS
3952 if(avctx->lowres==1){
3953 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3954 c->idct_put= ff_jref_idct4_put;
3955 c->idct_add= ff_jref_idct4_add;
3957 c->idct_put= ff_h264_lowres_idct_put_c;
3958 c->idct_add= ff_h264_lowres_idct_add_c;
3960 c->idct = j_rev_dct4;
3961 c->idct_permutation_type= FF_NO_IDCT_PERM;
3962 }else if(avctx->lowres==2){
3963 c->idct_put= ff_jref_idct2_put;
3964 c->idct_add= ff_jref_idct2_add;
3965 c->idct = j_rev_dct2;
3966 c->idct_permutation_type= FF_NO_IDCT_PERM;
3967 }else if(avctx->lowres==3){
3968 c->idct_put= ff_jref_idct1_put;
3969 c->idct_add= ff_jref_idct1_add;
3970 c->idct = j_rev_dct1;
3971 c->idct_permutation_type= FF_NO_IDCT_PERM;
3973 if(avctx->idct_algo==FF_IDCT_INT){
3974 c->idct_put= ff_jref_idct_put;
3975 c->idct_add= ff_jref_idct_add;
3976 c->idct = j_rev_dct;
3977 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
3978 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
3979 avctx->idct_algo==FF_IDCT_VP3){
3980 c->idct_put= ff_vp3_idct_put_c;
3981 c->idct_add= ff_vp3_idct_add_c;
3982 c->idct = ff_vp3_idct_c;
3983 c->idct_permutation_type= FF_NO_IDCT_PERM;
3984 }else if(avctx->idct_algo==FF_IDCT_WMV2){
3985 c->idct_put= ff_wmv2_idct_put_c;
3986 c->idct_add= ff_wmv2_idct_add_c;
3987 c->idct = ff_wmv2_idct_c;
3988 c->idct_permutation_type= FF_NO_IDCT_PERM;
3989 }else{ //accurate/default
3990 c->idct_put= simple_idct_put;
3991 c->idct_add= simple_idct_add;
3992 c->idct = simple_idct;
3993 c->idct_permutation_type= FF_NO_IDCT_PERM;
3997 if (ENABLE_H264_DECODER) {
3998 c->h264_idct_add= ff_h264_idct_add_c;
3999 c->h264_idct8_add= ff_h264_idct8_add_c;
4000 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4001 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4004 c->get_pixels = get_pixels_c;
4005 c->diff_pixels = diff_pixels_c;
4006 c->put_pixels_clamped = put_pixels_clamped_c;
4007 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4008 c->add_pixels_clamped = add_pixels_clamped_c;
4009 c->add_pixels8 = add_pixels8_c;
4010 c->add_pixels4 = add_pixels4_c;
4011 c->sum_abs_dctelem = sum_abs_dctelem_c;
4014 c->clear_blocks = clear_blocks_c;
4015 c->pix_sum = pix_sum_c;
4016 c->pix_norm1 = pix_norm1_c;
4018 /* TODO [0] 16 [1] 8 */
4019 c->pix_abs[0][0] = pix_abs16_c;
4020 c->pix_abs[0][1] = pix_abs16_x2_c;
4021 c->pix_abs[0][2] = pix_abs16_y2_c;
4022 c->pix_abs[0][3] = pix_abs16_xy2_c;
4023 c->pix_abs[1][0] = pix_abs8_c;
4024 c->pix_abs[1][1] = pix_abs8_x2_c;
4025 c->pix_abs[1][2] = pix_abs8_y2_c;
4026 c->pix_abs[1][3] = pix_abs8_xy2_c;
4028 #define dspfunc(PFX, IDX, NUM) \
4029 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4030 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4031 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4032 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4034 dspfunc(put, 0, 16);
4035 dspfunc(put_no_rnd, 0, 16);
4037 dspfunc(put_no_rnd, 1, 8);
4041 dspfunc(avg, 0, 16);
4042 dspfunc(avg_no_rnd, 0, 16);
4044 dspfunc(avg_no_rnd, 1, 8);
4049 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4050 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4052 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4053 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4054 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4055 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4056 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4057 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4058 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4059 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4060 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4062 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4063 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4064 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4065 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4066 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4067 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4068 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4069 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4070 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4072 #define dspfunc(PFX, IDX, NUM) \
4073 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4074 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4075 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4076 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4077 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4078 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4079 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4080 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4081 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4082 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4083 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4084 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4085 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4086 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4087 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4088 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4090 dspfunc(put_qpel, 0, 16);
4091 dspfunc(put_no_rnd_qpel, 0, 16);
4093 dspfunc(avg_qpel, 0, 16);
4094 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4096 dspfunc(put_qpel, 1, 8);
4097 dspfunc(put_no_rnd_qpel, 1, 8);
4099 dspfunc(avg_qpel, 1, 8);
4100 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4102 dspfunc(put_h264_qpel, 0, 16);
4103 dspfunc(put_h264_qpel, 1, 8);
4104 dspfunc(put_h264_qpel, 2, 4);
4105 dspfunc(put_h264_qpel, 3, 2);
4106 dspfunc(avg_h264_qpel, 0, 16);
4107 dspfunc(avg_h264_qpel, 1, 8);
4108 dspfunc(avg_h264_qpel, 2, 4);
4111 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4112 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4113 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4114 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4115 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4116 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4117 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4119 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4120 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4121 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4122 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4123 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4124 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4125 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4126 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4127 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4128 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4129 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4130 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4131 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4132 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4133 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4134 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4135 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4136 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4137 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4138 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4140 #ifdef CONFIG_CAVS_DECODER
4141 ff_cavsdsp_init(c,avctx);
4143 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4144 ff_vc1dsp_init(c,avctx);
4146 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4147 ff_intrax8dsp_init(c,avctx);
4149 #if defined(CONFIG_H264_ENCODER)
4150 ff_h264dspenc_init(c,avctx);
4153 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4154 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4155 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4156 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4157 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4158 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4159 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4160 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4162 #define SET_CMP_FUNC(name) \
4163 c->name[0]= name ## 16_c;\
4164 c->name[1]= name ## 8x8_c;
4166 SET_CMP_FUNC(hadamard8_diff)
4167 c->hadamard8_diff[4]= hadamard8_intra16_c;
4168 SET_CMP_FUNC(dct_sad)
4169 SET_CMP_FUNC(dct_max)
4171 SET_CMP_FUNC(dct264_sad)
4173 c->sad[0]= pix_abs16_c;
4174 c->sad[1]= pix_abs8_c;
4178 SET_CMP_FUNC(quant_psnr)
4181 c->vsad[0]= vsad16_c;
4182 c->vsad[4]= vsad_intra16_c;
4183 c->vsse[0]= vsse16_c;
4184 c->vsse[4]= vsse_intra16_c;
4185 c->nsse[0]= nsse16_c;
4186 c->nsse[1]= nsse8_c;
4187 #ifdef CONFIG_SNOW_ENCODER
4188 c->w53[0]= w53_16_c;
4190 c->w97[0]= w97_16_c;
4194 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4196 c->add_bytes= add_bytes_c;
4197 c->diff_bytes= diff_bytes_c;
4198 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4199 c->bswap_buf= bswap_buf;
4201 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4202 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4203 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4204 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4205 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4206 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4207 c->h264_loop_filter_strength= NULL;
4209 if (ENABLE_ANY_H263) {
4210 c->h263_h_loop_filter= h263_h_loop_filter_c;
4211 c->h263_v_loop_filter= h263_v_loop_filter_c;
4214 c->h261_loop_filter= h261_loop_filter_c;
4216 c->try_8x8basis= try_8x8basis_c;
4217 c->add_8x8basis= add_8x8basis_c;
4219 #ifdef CONFIG_SNOW_DECODER
4220 c->vertical_compose97i = ff_snow_vertical_compose97i;
4221 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4222 c->inner_add_yblock = ff_snow_inner_add_yblock;
4225 #ifdef CONFIG_VORBIS_DECODER
4226 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4228 #ifdef CONFIG_FLAC_ENCODER
4229 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4231 c->vector_fmul = vector_fmul_c;
4232 c->vector_fmul_reverse = vector_fmul_reverse_c;
4233 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4234 c->float_to_int16 = ff_float_to_int16_c;
4236 c->shrink[0]= ff_img_copy_plane;
4237 c->shrink[1]= ff_shrink22;
4238 c->shrink[2]= ff_shrink44;
4239 c->shrink[3]= ff_shrink88;
4241 c->prefetch= just_return;
4243 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4244 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4246 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4247 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4248 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4249 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4250 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4251 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4252 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4253 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4254 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4256 for(i=0; i<64; i++){
4257 if(!c->put_2tap_qpel_pixels_tab[0][i])
4258 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4259 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4260 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4263 switch(c->idct_permutation_type){
4264 case FF_NO_IDCT_PERM:
4266 c->idct_permutation[i]= i;
4268 case FF_LIBMPEG2_IDCT_PERM:
4270 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4272 case FF_SIMPLE_IDCT_PERM:
4274 c->idct_permutation[i]= simple_mmx_permutation[i];
4276 case FF_TRANSPOSE_IDCT_PERM:
4278 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4280 case FF_PARTTRANS_IDCT_PERM:
4282 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4285 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");