3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48 uint32_t ff_squareTbl[512] = {0, };
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
100 const uint32_t ff_inverse[256]={
101 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
102 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
103 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
104 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
105 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
106 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
107 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
108 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
109 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
110 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
111 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
112 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
113 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
114 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
115 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
116 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
117 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
118 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
119 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
120 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
121 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
122 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
123 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
124 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
125 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
126 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
127 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
128 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
129 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
130 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
131 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
132 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
135 /* Input permutation for the simple_idct_mmx */
136 static const uint8_t simple_mmx_permutation[64]={
137 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
187 #if LONG_MAX > 2147483647
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= bswap_32(src[i+0]);
222 dst[i+1]= bswap_32(src[i+1]);
223 dst[i+2]= bswap_32(src[i+2]);
224 dst[i+3]= bswap_32(src[i+3]);
225 dst[i+4]= bswap_32(src[i+4]);
226 dst[i+5]= bswap_32(src[i+5]);
227 dst[i+6]= bswap_32(src[i+6]);
228 dst[i+7]= bswap_32(src[i+7]);
231 dst[i+0]= bswap_32(src[i+0]);
235 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
238 uint32_t *sq = ff_squareTbl + 256;
241 for (i = 0; i < h; i++) {
242 s += sq[pix1[0] - pix2[0]];
243 s += sq[pix1[1] - pix2[1]];
244 s += sq[pix1[2] - pix2[2]];
245 s += sq[pix1[3] - pix2[3]];
252 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
255 uint32_t *sq = ff_squareTbl + 256;
258 for (i = 0; i < h; i++) {
259 s += sq[pix1[0] - pix2[0]];
260 s += sq[pix1[1] - pix2[1]];
261 s += sq[pix1[2] - pix2[2]];
262 s += sq[pix1[3] - pix2[3]];
263 s += sq[pix1[4] - pix2[4]];
264 s += sq[pix1[5] - pix2[5]];
265 s += sq[pix1[6] - pix2[6]];
266 s += sq[pix1[7] - pix2[7]];
273 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
276 uint32_t *sq = ff_squareTbl + 256;
279 for (i = 0; i < h; i++) {
280 s += sq[pix1[ 0] - pix2[ 0]];
281 s += sq[pix1[ 1] - pix2[ 1]];
282 s += sq[pix1[ 2] - pix2[ 2]];
283 s += sq[pix1[ 3] - pix2[ 3]];
284 s += sq[pix1[ 4] - pix2[ 4]];
285 s += sq[pix1[ 5] - pix2[ 5]];
286 s += sq[pix1[ 6] - pix2[ 6]];
287 s += sq[pix1[ 7] - pix2[ 7]];
288 s += sq[pix1[ 8] - pix2[ 8]];
289 s += sq[pix1[ 9] - pix2[ 9]];
290 s += sq[pix1[10] - pix2[10]];
291 s += sq[pix1[11] - pix2[11]];
292 s += sq[pix1[12] - pix2[12]];
293 s += sq[pix1[13] - pix2[13]];
294 s += sq[pix1[14] - pix2[14]];
295 s += sq[pix1[15] - pix2[15]];
304 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
305 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
307 const int dec_count= w==8 ? 3 : 4;
310 static const int scale[2][2][4][4]={
314 {268, 239, 239, 213},
318 // 9/7 16x16 or 32x32 dec=4
319 {344, 310, 310, 280},
327 {275, 245, 245, 218},
331 // 5/3 16x16 or 32x32 dec=4
332 {352, 317, 317, 286},
340 for (i = 0; i < h; i++) {
341 for (j = 0; j < w; j+=4) {
342 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
351 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
355 for(level=0; level<dec_count; level++){
356 for(ori= level ? 1 : 0; ori<4; ori++){
357 int size= w>>(dec_count-level);
358 int sx= (ori&1) ? size : 0;
359 int stride= 32<<(dec_count-level);
360 int sy= (ori&2) ? stride>>1 : 0;
362 for(i=0; i<size; i++){
363 for(j=0; j<size; j++){
364 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
390 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 1);
394 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395 return w_c(v, pix1, pix2, line_size, 32, h, 0);
399 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
403 /* read the pixels */
405 block[0] = pixels[0];
406 block[1] = pixels[1];
407 block[2] = pixels[2];
408 block[3] = pixels[3];
409 block[4] = pixels[4];
410 block[5] = pixels[5];
411 block[6] = pixels[6];
412 block[7] = pixels[7];
418 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
419 const uint8_t *s2, int stride){
422 /* read the pixels */
424 block[0] = s1[0] - s2[0];
425 block[1] = s1[1] - s2[1];
426 block[2] = s1[2] - s2[2];
427 block[3] = s1[3] - s2[3];
428 block[4] = s1[4] - s2[4];
429 block[5] = s1[5] - s2[5];
430 block[6] = s1[6] - s2[6];
431 block[7] = s1[7] - s2[7];
439 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445 /* read the pixels */
447 pixels[0] = cm[block[0]];
448 pixels[1] = cm[block[1]];
449 pixels[2] = cm[block[2]];
450 pixels[3] = cm[block[3]];
451 pixels[4] = cm[block[4]];
452 pixels[5] = cm[block[5]];
453 pixels[6] = cm[block[6]];
454 pixels[7] = cm[block[7]];
461 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
465 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
467 /* read the pixels */
469 pixels[0] = cm[block[0]];
470 pixels[1] = cm[block[1]];
471 pixels[2] = cm[block[2]];
472 pixels[3] = cm[block[3]];
479 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
483 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485 /* read the pixels */
487 pixels[0] = cm[block[0]];
488 pixels[1] = cm[block[1]];
495 static void put_signed_pixels_clamped_c(const DCTELEM *block,
496 uint8_t *restrict pixels,
501 for (i = 0; i < 8; i++) {
502 for (j = 0; j < 8; j++) {
505 else if (*block > 127)
508 *pixels = (uint8_t)(*block + 128);
512 pixels += (line_size - 8);
516 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
522 /* read the pixels */
524 pixels[0] = cm[pixels[0] + block[0]];
525 pixels[1] = cm[pixels[1] + block[1]];
526 pixels[2] = cm[pixels[2] + block[2]];
527 pixels[3] = cm[pixels[3] + block[3]];
528 pixels[4] = cm[pixels[4] + block[4]];
529 pixels[5] = cm[pixels[5] + block[5]];
530 pixels[6] = cm[pixels[6] + block[6]];
531 pixels[7] = cm[pixels[7] + block[7]];
537 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
543 /* read the pixels */
545 pixels[0] = cm[pixels[0] + block[0]];
546 pixels[1] = cm[pixels[1] + block[1]];
547 pixels[2] = cm[pixels[2] + block[2]];
548 pixels[3] = cm[pixels[3] + block[3]];
554 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
560 /* read the pixels */
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
569 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
573 pixels[0] += block[0];
574 pixels[1] += block[1];
575 pixels[2] += block[2];
576 pixels[3] += block[3];
577 pixels[4] += block[4];
578 pixels[5] += block[5];
579 pixels[6] += block[6];
580 pixels[7] += block[7];
586 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
599 static int sum_abs_dctelem_c(DCTELEM *block)
603 sum+= FFABS(block[i]);
609 #define PIXOP2(OPNAME, OP) \
610 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
614 OP(*((uint64_t*)block), AV_RN64(pixels));\
620 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint64_t a= AV_RN64(pixels );\
625 const uint64_t b= AV_RN64(pixels+1);\
626 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
632 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
636 const uint64_t a= AV_RN64(pixels );\
637 const uint64_t b= AV_RN64(pixels+1);\
638 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
644 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
648 const uint64_t a= AV_RN64(pixels );\
649 const uint64_t b= AV_RN64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
656 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660 const uint64_t a= AV_RN64(pixels );\
661 const uint64_t b= AV_RN64(pixels+line_size);\
662 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
668 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
671 const uint64_t a= AV_RN64(pixels );\
672 const uint64_t b= AV_RN64(pixels+1);\
673 uint64_t l0= (a&0x0303030303030303ULL)\
674 + (b&0x0303030303030303ULL)\
675 + 0x0202020202020202ULL;\
676 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 for(i=0; i<h; i+=2){\
682 uint64_t a= AV_RN64(pixels );\
683 uint64_t b= AV_RN64(pixels+1);\
684 l1= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL);\
686 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
691 a= AV_RN64(pixels );\
692 b= AV_RN64(pixels+1);\
693 l0= (a&0x0303030303030303ULL)\
694 + (b&0x0303030303030303ULL)\
695 + 0x0202020202020202ULL;\
696 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
704 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 const uint64_t a= AV_RN64(pixels );\
708 const uint64_t b= AV_RN64(pixels+1);\
709 uint64_t l0= (a&0x0303030303030303ULL)\
710 + (b&0x0303030303030303ULL)\
711 + 0x0101010101010101ULL;\
712 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 for(i=0; i<h; i+=2){\
718 uint64_t a= AV_RN64(pixels );\
719 uint64_t b= AV_RN64(pixels+1);\
720 l1= (a&0x0303030303030303ULL)\
721 + (b&0x0303030303030303ULL);\
722 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
727 a= AV_RN64(pixels );\
728 b= AV_RN64(pixels+1);\
729 l0= (a&0x0303030303030303ULL)\
730 + (b&0x0303030303030303ULL)\
731 + 0x0101010101010101ULL;\
732 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
740 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
741 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
748 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749 #else // 64 bit variant
751 #define PIXOP2(OPNAME, OP) \
752 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
755 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
760 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
763 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
768 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
771 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
772 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
777 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
781 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= AV_RN32(&src1[i*src_stride1 ]);\
787 b= AV_RN32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
789 a= AV_RN32(&src1[i*src_stride1+4]);\
790 b= AV_RN32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= AV_RN32(&src1[i*src_stride1 ]);\
801 b= AV_RN32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
803 a= AV_RN32(&src1[i*src_stride1+4]);\
804 b= AV_RN32(&src2[i*src_stride2+4]);\
805 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
809 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810 int src_stride1, int src_stride2, int h){\
814 a= AV_RN32(&src1[i*src_stride1 ]);\
815 b= AV_RN32(&src2[i*src_stride2 ]);\
816 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
820 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
825 a= AV_RN16(&src1[i*src_stride1 ]);\
826 b= AV_RN16(&src2[i*src_stride2 ]);\
827 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
831 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832 int src_stride1, int src_stride2, int h){\
833 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
834 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
837 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838 int src_stride1, int src_stride2, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
840 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
843 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
851 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
855 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
859 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
863 uint32_t a, b, c, d, l0, l1, h0, h1;\
864 a= AV_RN32(&src1[i*src_stride1]);\
865 b= AV_RN32(&src2[i*src_stride2]);\
866 c= AV_RN32(&src3[i*src_stride3]);\
867 d= AV_RN32(&src4[i*src_stride4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 c= AV_RN32(&src3[i*src_stride3+4]);\
881 d= AV_RN32(&src4[i*src_stride4+4]);\
882 l0= (a&0x03030303UL)\
885 h0= ((a&0xFCFCFCFCUL)>>2)\
886 + ((b&0xFCFCFCFCUL)>>2);\
887 l1= (c&0x03030303UL)\
889 h1= ((c&0xFCFCFCFCUL)>>2)\
890 + ((d&0xFCFCFCFCUL)>>2);\
891 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
895 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
946 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
959 int i, a0, b0, a1, b1;\
966 for(i=0; i<h; i+=2){\
972 block[0]= (a1+a0)>>2; /* FIXME non put */\
973 block[1]= (b1+b0)>>2;\
983 block[0]= (a1+a0)>>2;\
984 block[1]= (b1+b0)>>2;\
990 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
993 const uint32_t a= AV_RN32(pixels );\
994 const uint32_t b= AV_RN32(pixels+1);\
995 uint32_t l0= (a&0x03030303UL)\
998 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1003 for(i=0; i<h; i+=2){\
1004 uint32_t a= AV_RN32(pixels );\
1005 uint32_t b= AV_RN32(pixels+1);\
1006 l1= (a&0x03030303UL)\
1007 + (b&0x03030303UL);\
1008 h1= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013 a= AV_RN32(pixels );\
1014 b= AV_RN32(pixels+1);\
1015 l0= (a&0x03030303UL)\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1026 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1029 for(j=0; j<2; j++){\
1031 const uint32_t a= AV_RN32(pixels );\
1032 const uint32_t b= AV_RN32(pixels+1);\
1033 uint32_t l0= (a&0x03030303UL)\
1036 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1041 for(i=0; i<h; i+=2){\
1042 uint32_t a= AV_RN32(pixels );\
1043 uint32_t b= AV_RN32(pixels+1);\
1044 l1= (a&0x03030303UL)\
1045 + (b&0x03030303UL);\
1046 h1= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1051 a= AV_RN32(pixels );\
1052 b= AV_RN32(pixels+1);\
1053 l0= (a&0x03030303UL)\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062 pixels+=4-line_size*(h+1);\
1063 block +=4-line_size*h;\
1067 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1070 for(j=0; j<2; j++){\
1072 const uint32_t a= AV_RN32(pixels );\
1073 const uint32_t b= AV_RN32(pixels+1);\
1074 uint32_t l0= (a&0x03030303UL)\
1077 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1082 for(i=0; i<h; i+=2){\
1083 uint32_t a= AV_RN32(pixels );\
1084 uint32_t b= AV_RN32(pixels+1);\
1085 l1= (a&0x03030303UL)\
1086 + (b&0x03030303UL);\
1087 h1= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1092 a= AV_RN32(pixels );\
1093 b= AV_RN32(pixels+1);\
1094 l0= (a&0x03030303UL)\
1097 h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103 pixels+=4-line_size*(h+1);\
1104 block +=4-line_size*h;\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1117 #define op_avg(a, b) a = rnd_avg32(a, b)
1119 #define op_put(a, b) a = b
1126 #define avg2(a,b) ((a+b+1)>>1)
1127 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1129 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1133 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1137 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1139 const int A=(16-x16)*(16-y16);
1140 const int B=( x16)*(16-y16);
1141 const int C=(16-x16)*( y16);
1142 const int D=( x16)*( y16);
1147 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1160 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1164 const int s= 1<<shift;
1174 for(x=0; x<8; x++){ //XXX FIXME optimize
1175 int src_x, src_y, frac_x, frac_y, index;
1179 frac_x= src_x&(s-1);
1180 frac_y= src_y&(s-1);
1184 if((unsigned)src_x < width){
1185 if((unsigned)src_y < height){
1186 index= src_x + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1188 + src[index +1]* frac_x )*(s-frac_y)
1189 + ( src[index+stride ]*(s-frac_x)
1190 + src[index+stride+1]* frac_x )* frac_y
1193 index= src_x + av_clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1195 + src[index +1]* frac_x )*s
1199 if((unsigned)src_y < height){
1200 index= av_clip(src_x, 0, width) + src_y*stride;
1201 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1202 + src[index+stride ]* frac_y )*s
1205 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206 dst[y*stride + x]= src[index ];
1218 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1220 case 2: put_pixels2_c (dst, src, stride, height); break;
1221 case 4: put_pixels4_c (dst, src, stride, height); break;
1222 case 8: put_pixels8_c (dst, src, stride, height); break;
1223 case 16:put_pixels16_c(dst, src, stride, height); break;
1227 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1229 for (i=0; i < height; i++) {
1230 for (j=0; j < width; j++) {
1231 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1238 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1240 for (i=0; i < height; i++) {
1241 for (j=0; j < width; j++) {
1242 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1249 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1251 for (i=0; i < height; i++) {
1252 for (j=0; j < width; j++) {
1253 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1260 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1262 for (i=0; i < height; i++) {
1263 for (j=0; j < width; j++) {
1264 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1271 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1273 for (i=0; i < height; i++) {
1274 for (j=0; j < width; j++) {
1275 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1282 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
1286 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1293 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
1297 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1304 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
1308 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1315 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317 case 2: avg_pixels2_c (dst, src, stride, height); break;
1318 case 4: avg_pixels4_c (dst, src, stride, height); break;
1319 case 8: avg_pixels8_c (dst, src, stride, height); break;
1320 case 16:avg_pixels16_c(dst, src, stride, height); break;
1324 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
1328 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1335 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
1339 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1346 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
1350 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1357 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
1361 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1368 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
1372 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1379 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
1383 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1390 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
1394 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1401 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
1405 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1412 #define TPEL_WIDTH(width)\
1413 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1433 #define H264_CHROMA_MC(OPNAME, OP)\
1434 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435 const int A=(8-x)*(8-y);\
1436 const int B=( x)*(8-y);\
1437 const int C=(8-x)*( y);\
1438 const int D=( x)*( y);\
1441 assert(x<8 && y<8 && x>=0 && y>=0);\
1444 for(i=0; i<h; i++){\
1445 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1446 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1452 const int step= C ? stride : 1;\
1453 for(i=0; i<h; i++){\
1454 OP(dst[0], (A*src[0] + E*src[step+0]));\
1455 OP(dst[1], (A*src[1] + E*src[step+1]));\
1462 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1463 const int A=(8-x)*(8-y);\
1464 const int B=( x)*(8-y);\
1465 const int C=(8-x)*( y);\
1466 const int D=( x)*( y);\
1469 assert(x<8 && y<8 && x>=0 && y>=0);\
1472 for(i=0; i<h; i++){\
1473 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1474 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1475 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1476 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1482 const int step= C ? stride : 1;\
1483 for(i=0; i<h; i++){\
1484 OP(dst[0], (A*src[0] + E*src[step+0]));\
1485 OP(dst[1], (A*src[1] + E*src[step+1]));\
1486 OP(dst[2], (A*src[2] + E*src[step+2]));\
1487 OP(dst[3], (A*src[3] + E*src[step+3]));\
1494 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1495 const int A=(8-x)*(8-y);\
1496 const int B=( x)*(8-y);\
1497 const int C=(8-x)*( y);\
1498 const int D=( x)*( y);\
1501 assert(x<8 && y<8 && x>=0 && y>=0);\
1504 for(i=0; i<h; i++){\
1505 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1506 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1507 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1508 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1509 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1510 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1511 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1512 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1518 const int step= C ? stride : 1;\
1519 for(i=0; i<h; i++){\
1520 OP(dst[0], (A*src[0] + E*src[step+0]));\
1521 OP(dst[1], (A*src[1] + E*src[step+1]));\
1522 OP(dst[2], (A*src[2] + E*src[step+2]));\
1523 OP(dst[3], (A*src[3] + E*src[step+3]));\
1524 OP(dst[4], (A*src[4] + E*src[step+4]));\
1525 OP(dst[5], (A*src[5] + E*src[step+5]));\
1526 OP(dst[6], (A*src[6] + E*src[step+6]));\
1527 OP(dst[7], (A*src[7] + E*src[step+7]));\
1534 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1535 #define op_put(a, b) a = (((b) + 32)>>6)
1537 H264_CHROMA_MC(put_ , op_put)
1538 H264_CHROMA_MC(avg_ , op_avg)
1542 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1543 const int A=(8-x)*(8-y);
1544 const int B=( x)*(8-y);
1545 const int C=(8-x)*( y);
1546 const int D=( x)*( y);
1549 assert(x<8 && y<8 && x>=0 && y>=0);
1553 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1554 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1555 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1556 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1557 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1558 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1559 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1560 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1566 #define QPEL_MC(r, OPNAME, RND, OP) \
1567 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1568 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1572 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1573 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1574 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1575 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1576 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1577 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1578 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1579 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1585 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1587 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1591 const int src0= src[0*srcStride];\
1592 const int src1= src[1*srcStride];\
1593 const int src2= src[2*srcStride];\
1594 const int src3= src[3*srcStride];\
1595 const int src4= src[4*srcStride];\
1596 const int src5= src[5*srcStride];\
1597 const int src6= src[6*srcStride];\
1598 const int src7= src[7*srcStride];\
1599 const int src8= src[8*srcStride];\
1600 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1601 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1602 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1603 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1604 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1605 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1606 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1607 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1613 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1614 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1619 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1620 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1621 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1622 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1623 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1624 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1625 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1626 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1627 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1628 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1629 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1630 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1631 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1632 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1633 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1634 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1640 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1641 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1646 const int src0= src[0*srcStride];\
1647 const int src1= src[1*srcStride];\
1648 const int src2= src[2*srcStride];\
1649 const int src3= src[3*srcStride];\
1650 const int src4= src[4*srcStride];\
1651 const int src5= src[5*srcStride];\
1652 const int src6= src[6*srcStride];\
1653 const int src7= src[7*srcStride];\
1654 const int src8= src[8*srcStride];\
1655 const int src9= src[9*srcStride];\
1656 const int src10= src[10*srcStride];\
1657 const int src11= src[11*srcStride];\
1658 const int src12= src[12*srcStride];\
1659 const int src13= src[13*srcStride];\
1660 const int src14= src[14*srcStride];\
1661 const int src15= src[15*srcStride];\
1662 const int src16= src[16*srcStride];\
1663 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1664 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1665 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1666 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1667 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1668 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1669 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1670 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1671 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1672 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1673 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1674 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1675 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1676 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1677 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1678 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1684 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1685 OPNAME ## pixels8_c(dst, src, stride, 8);\
1688 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1690 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1691 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1694 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1695 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1698 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1700 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1701 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1704 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1705 uint8_t full[16*9];\
1707 copy_block9(full, src, 16, stride, 9);\
1708 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1709 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1712 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1713 uint8_t full[16*9];\
1714 copy_block9(full, src, 16, stride, 9);\
1715 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1718 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1719 uint8_t full[16*9];\
1721 copy_block9(full, src, 16, stride, 9);\
1722 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1723 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1725 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1726 uint8_t full[16*9];\
1729 uint8_t halfHV[64];\
1730 copy_block9(full, src, 16, stride, 9);\
1731 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1732 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1733 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1734 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1736 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1737 uint8_t full[16*9];\
1739 uint8_t halfHV[64];\
1740 copy_block9(full, src, 16, stride, 9);\
1741 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1742 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1743 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1744 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1746 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1747 uint8_t full[16*9];\
1750 uint8_t halfHV[64];\
1751 copy_block9(full, src, 16, stride, 9);\
1752 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1753 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1754 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1755 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1757 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1758 uint8_t full[16*9];\
1760 uint8_t halfHV[64];\
1761 copy_block9(full, src, 16, stride, 9);\
1762 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1763 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1764 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1765 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1767 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1768 uint8_t full[16*9];\
1771 uint8_t halfHV[64];\
1772 copy_block9(full, src, 16, stride, 9);\
1773 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1774 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1775 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1776 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1778 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1779 uint8_t full[16*9];\
1781 uint8_t halfHV[64];\
1782 copy_block9(full, src, 16, stride, 9);\
1783 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1784 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1785 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1786 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1788 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1789 uint8_t full[16*9];\
1792 uint8_t halfHV[64];\
1793 copy_block9(full, src, 16, stride, 9);\
1794 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1795 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1796 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1797 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1799 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1800 uint8_t full[16*9];\
1802 uint8_t halfHV[64];\
1803 copy_block9(full, src, 16, stride, 9);\
1804 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1805 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1806 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1807 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1809 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1811 uint8_t halfHV[64];\
1812 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1813 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1814 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1816 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1818 uint8_t halfHV[64];\
1819 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1820 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1821 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1823 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t full[16*9];\
1827 uint8_t halfHV[64];\
1828 copy_block9(full, src, 16, stride, 9);\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1830 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1831 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1832 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1834 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1835 uint8_t full[16*9];\
1837 copy_block9(full, src, 16, stride, 9);\
1838 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1839 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1840 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1842 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1843 uint8_t full[16*9];\
1846 uint8_t halfHV[64];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1849 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1850 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1851 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1853 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1854 uint8_t full[16*9];\
1856 copy_block9(full, src, 16, stride, 9);\
1857 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1858 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1859 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1861 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1863 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1864 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1866 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1867 OPNAME ## pixels16_c(dst, src, stride, 16);\
1870 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1872 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1873 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1876 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1877 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1880 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1882 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1883 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1886 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[24*17];\
1889 copy_block17(full, src, 24, stride, 17);\
1890 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1891 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1894 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1895 uint8_t full[24*17];\
1896 copy_block17(full, src, 24, stride, 17);\
1897 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1900 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1901 uint8_t full[24*17];\
1903 copy_block17(full, src, 24, stride, 17);\
1904 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1905 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1907 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[24*17];\
1909 uint8_t halfH[272];\
1910 uint8_t halfV[256];\
1911 uint8_t halfHV[256];\
1912 copy_block17(full, src, 24, stride, 17);\
1913 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1914 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1915 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1916 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1918 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[24*17];\
1920 uint8_t halfH[272];\
1921 uint8_t halfHV[256];\
1922 copy_block17(full, src, 24, stride, 17);\
1923 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1924 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1925 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1926 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1928 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[24*17];\
1930 uint8_t halfH[272];\
1931 uint8_t halfV[256];\
1932 uint8_t halfHV[256];\
1933 copy_block17(full, src, 24, stride, 17);\
1934 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1935 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1936 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1937 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1939 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[24*17];\
1941 uint8_t halfH[272];\
1942 uint8_t halfHV[256];\
1943 copy_block17(full, src, 24, stride, 17);\
1944 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1945 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1946 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1947 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1949 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t full[24*17];\
1951 uint8_t halfH[272];\
1952 uint8_t halfV[256];\
1953 uint8_t halfHV[256];\
1954 copy_block17(full, src, 24, stride, 17);\
1955 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1956 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1957 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1958 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1960 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1961 uint8_t full[24*17];\
1962 uint8_t halfH[272];\
1963 uint8_t halfHV[256];\
1964 copy_block17(full, src, 24, stride, 17);\
1965 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1966 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1967 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1968 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1970 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1971 uint8_t full[24*17];\
1972 uint8_t halfH[272];\
1973 uint8_t halfV[256];\
1974 uint8_t halfHV[256];\
1975 copy_block17(full, src, 24, stride, 17);\
1976 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1977 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1978 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1979 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1981 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t full[24*17];\
1983 uint8_t halfH[272];\
1984 uint8_t halfHV[256];\
1985 copy_block17(full, src, 24, stride, 17);\
1986 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1987 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1988 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1989 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1991 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1992 uint8_t halfH[272];\
1993 uint8_t halfHV[256];\
1994 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1995 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1996 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1998 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t halfH[272];\
2000 uint8_t halfHV[256];\
2001 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2002 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2003 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2005 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2006 uint8_t full[24*17];\
2007 uint8_t halfH[272];\
2008 uint8_t halfV[256];\
2009 uint8_t halfHV[256];\
2010 copy_block17(full, src, 24, stride, 17);\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2012 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2013 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2014 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2016 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2017 uint8_t full[24*17];\
2018 uint8_t halfH[272];\
2019 copy_block17(full, src, 24, stride, 17);\
2020 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2021 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2022 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2024 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2025 uint8_t full[24*17];\
2026 uint8_t halfH[272];\
2027 uint8_t halfV[256];\
2028 uint8_t halfHV[256];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2031 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2032 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2033 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2035 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2036 uint8_t full[24*17];\
2037 uint8_t halfH[272];\
2038 copy_block17(full, src, 24, stride, 17);\
2039 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2040 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2041 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2043 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2044 uint8_t halfH[272];\
2045 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2046 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2050 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2051 #define op_put(a, b) a = cm[((b) + 16)>>5]
2052 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2054 QPEL_MC(0, put_ , _ , op_put)
2055 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2056 QPEL_MC(0, avg_ , _ , op_avg)
2057 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2059 #undef op_avg_no_rnd
2061 #undef op_put_no_rnd
2064 #define H264_LOWPASS(OPNAME, OP, OP2) \
2065 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2067 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2071 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2072 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2078 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2080 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2084 const int srcB= src[-2*srcStride];\
2085 const int srcA= src[-1*srcStride];\
2086 const int src0= src[0 *srcStride];\
2087 const int src1= src[1 *srcStride];\
2088 const int src2= src[2 *srcStride];\
2089 const int src3= src[3 *srcStride];\
2090 const int src4= src[4 *srcStride];\
2091 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2092 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2098 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2101 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2103 src -= 2*srcStride;\
2104 for(i=0; i<h+5; i++)\
2106 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2107 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2111 tmp -= tmpStride*(h+5-2);\
2114 const int tmpB= tmp[-2*tmpStride];\
2115 const int tmpA= tmp[-1*tmpStride];\
2116 const int tmp0= tmp[0 *tmpStride];\
2117 const int tmp1= tmp[1 *tmpStride];\
2118 const int tmp2= tmp[2 *tmpStride];\
2119 const int tmp3= tmp[3 *tmpStride];\
2120 const int tmp4= tmp[4 *tmpStride];\
2121 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2122 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2127 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2129 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2133 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2134 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2135 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2136 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2142 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2144 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2148 const int srcB= src[-2*srcStride];\
2149 const int srcA= src[-1*srcStride];\
2150 const int src0= src[0 *srcStride];\
2151 const int src1= src[1 *srcStride];\
2152 const int src2= src[2 *srcStride];\
2153 const int src3= src[3 *srcStride];\
2154 const int src4= src[4 *srcStride];\
2155 const int src5= src[5 *srcStride];\
2156 const int src6= src[6 *srcStride];\
2157 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2158 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2159 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2160 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2166 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2169 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2171 src -= 2*srcStride;\
2172 for(i=0; i<h+5; i++)\
2174 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2175 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2176 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2177 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2181 tmp -= tmpStride*(h+5-2);\
2184 const int tmpB= tmp[-2*tmpStride];\
2185 const int tmpA= tmp[-1*tmpStride];\
2186 const int tmp0= tmp[0 *tmpStride];\
2187 const int tmp1= tmp[1 *tmpStride];\
2188 const int tmp2= tmp[2 *tmpStride];\
2189 const int tmp3= tmp[3 *tmpStride];\
2190 const int tmp4= tmp[4 *tmpStride];\
2191 const int tmp5= tmp[5 *tmpStride];\
2192 const int tmp6= tmp[6 *tmpStride];\
2193 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2194 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2195 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2196 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2202 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2204 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2208 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2209 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2210 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2211 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2212 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2213 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2214 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2215 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2221 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2223 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2227 const int srcB= src[-2*srcStride];\
2228 const int srcA= src[-1*srcStride];\
2229 const int src0= src[0 *srcStride];\
2230 const int src1= src[1 *srcStride];\
2231 const int src2= src[2 *srcStride];\
2232 const int src3= src[3 *srcStride];\
2233 const int src4= src[4 *srcStride];\
2234 const int src5= src[5 *srcStride];\
2235 const int src6= src[6 *srcStride];\
2236 const int src7= src[7 *srcStride];\
2237 const int src8= src[8 *srcStride];\
2238 const int src9= src[9 *srcStride];\
2239 const int src10=src[10*srcStride];\
2240 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2241 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2242 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2243 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2244 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2245 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2246 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2247 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2253 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2256 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2258 src -= 2*srcStride;\
2259 for(i=0; i<h+5; i++)\
2261 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2262 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2263 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2264 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2265 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2266 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2267 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2268 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2272 tmp -= tmpStride*(h+5-2);\
2275 const int tmpB= tmp[-2*tmpStride];\
2276 const int tmpA= tmp[-1*tmpStride];\
2277 const int tmp0= tmp[0 *tmpStride];\
2278 const int tmp1= tmp[1 *tmpStride];\
2279 const int tmp2= tmp[2 *tmpStride];\
2280 const int tmp3= tmp[3 *tmpStride];\
2281 const int tmp4= tmp[4 *tmpStride];\
2282 const int tmp5= tmp[5 *tmpStride];\
2283 const int tmp6= tmp[6 *tmpStride];\
2284 const int tmp7= tmp[7 *tmpStride];\
2285 const int tmp8= tmp[8 *tmpStride];\
2286 const int tmp9= tmp[9 *tmpStride];\
2287 const int tmp10=tmp[10*tmpStride];\
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2290 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2291 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2292 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2293 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2294 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2295 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2301 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2302 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2303 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2304 src += 8*srcStride;\
2305 dst += 8*dstStride;\
2306 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2307 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2312 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2313 src += 8*srcStride;\
2314 dst += 8*dstStride;\
2315 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2316 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2320 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2321 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2322 src += 8*srcStride;\
2323 dst += 8*dstStride;\
2324 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2325 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328 #define H264_MC(OPNAME, SIZE) \
2329 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2330 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2333 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2334 uint8_t half[SIZE*SIZE];\
2335 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2336 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2339 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2340 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2343 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2344 uint8_t half[SIZE*SIZE];\
2345 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2346 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2349 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2350 uint8_t full[SIZE*(SIZE+5)];\
2351 uint8_t * const full_mid= full + SIZE*2;\
2352 uint8_t half[SIZE*SIZE];\
2353 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2354 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2355 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2358 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2359 uint8_t full[SIZE*(SIZE+5)];\
2360 uint8_t * const full_mid= full + SIZE*2;\
2361 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2362 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2365 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2366 uint8_t full[SIZE*(SIZE+5)];\
2367 uint8_t * const full_mid= full + SIZE*2;\
2368 uint8_t half[SIZE*SIZE];\
2369 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2370 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2371 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2374 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2375 uint8_t full[SIZE*(SIZE+5)];\
2376 uint8_t * const full_mid= full + SIZE*2;\
2377 uint8_t halfH[SIZE*SIZE];\
2378 uint8_t halfV[SIZE*SIZE];\
2379 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2380 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2381 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2382 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2385 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2386 uint8_t full[SIZE*(SIZE+5)];\
2387 uint8_t * const full_mid= full + SIZE*2;\
2388 uint8_t halfH[SIZE*SIZE];\
2389 uint8_t halfV[SIZE*SIZE];\
2390 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2391 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2392 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2393 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2396 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2397 uint8_t full[SIZE*(SIZE+5)];\
2398 uint8_t * const full_mid= full + SIZE*2;\
2399 uint8_t halfH[SIZE*SIZE];\
2400 uint8_t halfV[SIZE*SIZE];\
2401 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2402 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2403 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2404 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2407 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2408 uint8_t full[SIZE*(SIZE+5)];\
2409 uint8_t * const full_mid= full + SIZE*2;\
2410 uint8_t halfH[SIZE*SIZE];\
2411 uint8_t halfV[SIZE*SIZE];\
2412 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2413 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2414 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2415 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2418 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2419 int16_t tmp[SIZE*(SIZE+5)];\
2420 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2423 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2424 int16_t tmp[SIZE*(SIZE+5)];\
2425 uint8_t halfH[SIZE*SIZE];\
2426 uint8_t halfHV[SIZE*SIZE];\
2427 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2428 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2429 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2432 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2433 int16_t tmp[SIZE*(SIZE+5)];\
2434 uint8_t halfH[SIZE*SIZE];\
2435 uint8_t halfHV[SIZE*SIZE];\
2436 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2437 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2438 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2441 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2442 uint8_t full[SIZE*(SIZE+5)];\
2443 uint8_t * const full_mid= full + SIZE*2;\
2444 int16_t tmp[SIZE*(SIZE+5)];\
2445 uint8_t halfV[SIZE*SIZE];\
2446 uint8_t halfHV[SIZE*SIZE];\
2447 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2448 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2449 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2450 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2453 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2454 uint8_t full[SIZE*(SIZE+5)];\
2455 uint8_t * const full_mid= full + SIZE*2;\
2456 int16_t tmp[SIZE*(SIZE+5)];\
2457 uint8_t halfV[SIZE*SIZE];\
2458 uint8_t halfHV[SIZE*SIZE];\
2459 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2460 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2461 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2462 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2465 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2466 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2467 #define op_put(a, b) a = cm[((b) + 16)>>5]
2468 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2469 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2471 H264_LOWPASS(put_ , op_put, op2_put)
2472 H264_LOWPASS(avg_ , op_avg, op2_avg)
2487 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2488 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2489 #define H264_WEIGHT(W,H) \
2490 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2492 offset <<= log2_denom; \
2493 if(log2_denom) offset += 1<<(log2_denom-1); \
2494 for(y=0; y<H; y++, block += stride){ \
2497 if(W==2) continue; \
2500 if(W==4) continue; \
2505 if(W==8) continue; \
2516 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2518 offset = ((offset + 1) | 1) << log2_denom; \
2519 for(y=0; y<H; y++, dst += stride, src += stride){ \
2522 if(W==2) continue; \
2525 if(W==4) continue; \
2530 if(W==8) continue; \
2557 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2562 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2563 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2564 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2565 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2566 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2567 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2568 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2569 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2575 #ifdef CONFIG_CAVS_DECODER
2577 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2579 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2580 put_pixels8_c(dst, src, stride, 8);
2582 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2583 avg_pixels8_c(dst, src, stride, 8);
2585 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 put_pixels16_c(dst, src, stride, 16);
2588 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589 avg_pixels16_c(dst, src, stride, 16);
2591 #endif /* CONFIG_CAVS_DECODER */
2593 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2595 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2597 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2598 put_pixels8_c(dst, src, stride, 8);
2600 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2602 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2605 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2607 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2612 const int src_1= src[ -srcStride];
2613 const int src0 = src[0 ];
2614 const int src1 = src[ srcStride];
2615 const int src2 = src[2*srcStride];
2616 const int src3 = src[3*srcStride];
2617 const int src4 = src[4*srcStride];
2618 const int src5 = src[5*srcStride];
2619 const int src6 = src[6*srcStride];
2620 const int src7 = src[7*srcStride];
2621 const int src8 = src[8*srcStride];
2622 const int src9 = src[9*srcStride];
2623 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2624 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2625 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2626 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2627 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2628 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2629 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2630 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2636 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2637 put_pixels8_c(dst, src, stride, 8);
2640 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2642 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2643 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2646 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2647 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2650 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2652 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2653 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2656 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2657 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2660 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2664 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2665 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2666 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2667 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2669 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2673 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2674 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2675 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2676 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2678 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2680 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2681 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2684 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2685 if(ENABLE_ANY_H263) {
2687 const int strength= ff_h263_loop_filter_strength[qscale];
2691 int p0= src[x-2*stride];
2692 int p1= src[x-1*stride];
2693 int p2= src[x+0*stride];
2694 int p3= src[x+1*stride];
2695 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2697 if (d<-2*strength) d1= 0;
2698 else if(d<- strength) d1=-2*strength - d;
2699 else if(d< strength) d1= d;
2700 else if(d< 2*strength) d1= 2*strength - d;
2705 if(p1&256) p1= ~(p1>>31);
2706 if(p2&256) p2= ~(p2>>31);
2708 src[x-1*stride] = p1;
2709 src[x+0*stride] = p2;
2713 d2= av_clip((p0-p3)/4, -ad1, ad1);
2715 src[x-2*stride] = p0 - d2;
2716 src[x+ stride] = p3 + d2;
2721 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2722 if(ENABLE_ANY_H263) {
2724 const int strength= ff_h263_loop_filter_strength[qscale];
2728 int p0= src[y*stride-2];
2729 int p1= src[y*stride-1];
2730 int p2= src[y*stride+0];
2731 int p3= src[y*stride+1];
2732 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2734 if (d<-2*strength) d1= 0;
2735 else if(d<- strength) d1=-2*strength - d;
2736 else if(d< strength) d1= d;
2737 else if(d< 2*strength) d1= 2*strength - d;
2742 if(p1&256) p1= ~(p1>>31);
2743 if(p2&256) p2= ~(p2>>31);
2745 src[y*stride-1] = p1;
2746 src[y*stride+0] = p2;
2750 d2= av_clip((p0-p3)/4, -ad1, ad1);
2752 src[y*stride-2] = p0 - d2;
2753 src[y*stride+1] = p3 + d2;
2758 static void h261_loop_filter_c(uint8_t *src, int stride){
2763 temp[x ] = 4*src[x ];
2764 temp[x + 7*8] = 4*src[x + 7*stride];
2768 xy = y * stride + x;
2770 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2775 src[ y*stride] = (temp[ y*8] + 2)>>2;
2776 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2778 xy = y * stride + x;
2780 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2785 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2788 for( i = 0; i < 4; i++ ) {
2793 for( d = 0; d < 4; d++ ) {
2794 const int p0 = pix[-1*xstride];
2795 const int p1 = pix[-2*xstride];
2796 const int p2 = pix[-3*xstride];
2797 const int q0 = pix[0];
2798 const int q1 = pix[1*xstride];
2799 const int q2 = pix[2*xstride];
2801 if( FFABS( p0 - q0 ) < alpha &&
2802 FFABS( p1 - p0 ) < beta &&
2803 FFABS( q1 - q0 ) < beta ) {
2808 if( FFABS( p2 - p0 ) < beta ) {
2809 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2812 if( FFABS( q2 - q0 ) < beta ) {
2813 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2817 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2818 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2819 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2825 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2827 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2829 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2831 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2834 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2837 for( i = 0; i < 4; i++ ) {
2838 const int tc = tc0[i];
2843 for( d = 0; d < 2; d++ ) {
2844 const int p0 = pix[-1*xstride];
2845 const int p1 = pix[-2*xstride];
2846 const int q0 = pix[0];
2847 const int q1 = pix[1*xstride];
2849 if( FFABS( p0 - q0 ) < alpha &&
2850 FFABS( p1 - p0 ) < beta &&
2851 FFABS( q1 - q0 ) < beta ) {
2853 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2855 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2856 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2862 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2864 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2866 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2868 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2871 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2874 for( d = 0; d < 8; d++ ) {
2875 const int p0 = pix[-1*xstride];
2876 const int p1 = pix[-2*xstride];
2877 const int q0 = pix[0];
2878 const int q1 = pix[1*xstride];
2880 if( FFABS( p0 - q0 ) < alpha &&
2881 FFABS( p1 - p0 ) < beta &&
2882 FFABS( q1 - q0 ) < beta ) {
2884 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2885 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2890 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2892 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2894 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2896 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2899 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2905 s += abs(pix1[0] - pix2[0]);
2906 s += abs(pix1[1] - pix2[1]);
2907 s += abs(pix1[2] - pix2[2]);
2908 s += abs(pix1[3] - pix2[3]);
2909 s += abs(pix1[4] - pix2[4]);
2910 s += abs(pix1[5] - pix2[5]);
2911 s += abs(pix1[6] - pix2[6]);
2912 s += abs(pix1[7] - pix2[7]);
2913 s += abs(pix1[8] - pix2[8]);
2914 s += abs(pix1[9] - pix2[9]);
2915 s += abs(pix1[10] - pix2[10]);
2916 s += abs(pix1[11] - pix2[11]);
2917 s += abs(pix1[12] - pix2[12]);
2918 s += abs(pix1[13] - pix2[13]);
2919 s += abs(pix1[14] - pix2[14]);
2920 s += abs(pix1[15] - pix2[15]);
2927 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2933 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2934 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2935 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2936 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2937 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2938 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2939 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2940 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2941 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2942 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2943 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2944 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2945 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2946 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2947 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2948 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2955 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2958 uint8_t *pix3 = pix2 + line_size;
2962 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2963 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2964 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2965 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2966 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2967 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2968 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2969 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2970 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2971 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2972 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2973 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2974 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2975 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2976 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2977 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2985 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2988 uint8_t *pix3 = pix2 + line_size;
2992 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2993 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
2994 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
2995 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
2996 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
2997 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
2998 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
2999 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3000 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3001 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3002 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3003 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3004 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3005 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3006 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3007 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3015 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3021 s += abs(pix1[0] - pix2[0]);
3022 s += abs(pix1[1] - pix2[1]);
3023 s += abs(pix1[2] - pix2[2]);
3024 s += abs(pix1[3] - pix2[3]);
3025 s += abs(pix1[4] - pix2[4]);
3026 s += abs(pix1[5] - pix2[5]);
3027 s += abs(pix1[6] - pix2[6]);
3028 s += abs(pix1[7] - pix2[7]);
3035 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3041 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3042 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3043 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3044 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3045 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3046 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3047 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3048 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3055 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3058 uint8_t *pix3 = pix2 + line_size;
3062 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3063 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3064 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3065 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3066 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3067 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3068 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3069 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3077 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3080 uint8_t *pix3 = pix2 + line_size;
3084 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3085 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3086 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3087 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3088 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3089 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3090 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3091 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3099 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3100 MpegEncContext *c = v;
3106 for(x=0; x<16; x++){
3107 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3110 for(x=0; x<15; x++){
3111 score2+= FFABS( s1[x ] - s1[x +stride]
3112 - s1[x+1] + s1[x+1+stride])
3113 -FFABS( s2[x ] - s2[x +stride]
3114 - s2[x+1] + s2[x+1+stride]);
3121 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3122 else return score1 + FFABS(score2)*8;
3125 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3126 MpegEncContext *c = v;
3133 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3137 score2+= FFABS( s1[x ] - s1[x +stride]
3138 - s1[x+1] + s1[x+1+stride])
3139 -FFABS( s2[x ] - s2[x +stride]
3140 - s2[x+1] + s2[x+1+stride]);
3147 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3148 else return score1 + FFABS(score2)*8;
3151 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3155 for(i=0; i<8*8; i++){
3156 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3159 assert(-512<b && b<512);
3161 sum += (w*b)*(w*b)>>4;
3166 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3169 for(i=0; i<8*8; i++){
3170 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3175 * permutes an 8x8 block.
3176 * @param block the block which will be permuted according to the given permutation vector
3177 * @param permutation the permutation vector
3178 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3179 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3180 * (inverse) permutated to scantable order!
3182 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3188 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3190 for(i=0; i<=last; i++){
3191 const int j= scantable[i];
3196 for(i=0; i<=last; i++){
3197 const int j= scantable[i];
3198 const int perm_j= permutation[j];
3199 block[perm_j]= temp[j];
3203 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3207 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3210 memset(cmp, 0, sizeof(void*)*5);
3218 cmp[i]= c->hadamard8_diff[i];
3224 cmp[i]= c->dct_sad[i];
3227 cmp[i]= c->dct264_sad[i];
3230 cmp[i]= c->dct_max[i];
3233 cmp[i]= c->quant_psnr[i];
3253 #ifdef CONFIG_SNOW_ENCODER
3262 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3268 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3270 static void clear_blocks_c(DCTELEM *blocks)
3272 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3275 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3277 for(i=0; i+7<w; i+=8){
3278 dst[i+0] += src[i+0];
3279 dst[i+1] += src[i+1];
3280 dst[i+2] += src[i+2];
3281 dst[i+3] += src[i+3];
3282 dst[i+4] += src[i+4];
3283 dst[i+5] += src[i+5];
3284 dst[i+6] += src[i+6];
3285 dst[i+7] += src[i+7];
3288 dst[i+0] += src[i+0];
3291 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3293 for(i=0; i+7<w; i+=8){
3294 dst[i+0] = src1[i+0]-src2[i+0];
3295 dst[i+1] = src1[i+1]-src2[i+1];
3296 dst[i+2] = src1[i+2]-src2[i+2];
3297 dst[i+3] = src1[i+3]-src2[i+3];
3298 dst[i+4] = src1[i+4]-src2[i+4];
3299 dst[i+5] = src1[i+5]-src2[i+5];
3300 dst[i+6] = src1[i+6]-src2[i+6];
3301 dst[i+7] = src1[i+7]-src2[i+7];
3304 dst[i+0] = src1[i+0]-src2[i+0];
3307 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3315 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3325 #define BUTTERFLY2(o1,o2,i1,i2) \
3329 #define BUTTERFLY1(x,y) \
3338 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3340 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3348 //FIXME try pointer walks
3349 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3350 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3351 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3352 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3354 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3355 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3356 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3357 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3359 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3360 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3361 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3362 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3366 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3367 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3368 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3369 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3371 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3372 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3373 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3374 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3377 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3378 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3379 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3380 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3386 printf("MAX:%d\n", maxi);
3392 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3400 //FIXME try pointer walks
3401 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3402 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3403 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3404 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3406 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3407 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3408 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3409 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3411 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3412 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3413 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3414 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3418 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3419 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3420 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3421 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3423 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3424 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3425 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3426 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3429 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3430 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3431 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3432 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3435 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3440 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3441 MpegEncContext * const s= (MpegEncContext *)c;
3442 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3443 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3447 s->dsp.diff_pixels(temp, src1, src2, stride);
3449 return s->dsp.sum_abs_dctelem(temp);
3454 const int s07 = SRC(0) + SRC(7);\
3455 const int s16 = SRC(1) + SRC(6);\
3456 const int s25 = SRC(2) + SRC(5);\
3457 const int s34 = SRC(3) + SRC(4);\
3458 const int a0 = s07 + s34;\
3459 const int a1 = s16 + s25;\
3460 const int a2 = s07 - s34;\
3461 const int a3 = s16 - s25;\
3462 const int d07 = SRC(0) - SRC(7);\
3463 const int d16 = SRC(1) - SRC(6);\
3464 const int d25 = SRC(2) - SRC(5);\
3465 const int d34 = SRC(3) - SRC(4);\
3466 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3467 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3468 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3469 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3471 DST(1, a4 + (a7>>2)) ;\
3472 DST(2, a2 + (a3>>1)) ;\
3473 DST(3, a5 + (a6>>2)) ;\
3475 DST(5, a6 - (a5>>2)) ;\
3476 DST(6, (a2>>1) - a3 ) ;\
3477 DST(7, (a4>>2) - a7 ) ;\
3480 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3481 MpegEncContext * const s= (MpegEncContext *)c;
3486 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3488 #define SRC(x) dct[i][x]
3489 #define DST(x,v) dct[i][x]= v
3490 for( i = 0; i < 8; i++ )
3495 #define SRC(x) dct[x][i]
3496 #define DST(x,v) sum += FFABS(v)
3497 for( i = 0; i < 8; i++ )
3505 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3506 MpegEncContext * const s= (MpegEncContext *)c;
3507 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3508 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3513 s->dsp.diff_pixels(temp, src1, src2, stride);
3517 sum= FFMAX(sum, FFABS(temp[i]));
3522 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3523 MpegEncContext * const s= (MpegEncContext *)c;
3524 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3525 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3526 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3532 s->dsp.diff_pixels(temp, src1, src2, stride);
3534 memcpy(bak, temp, 64*sizeof(DCTELEM));
3536 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3537 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3538 ff_simple_idct(temp); //FIXME
3541 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3546 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3547 MpegEncContext * const s= (MpegEncContext *)c;
3548 const uint8_t *scantable= s->intra_scantable.permutated;
3549 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3550 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3551 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3552 uint8_t * const bak= (uint8_t*)aligned_bak;
3553 int i, last, run, bits, level, distoration, start_i;
3554 const int esc_length= s->ac_esc_length;
3556 uint8_t * last_length;
3561 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3562 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3565 s->dsp.diff_pixels(temp, src1, src2, stride);
3567 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3573 length = s->intra_ac_vlc_length;
3574 last_length= s->intra_ac_vlc_last_length;
3575 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3578 length = s->inter_ac_vlc_length;
3579 last_length= s->inter_ac_vlc_last_length;
3584 for(i=start_i; i<last; i++){
3585 int j= scantable[i];
3590 if((level&(~127)) == 0){
3591 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3600 level= temp[i] + 64;
3604 if((level&(~127)) == 0){
3605 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3613 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3615 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3618 s->dsp.idct_add(bak, stride, temp);
3620 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3622 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3625 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3626 MpegEncContext * const s= (MpegEncContext *)c;
3627 const uint8_t *scantable= s->intra_scantable.permutated;
3628 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3629 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3630 int i, last, run, bits, level, start_i;
3631 const int esc_length= s->ac_esc_length;
3633 uint8_t * last_length;
3637 s->dsp.diff_pixels(temp, src1, src2, stride);
3639 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3645 length = s->intra_ac_vlc_length;
3646 last_length= s->intra_ac_vlc_last_length;
3647 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3650 length = s->inter_ac_vlc_length;
3651 last_length= s->inter_ac_vlc_last_length;
3656 for(i=start_i; i<last; i++){
3657 int j= scantable[i];
3662 if((level&(~127)) == 0){
3663 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3672 level= temp[i] + 64;
3676 if((level&(~127)) == 0){
3677 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3685 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3690 for(x=0; x<16; x+=4){
3691 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3692 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3700 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3705 for(x=0; x<16; x++){
3706 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3715 #define SQ(a) ((a)*(a))
3716 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3721 for(x=0; x<16; x+=4){
3722 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3723 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3731 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3736 for(x=0; x<16; x++){
3737 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3746 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3750 for(i=0; i<size; i++)
3751 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3755 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3756 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3757 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3759 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3761 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3762 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3763 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3764 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3766 static void vector_fmul_c(float *dst, const float *src, int len){
3768 for(i=0; i<len; i++)
3772 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3775 for(i=0; i<len; i++)
3776 dst[i] = src0[i] * src1[-i];
3779 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3781 for(i=0; i<len; i++)
3782 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3785 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3787 for(i=0; i<len; i++) {
3788 int_fast32_t tmp = ((int32_t*)src)[i];
3790 tmp = (0x43c0ffff - tmp)>>31;
3791 // is this faster on some gcc/cpu combinations?
3792 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3795 dst[i] = tmp - 0x8000;
3800 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3801 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3802 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3803 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3804 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3805 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3806 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3808 static void wmv2_idct_row(short * b)
3811 int a0,a1,a2,a3,a4,a5,a6,a7;
3813 a1 = W1*b[1]+W7*b[7];
3814 a7 = W7*b[1]-W1*b[7];
3815 a5 = W5*b[5]+W3*b[3];
3816 a3 = W3*b[5]-W5*b[3];
3817 a2 = W2*b[2]+W6*b[6];
3818 a6 = W6*b[2]-W2*b[6];
3819 a0 = W0*b[0]+W0*b[4];
3820 a4 = W0*b[0]-W0*b[4];
3822 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3823 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3825 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3826 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3827 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3828 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3829 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3830 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3831 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3832 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3834 static void wmv2_idct_col(short * b)
3837 int a0,a1,a2,a3,a4,a5,a6,a7;
3838 /*step 1, with extended precision*/
3839 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3840 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3841 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3842 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3843 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3844 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3845 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3846 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3848 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3849 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3851 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3852 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3853 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3854 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3856 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3857 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3858 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3859 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3861 void ff_wmv2_idct_c(short * block){
3865 wmv2_idct_row(block+i);
3868 wmv2_idct_col(block+i);
3871 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3873 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3875 ff_wmv2_idct_c(block);
3876 put_pixels_clamped_c(block, dest, line_size);
3878 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3880 ff_wmv2_idct_c(block);
3881 add_pixels_clamped_c(block, dest, line_size);
3883 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3886 put_pixels_clamped_c(block, dest, line_size);
3888 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3891 add_pixels_clamped_c(block, dest, line_size);
3894 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3897 put_pixels_clamped4_c(block, dest, line_size);
3899 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3902 add_pixels_clamped4_c(block, dest, line_size);
3905 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3908 put_pixels_clamped2_c(block, dest, line_size);
3910 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3913 add_pixels_clamped2_c(block, dest, line_size);
3916 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3918 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3920 dest[0] = cm[(block[0] + 4)>>3];
3922 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3924 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3926 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3929 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3931 /* init static data */
3932 void dsputil_static_init(void)
3936 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3937 for(i=0;i<MAX_NEG_CROP;i++) {
3939 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3942 for(i=0;i<512;i++) {
3943 ff_squareTbl[i] = (i - 256) * (i - 256);
3946 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3949 int ff_check_alignment(void){
3950 static int did_fail=0;
3951 DECLARE_ALIGNED_16(int, aligned);
3953 if((long)&aligned & 15){
3955 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3956 av_log(NULL, AV_LOG_ERROR,
3957 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3958 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3959 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3960 "Do not report crashes to FFmpeg developers.\n");
3969 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3973 ff_check_alignment();
3975 #ifdef CONFIG_ENCODERS
3976 if(avctx->dct_algo==FF_DCT_FASTINT) {
3977 c->fdct = fdct_ifast;
3978 c->fdct248 = fdct_ifast248;
3980 else if(avctx->dct_algo==FF_DCT_FAAN) {
3981 c->fdct = ff_faandct;
3982 c->fdct248 = ff_faandct248;
3985 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3986 c->fdct248 = ff_fdct248_islow;
3988 #endif //CONFIG_ENCODERS
3990 if(avctx->lowres==1){
3991 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3992 c->idct_put= ff_jref_idct4_put;
3993 c->idct_add= ff_jref_idct4_add;
3995 c->idct_put= ff_h264_lowres_idct_put_c;
3996 c->idct_add= ff_h264_lowres_idct_add_c;
3998 c->idct = j_rev_dct4;
3999 c->idct_permutation_type= FF_NO_IDCT_PERM;
4000 }else if(avctx->lowres==2){
4001 c->idct_put= ff_jref_idct2_put;
4002 c->idct_add= ff_jref_idct2_add;
4003 c->idct = j_rev_dct2;
4004 c->idct_permutation_type= FF_NO_IDCT_PERM;
4005 }else if(avctx->lowres==3){
4006 c->idct_put= ff_jref_idct1_put;
4007 c->idct_add= ff_jref_idct1_add;
4008 c->idct = j_rev_dct1;
4009 c->idct_permutation_type= FF_NO_IDCT_PERM;
4011 if(avctx->idct_algo==FF_IDCT_INT){
4012 c->idct_put= ff_jref_idct_put;
4013 c->idct_add= ff_jref_idct_add;
4014 c->idct = j_rev_dct;
4015 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4016 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4017 avctx->idct_algo==FF_IDCT_VP3){
4018 c->idct_put= ff_vp3_idct_put_c;
4019 c->idct_add= ff_vp3_idct_add_c;
4020 c->idct = ff_vp3_idct_c;
4021 c->idct_permutation_type= FF_NO_IDCT_PERM;
4022 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4023 c->idct_put= ff_wmv2_idct_put_c;
4024 c->idct_add= ff_wmv2_idct_add_c;
4025 c->idct = ff_wmv2_idct_c;
4026 c->idct_permutation_type= FF_NO_IDCT_PERM;
4027 }else{ //accurate/default
4028 c->idct_put= ff_simple_idct_put;
4029 c->idct_add= ff_simple_idct_add;
4030 c->idct = ff_simple_idct;
4031 c->idct_permutation_type= FF_NO_IDCT_PERM;
4035 if (ENABLE_H264_DECODER) {
4036 c->h264_idct_add= ff_h264_idct_add_c;
4037 c->h264_idct8_add= ff_h264_idct8_add_c;
4038 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4039 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4042 c->get_pixels = get_pixels_c;
4043 c->diff_pixels = diff_pixels_c;
4044 c->put_pixels_clamped = put_pixels_clamped_c;
4045 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4046 c->add_pixels_clamped = add_pixels_clamped_c;
4047 c->add_pixels8 = add_pixels8_c;
4048 c->add_pixels4 = add_pixels4_c;
4049 c->sum_abs_dctelem = sum_abs_dctelem_c;
4052 c->clear_blocks = clear_blocks_c;
4053 c->pix_sum = pix_sum_c;
4054 c->pix_norm1 = pix_norm1_c;
4056 /* TODO [0] 16 [1] 8 */
4057 c->pix_abs[0][0] = pix_abs16_c;
4058 c->pix_abs[0][1] = pix_abs16_x2_c;
4059 c->pix_abs[0][2] = pix_abs16_y2_c;
4060 c->pix_abs[0][3] = pix_abs16_xy2_c;
4061 c->pix_abs[1][0] = pix_abs8_c;
4062 c->pix_abs[1][1] = pix_abs8_x2_c;
4063 c->pix_abs[1][2] = pix_abs8_y2_c;
4064 c->pix_abs[1][3] = pix_abs8_xy2_c;
4066 #define dspfunc(PFX, IDX, NUM) \
4067 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4068 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4069 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4070 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4072 dspfunc(put, 0, 16);
4073 dspfunc(put_no_rnd, 0, 16);
4075 dspfunc(put_no_rnd, 1, 8);
4079 dspfunc(avg, 0, 16);
4080 dspfunc(avg_no_rnd, 0, 16);
4082 dspfunc(avg_no_rnd, 1, 8);
4087 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4088 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4090 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4091 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4092 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4093 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4094 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4095 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4096 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4097 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4098 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4100 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4101 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4102 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4103 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4104 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4105 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4106 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4107 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4108 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4110 #define dspfunc(PFX, IDX, NUM) \
4111 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4112 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4113 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4114 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4115 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4116 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4117 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4118 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4119 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4120 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4121 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4122 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4123 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4124 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4125 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4126 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4128 dspfunc(put_qpel, 0, 16);
4129 dspfunc(put_no_rnd_qpel, 0, 16);
4131 dspfunc(avg_qpel, 0, 16);
4132 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4134 dspfunc(put_qpel, 1, 8);
4135 dspfunc(put_no_rnd_qpel, 1, 8);
4137 dspfunc(avg_qpel, 1, 8);
4138 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4140 dspfunc(put_h264_qpel, 0, 16);
4141 dspfunc(put_h264_qpel, 1, 8);
4142 dspfunc(put_h264_qpel, 2, 4);
4143 dspfunc(put_h264_qpel, 3, 2);
4144 dspfunc(avg_h264_qpel, 0, 16);
4145 dspfunc(avg_h264_qpel, 1, 8);
4146 dspfunc(avg_h264_qpel, 2, 4);
4149 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4150 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4151 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4152 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4153 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4154 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4155 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4157 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4158 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4159 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4160 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4161 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4162 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4163 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4164 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4165 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4166 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4167 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4168 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4169 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4170 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4171 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4172 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4173 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4174 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4175 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4176 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4178 #ifdef CONFIG_CAVS_DECODER
4179 ff_cavsdsp_init(c,avctx);
4181 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4182 ff_vc1dsp_init(c,avctx);
4184 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4185 ff_intrax8dsp_init(c,avctx);
4187 #if defined(CONFIG_H264_ENCODER)
4188 ff_h264dspenc_init(c,avctx);
4191 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4192 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4193 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4194 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4195 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4196 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4197 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4198 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4200 #define SET_CMP_FUNC(name) \
4201 c->name[0]= name ## 16_c;\
4202 c->name[1]= name ## 8x8_c;
4204 SET_CMP_FUNC(hadamard8_diff)
4205 c->hadamard8_diff[4]= hadamard8_intra16_c;
4206 SET_CMP_FUNC(dct_sad)
4207 SET_CMP_FUNC(dct_max)
4209 SET_CMP_FUNC(dct264_sad)
4211 c->sad[0]= pix_abs16_c;
4212 c->sad[1]= pix_abs8_c;
4216 SET_CMP_FUNC(quant_psnr)
4219 c->vsad[0]= vsad16_c;
4220 c->vsad[4]= vsad_intra16_c;
4221 c->vsse[0]= vsse16_c;
4222 c->vsse[4]= vsse_intra16_c;
4223 c->nsse[0]= nsse16_c;
4224 c->nsse[1]= nsse8_c;
4225 #ifdef CONFIG_SNOW_ENCODER
4226 c->w53[0]= w53_16_c;
4228 c->w97[0]= w97_16_c;
4232 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4234 c->add_bytes= add_bytes_c;
4235 c->diff_bytes= diff_bytes_c;
4236 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4237 c->bswap_buf= bswap_buf;
4239 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4240 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4241 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4242 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4243 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4244 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4245 c->h264_loop_filter_strength= NULL;
4247 if (ENABLE_ANY_H263) {
4248 c->h263_h_loop_filter= h263_h_loop_filter_c;
4249 c->h263_v_loop_filter= h263_v_loop_filter_c;
4252 c->h261_loop_filter= h261_loop_filter_c;
4254 c->try_8x8basis= try_8x8basis_c;
4255 c->add_8x8basis= add_8x8basis_c;
4257 #ifdef CONFIG_SNOW_DECODER
4258 c->vertical_compose97i = ff_snow_vertical_compose97i;
4259 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4260 c->inner_add_yblock = ff_snow_inner_add_yblock;
4263 #ifdef CONFIG_VORBIS_DECODER
4264 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4266 #ifdef CONFIG_FLAC_ENCODER
4267 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4269 c->vector_fmul = vector_fmul_c;
4270 c->vector_fmul_reverse = vector_fmul_reverse_c;
4271 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4272 c->float_to_int16 = ff_float_to_int16_c;
4274 c->shrink[0]= ff_img_copy_plane;
4275 c->shrink[1]= ff_shrink22;
4276 c->shrink[2]= ff_shrink44;
4277 c->shrink[3]= ff_shrink88;
4279 c->prefetch= just_return;
4281 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4282 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4284 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4285 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4286 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4287 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4288 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4289 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4290 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4291 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4292 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4294 for(i=0; i<64; i++){
4295 if(!c->put_2tap_qpel_pixels_tab[0][i])
4296 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4297 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4298 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4301 switch(c->idct_permutation_type){
4302 case FF_NO_IDCT_PERM:
4304 c->idct_permutation[i]= i;
4306 case FF_LIBMPEG2_IDCT_PERM:
4308 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4310 case FF_SIMPLE_IDCT_PERM:
4312 c->idct_permutation[i]= simple_mmx_permutation[i];
4314 case FF_TRANSPOSE_IDCT_PERM:
4316 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4318 case FF_PARTTRANS_IDCT_PERM:
4320 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4323 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");