3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "mpegvideo.h"
33 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
47 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
48 uint32_t ff_squareTbl[512] = {0, };
50 const uint8_t ff_zigzag_direct[64] = {
51 0, 1, 8, 16, 9, 2, 3, 10,
52 17, 24, 32, 25, 18, 11, 4, 5,
53 12, 19, 26, 33, 40, 48, 41, 34,
54 27, 20, 13, 6, 7, 14, 21, 28,
55 35, 42, 49, 56, 57, 50, 43, 36,
56 29, 22, 15, 23, 30, 37, 44, 51,
57 58, 59, 52, 45, 38, 31, 39, 46,
58 53, 60, 61, 54, 47, 55, 62, 63
61 /* Specific zigzag scan for 248 idct. NOTE that unlike the
62 specification, we interleave the fields */
63 const uint8_t ff_zigzag248_direct[64] = {
64 0, 8, 1, 9, 16, 24, 2, 10,
65 17, 25, 32, 40, 48, 56, 33, 41,
66 18, 26, 3, 11, 4, 12, 19, 27,
67 34, 42, 49, 57, 50, 58, 35, 43,
68 20, 28, 5, 13, 6, 14, 21, 29,
69 36, 44, 51, 59, 52, 60, 37, 45,
70 22, 30, 7, 15, 23, 31, 38, 46,
71 53, 61, 54, 62, 39, 47, 55, 63,
74 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
75 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
77 const uint8_t ff_alternate_horizontal_scan[64] = {
78 0, 1, 2, 3, 8, 9, 16, 17,
79 10, 11, 4, 5, 6, 7, 15, 14,
80 13, 12, 19, 18, 24, 25, 32, 33,
81 26, 27, 20, 21, 22, 23, 28, 29,
82 30, 31, 34, 35, 40, 41, 48, 49,
83 42, 43, 36, 37, 38, 39, 44, 45,
84 46, 47, 50, 51, 56, 57, 58, 59,
85 52, 53, 54, 55, 60, 61, 62, 63,
88 const uint8_t ff_alternate_vertical_scan[64] = {
89 0, 8, 16, 24, 1, 9, 2, 10,
90 17, 25, 32, 40, 48, 56, 57, 49,
91 41, 33, 26, 18, 3, 11, 4, 12,
92 19, 27, 34, 42, 50, 58, 35, 43,
93 51, 59, 20, 28, 5, 13, 6, 14,
94 21, 29, 36, 44, 52, 60, 37, 45,
95 53, 61, 22, 30, 7, 15, 23, 31,
96 38, 46, 54, 62, 39, 47, 55, 63,
99 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
100 const uint32_t ff_inverse[256]={
101 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
102 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
103 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
104 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
105 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
106 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
107 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
108 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
109 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
110 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
111 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
112 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
113 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
114 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
115 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
116 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
117 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
118 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
119 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
120 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
121 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
122 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
123 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
124 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
125 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
126 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
127 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
128 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
129 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
130 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
131 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
132 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
135 /* Input permutation for the simple_idct_mmx */
136 static const uint8_t simple_mmx_permutation[64]={
137 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
138 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
139 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
140 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
141 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
142 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
143 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
144 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
187 #if LONG_MAX > 2147483647
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= bswap_32(src[i+0]);
222 dst[i+1]= bswap_32(src[i+1]);
223 dst[i+2]= bswap_32(src[i+2]);
224 dst[i+3]= bswap_32(src[i+3]);
225 dst[i+4]= bswap_32(src[i+4]);
226 dst[i+5]= bswap_32(src[i+5]);
227 dst[i+6]= bswap_32(src[i+6]);
228 dst[i+7]= bswap_32(src[i+7]);
231 dst[i+0]= bswap_32(src[i+0]);
235 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
238 uint32_t *sq = ff_squareTbl + 256;
241 for (i = 0; i < h; i++) {
242 s += sq[pix1[0] - pix2[0]];
243 s += sq[pix1[1] - pix2[1]];
244 s += sq[pix1[2] - pix2[2]];
245 s += sq[pix1[3] - pix2[3]];
252 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
255 uint32_t *sq = ff_squareTbl + 256;
258 for (i = 0; i < h; i++) {
259 s += sq[pix1[0] - pix2[0]];
260 s += sq[pix1[1] - pix2[1]];
261 s += sq[pix1[2] - pix2[2]];
262 s += sq[pix1[3] - pix2[3]];
263 s += sq[pix1[4] - pix2[4]];
264 s += sq[pix1[5] - pix2[5]];
265 s += sq[pix1[6] - pix2[6]];
266 s += sq[pix1[7] - pix2[7]];
273 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
276 uint32_t *sq = ff_squareTbl + 256;
279 for (i = 0; i < h; i++) {
280 s += sq[pix1[ 0] - pix2[ 0]];
281 s += sq[pix1[ 1] - pix2[ 1]];
282 s += sq[pix1[ 2] - pix2[ 2]];
283 s += sq[pix1[ 3] - pix2[ 3]];
284 s += sq[pix1[ 4] - pix2[ 4]];
285 s += sq[pix1[ 5] - pix2[ 5]];
286 s += sq[pix1[ 6] - pix2[ 6]];
287 s += sq[pix1[ 7] - pix2[ 7]];
288 s += sq[pix1[ 8] - pix2[ 8]];
289 s += sq[pix1[ 9] - pix2[ 9]];
290 s += sq[pix1[10] - pix2[10]];
291 s += sq[pix1[11] - pix2[11]];
292 s += sq[pix1[12] - pix2[12]];
293 s += sq[pix1[13] - pix2[13]];
294 s += sq[pix1[14] - pix2[14]];
295 s += sq[pix1[15] - pix2[15]];
304 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
305 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
307 const int dec_count= w==8 ? 3 : 4;
310 static const int scale[2][2][4][4]={
314 {268, 239, 239, 213},
318 // 9/7 16x16 or 32x32 dec=4
319 {344, 310, 310, 280},
327 {275, 245, 245, 218},
331 // 5/3 16x16 or 32x32 dec=4
332 {352, 317, 317, 286},
340 for (i = 0; i < h; i++) {
341 for (j = 0; j < w; j+=4) {
342 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
343 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
344 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
345 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
351 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
355 for(level=0; level<dec_count; level++){
356 for(ori= level ? 1 : 0; ori<4; ori++){
357 int size= w>>(dec_count-level);
358 int sx= (ori&1) ? size : 0;
359 int stride= 32<<(dec_count-level);
360 int sy= (ori&2) ? stride>>1 : 0;
362 for(i=0; i<size; i++){
363 for(j=0; j<size; j++){
364 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
374 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
375 return w_c(v, pix1, pix2, line_size, 8, h, 1);
378 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
379 return w_c(v, pix1, pix2, line_size, 8, h, 0);
382 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
383 return w_c(v, pix1, pix2, line_size, 16, h, 1);
386 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
387 return w_c(v, pix1, pix2, line_size, 16, h, 0);
390 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
391 return w_c(v, pix1, pix2, line_size, 32, h, 1);
394 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
395 return w_c(v, pix1, pix2, line_size, 32, h, 0);
399 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
403 /* read the pixels */
405 block[0] = pixels[0];
406 block[1] = pixels[1];
407 block[2] = pixels[2];
408 block[3] = pixels[3];
409 block[4] = pixels[4];
410 block[5] = pixels[5];
411 block[6] = pixels[6];
412 block[7] = pixels[7];
418 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
419 const uint8_t *s2, int stride){
422 /* read the pixels */
424 block[0] = s1[0] - s2[0];
425 block[1] = s1[1] - s2[1];
426 block[2] = s1[2] - s2[2];
427 block[3] = s1[3] - s2[3];
428 block[4] = s1[4] - s2[4];
429 block[5] = s1[5] - s2[5];
430 block[6] = s1[6] - s2[6];
431 block[7] = s1[7] - s2[7];
439 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
443 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
445 /* read the pixels */
447 pixels[0] = cm[block[0]];
448 pixels[1] = cm[block[1]];
449 pixels[2] = cm[block[2]];
450 pixels[3] = cm[block[3]];
451 pixels[4] = cm[block[4]];
452 pixels[5] = cm[block[5]];
453 pixels[6] = cm[block[6]];
454 pixels[7] = cm[block[7]];
461 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
465 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
467 /* read the pixels */
469 pixels[0] = cm[block[0]];
470 pixels[1] = cm[block[1]];
471 pixels[2] = cm[block[2]];
472 pixels[3] = cm[block[3]];
479 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
483 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
485 /* read the pixels */
487 pixels[0] = cm[block[0]];
488 pixels[1] = cm[block[1]];
495 static void put_signed_pixels_clamped_c(const DCTELEM *block,
496 uint8_t *restrict pixels,
501 for (i = 0; i < 8; i++) {
502 for (j = 0; j < 8; j++) {
505 else if (*block > 127)
508 *pixels = (uint8_t)(*block + 128);
512 pixels += (line_size - 8);
516 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
520 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
522 /* read the pixels */
524 pixels[0] = cm[pixels[0] + block[0]];
525 pixels[1] = cm[pixels[1] + block[1]];
526 pixels[2] = cm[pixels[2] + block[2]];
527 pixels[3] = cm[pixels[3] + block[3]];
528 pixels[4] = cm[pixels[4] + block[4]];
529 pixels[5] = cm[pixels[5] + block[5]];
530 pixels[6] = cm[pixels[6] + block[6]];
531 pixels[7] = cm[pixels[7] + block[7]];
537 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
541 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
543 /* read the pixels */
545 pixels[0] = cm[pixels[0] + block[0]];
546 pixels[1] = cm[pixels[1] + block[1]];
547 pixels[2] = cm[pixels[2] + block[2]];
548 pixels[3] = cm[pixels[3] + block[3]];
554 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
558 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
560 /* read the pixels */
562 pixels[0] = cm[pixels[0] + block[0]];
563 pixels[1] = cm[pixels[1] + block[1]];
569 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
573 pixels[0] += block[0];
574 pixels[1] += block[1];
575 pixels[2] += block[2];
576 pixels[3] += block[3];
577 pixels[4] += block[4];
578 pixels[5] += block[5];
579 pixels[6] += block[6];
580 pixels[7] += block[7];
586 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
590 pixels[0] += block[0];
591 pixels[1] += block[1];
592 pixels[2] += block[2];
593 pixels[3] += block[3];
599 static int sum_abs_dctelem_c(DCTELEM *block)
603 sum+= FFABS(block[i]);
609 #define PIXOP2(OPNAME, OP) \
610 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
614 OP(*((uint64_t*)block), AV_RN64(pixels));\
620 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
624 const uint64_t a= AV_RN64(pixels );\
625 const uint64_t b= AV_RN64(pixels+1);\
626 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
632 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
636 const uint64_t a= AV_RN64(pixels );\
637 const uint64_t b= AV_RN64(pixels+1);\
638 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
644 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
648 const uint64_t a= AV_RN64(pixels );\
649 const uint64_t b= AV_RN64(pixels+line_size);\
650 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
656 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
660 const uint64_t a= AV_RN64(pixels );\
661 const uint64_t b= AV_RN64(pixels+line_size);\
662 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
668 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
671 const uint64_t a= AV_RN64(pixels );\
672 const uint64_t b= AV_RN64(pixels+1);\
673 uint64_t l0= (a&0x0303030303030303ULL)\
674 + (b&0x0303030303030303ULL)\
675 + 0x0202020202020202ULL;\
676 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
677 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
681 for(i=0; i<h; i+=2){\
682 uint64_t a= AV_RN64(pixels );\
683 uint64_t b= AV_RN64(pixels+1);\
684 l1= (a&0x0303030303030303ULL)\
685 + (b&0x0303030303030303ULL);\
686 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
687 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
688 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
691 a= AV_RN64(pixels );\
692 b= AV_RN64(pixels+1);\
693 l0= (a&0x0303030303030303ULL)\
694 + (b&0x0303030303030303ULL)\
695 + 0x0202020202020202ULL;\
696 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
697 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
698 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
704 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
707 const uint64_t a= AV_RN64(pixels );\
708 const uint64_t b= AV_RN64(pixels+1);\
709 uint64_t l0= (a&0x0303030303030303ULL)\
710 + (b&0x0303030303030303ULL)\
711 + 0x0101010101010101ULL;\
712 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
713 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
717 for(i=0; i<h; i+=2){\
718 uint64_t a= AV_RN64(pixels );\
719 uint64_t b= AV_RN64(pixels+1);\
720 l1= (a&0x0303030303030303ULL)\
721 + (b&0x0303030303030303ULL);\
722 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
723 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
724 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
727 a= AV_RN64(pixels );\
728 b= AV_RN64(pixels+1);\
729 l0= (a&0x0303030303030303ULL)\
730 + (b&0x0303030303030303ULL)\
731 + 0x0101010101010101ULL;\
732 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
733 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
734 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
740 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
741 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
742 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
743 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
744 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
745 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
746 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
748 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
749 #else // 64 bit variant
751 #define PIXOP2(OPNAME, OP) \
752 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
755 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
760 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
763 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
768 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
771 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
772 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
777 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
778 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
781 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
782 int src_stride1, int src_stride2, int h){\
786 a= AV_RN32(&src1[i*src_stride1 ]);\
787 b= AV_RN32(&src2[i*src_stride2 ]);\
788 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
789 a= AV_RN32(&src1[i*src_stride1+4]);\
790 b= AV_RN32(&src2[i*src_stride2+4]);\
791 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
795 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
796 int src_stride1, int src_stride2, int h){\
800 a= AV_RN32(&src1[i*src_stride1 ]);\
801 b= AV_RN32(&src2[i*src_stride2 ]);\
802 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
803 a= AV_RN32(&src1[i*src_stride1+4]);\
804 b= AV_RN32(&src2[i*src_stride2+4]);\
805 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
809 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
810 int src_stride1, int src_stride2, int h){\
814 a= AV_RN32(&src1[i*src_stride1 ]);\
815 b= AV_RN32(&src2[i*src_stride2 ]);\
816 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
820 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
821 int src_stride1, int src_stride2, int h){\
825 a= AV_RN16(&src1[i*src_stride1 ]);\
826 b= AV_RN16(&src2[i*src_stride2 ]);\
827 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
831 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
832 int src_stride1, int src_stride2, int h){\
833 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
834 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
837 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
838 int src_stride1, int src_stride2, int h){\
839 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
840 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
843 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
844 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
847 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
848 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
851 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
852 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
855 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
856 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
859 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
860 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
863 uint32_t a, b, c, d, l0, l1, h0, h1;\
864 a= AV_RN32(&src1[i*src_stride1]);\
865 b= AV_RN32(&src2[i*src_stride2]);\
866 c= AV_RN32(&src3[i*src_stride3]);\
867 d= AV_RN32(&src4[i*src_stride4]);\
868 l0= (a&0x03030303UL)\
871 h0= ((a&0xFCFCFCFCUL)>>2)\
872 + ((b&0xFCFCFCFCUL)>>2);\
873 l1= (c&0x03030303UL)\
875 h1= ((c&0xFCFCFCFCUL)>>2)\
876 + ((d&0xFCFCFCFCUL)>>2);\
877 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
878 a= AV_RN32(&src1[i*src_stride1+4]);\
879 b= AV_RN32(&src2[i*src_stride2+4]);\
880 c= AV_RN32(&src3[i*src_stride3+4]);\
881 d= AV_RN32(&src4[i*src_stride4+4]);\
882 l0= (a&0x03030303UL)\
885 h0= ((a&0xFCFCFCFCUL)>>2)\
886 + ((b&0xFCFCFCFCUL)>>2);\
887 l1= (c&0x03030303UL)\
889 h1= ((c&0xFCFCFCFCUL)>>2)\
890 + ((d&0xFCFCFCFCUL)>>2);\
891 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
895 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
896 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
899 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
900 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
903 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
904 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
907 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
908 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
911 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
912 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
915 uint32_t a, b, c, d, l0, l1, h0, h1;\
916 a= AV_RN32(&src1[i*src_stride1]);\
917 b= AV_RN32(&src2[i*src_stride2]);\
918 c= AV_RN32(&src3[i*src_stride3]);\
919 d= AV_RN32(&src4[i*src_stride4]);\
920 l0= (a&0x03030303UL)\
923 h0= ((a&0xFCFCFCFCUL)>>2)\
924 + ((b&0xFCFCFCFCUL)>>2);\
925 l1= (c&0x03030303UL)\
927 h1= ((c&0xFCFCFCFCUL)>>2)\
928 + ((d&0xFCFCFCFCUL)>>2);\
929 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
930 a= AV_RN32(&src1[i*src_stride1+4]);\
931 b= AV_RN32(&src2[i*src_stride2+4]);\
932 c= AV_RN32(&src3[i*src_stride3+4]);\
933 d= AV_RN32(&src4[i*src_stride4+4]);\
934 l0= (a&0x03030303UL)\
937 h0= ((a&0xFCFCFCFCUL)>>2)\
938 + ((b&0xFCFCFCFCUL)>>2);\
939 l1= (c&0x03030303UL)\
941 h1= ((c&0xFCFCFCFCUL)>>2)\
942 + ((d&0xFCFCFCFCUL)>>2);\
943 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
946 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
947 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
948 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
949 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
951 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
952 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
953 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
954 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
957 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
959 int i, a0, b0, a1, b1;\
966 for(i=0; i<h; i+=2){\
972 block[0]= (a1+a0)>>2; /* FIXME non put */\
973 block[1]= (b1+b0)>>2;\
983 block[0]= (a1+a0)>>2;\
984 block[1]= (b1+b0)>>2;\
990 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
993 const uint32_t a= AV_RN32(pixels );\
994 const uint32_t b= AV_RN32(pixels+1);\
995 uint32_t l0= (a&0x03030303UL)\
998 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
999 + ((b&0xFCFCFCFCUL)>>2);\
1003 for(i=0; i<h; i+=2){\
1004 uint32_t a= AV_RN32(pixels );\
1005 uint32_t b= AV_RN32(pixels+1);\
1006 l1= (a&0x03030303UL)\
1007 + (b&0x03030303UL);\
1008 h1= ((a&0xFCFCFCFCUL)>>2)\
1009 + ((b&0xFCFCFCFCUL)>>2);\
1010 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1013 a= AV_RN32(pixels );\
1014 b= AV_RN32(pixels+1);\
1015 l0= (a&0x03030303UL)\
1018 h0= ((a&0xFCFCFCFCUL)>>2)\
1019 + ((b&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1026 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1029 for(j=0; j<2; j++){\
1031 const uint32_t a= AV_RN32(pixels );\
1032 const uint32_t b= AV_RN32(pixels+1);\
1033 uint32_t l0= (a&0x03030303UL)\
1036 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1037 + ((b&0xFCFCFCFCUL)>>2);\
1041 for(i=0; i<h; i+=2){\
1042 uint32_t a= AV_RN32(pixels );\
1043 uint32_t b= AV_RN32(pixels+1);\
1044 l1= (a&0x03030303UL)\
1045 + (b&0x03030303UL);\
1046 h1= ((a&0xFCFCFCFCUL)>>2)\
1047 + ((b&0xFCFCFCFCUL)>>2);\
1048 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1051 a= AV_RN32(pixels );\
1052 b= AV_RN32(pixels+1);\
1053 l0= (a&0x03030303UL)\
1056 h0= ((a&0xFCFCFCFCUL)>>2)\
1057 + ((b&0xFCFCFCFCUL)>>2);\
1058 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1062 pixels+=4-line_size*(h+1);\
1063 block +=4-line_size*h;\
1067 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1070 for(j=0; j<2; j++){\
1072 const uint32_t a= AV_RN32(pixels );\
1073 const uint32_t b= AV_RN32(pixels+1);\
1074 uint32_t l0= (a&0x03030303UL)\
1077 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1082 for(i=0; i<h; i+=2){\
1083 uint32_t a= AV_RN32(pixels );\
1084 uint32_t b= AV_RN32(pixels+1);\
1085 l1= (a&0x03030303UL)\
1086 + (b&0x03030303UL);\
1087 h1= ((a&0xFCFCFCFCUL)>>2)\
1088 + ((b&0xFCFCFCFCUL)>>2);\
1089 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1092 a= AV_RN32(pixels );\
1093 b= AV_RN32(pixels+1);\
1094 l0= (a&0x03030303UL)\
1097 h0= ((a&0xFCFCFCFCUL)>>2)\
1098 + ((b&0xFCFCFCFCUL)>>2);\
1099 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1103 pixels+=4-line_size*(h+1);\
1104 block +=4-line_size*h;\
1108 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1109 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1110 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1111 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1112 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1113 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1114 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1115 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1117 #define op_avg(a, b) a = rnd_avg32(a, b)
1119 #define op_put(a, b) a = b
1126 #define avg2(a,b) ((a+b+1)>>1)
1127 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1129 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1130 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1133 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1134 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1137 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1139 const int A=(16-x16)*(16-y16);
1140 const int B=( x16)*(16-y16);
1141 const int C=(16-x16)*( y16);
1142 const int D=( x16)*( y16);
1147 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1148 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1149 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1150 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1151 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1152 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1153 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1154 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1160 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1161 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1164 const int s= 1<<shift;
1174 for(x=0; x<8; x++){ //XXX FIXME optimize
1175 int src_x, src_y, frac_x, frac_y, index;
1179 frac_x= src_x&(s-1);
1180 frac_y= src_y&(s-1);
1184 if((unsigned)src_x < width){
1185 if((unsigned)src_y < height){
1186 index= src_x + src_y*stride;
1187 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1188 + src[index +1]* frac_x )*(s-frac_y)
1189 + ( src[index+stride ]*(s-frac_x)
1190 + src[index+stride+1]* frac_x )* frac_y
1193 index= src_x + av_clip(src_y, 0, height)*stride;
1194 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1195 + src[index +1]* frac_x )*s
1199 if((unsigned)src_y < height){
1200 index= av_clip(src_x, 0, width) + src_y*stride;
1201 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1202 + src[index+stride ]* frac_y )*s
1205 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1206 dst[y*stride + x]= src[index ];
1218 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1220 case 2: put_pixels2_c (dst, src, stride, height); break;
1221 case 4: put_pixels4_c (dst, src, stride, height); break;
1222 case 8: put_pixels8_c (dst, src, stride, height); break;
1223 case 16:put_pixels16_c(dst, src, stride, height); break;
1227 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1229 for (i=0; i < height; i++) {
1230 for (j=0; j < width; j++) {
1231 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1238 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1240 for (i=0; i < height; i++) {
1241 for (j=0; j < width; j++) {
1242 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1249 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1251 for (i=0; i < height; i++) {
1252 for (j=0; j < width; j++) {
1253 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1260 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1262 for (i=0; i < height; i++) {
1263 for (j=0; j < width; j++) {
1264 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1271 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1273 for (i=0; i < height; i++) {
1274 for (j=0; j < width; j++) {
1275 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1282 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1284 for (i=0; i < height; i++) {
1285 for (j=0; j < width; j++) {
1286 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1293 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1295 for (i=0; i < height; i++) {
1296 for (j=0; j < width; j++) {
1297 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1304 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1306 for (i=0; i < height; i++) {
1307 for (j=0; j < width; j++) {
1308 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1315 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1317 case 2: avg_pixels2_c (dst, src, stride, height); break;
1318 case 4: avg_pixels4_c (dst, src, stride, height); break;
1319 case 8: avg_pixels8_c (dst, src, stride, height); break;
1320 case 16:avg_pixels16_c(dst, src, stride, height); break;
1324 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1326 for (i=0; i < height; i++) {
1327 for (j=0; j < width; j++) {
1328 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1335 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1337 for (i=0; i < height; i++) {
1338 for (j=0; j < width; j++) {
1339 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1346 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1348 for (i=0; i < height; i++) {
1349 for (j=0; j < width; j++) {
1350 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1357 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 for (i=0; i < height; i++) {
1360 for (j=0; j < width; j++) {
1361 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1368 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1370 for (i=0; i < height; i++) {
1371 for (j=0; j < width; j++) {
1372 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1379 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1381 for (i=0; i < height; i++) {
1382 for (j=0; j < width; j++) {
1383 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1390 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1392 for (i=0; i < height; i++) {
1393 for (j=0; j < width; j++) {
1394 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1401 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1403 for (i=0; i < height; i++) {
1404 for (j=0; j < width; j++) {
1405 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1412 #define TPEL_WIDTH(width)\
1413 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1414 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1415 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1416 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1417 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1418 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1419 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1420 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1421 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1422 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1423 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1424 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1425 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1426 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1427 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1428 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1429 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1430 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1433 #define H264_CHROMA_MC(OPNAME, OP)\
1434 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1435 const int A=(8-x)*(8-y);\
1436 const int B=( x)*(8-y);\
1437 const int C=(8-x)*( y);\
1438 const int D=( x)*( y);\
1441 assert(x<8 && y<8 && x>=0 && y>=0);\
1446 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1447 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1453 const int step= C ? stride : 1;\
1456 OP(dst[0], (A*src[0] + E*src[step+0]));\
1457 OP(dst[1], (A*src[1] + E*src[step+1]));\
1464 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1465 const int A=(8-x)*(8-y);\
1466 const int B=( x)*(8-y);\
1467 const int C=(8-x)*( y);\
1468 const int D=( x)*( y);\
1471 assert(x<8 && y<8 && x>=0 && y>=0);\
1476 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1477 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1478 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1479 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1485 const int step= C ? stride : 1;\
1488 OP(dst[0], (A*src[0] + E*src[step+0]));\
1489 OP(dst[1], (A*src[1] + E*src[step+1]));\
1490 OP(dst[2], (A*src[2] + E*src[step+2]));\
1491 OP(dst[3], (A*src[3] + E*src[step+3]));\
1498 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1499 const int A=(8-x)*(8-y);\
1500 const int B=( x)*(8-y);\
1501 const int C=(8-x)*( y);\
1502 const int D=( x)*( y);\
1505 assert(x<8 && y<8 && x>=0 && y>=0);\
1510 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1511 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1512 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1513 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1514 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1515 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1516 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1517 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1523 const int step= C ? stride : 1;\
1526 OP(dst[0], (A*src[0] + E*src[step+0]));\
1527 OP(dst[1], (A*src[1] + E*src[step+1]));\
1528 OP(dst[2], (A*src[2] + E*src[step+2]));\
1529 OP(dst[3], (A*src[3] + E*src[step+3]));\
1530 OP(dst[4], (A*src[4] + E*src[step+4]));\
1531 OP(dst[5], (A*src[5] + E*src[step+5]));\
1532 OP(dst[6], (A*src[6] + E*src[step+6]));\
1533 OP(dst[7], (A*src[7] + E*src[step+7]));\
1540 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1541 #define op_put(a, b) a = (((b) + 32)>>6)
1543 H264_CHROMA_MC(put_ , op_put)
1544 H264_CHROMA_MC(avg_ , op_avg)
1548 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1549 const int A=(8-x)*(8-y);
1550 const int B=( x)*(8-y);
1551 const int C=(8-x)*( y);
1552 const int D=( x)*( y);
1555 assert(x<8 && y<8 && x>=0 && y>=0);
1559 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1560 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1561 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1562 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1563 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1564 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1565 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1566 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1572 #define QPEL_MC(r, OPNAME, RND, OP) \
1573 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1578 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1579 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1580 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1581 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1582 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1583 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1584 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1585 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1591 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1593 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1597 const int src0= src[0*srcStride];\
1598 const int src1= src[1*srcStride];\
1599 const int src2= src[2*srcStride];\
1600 const int src3= src[3*srcStride];\
1601 const int src4= src[4*srcStride];\
1602 const int src5= src[5*srcStride];\
1603 const int src6= src[6*srcStride];\
1604 const int src7= src[7*srcStride];\
1605 const int src8= src[8*srcStride];\
1606 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1607 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1608 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1609 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1610 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1611 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1612 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1613 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1619 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1620 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1625 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1626 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1627 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1628 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1629 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1630 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1631 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1632 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1633 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1634 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1635 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1636 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1637 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1638 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1639 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1640 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1646 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1647 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1652 const int src0= src[0*srcStride];\
1653 const int src1= src[1*srcStride];\
1654 const int src2= src[2*srcStride];\
1655 const int src3= src[3*srcStride];\
1656 const int src4= src[4*srcStride];\
1657 const int src5= src[5*srcStride];\
1658 const int src6= src[6*srcStride];\
1659 const int src7= src[7*srcStride];\
1660 const int src8= src[8*srcStride];\
1661 const int src9= src[9*srcStride];\
1662 const int src10= src[10*srcStride];\
1663 const int src11= src[11*srcStride];\
1664 const int src12= src[12*srcStride];\
1665 const int src13= src[13*srcStride];\
1666 const int src14= src[14*srcStride];\
1667 const int src15= src[15*srcStride];\
1668 const int src16= src[16*srcStride];\
1669 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1670 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1671 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1672 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1673 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1674 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1675 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1676 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1677 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1678 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1679 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1680 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1681 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1682 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1683 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1684 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1690 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1691 OPNAME ## pixels8_c(dst, src, stride, 8);\
1694 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1696 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1697 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1700 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1701 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1704 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1706 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1707 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1710 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1711 uint8_t full[16*9];\
1713 copy_block9(full, src, 16, stride, 9);\
1714 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1715 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1718 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1719 uint8_t full[16*9];\
1720 copy_block9(full, src, 16, stride, 9);\
1721 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1724 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1725 uint8_t full[16*9];\
1727 copy_block9(full, src, 16, stride, 9);\
1728 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1729 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1731 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1732 uint8_t full[16*9];\
1735 uint8_t halfHV[64];\
1736 copy_block9(full, src, 16, stride, 9);\
1737 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1738 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1739 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1740 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1742 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1743 uint8_t full[16*9];\
1745 uint8_t halfHV[64];\
1746 copy_block9(full, src, 16, stride, 9);\
1747 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1748 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1749 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1750 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1752 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1753 uint8_t full[16*9];\
1756 uint8_t halfHV[64];\
1757 copy_block9(full, src, 16, stride, 9);\
1758 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1759 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1760 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1761 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1763 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1764 uint8_t full[16*9];\
1766 uint8_t halfHV[64];\
1767 copy_block9(full, src, 16, stride, 9);\
1768 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1769 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1770 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1771 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1773 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1774 uint8_t full[16*9];\
1777 uint8_t halfHV[64];\
1778 copy_block9(full, src, 16, stride, 9);\
1779 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1780 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1781 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1782 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1784 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1785 uint8_t full[16*9];\
1787 uint8_t halfHV[64];\
1788 copy_block9(full, src, 16, stride, 9);\
1789 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1790 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1791 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1792 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1794 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1795 uint8_t full[16*9];\
1798 uint8_t halfHV[64];\
1799 copy_block9(full, src, 16, stride, 9);\
1800 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1801 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1802 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1803 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1805 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1806 uint8_t full[16*9];\
1808 uint8_t halfHV[64];\
1809 copy_block9(full, src, 16, stride, 9);\
1810 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1811 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1812 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1813 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1815 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1817 uint8_t halfHV[64];\
1818 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1819 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1820 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1822 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1824 uint8_t halfHV[64];\
1825 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1826 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1827 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1829 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1830 uint8_t full[16*9];\
1833 uint8_t halfHV[64];\
1834 copy_block9(full, src, 16, stride, 9);\
1835 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1836 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1837 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1838 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1840 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1841 uint8_t full[16*9];\
1843 copy_block9(full, src, 16, stride, 9);\
1844 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1845 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1846 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1848 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1849 uint8_t full[16*9];\
1852 uint8_t halfHV[64];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1855 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1856 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1857 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1859 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1860 uint8_t full[16*9];\
1862 copy_block9(full, src, 16, stride, 9);\
1863 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1864 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1865 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1867 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1869 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1870 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1872 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1873 OPNAME ## pixels16_c(dst, src, stride, 16);\
1876 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1878 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1879 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
1882 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1883 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1886 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1888 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1889 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
1892 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[24*17];\
1895 copy_block17(full, src, 24, stride, 17);\
1896 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1897 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
1900 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1901 uint8_t full[24*17];\
1902 copy_block17(full, src, 24, stride, 17);\
1903 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1906 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1907 uint8_t full[24*17];\
1909 copy_block17(full, src, 24, stride, 17);\
1910 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1911 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
1913 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[24*17];\
1915 uint8_t halfH[272];\
1916 uint8_t halfV[256];\
1917 uint8_t halfHV[256];\
1918 copy_block17(full, src, 24, stride, 17);\
1919 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1920 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1921 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1922 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1924 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[24*17];\
1926 uint8_t halfH[272];\
1927 uint8_t halfHV[256];\
1928 copy_block17(full, src, 24, stride, 17);\
1929 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1930 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1931 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1932 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1934 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[24*17];\
1936 uint8_t halfH[272];\
1937 uint8_t halfV[256];\
1938 uint8_t halfHV[256];\
1939 copy_block17(full, src, 24, stride, 17);\
1940 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1941 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1942 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1943 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1945 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[24*17];\
1947 uint8_t halfH[272];\
1948 uint8_t halfHV[256];\
1949 copy_block17(full, src, 24, stride, 17);\
1950 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1951 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1952 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1953 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
1955 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[24*17];\
1957 uint8_t halfH[272];\
1958 uint8_t halfV[256];\
1959 uint8_t halfHV[256];\
1960 copy_block17(full, src, 24, stride, 17);\
1961 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1962 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1963 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1964 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1966 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[24*17];\
1968 uint8_t halfH[272];\
1969 uint8_t halfHV[256];\
1970 copy_block17(full, src, 24, stride, 17);\
1971 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1972 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
1973 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1974 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1976 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1977 uint8_t full[24*17];\
1978 uint8_t halfH[272];\
1979 uint8_t halfV[256];\
1980 uint8_t halfHV[256];\
1981 copy_block17(full, src, 24, stride, 17);\
1982 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1983 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1984 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1985 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1987 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1988 uint8_t full[24*17];\
1989 uint8_t halfH[272];\
1990 uint8_t halfHV[256];\
1991 copy_block17(full, src, 24, stride, 17);\
1992 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1993 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
1994 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1995 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1997 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1998 uint8_t halfH[272];\
1999 uint8_t halfHV[256];\
2000 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2001 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2002 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2004 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2005 uint8_t halfH[272];\
2006 uint8_t halfHV[256];\
2007 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2008 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2009 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2011 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2012 uint8_t full[24*17];\
2013 uint8_t halfH[272];\
2014 uint8_t halfV[256];\
2015 uint8_t halfHV[256];\
2016 copy_block17(full, src, 24, stride, 17);\
2017 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2018 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2019 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2020 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2022 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2023 uint8_t full[24*17];\
2024 uint8_t halfH[272];\
2025 copy_block17(full, src, 24, stride, 17);\
2026 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2027 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2028 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2030 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2031 uint8_t full[24*17];\
2032 uint8_t halfH[272];\
2033 uint8_t halfV[256];\
2034 uint8_t halfHV[256];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2037 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2038 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2039 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2041 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2042 uint8_t full[24*17];\
2043 uint8_t halfH[272];\
2044 copy_block17(full, src, 24, stride, 17);\
2045 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2046 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2047 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2049 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t halfH[272];\
2051 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2052 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2055 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2056 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2057 #define op_put(a, b) a = cm[((b) + 16)>>5]
2058 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2060 QPEL_MC(0, put_ , _ , op_put)
2061 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2062 QPEL_MC(0, avg_ , _ , op_avg)
2063 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2065 #undef op_avg_no_rnd
2067 #undef op_put_no_rnd
2070 #define H264_LOWPASS(OPNAME, OP, OP2) \
2071 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2073 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2077 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2078 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2084 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2086 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2090 const int srcB= src[-2*srcStride];\
2091 const int srcA= src[-1*srcStride];\
2092 const int src0= src[0 *srcStride];\
2093 const int src1= src[1 *srcStride];\
2094 const int src2= src[2 *srcStride];\
2095 const int src3= src[3 *srcStride];\
2096 const int src4= src[4 *srcStride];\
2097 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2098 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2104 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2107 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2109 src -= 2*srcStride;\
2110 for(i=0; i<h+5; i++)\
2112 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2113 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2117 tmp -= tmpStride*(h+5-2);\
2120 const int tmpB= tmp[-2*tmpStride];\
2121 const int tmpA= tmp[-1*tmpStride];\
2122 const int tmp0= tmp[0 *tmpStride];\
2123 const int tmp1= tmp[1 *tmpStride];\
2124 const int tmp2= tmp[2 *tmpStride];\
2125 const int tmp3= tmp[3 *tmpStride];\
2126 const int tmp4= tmp[4 *tmpStride];\
2127 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2128 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2133 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2135 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2139 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2140 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2141 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2142 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2148 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2150 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2154 const int srcB= src[-2*srcStride];\
2155 const int srcA= src[-1*srcStride];\
2156 const int src0= src[0 *srcStride];\
2157 const int src1= src[1 *srcStride];\
2158 const int src2= src[2 *srcStride];\
2159 const int src3= src[3 *srcStride];\
2160 const int src4= src[4 *srcStride];\
2161 const int src5= src[5 *srcStride];\
2162 const int src6= src[6 *srcStride];\
2163 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2164 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2165 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2166 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2172 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2175 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2177 src -= 2*srcStride;\
2178 for(i=0; i<h+5; i++)\
2180 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2181 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2182 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2183 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2187 tmp -= tmpStride*(h+5-2);\
2190 const int tmpB= tmp[-2*tmpStride];\
2191 const int tmpA= tmp[-1*tmpStride];\
2192 const int tmp0= tmp[0 *tmpStride];\
2193 const int tmp1= tmp[1 *tmpStride];\
2194 const int tmp2= tmp[2 *tmpStride];\
2195 const int tmp3= tmp[3 *tmpStride];\
2196 const int tmp4= tmp[4 *tmpStride];\
2197 const int tmp5= tmp[5 *tmpStride];\
2198 const int tmp6= tmp[6 *tmpStride];\
2199 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2200 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2201 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2202 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2208 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2210 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2214 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2215 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2216 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2217 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2218 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2219 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2220 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2221 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2227 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2229 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2233 const int srcB= src[-2*srcStride];\
2234 const int srcA= src[-1*srcStride];\
2235 const int src0= src[0 *srcStride];\
2236 const int src1= src[1 *srcStride];\
2237 const int src2= src[2 *srcStride];\
2238 const int src3= src[3 *srcStride];\
2239 const int src4= src[4 *srcStride];\
2240 const int src5= src[5 *srcStride];\
2241 const int src6= src[6 *srcStride];\
2242 const int src7= src[7 *srcStride];\
2243 const int src8= src[8 *srcStride];\
2244 const int src9= src[9 *srcStride];\
2245 const int src10=src[10*srcStride];\
2246 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2247 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2248 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2249 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2250 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2251 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2252 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2253 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2259 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2262 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264 src -= 2*srcStride;\
2265 for(i=0; i<h+5; i++)\
2267 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2268 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2269 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2270 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2271 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2272 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2273 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2274 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2278 tmp -= tmpStride*(h+5-2);\
2281 const int tmpB= tmp[-2*tmpStride];\
2282 const int tmpA= tmp[-1*tmpStride];\
2283 const int tmp0= tmp[0 *tmpStride];\
2284 const int tmp1= tmp[1 *tmpStride];\
2285 const int tmp2= tmp[2 *tmpStride];\
2286 const int tmp3= tmp[3 *tmpStride];\
2287 const int tmp4= tmp[4 *tmpStride];\
2288 const int tmp5= tmp[5 *tmpStride];\
2289 const int tmp6= tmp[6 *tmpStride];\
2290 const int tmp7= tmp[7 *tmpStride];\
2291 const int tmp8= tmp[8 *tmpStride];\
2292 const int tmp9= tmp[9 *tmpStride];\
2293 const int tmp10=tmp[10*tmpStride];\
2294 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2295 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2296 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2297 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2298 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2299 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2300 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2301 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2307 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2308 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2309 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2310 src += 8*srcStride;\
2311 dst += 8*dstStride;\
2312 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2313 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2316 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2317 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2318 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2319 src += 8*srcStride;\
2320 dst += 8*dstStride;\
2321 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2322 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2325 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2326 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2327 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2328 src += 8*srcStride;\
2329 dst += 8*dstStride;\
2330 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2331 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2334 #define H264_MC(OPNAME, SIZE) \
2335 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2336 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2339 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2340 uint8_t half[SIZE*SIZE];\
2341 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2342 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2345 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2346 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2349 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2350 uint8_t half[SIZE*SIZE];\
2351 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2352 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2355 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2356 uint8_t full[SIZE*(SIZE+5)];\
2357 uint8_t * const full_mid= full + SIZE*2;\
2358 uint8_t half[SIZE*SIZE];\
2359 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2360 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2361 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2364 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2365 uint8_t full[SIZE*(SIZE+5)];\
2366 uint8_t * const full_mid= full + SIZE*2;\
2367 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2368 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2371 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2372 uint8_t full[SIZE*(SIZE+5)];\
2373 uint8_t * const full_mid= full + SIZE*2;\
2374 uint8_t half[SIZE*SIZE];\
2375 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2376 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2377 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2380 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2381 uint8_t full[SIZE*(SIZE+5)];\
2382 uint8_t * const full_mid= full + SIZE*2;\
2383 uint8_t halfH[SIZE*SIZE];\
2384 uint8_t halfV[SIZE*SIZE];\
2385 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2386 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2387 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2388 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2391 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2392 uint8_t full[SIZE*(SIZE+5)];\
2393 uint8_t * const full_mid= full + SIZE*2;\
2394 uint8_t halfH[SIZE*SIZE];\
2395 uint8_t halfV[SIZE*SIZE];\
2396 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2397 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2398 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2399 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2402 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2403 uint8_t full[SIZE*(SIZE+5)];\
2404 uint8_t * const full_mid= full + SIZE*2;\
2405 uint8_t halfH[SIZE*SIZE];\
2406 uint8_t halfV[SIZE*SIZE];\
2407 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2408 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2409 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2410 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2413 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2414 uint8_t full[SIZE*(SIZE+5)];\
2415 uint8_t * const full_mid= full + SIZE*2;\
2416 uint8_t halfH[SIZE*SIZE];\
2417 uint8_t halfV[SIZE*SIZE];\
2418 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2419 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2420 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2421 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2424 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2425 int16_t tmp[SIZE*(SIZE+5)];\
2426 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2429 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2430 int16_t tmp[SIZE*(SIZE+5)];\
2431 uint8_t halfH[SIZE*SIZE];\
2432 uint8_t halfHV[SIZE*SIZE];\
2433 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2434 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2435 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2438 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2439 int16_t tmp[SIZE*(SIZE+5)];\
2440 uint8_t halfH[SIZE*SIZE];\
2441 uint8_t halfHV[SIZE*SIZE];\
2442 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2443 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2444 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2447 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2448 uint8_t full[SIZE*(SIZE+5)];\
2449 uint8_t * const full_mid= full + SIZE*2;\
2450 int16_t tmp[SIZE*(SIZE+5)];\
2451 uint8_t halfV[SIZE*SIZE];\
2452 uint8_t halfHV[SIZE*SIZE];\
2453 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2454 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2455 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2456 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2459 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2460 uint8_t full[SIZE*(SIZE+5)];\
2461 uint8_t * const full_mid= full + SIZE*2;\
2462 int16_t tmp[SIZE*(SIZE+5)];\
2463 uint8_t halfV[SIZE*SIZE];\
2464 uint8_t halfHV[SIZE*SIZE];\
2465 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2466 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2467 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2468 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2471 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2472 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2473 #define op_put(a, b) a = cm[((b) + 16)>>5]
2474 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2475 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2477 H264_LOWPASS(put_ , op_put, op2_put)
2478 H264_LOWPASS(avg_ , op_avg, op2_avg)
2493 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2494 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2495 #define H264_WEIGHT(W,H) \
2496 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2498 offset <<= log2_denom; \
2499 if(log2_denom) offset += 1<<(log2_denom-1); \
2500 for(y=0; y<H; y++, block += stride){ \
2503 if(W==2) continue; \
2506 if(W==4) continue; \
2511 if(W==8) continue; \
2522 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2524 offset = ((offset + 1) | 1) << log2_denom; \
2525 for(y=0; y<H; y++, dst += stride, src += stride){ \
2528 if(W==2) continue; \
2531 if(W==4) continue; \
2536 if(W==8) continue; \
2563 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2564 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2568 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2569 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2570 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2571 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2572 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2573 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2574 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2575 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2581 #ifdef CONFIG_CAVS_DECODER
2583 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2585 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2586 put_pixels8_c(dst, src, stride, 8);
2588 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2589 avg_pixels8_c(dst, src, stride, 8);
2591 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2592 put_pixels16_c(dst, src, stride, 16);
2594 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2595 avg_pixels16_c(dst, src, stride, 16);
2597 #endif /* CONFIG_CAVS_DECODER */
2599 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2601 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2603 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2604 put_pixels8_c(dst, src, stride, 8);
2606 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2608 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2611 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2613 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2614 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2618 const int src_1= src[ -srcStride];
2619 const int src0 = src[0 ];
2620 const int src1 = src[ srcStride];
2621 const int src2 = src[2*srcStride];
2622 const int src3 = src[3*srcStride];
2623 const int src4 = src[4*srcStride];
2624 const int src5 = src[5*srcStride];
2625 const int src6 = src[6*srcStride];
2626 const int src7 = src[7*srcStride];
2627 const int src8 = src[8*srcStride];
2628 const int src9 = src[9*srcStride];
2629 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2630 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2631 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2632 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2633 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2634 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2635 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2636 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2642 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2643 put_pixels8_c(dst, src, stride, 8);
2646 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2648 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2649 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2652 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2653 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2656 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2658 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2659 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2662 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2663 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2666 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2670 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2671 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2672 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2673 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2675 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2679 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2680 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2681 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2682 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2684 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2686 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2687 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2690 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2691 if(ENABLE_ANY_H263) {
2693 const int strength= ff_h263_loop_filter_strength[qscale];
2697 int p0= src[x-2*stride];
2698 int p1= src[x-1*stride];
2699 int p2= src[x+0*stride];
2700 int p3= src[x+1*stride];
2701 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2703 if (d<-2*strength) d1= 0;
2704 else if(d<- strength) d1=-2*strength - d;
2705 else if(d< strength) d1= d;
2706 else if(d< 2*strength) d1= 2*strength - d;
2711 if(p1&256) p1= ~(p1>>31);
2712 if(p2&256) p2= ~(p2>>31);
2714 src[x-1*stride] = p1;
2715 src[x+0*stride] = p2;
2719 d2= av_clip((p0-p3)/4, -ad1, ad1);
2721 src[x-2*stride] = p0 - d2;
2722 src[x+ stride] = p3 + d2;
2727 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2728 if(ENABLE_ANY_H263) {
2730 const int strength= ff_h263_loop_filter_strength[qscale];
2734 int p0= src[y*stride-2];
2735 int p1= src[y*stride-1];
2736 int p2= src[y*stride+0];
2737 int p3= src[y*stride+1];
2738 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2740 if (d<-2*strength) d1= 0;
2741 else if(d<- strength) d1=-2*strength - d;
2742 else if(d< strength) d1= d;
2743 else if(d< 2*strength) d1= 2*strength - d;
2748 if(p1&256) p1= ~(p1>>31);
2749 if(p2&256) p2= ~(p2>>31);
2751 src[y*stride-1] = p1;
2752 src[y*stride+0] = p2;
2756 d2= av_clip((p0-p3)/4, -ad1, ad1);
2758 src[y*stride-2] = p0 - d2;
2759 src[y*stride+1] = p3 + d2;
2764 static void h261_loop_filter_c(uint8_t *src, int stride){
2769 temp[x ] = 4*src[x ];
2770 temp[x + 7*8] = 4*src[x + 7*stride];
2774 xy = y * stride + x;
2776 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2781 src[ y*stride] = (temp[ y*8] + 2)>>2;
2782 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2784 xy = y * stride + x;
2786 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2791 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2794 for( i = 0; i < 4; i++ ) {
2799 for( d = 0; d < 4; d++ ) {
2800 const int p0 = pix[-1*xstride];
2801 const int p1 = pix[-2*xstride];
2802 const int p2 = pix[-3*xstride];
2803 const int q0 = pix[0];
2804 const int q1 = pix[1*xstride];
2805 const int q2 = pix[2*xstride];
2807 if( FFABS( p0 - q0 ) < alpha &&
2808 FFABS( p1 - p0 ) < beta &&
2809 FFABS( q1 - q0 ) < beta ) {
2814 if( FFABS( p2 - p0 ) < beta ) {
2815 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2818 if( FFABS( q2 - q0 ) < beta ) {
2819 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2823 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2824 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2825 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2831 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2833 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2835 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2837 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2840 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2843 for( i = 0; i < 4; i++ ) {
2844 const int tc = tc0[i];
2849 for( d = 0; d < 2; d++ ) {
2850 const int p0 = pix[-1*xstride];
2851 const int p1 = pix[-2*xstride];
2852 const int q0 = pix[0];
2853 const int q1 = pix[1*xstride];
2855 if( FFABS( p0 - q0 ) < alpha &&
2856 FFABS( p1 - p0 ) < beta &&
2857 FFABS( q1 - q0 ) < beta ) {
2859 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2861 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2862 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2868 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2870 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2872 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2874 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
2877 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2880 for( d = 0; d < 8; d++ ) {
2881 const int p0 = pix[-1*xstride];
2882 const int p1 = pix[-2*xstride];
2883 const int q0 = pix[0];
2884 const int q1 = pix[1*xstride];
2886 if( FFABS( p0 - q0 ) < alpha &&
2887 FFABS( p1 - p0 ) < beta &&
2888 FFABS( q1 - q0 ) < beta ) {
2890 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
2891 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
2896 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2898 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
2900 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
2902 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
2905 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2911 s += abs(pix1[0] - pix2[0]);
2912 s += abs(pix1[1] - pix2[1]);
2913 s += abs(pix1[2] - pix2[2]);
2914 s += abs(pix1[3] - pix2[3]);
2915 s += abs(pix1[4] - pix2[4]);
2916 s += abs(pix1[5] - pix2[5]);
2917 s += abs(pix1[6] - pix2[6]);
2918 s += abs(pix1[7] - pix2[7]);
2919 s += abs(pix1[8] - pix2[8]);
2920 s += abs(pix1[9] - pix2[9]);
2921 s += abs(pix1[10] - pix2[10]);
2922 s += abs(pix1[11] - pix2[11]);
2923 s += abs(pix1[12] - pix2[12]);
2924 s += abs(pix1[13] - pix2[13]);
2925 s += abs(pix1[14] - pix2[14]);
2926 s += abs(pix1[15] - pix2[15]);
2933 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2939 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
2940 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
2941 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
2942 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
2943 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
2944 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
2945 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
2946 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
2947 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
2948 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
2949 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
2950 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
2951 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
2952 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
2953 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
2954 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
2961 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2964 uint8_t *pix3 = pix2 + line_size;
2968 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
2969 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
2970 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
2971 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
2972 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
2973 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
2974 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
2975 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
2976 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
2977 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
2978 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
2979 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
2980 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
2981 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
2982 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
2983 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
2991 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
2994 uint8_t *pix3 = pix2 + line_size;
2998 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
2999 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3000 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3001 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3002 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3003 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3004 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3005 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3006 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3007 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3008 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3009 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3010 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3011 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3012 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3013 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3021 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3027 s += abs(pix1[0] - pix2[0]);
3028 s += abs(pix1[1] - pix2[1]);
3029 s += abs(pix1[2] - pix2[2]);
3030 s += abs(pix1[3] - pix2[3]);
3031 s += abs(pix1[4] - pix2[4]);
3032 s += abs(pix1[5] - pix2[5]);
3033 s += abs(pix1[6] - pix2[6]);
3034 s += abs(pix1[7] - pix2[7]);
3041 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3047 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3048 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3049 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3050 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3051 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3052 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3053 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3054 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3061 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3064 uint8_t *pix3 = pix2 + line_size;
3068 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3069 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3070 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3071 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3072 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3073 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3074 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3075 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3083 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3086 uint8_t *pix3 = pix2 + line_size;
3090 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3091 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3092 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3093 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3094 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3095 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3096 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3097 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3105 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3106 MpegEncContext *c = v;
3112 for(x=0; x<16; x++){
3113 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3116 for(x=0; x<15; x++){
3117 score2+= FFABS( s1[x ] - s1[x +stride]
3118 - s1[x+1] + s1[x+1+stride])
3119 -FFABS( s2[x ] - s2[x +stride]
3120 - s2[x+1] + s2[x+1+stride]);
3127 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3128 else return score1 + FFABS(score2)*8;
3131 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3132 MpegEncContext *c = v;
3139 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3143 score2+= FFABS( s1[x ] - s1[x +stride]
3144 - s1[x+1] + s1[x+1+stride])
3145 -FFABS( s2[x ] - s2[x +stride]
3146 - s2[x+1] + s2[x+1+stride]);
3153 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3154 else return score1 + FFABS(score2)*8;
3157 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3161 for(i=0; i<8*8; i++){
3162 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3165 assert(-512<b && b<512);
3167 sum += (w*b)*(w*b)>>4;
3172 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3175 for(i=0; i<8*8; i++){
3176 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3181 * permutes an 8x8 block.
3182 * @param block the block which will be permuted according to the given permutation vector
3183 * @param permutation the permutation vector
3184 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3185 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3186 * (inverse) permutated to scantable order!
3188 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3194 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3196 for(i=0; i<=last; i++){
3197 const int j= scantable[i];
3202 for(i=0; i<=last; i++){
3203 const int j= scantable[i];
3204 const int perm_j= permutation[j];
3205 block[perm_j]= temp[j];
3209 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3213 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3216 memset(cmp, 0, sizeof(void*)*5);
3224 cmp[i]= c->hadamard8_diff[i];
3230 cmp[i]= c->dct_sad[i];
3233 cmp[i]= c->dct264_sad[i];
3236 cmp[i]= c->dct_max[i];
3239 cmp[i]= c->quant_psnr[i];
3259 #ifdef CONFIG_SNOW_ENCODER
3268 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3274 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3276 static void clear_blocks_c(DCTELEM *blocks)
3278 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3281 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3283 for(i=0; i+7<w; i+=8){
3284 dst[i+0] += src[i+0];
3285 dst[i+1] += src[i+1];
3286 dst[i+2] += src[i+2];
3287 dst[i+3] += src[i+3];
3288 dst[i+4] += src[i+4];
3289 dst[i+5] += src[i+5];
3290 dst[i+6] += src[i+6];
3291 dst[i+7] += src[i+7];
3294 dst[i+0] += src[i+0];
3297 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3299 for(i=0; i+7<w; i+=8){
3300 dst[i+0] = src1[i+0]-src2[i+0];
3301 dst[i+1] = src1[i+1]-src2[i+1];
3302 dst[i+2] = src1[i+2]-src2[i+2];
3303 dst[i+3] = src1[i+3]-src2[i+3];
3304 dst[i+4] = src1[i+4]-src2[i+4];
3305 dst[i+5] = src1[i+5]-src2[i+5];
3306 dst[i+6] = src1[i+6]-src2[i+6];
3307 dst[i+7] = src1[i+7]-src2[i+7];
3310 dst[i+0] = src1[i+0]-src2[i+0];
3313 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3321 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3331 #define BUTTERFLY2(o1,o2,i1,i2) \
3335 #define BUTTERFLY1(x,y) \
3344 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3346 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3354 //FIXME try pointer walks
3355 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3356 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3357 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3358 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3360 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3361 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3362 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3363 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3365 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3366 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3367 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3368 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3372 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3373 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3374 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3375 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3377 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3378 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3379 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3380 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3383 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3384 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3385 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3386 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3392 printf("MAX:%d\n", maxi);
3398 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3406 //FIXME try pointer walks
3407 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3408 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3409 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3410 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3412 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3413 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3414 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3415 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3417 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3418 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3419 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3420 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3424 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3425 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3426 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3427 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3429 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3430 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3431 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3432 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3435 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3436 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3437 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3438 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3441 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3446 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3447 MpegEncContext * const s= (MpegEncContext *)c;
3448 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3449 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3453 s->dsp.diff_pixels(temp, src1, src2, stride);
3455 return s->dsp.sum_abs_dctelem(temp);
3460 const int s07 = SRC(0) + SRC(7);\
3461 const int s16 = SRC(1) + SRC(6);\
3462 const int s25 = SRC(2) + SRC(5);\
3463 const int s34 = SRC(3) + SRC(4);\
3464 const int a0 = s07 + s34;\
3465 const int a1 = s16 + s25;\
3466 const int a2 = s07 - s34;\
3467 const int a3 = s16 - s25;\
3468 const int d07 = SRC(0) - SRC(7);\
3469 const int d16 = SRC(1) - SRC(6);\
3470 const int d25 = SRC(2) - SRC(5);\
3471 const int d34 = SRC(3) - SRC(4);\
3472 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3473 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3474 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3475 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3477 DST(1, a4 + (a7>>2)) ;\
3478 DST(2, a2 + (a3>>1)) ;\
3479 DST(3, a5 + (a6>>2)) ;\
3481 DST(5, a6 - (a5>>2)) ;\
3482 DST(6, (a2>>1) - a3 ) ;\
3483 DST(7, (a4>>2) - a7 ) ;\
3486 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3487 MpegEncContext * const s= (MpegEncContext *)c;
3492 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3494 #define SRC(x) dct[i][x]
3495 #define DST(x,v) dct[i][x]= v
3496 for( i = 0; i < 8; i++ )
3501 #define SRC(x) dct[x][i]
3502 #define DST(x,v) sum += FFABS(v)
3503 for( i = 0; i < 8; i++ )
3511 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3512 MpegEncContext * const s= (MpegEncContext *)c;
3513 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3514 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3519 s->dsp.diff_pixels(temp, src1, src2, stride);
3523 sum= FFMAX(sum, FFABS(temp[i]));
3528 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3529 MpegEncContext * const s= (MpegEncContext *)c;
3530 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3531 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3532 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3538 s->dsp.diff_pixels(temp, src1, src2, stride);
3540 memcpy(bak, temp, 64*sizeof(DCTELEM));
3542 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3543 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3544 ff_simple_idct(temp); //FIXME
3547 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3552 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3553 MpegEncContext * const s= (MpegEncContext *)c;
3554 const uint8_t *scantable= s->intra_scantable.permutated;
3555 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3556 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3557 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3558 uint8_t * const bak= (uint8_t*)aligned_bak;
3559 int i, last, run, bits, level, distoration, start_i;
3560 const int esc_length= s->ac_esc_length;
3562 uint8_t * last_length;
3567 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3568 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3571 s->dsp.diff_pixels(temp, src1, src2, stride);
3573 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3579 length = s->intra_ac_vlc_length;
3580 last_length= s->intra_ac_vlc_last_length;
3581 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3584 length = s->inter_ac_vlc_length;
3585 last_length= s->inter_ac_vlc_last_length;
3590 for(i=start_i; i<last; i++){
3591 int j= scantable[i];
3596 if((level&(~127)) == 0){
3597 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3606 level= temp[i] + 64;
3610 if((level&(~127)) == 0){
3611 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3619 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3621 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3624 s->dsp.idct_add(bak, stride, temp);
3626 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3628 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3631 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3632 MpegEncContext * const s= (MpegEncContext *)c;
3633 const uint8_t *scantable= s->intra_scantable.permutated;
3634 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3635 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3636 int i, last, run, bits, level, start_i;
3637 const int esc_length= s->ac_esc_length;
3639 uint8_t * last_length;
3643 s->dsp.diff_pixels(temp, src1, src2, stride);
3645 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3651 length = s->intra_ac_vlc_length;
3652 last_length= s->intra_ac_vlc_last_length;
3653 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3656 length = s->inter_ac_vlc_length;
3657 last_length= s->inter_ac_vlc_last_length;
3662 for(i=start_i; i<last; i++){
3663 int j= scantable[i];
3668 if((level&(~127)) == 0){
3669 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3678 level= temp[i] + 64;
3682 if((level&(~127)) == 0){
3683 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3691 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3696 for(x=0; x<16; x+=4){
3697 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3698 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3706 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3711 for(x=0; x<16; x++){
3712 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3721 #define SQ(a) ((a)*(a))
3722 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3727 for(x=0; x<16; x+=4){
3728 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3729 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3737 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3742 for(x=0; x<16; x++){
3743 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3752 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3756 for(i=0; i<size; i++)
3757 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3761 WARPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3762 WARPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3763 WARPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3765 WARPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3767 WARPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3768 WARPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3769 WARPER8_16_SQ(rd8x8_c, rd16_c)
3770 WARPER8_16_SQ(bit8x8_c, bit16_c)
3772 static void vector_fmul_c(float *dst, const float *src, int len){
3774 for(i=0; i<len; i++)
3778 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3781 for(i=0; i<len; i++)
3782 dst[i] = src0[i] * src1[-i];
3785 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3787 for(i=0; i<len; i++)
3788 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3791 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3793 for(i=0; i<len; i++) {
3794 int_fast32_t tmp = ((int32_t*)src)[i];
3796 tmp = (0x43c0ffff - tmp)>>31;
3797 // is this faster on some gcc/cpu combinations?
3798 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3801 dst[i] = tmp - 0x8000;
3806 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3807 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3808 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3809 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3810 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3811 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3812 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3814 static void wmv2_idct_row(short * b)
3817 int a0,a1,a2,a3,a4,a5,a6,a7;
3819 a1 = W1*b[1]+W7*b[7];
3820 a7 = W7*b[1]-W1*b[7];
3821 a5 = W5*b[5]+W3*b[3];
3822 a3 = W3*b[5]-W5*b[3];
3823 a2 = W2*b[2]+W6*b[6];
3824 a6 = W6*b[2]-W2*b[6];
3825 a0 = W0*b[0]+W0*b[4];
3826 a4 = W0*b[0]-W0*b[4];
3828 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3829 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3831 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3832 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3833 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3834 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3835 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3836 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3837 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3838 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3840 static void wmv2_idct_col(short * b)
3843 int a0,a1,a2,a3,a4,a5,a6,a7;
3844 /*step 1, with extended precision*/
3845 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3846 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3847 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3848 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3849 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3850 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3851 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3852 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3854 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3855 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3857 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3858 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3859 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
3860 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
3862 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
3863 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
3864 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
3865 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
3867 void ff_wmv2_idct_c(short * block){
3871 wmv2_idct_row(block+i);
3874 wmv2_idct_col(block+i);
3877 /* XXX: those functions should be suppressed ASAP when all IDCTs are
3879 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
3881 ff_wmv2_idct_c(block);
3882 put_pixels_clamped_c(block, dest, line_size);
3884 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
3886 ff_wmv2_idct_c(block);
3887 add_pixels_clamped_c(block, dest, line_size);
3889 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
3892 put_pixels_clamped_c(block, dest, line_size);
3894 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
3897 add_pixels_clamped_c(block, dest, line_size);
3900 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
3903 put_pixels_clamped4_c(block, dest, line_size);
3905 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
3908 add_pixels_clamped4_c(block, dest, line_size);
3911 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
3914 put_pixels_clamped2_c(block, dest, line_size);
3916 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
3919 add_pixels_clamped2_c(block, dest, line_size);
3922 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
3924 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3926 dest[0] = cm[(block[0] + 4)>>3];
3928 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
3930 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
3932 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
3935 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
3937 /* init static data */
3938 void dsputil_static_init(void)
3942 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
3943 for(i=0;i<MAX_NEG_CROP;i++) {
3945 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
3948 for(i=0;i<512;i++) {
3949 ff_squareTbl[i] = (i - 256) * (i - 256);
3952 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
3955 int ff_check_alignment(void){
3956 static int did_fail=0;
3957 DECLARE_ALIGNED_16(int, aligned);
3959 if((long)&aligned & 15){
3961 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
3962 av_log(NULL, AV_LOG_ERROR,
3963 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
3964 "and may be very slow or crash. This is not a bug in libavcodec,\n"
3965 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
3966 "Do not report crashes to FFmpeg developers.\n");
3975 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
3979 ff_check_alignment();
3981 #ifdef CONFIG_ENCODERS
3982 if(avctx->dct_algo==FF_DCT_FASTINT) {
3983 c->fdct = fdct_ifast;
3984 c->fdct248 = fdct_ifast248;
3986 else if(avctx->dct_algo==FF_DCT_FAAN) {
3987 c->fdct = ff_faandct;
3988 c->fdct248 = ff_faandct248;
3991 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
3992 c->fdct248 = ff_fdct248_islow;
3994 #endif //CONFIG_ENCODERS
3996 if(avctx->lowres==1){
3997 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
3998 c->idct_put= ff_jref_idct4_put;
3999 c->idct_add= ff_jref_idct4_add;
4001 c->idct_put= ff_h264_lowres_idct_put_c;
4002 c->idct_add= ff_h264_lowres_idct_add_c;
4004 c->idct = j_rev_dct4;
4005 c->idct_permutation_type= FF_NO_IDCT_PERM;
4006 }else if(avctx->lowres==2){
4007 c->idct_put= ff_jref_idct2_put;
4008 c->idct_add= ff_jref_idct2_add;
4009 c->idct = j_rev_dct2;
4010 c->idct_permutation_type= FF_NO_IDCT_PERM;
4011 }else if(avctx->lowres==3){
4012 c->idct_put= ff_jref_idct1_put;
4013 c->idct_add= ff_jref_idct1_add;
4014 c->idct = j_rev_dct1;
4015 c->idct_permutation_type= FF_NO_IDCT_PERM;
4017 if(avctx->idct_algo==FF_IDCT_INT){
4018 c->idct_put= ff_jref_idct_put;
4019 c->idct_add= ff_jref_idct_add;
4020 c->idct = j_rev_dct;
4021 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4022 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4023 avctx->idct_algo==FF_IDCT_VP3){
4024 c->idct_put= ff_vp3_idct_put_c;
4025 c->idct_add= ff_vp3_idct_add_c;
4026 c->idct = ff_vp3_idct_c;
4027 c->idct_permutation_type= FF_NO_IDCT_PERM;
4028 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4029 c->idct_put= ff_wmv2_idct_put_c;
4030 c->idct_add= ff_wmv2_idct_add_c;
4031 c->idct = ff_wmv2_idct_c;
4032 c->idct_permutation_type= FF_NO_IDCT_PERM;
4033 }else{ //accurate/default
4034 c->idct_put= ff_simple_idct_put;
4035 c->idct_add= ff_simple_idct_add;
4036 c->idct = ff_simple_idct;
4037 c->idct_permutation_type= FF_NO_IDCT_PERM;
4041 if (ENABLE_H264_DECODER) {
4042 c->h264_idct_add= ff_h264_idct_add_c;
4043 c->h264_idct8_add= ff_h264_idct8_add_c;
4044 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4045 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4048 c->get_pixels = get_pixels_c;
4049 c->diff_pixels = diff_pixels_c;
4050 c->put_pixels_clamped = put_pixels_clamped_c;
4051 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4052 c->add_pixels_clamped = add_pixels_clamped_c;
4053 c->add_pixels8 = add_pixels8_c;
4054 c->add_pixels4 = add_pixels4_c;
4055 c->sum_abs_dctelem = sum_abs_dctelem_c;
4058 c->clear_blocks = clear_blocks_c;
4059 c->pix_sum = pix_sum_c;
4060 c->pix_norm1 = pix_norm1_c;
4062 /* TODO [0] 16 [1] 8 */
4063 c->pix_abs[0][0] = pix_abs16_c;
4064 c->pix_abs[0][1] = pix_abs16_x2_c;
4065 c->pix_abs[0][2] = pix_abs16_y2_c;
4066 c->pix_abs[0][3] = pix_abs16_xy2_c;
4067 c->pix_abs[1][0] = pix_abs8_c;
4068 c->pix_abs[1][1] = pix_abs8_x2_c;
4069 c->pix_abs[1][2] = pix_abs8_y2_c;
4070 c->pix_abs[1][3] = pix_abs8_xy2_c;
4072 #define dspfunc(PFX, IDX, NUM) \
4073 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4074 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4075 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4076 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4078 dspfunc(put, 0, 16);
4079 dspfunc(put_no_rnd, 0, 16);
4081 dspfunc(put_no_rnd, 1, 8);
4085 dspfunc(avg, 0, 16);
4086 dspfunc(avg_no_rnd, 0, 16);
4088 dspfunc(avg_no_rnd, 1, 8);
4093 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4094 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4096 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4097 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4098 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4099 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4100 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4101 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4102 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4103 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4104 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4106 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4107 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4108 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4109 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4110 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4111 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4112 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4113 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4114 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4116 #define dspfunc(PFX, IDX, NUM) \
4117 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4118 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4119 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4120 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4121 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4122 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4123 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4124 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4125 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4126 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4127 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4128 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4129 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4130 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4131 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4132 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4134 dspfunc(put_qpel, 0, 16);
4135 dspfunc(put_no_rnd_qpel, 0, 16);
4137 dspfunc(avg_qpel, 0, 16);
4138 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4140 dspfunc(put_qpel, 1, 8);
4141 dspfunc(put_no_rnd_qpel, 1, 8);
4143 dspfunc(avg_qpel, 1, 8);
4144 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4146 dspfunc(put_h264_qpel, 0, 16);
4147 dspfunc(put_h264_qpel, 1, 8);
4148 dspfunc(put_h264_qpel, 2, 4);
4149 dspfunc(put_h264_qpel, 3, 2);
4150 dspfunc(avg_h264_qpel, 0, 16);
4151 dspfunc(avg_h264_qpel, 1, 8);
4152 dspfunc(avg_h264_qpel, 2, 4);
4155 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4156 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4157 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4158 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4159 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4160 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4161 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4163 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4164 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4165 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4166 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4167 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4168 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4169 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4170 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4171 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4172 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4173 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4174 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4175 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4176 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4177 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4178 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4179 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4180 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4181 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4182 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4184 #ifdef CONFIG_CAVS_DECODER
4185 ff_cavsdsp_init(c,avctx);
4187 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4188 ff_vc1dsp_init(c,avctx);
4190 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4191 ff_intrax8dsp_init(c,avctx);
4193 #if defined(CONFIG_H264_ENCODER)
4194 ff_h264dspenc_init(c,avctx);
4197 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4198 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4199 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4200 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4201 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4202 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4203 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4204 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4206 #define SET_CMP_FUNC(name) \
4207 c->name[0]= name ## 16_c;\
4208 c->name[1]= name ## 8x8_c;
4210 SET_CMP_FUNC(hadamard8_diff)
4211 c->hadamard8_diff[4]= hadamard8_intra16_c;
4212 SET_CMP_FUNC(dct_sad)
4213 SET_CMP_FUNC(dct_max)
4215 SET_CMP_FUNC(dct264_sad)
4217 c->sad[0]= pix_abs16_c;
4218 c->sad[1]= pix_abs8_c;
4222 SET_CMP_FUNC(quant_psnr)
4225 c->vsad[0]= vsad16_c;
4226 c->vsad[4]= vsad_intra16_c;
4227 c->vsse[0]= vsse16_c;
4228 c->vsse[4]= vsse_intra16_c;
4229 c->nsse[0]= nsse16_c;
4230 c->nsse[1]= nsse8_c;
4231 #ifdef CONFIG_SNOW_ENCODER
4232 c->w53[0]= w53_16_c;
4234 c->w97[0]= w97_16_c;
4238 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4240 c->add_bytes= add_bytes_c;
4241 c->diff_bytes= diff_bytes_c;
4242 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4243 c->bswap_buf= bswap_buf;
4245 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4246 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4247 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4248 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4249 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4250 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4251 c->h264_loop_filter_strength= NULL;
4253 if (ENABLE_ANY_H263) {
4254 c->h263_h_loop_filter= h263_h_loop_filter_c;
4255 c->h263_v_loop_filter= h263_v_loop_filter_c;
4258 c->h261_loop_filter= h261_loop_filter_c;
4260 c->try_8x8basis= try_8x8basis_c;
4261 c->add_8x8basis= add_8x8basis_c;
4263 #ifdef CONFIG_SNOW_DECODER
4264 c->vertical_compose97i = ff_snow_vertical_compose97i;
4265 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4266 c->inner_add_yblock = ff_snow_inner_add_yblock;
4269 #ifdef CONFIG_VORBIS_DECODER
4270 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4272 #ifdef CONFIG_FLAC_ENCODER
4273 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4275 c->vector_fmul = vector_fmul_c;
4276 c->vector_fmul_reverse = vector_fmul_reverse_c;
4277 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4278 c->float_to_int16 = ff_float_to_int16_c;
4280 c->shrink[0]= ff_img_copy_plane;
4281 c->shrink[1]= ff_shrink22;
4282 c->shrink[2]= ff_shrink44;
4283 c->shrink[3]= ff_shrink88;
4285 c->prefetch= just_return;
4287 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4288 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4290 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4291 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4292 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4293 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4294 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4295 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4296 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4297 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4298 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4300 for(i=0; i<64; i++){
4301 if(!c->put_2tap_qpel_pixels_tab[0][i])
4302 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4303 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4304 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4307 switch(c->idct_permutation_type){
4308 case FF_NO_IDCT_PERM:
4310 c->idct_permutation[i]= i;
4312 case FF_LIBMPEG2_IDCT_PERM:
4314 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4316 case FF_SIMPLE_IDCT_PERM:
4318 c->idct_permutation[i]= simple_mmx_permutation[i];
4320 case FF_TRANSPOSE_IDCT_PERM:
4322 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4324 case FF_PARTTRANS_IDCT_PERM:
4326 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4329 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");